Creating clusters of related columns - tsql

I have a table named Stores with columns:
StoreCode NVARCHAR(10),
OldStoreCode NVARCHAR(10)
Here is a sample of my data:
| StoreCode | OldStoreCode |
|-----------|--------------|
| A | B |
| B | A |
| D | E |
| E | F |
| M | K |
| J | K |
| K | L |
|-----------|--------------|
I want to create clusters of related Stores. Related store means there is a one way relation between StoreCodes and OldStoreCodes.
Expected result table:
| StoreCode | ClusterId |
|-----------|-----------|
| A | 1 |
| B | 1 |
| D | 2 |
| E | 2 |
| F | 2 |
| M | 3 |
| K | 3 |
| J | 3 |
| L | 3 |
|-----------|-----------|
There is no maximum number hops. There may be a StoreCode A which has a OldStoreCode B, which has a OldStoreCode C, which has a OldStoreCode D etc.
How can I cluster stores like this?

Try it like this:
EDIT: With changes by OP taken from comment
DECLARE #tbl TABLE(ID INT IDENTITY, StoreCode VARCHAR(100),OldStoreCode VARCHAR(100));
INSERT INTO #tbl VALUES
('A','B'),('B','A'),('D','E'),('E','F'),('M','K'),('J','K'),('K','L');
WITH Related AS
(
SELECT DISTINCT t1.ID,Val
FROM #tbl AS t1
INNER JOIN #tbl AS t2 ON t1.StoreCode=t2.StoreCode
OR t1.OldStoreCode=t2.OldStoreCode
OR t1.OldStoreCode=t2.StoreCode
OR t1.StoreCode=t2.OldStoreCode
CROSS APPLY(SELECT DISTINCT Val
FROM
(VALUES(t1.StoreCode),(t2.StoreCode),(t1.OldStoreCode),(t2.OldStoreCode)) AS A(Val)
) AS valsInCols
)
,ClusterKeys AS
(
SELECT r1.ID
,(
SELECT r2.Val AS [*]
FROM Related AS r2
WHERE r2.ID=r1.ID
ORDER BY r2.Val
FOR XML PATH('')
) AS ClusterKey
FROM Related AS r1
GROUP BY r1.ID
)
,ClusterIds AS
(
SELECT ClusterKey
,MIN(ID) AS ID
FROM ClusterKeys
GROUP BY ClusterKey
)
SELECT r.ID
,r.Val
FROM ClusterIds c
INNER JOIN Related r ON c.ID = r.ID
The result
ID Val
1 A
1 B
3 D
3 E
3 F
5 J
5 K
5 L
5 M

This should do it:
SAMPLE DATA:
IF OBJECT_ID('tempdb..#Temp1') IS NOT NULL
BEGIN
DROP TABLE #Temp1;
END;
CREATE TABLE #Temp1(StoreCode NVARCHAR(10)
, OldStoreCode NVARCHAR(10));
INSERT INTO #Temp1(StoreCode
, OldStoreCode)
VALUES
('A'
, 'B'),
('B'
, 'A'),
('D'
, 'E'),
('E'
, 'F'),
('M'
, 'K'),
('J'
, 'K'),
('K'
, 'L');
QUERY:
;WITH A -- get all distinct new and old storecodes
AS (
SELECT StoreCode
FROM #Temp1
UNION
SELECT OldStoreCode
FROM #Temp1),
B -- give a unique number id to each store code
AS (SELECT rn = RANK() OVER(ORDER BY StoreCode)
, StoreCode
FROM A),
C -- combine the store codes and the unique number id's in one table
AS (SELECT b2.rn AS StoreCodeID
, t.StoreCode
, b1.rn AS OldStoreCodeId
, t.OldStoreCode
FROM #Temp1 AS t
LEFT OUTER JOIN B AS b1 ON t.OldStoreCode = b1.StoreCode
LEFT OUTER JOIN B AS b2 ON t.StoreCode = b2.StoreCode),
D -- assign a row number for each entry in the data set
AS (SELECT rn = RANK() OVER(ORDER BY StoreCode)
, *
FROM C),
E -- derive first and last store in the path
AS (SELECT FirstStore = d2.StoreCode
, LastStore = d1.OldStoreCode
, GroupID = d1.OldStoreCodeId
FROM D AS d1
RIGHT OUTER JOIN D AS d2 ON d1.StoreCodeID = d2.OldStoreCodeId
AND d1.rn - 1 = d2.rn
WHERE d1.OldStoreCode IS NOT NULL) ,
F -- get the stores wich led to the last store with one hop
AS (SELECT C.StoreCode
, E.GroupID
FROM E
INNER JOIN C ON E.LastStore = C.OldStoreCode)
-- combine to get the full grouping
SELECT A.StoreCode, ClusterID = DENSE_RANK() OVER (ORDER BY A.GroupID) FROM (
SELECT C.StoreCode,F.GroupID FROM C INNER JOIN F ON C.OldStoreCode = F.StoreCode
UNION
SELECT * FROM F
UNION
SELECT E.LastStore,E.GroupID FROM E) AS A ORDER BY StoreCode, ClusterID
RESULTS:

Related

Inner join and update the table in one execution DB2

I have two tables, where I would like to update table_2 if the column's value is same and then applying inner join with table1. I would like to do in one execution.
Here I habe table1 and table2, where item_2 of table2 has same value with status = 0. Here I would like to update one of the status with 9.
table1
#|ID| ITEM_1 |Application
-+--+----------+------
1|1| item1 | read
2|2| item1 | write
3|3| item1 | learn
table2
#|ID| ITEM_2 |Description |STATUS
-+--+---------+---------------------
1|10| item1 | des1 | 0
2|11| item1 | des2 | 0
3|12| item1 | des3 | 2
For updating table2, I used lag() function and then inner join with table1.
But here I need to execute two times, first for update and then second for inner join. But I am looking to execute in one time.
update
UPDATE
(
SELECT
T2.*
, lag(ITEM_2, 1, 0) over (order by ITEM_2 ASC) as C2
FROM TABLE_2 T2 where T2.STATUS = 0
)
SET STATUS = 9
WHERE C2 = ITEM_2;
#|ID| ITEM_2 |Description |STATUS
-+--+---------+---------------------
1|10| item1 | des1 | 0
2|11| item1 | des2 | 9
3|12| item1 | des3 | 2
inner join
select T1.ID, T1.ITEM_1, T1.Appliction, T2.ID, T2.ITEM_2, T2.Description, T2.STATUS
from TABLE_1 T1
INNER JOIN TABLE_2 T2 ON T1.ITEM_1 = T2.ITEM_2
where T2.STATUS = 0
ID | ITEM_1 | APPLICTION | ID | ITEM_2 | DESCRIPTION | STATUS
1 | item1 | read | 10 | item1 | des1 | 0
WITH U AS
(SELECT COUNT (1) AS DUMMY FROM NEW TABLE
(UPDATE TABLE_2 A SET STATUS = 9 WHERE EXISTS
(SELECT 1 FROM TABLE_2 B WHERE A.ITEM_2 = B.ITEM_2 AND A.ID > B.ID AND B.STATUS = 0
)))
select T1.ID, T1.ITEM_1, T1.Appliction, T2.ID, T2.ITEM_2, T2.Description, T2.STATUS
from TABLE_1 T1
Inner join TABLE_2 T2 ON T1.ITEM_1 = T2.ITEM_2
where T2.STATUS = 0`
fiddle

Report duplicate data

create table dupt(cat varchar(10), num int)
insert dupt(cat,num) values ('A',1),('A',2),('A',3),
('B',1),('B',2),
('C',1),('C',2), ('C',3),
('D',1),('D',2), ('D',4),
('E',1),('E',2),
('F',1),('F',2)
I need to create a report which finds out duplicate data. From the sample data above, report needs to show that data for cat A is duplicated by cat C (notice the num value and no. of records) and cat B is duplicated by cat E and F. What is the best way to show that?
Example output
-------------
|cat | dupby|
-------------
| A | C |
| B | E, F |
-------------
Updated: switched to traditional set matching using common table expression and the stuff() with select ... for xml path ('') method of string concatenation only on the final results:
;with cte as (
select *
, cnt = count(*) over (partition by cat)
from t
)
, duplicates as (
select
x.cat
, dup_cat = x2.cat
from cte as x
inner join cte as x2
on x.cat < x2.cat
and x.num = x2.num
and x.cnt = x2.cnt
group by x.cat, x2.cat, x.cnt
having count(*) = x.cnt
)
select
d.cat
, dupby = stuff((
select ', '+i.dup_cat
from duplicates i
where i.cat = d.cat
for xml path (''), type).value('.','varchar(8000)')
,1,2,'')
from duplicates d
where not exists (
select 1
from duplicates i
where d.cat = i.dup_cat
)
group by d.cat
rextester demo: http://rextester.com/KHAG98718
returns:
+-----+-------+
| cat | dupby |
+-----+-------+
| A | C |
| B | E, F |
+-----+-------+

Comparing tables and getting non matching values

I'm pretty new to SQL and I can't get this to work I've got these two tables below
Table A Table B
_________________ _________________
| A | 2015-10-4 | B | 2015-11-6
| B | 2015-11-4 | C | 2015-05-4
| C | 2015-05-6 | D | 2015-05-8
| D | 2015-05-7 | C | 2015-05-5
I'm trying to write a stored procedure that will get all letters from table B that has a date less than table A and any letter that doesn't exist in table B.
This is what I have so far
SELECT *
FROM A q JOIN
B c ON q.Letter = c.Letter AND q.Date > c.Date OR c.Letter IS NULL
This returns C but I can't have it return A also. It's confusing to me trying to join and compare tables still.
I do not want duplicate rows, the results I would be expecting would return
| A | 2015-10-4
| C | 2015-05-6
EDIT
I'm running into an issue now where if I have a case like this
Table A Table B
_________________ _________________
| A | 2015-10-4 | B | 2015-11-6
| B | 2015-11-4 | C | 2015-05-4
| C | 2015-05-6 | D | 2015-05-8
| D | 2015-05-7 | C | 2015-05-5
| C | 2015-05-7
It will still return C for some reason. Using a.date > max(b.date) doesn't work because max can't used that way. And I want to assume the max date can be anywhere in the table in table B.
So now my new results would be
| A | 2015-10-4
But I am getting A and C still.
You should use a LEFT JOIN:
SELECT DISTINCT A.letter, A.[Date]
FROM dbo.TableA A
LEFT JOIN dbo.TableB B
ON A.letter = B.letter
WHERE B.[Date] < A.[Date] OR B.letter IS NULL;
UPDATE
You should have explained your requirements as: "get all letters from table B in which every date is lesser than...."
SELECT DISTINCT A.letter, A.[Date]
FROM dbo.TableA A
LEFT JOIN (SELECT letter, MAX([Date]) [Date]
FROM dbo.TableB
GROUP BY letter) B
ON A.letter = B.letter
WHERE B.[Date] < A.[Date] OR B.letter IS NULL;
I would go for a UNION / UNION ALL, so that you get the result subset for the first condition + the ones for the second one.
Something similar to this should do the job:
sqlite> create table A (letter, my_date);
sqlite> create table B (letter, my_date);
sqlite> insert into A values ('A', '2015-10-04');
sqlite> insert into A values ('B', '2015-11-04');
sqlite> insert into A values ('C', '2015-05-06');
sqlite> insert into A values ('D', '2015-05-07');
sqlite> insert into B values ('B', '2015-11-06');
sqlite> insert into B values ('C', '2015-05-04');
sqlite> insert into B values ('D', '2015-05-08');
sqlite> insert into B values ('C', '2015-05-05');
A 2015-10-04
sqlite> select B.* from A, B where A.letter = B.letter and B.my_date < A.my_date UNION ALL select A.* from A where not exists (select 1 from B where B.letter=A.letter);
letter my_date
---------- ----------
C 2015-05-04
C 2015-05-05
A 2015-10-04

joining tables with different length column values

I need to get ID by joining columns of tables with variable length.
Table A has 2 columns ID and PostCode
-----------------
| ID | PostCode |
|----|----------|
| 1 | BR |
|----|----------|
| 2 | WT |
|----|----------|
| 3 | B71 |
|----|----------|
| 4 | BR5 |
|----|----------|
Table B has columns with Name and Full postcode
|------|----------|
| Name | PostCode |
|------|----------|
| Mr X | CR2 5ER |
|------|----------|
| Ms Y | BT2 6ER |
|------|----------|
| XX | B71 4WQ |
|------|----------|
| YY | BR4 8ER |
|------|----------|
| SS | BR5A 5RT |
|------|----------|
I need to get Id's 1 [BR->BR4 8ER], 3 [B71->B71 4WQ] and 4 [BR5->BR5A 5RT]
How do I get to work this?
select A.PostCode, B.PostCode as FullPostCode, B.Name
from A
join B
on substring(B.PostCode,0,len(A.PostCode)) = A.PostCode
Consider the postcode BR29 8LN. If table A has codes B and BR, this postcode will be captured TWICE - not what the OP would want, and not what I wanted.
The below captures everything so long as after the postcode prefix, there is a number thus delimiting the postcode area:
select A.PostCode, B.PostCode as FullPostCode, B.Name
from B
inner join A
on substring(B.PostCode ,0,len(A.PostCode)+1) = A.PostCode
WHERE IsNumeric(substring(B.PostCode ,len(A.PostCode)+1,1)) = 1
This may help.
DECLARE #TableA TABLE (UserID INT,
PostCode VARCHAR(10))
DECLARE #TableB TABLE (Name VARCHAR(10),
PostCode VARCHAR(10))
INSERT INTO #TableA
VALUES
('1', 'BR'),
('2', 'WT'),
('3', 'B71'),
('4', 'BR5')
INSERT INTO #TableB
VALUES
('Mr X', 'CR2 5ER'),
('Ms Y', 'BT2 6ER'),
('XX', 'B71 4WQ'),
('YY', 'BR4 8ER'),
('SS', 'BR5A 5RT');
WITH CTE
AS (
SELECT CAST(UserID AS VARCHAR(10)) AS UserID,
Name,
tb.PostCode,
ta.PostCode AS PostCode2
,
ROW_NUMBER() OVER (PARTITION BY UserID ORDER BY tb.PostCode DESC) AS PcID
FROM #TableA AS ta
JOIN #TableB AS tb
ON ta.PostCode = LEFT(tb.PostCode, LEN(ta.PostCode))
)
, cte2
AS (
SELECT STUFF((SELECT ', ' + c2.UserID + ' [' + c2.PostCode2 + '-' + c2.PostCode + ']'
FROM cte AS c2
WHERE c1.UserID = c2.UserID
AND PcID = 1
FOR XML PATH('')), 1, 2, '') AS PostCodeMatch
FROM cte AS c1
WHERE PcID = 1
)
SELECT DISTINCT STUFF((SELECT ', ' + PostCodeMatch
FROM cte2 AS c2
FOR XML PATH('')), 1, 2, '') AS PostCodeMatch
FROM cte2
You might do something like this:
select A.PostCode, B.PostCode as FullPostCode, B.Name
from A
join B on B.PostCode like A.PostCode + '%'

sql join if value exists in other table then Count it

I have following tables.
Table A
UserID | key 1 | A 2 | B 3 | A 4 | C 5 |
Table B
UserID | Num1 | 501 | 3002 |3 | 1004 | 20
I have query like this
SELECT COUNT(key) AS cnt, key
FROM A
WHERE key <> ''
GROUP BY key
ORDER BY cnt DESC
The results should be something like this
key | cnt A | 2 B | 1 C | 1
What I would like to add is Joining Table B.
If UserID has value in Num in Table B, I would like to count UserID with/Num Grouped by key
Here is desired results
key | cnt | Has Num? A | 2 | 2 B | 1 | 0 C | 1 | 1
I tried to write subquery but I can't attach it to main query. Subquery is something like this.
SELECT COUNT(DISTINCT UserID) AS num
FROM B
LEFT OUTER JOIN A ON B.UserID = A.UserID
WHERE Num <>'' AND key <> ''
GROUP BY key
If I'm understanding this correctly, what you're looking for is a count of the Keys in Table A when they were used by a UserID, and then a count of the number of unique UserIDs in Table B who both appeared in the first Table A query and had a Num.
Try this:
SELECT a.[Key], COUNT(a.[Key]) AS cnt, isNull(SUM(b.bCnt), 0) AS [Has Num?]
FROM #TableA a
LEFT OUTER JOIN (
SELECT b.UserID, 1
FROM #TableB b
WHERE LEN(b.Num) > 0
GROUP BY b.UserID
) b (UserID, bCnt) ON b.UserID = a.UserID
WHERE LEN(a.[Key]) > 0
GROUP BY a.[Key]
This query gives the results that you were expecting.
DECLARE #TableA TABLE(UserID INT, [Key] CHAR(1))
INSERT INTO #TableA VALUES(1,'A'),(2,'B'),(3,'A'),(4,'C'),(5,'')
DECLARE #TableB TABLE(UserID INT, Num INT NULL)
INSERT INTO #TableB VALUES(1,50),(1,300),(2,NULL),(3,100),(4,20)
SELECT x.[Key],x.Cnt,y.[Has Num?]
FROM
( SELECT [Key],Cnt = COUNT([Key])
FROM #TableA
WHERE LEN([Key])>0
GROUP BY [Key]
)X
JOIN
(
SELECT a.[Key],[Has Num?] = COUNT(b.Num)
FROM #TableA a
JOIN #TableB b ON a.UserID = b.UserID
GROUP BY a.[Key]
)Y
ON x.[Key] = Y.[Key]
Key Cnt Has Num?
A 2 3
B 1 0
C 1 1
How about an OUTER APPLY
SELECT [Key], COUNT(a.[Key]) AS cnt, SUM(x.NumCount) AS [Has Num?]
FROM #TableA a
OUTER APPLY (SELECT COUNT(NUM) AS NumCount
FROM #TableB b
WHERE b.UserId = a.UserId AND Num IS NOT NULL
) x
WHERE [Key] <> ''
GROUP BY [Key]
ORDER BY cnt DESC
Result:
Key cnt Has Num?
---- ----------- -----------
A 2 3
B 1 0
C 1 1