JOIN 2 Tables get Unique values from 2 columns - tsql

I have a (i think) complicated problem, and have no idea how to do that in SQL (the whole day). I have turned the logic around a couple of times, and always something is missing.
There is a join between 2 tables that hold different FK references to a 3rd table.
How to join those 2 tables, so i am sure that all FK combinations are presented, and all are unique?
I need to have the 2 FK columns in one, so i can later join to 3rd. nulls are possible. Group by not possible, since i need to know where the record is from (need Id_1 and Id_2 in the result)
here the sample code:
DECLARE #T1 TABLE (Id int, CommonId int, FK_Id_1 int)
DECLARE #T2 TABLE (Id int,CommonId int, FK_Id_2 int)
INSERT INTO #T1 VALUES (1,1,1)
INSERT INTO #T1 VALUES (2,1,2)
INSERT INTO #T1 VALUES (3,2,3)
INSERT INTO #T1 VALUES (4,3,NULL)
INSERT INTO #T1 VALUES (5,4,NULL)
INSERT INTO #T2 VALUES (11,1,1)
INSERT INTO #T2 VALUES (12,2,2)
INSERT INTO #T2 VALUES (13,2,3)
INSERT INTO #T2 VALUES (14,4,5)
SELECT t1.Id as Id_1,t2.Id as Id_2, t1.CommonId, t1.FK_Id_1, t2.FK_Id_2,
COUNT(t1.FK_Id_1) OVER (PARTITION BY t1.FK_Id_1) AS T1_RANK,
COUNT(t2.FK_Id_2) OVER (PARTITION BY t2.FK_Id_2)AS T2_RANK
FROM #T1 t1
FULL JOIN #T2 t2 on t1.CommonId = t2.CommonId
ORDER BY CommonId
This query is returning this:
Id_1 Id_2 CommonId FK_Id_1 FK_Id_2 T1_RANK T2_RANK
----------- ----------- ----------- ----------- ----------- ----------- -----------
1 11 1 1 1 1 2
2 11 1 2 1 1 2
3 12 2 3 2 2 1
3 13 2 3 3 2 1
4 NULL 3 NULL NULL 0 0
5 14 4 NULL 5 0 1
and i need somehow to make it look like this:
Id_1 Id_2 CommonId FK_Id
----------- ----------- ----------- -----------
1 11 1 1
2 11 1 2
3 12 2 2
3 13 2 3
4 NULL 3 NULL
5 14 4 5
I did something like SELECT COALESCE(FK_Id_1,FK_Id_2) AS FK_Id but this is always selecting T1 with priority. I am thinking of some way to switch the priority depending of the duplicated values.
i have a ugly solution that looks like this, but am looking for a better ideas.
;WITH tmp as (
SELECT t1.Id as Id_1,t2.Id as Id_2, t1.CommonId, t1.FK_Id_1, t2.FK_Id_2,
COUNT(t1.FK_Id_1) OVER (PARTITION BY t1.FK_Id_1) AS T1_RANK,
COUNT(t2.FK_Id_2) OVER (PARTITION BY t2.FK_Id_2)AS T2_RANK
FROM #T1 t1
FULL JOIN #T2 t2 on t1.CommonId = t2.CommonId)
SELECT Id_1, Id_2, CommonId,
CASE
WHEN T1_RANK > T2_RANK THEN COALESCE(FK_Id_2,FK_Id_1)
WHEN T2_RANK > T1_RANK THEN COALESCE(FK_Id_1,FK_Id_2)
END AS FK_Id
FROM tmp
ORDER BY CommonId
I don't know if i explained the whole situation correctly, i must join the tables, because i have other columns coming only from T1 and T2 ( can not UNION->DISTINCT - this will select also the NULLs)

just pick the CommonId, and then full join to both tables.
the query below matches 100% to your desired result.
;WITH cte AS (
SELECT CommonId FROM #T1
UNION SELECT CommonId FROM #T2
)
SELECT t1.Id AS Id_1, t2.Id AS Id_2, cte.CommonId, ISNULL(t2.FK_Id_2, t1.FK_Id_1) AS FK_Id
FROM cte
FULL OUTER JOIN #T1 t1 ON cte.CommonId = t1.CommonId
FULL OUTER JOIN #T2 t2 ON cte.CommonId = t2.CommonId
Beware that that the results of the FK_Id changes with the column order in the ISNULL in the predicate
ISNULL(t2.FK_Id_2, t1.FK_Id_1) is not the same as ISNULL(t1.FK_Id_1, t2.FK_Id_2)
In my opinion this alternative version fits better your request, since it's taking both options of FK.
;WITH cte AS (
SELECT CommonId FROM #T1
UNION SELECT CommonId FROM #T2
)
SELECT t1.Id AS Id_1, t2.Id AS Id_2, cte.CommonId, ISNULL(t2.FK_Id_2, t1.FK_Id_1) AS FK_Id--, cte.CommonId, *
FROM cte
FULL OUTER JOIN #T1 t1 ON cte.CommonId = t1.CommonId
FULL OUTER JOIN #T2 t2 ON cte.CommonId = t2.CommonId
UNION
SELECT t1.Id AS Id_1, t2.Id AS Id_2, cte.CommonId, ISNULL(t1.FK_Id_1, t2.FK_Id_2) AS FK_Id--, cte.CommonId, *
FROM cte
FULL OUTER JOIN #T1 t1 ON cte.CommonId = t1.CommonId
FULL OUTER JOIN #T2 t2 ON cte.CommonId = t2.CommonId

Related

Delete duplicate rows with different values in columns

I didn't find my case on the Internet. Tell me how i can delete duplicates if the values are in different columns.
I have a table with a lot of values, for example:
|Id1|Id2|
|89417980|89417978|
|89417980|89417979|
|89417978|89417980|
|89417979|89417980|
I need to exclude duplicates and leave in the answer only:
|Id1|Id2|
|89417980|89417978|
|89417980|89417979|
min/max does not work here, as the values may be different.
I tried to union/join tables on a table/exclude results with temporary tables, but in the end I come to the beginning.
Assuming id1 and id2 are primary keys columns you could try this
DECLARE #tbl table (id1 int, id2 int )
INSERT INTO #tbl
SELECT 89417980, 89417978
UNION SELECT 89417980, 89417979
UNION SELECT 89417978, 89417980
UNION SELECT 89417979, 89417980
SELECT * FROM #tbl
;WITH CTE AS (--Get comparable value as "cs"
SELECT
IIF(id1 > id2, CHECKSUM(id1, id2), CHECKSUM(id2,id1)) as cs
, id1
, id2
, ROW_NUMBER() OVER (order by id1, id2) as rn
FROM #tbl
)
, CTE2 AS ( --Get rows to keep
SELECT MAX (rn) as rn
FROM CTE
GROUP BY cs
HAVING COUNT(*) > 1
)
DELETE tbl -- Delete all except the rows to keep
FROM #tbl tbl
WHERE NOT EXISTS(SELECT 1
FROM CTE2
JOIN CTE ON CTE.rn = CTE2.rn
WHERE CTE.id1 = tbl.id1
AND CTE.id2 = tbl.id2
)
SELECT * FROM #tbl

TSQL -- quick way to get a count across all tables

If database d1 has tables T1,T2,T3,T4 all with the field "Date1".
What is the best way to get a count of all records across all tables with a date older than 3 days ago?
I know one could do unions, I assume there is no nifty syntax that would omit all tables [like a 'parent' object in C++].
Here best may mean more efficient, or just a pleasing syntax in T-SQL.
This is for SSMS 17.7. Microsoft SQL Server 2014 (SP2)
If you know the table names in advance, a simple query on union all will probably be the simplest way:
SELECT COUNT(*)
FROM
(
SELECT Date1
FROM T1
UNION ALL
SELECT Date1
FROM T2
SELECT Date1
FROM T3
SELECT Date1
FROM T4
) As t
WHERE Date1 <= DATEADD(DAY, -3, GETDATE())
If you don't know the table names in advance, you can use information_schema.columns to build the union query dynamically.
Well, you're interested in a parent object, that would be a view, then. You can reuse it for a variety of queries. Alternatively, add more columns if you need them:
CREATE VIEW parent AS
SELECT Date1 FROM t1 UNION ALL
SELECT Date1 FROM t2 UNION ALL
SELECT Date1 FROM t3 UNION ALL
SELECT Date1 FROM t4;
And now, that can be queried in the way you want
SELECT COUNT(*) FROM parent WHERE Date1 <= DATEADD(DAY, -3, GETDATE())
Without UNION?
Since a COUNT without a GROUP BY returns 1 value, why not use CROSS JOIN for once?
SELECT
t1.Cnt AS [T1],
t2.Cnt AS [T2],
t3.Cnt AS [T3],
t4.Cnt AS [T4],
(t1.Cnt + t2.Cnt + t3.Cnt + t4.Cnt) AS [T1234]
FROM
(SELECT COUNT(*) AS Cnt FROM T1 WHERE [Date1] < CAST(GetDate()-3 AS DATE)) AS t1
CROSS JOIN
(SELECT COUNT(*) AS Cnt FROM T2 WHERE [Date1] < CAST(GetDate()-3 AS DATE)) AS t2
CROSS JOIN
(SELECT COUNT(*) AS Cnt FROM T3 WHERE [Date1] < CAST(GetDate()-3 AS DATE)) AS t3
CROSS JOIN
(SELECT COUNT(*) AS Cnt FROM T4 WHERE [Date1] < CAST(GetDate()-3 AS DATE)) AS t4
Or a CROSS APPLY
SELECT
t1.Cnt AS [T1],
t2.Cnt AS [T2],
t3.Cnt AS [T3],
t4.Cnt AS [T4],
(t1.Cnt + t2.Cnt + t3.Cnt + t4.Cnt) AS [T1234]
FROM (SELECT CAST(GetDate()-3 AS DATE) as Dt) d
CROSS APPLY (SELECT COUNT(*) AS Cnt FROM T1 WHERE [Date1] < d.Dt) AS t1
CROSS APPLY (SELECT COUNT(*) AS Cnt FROM T2 WHERE [Date1] < d.Dt) AS t2
CROSS APPLY (SELECT COUNT(*) AS Cnt FROM T3 WHERE [Date1] < d.Dt) AS t3
CROSS APPLY (SELECT COUNT(*) AS Cnt FROM T4 WHERE [Date1] < d.Dt) AS t4
Example snippet for Sql Server:
declare #T1 table (id int primary key identity(1,1), [Date1] date);
declare #T2 table (id int primary key identity(1,1), [Date1] date);
declare #T3 table (id int primary key identity(1,1), [Date1] date);
declare #T4 table (id int primary key identity(1,1), [Date1] date);
insert into #T1 ([Date1]) values (getdate()-6),(getdate()-5),(getdate()-4),(getdate()-3),(getdate()-2),(getdate()-1),(getdate()-0);
insert into #T2 ([Date1]) select top 6 [Date1] from #T1 order by [Date1] desc;
insert into #T3 ([Date1]) select top 5 [Date1] from #T1 order by [Date1] desc;
insert into #T4 ([Date1]) select top 4 [Date1] from #T1 order by [Date1] desc;
SELECT
t1.Cnt AS [T1],
t2.Cnt AS [T2],
t3.Cnt AS [T3],
t4.Cnt AS [T4],
(t1.Cnt + t2.Cnt + t3.Cnt + t4.Cnt) AS [T1234]
FROM
(SELECT COUNT(*) AS Cnt FROM #T1 WHERE [Date1] < CAST(GetDate()-3 AS DATE)) AS t1
CROSS JOIN
(SELECT COUNT(*) AS Cnt FROM #T2 WHERE [Date1] < CAST(GetDate()-3 AS DATE)) AS t2
CROSS JOIN
(SELECT COUNT(*) AS Cnt FROM #T3 WHERE [Date1] < CAST(GetDate()-3 AS DATE)) AS t3
CROSS JOIN
(SELECT COUNT(*) AS Cnt FROM #T4 WHERE [Date1] < CAST(GetDate()-3 AS DATE)) AS t4
Returns:
T1 T2 T3 T4 T1234
3 2 1 0 6
Instead create a view, you can use a CTE (Common Table Expression). It works like a view, but not persists on database. Please try it:
WITH CteDate( Date1 )
AS ( SELECT Date1 FROM t1 UNION ALL
SELECT Date1 FROM t2 UNION ALL
SELECT Date1 FROM t3 UNION ALL
SELECT Date1 FROM t4
)
SELECT COUNT(*) FROM CteDate WHERE Date1 <= DATEADD(DAY, -3, GETDATE())
It works for all SQL Server greater or equal then 2005.

SQL GROUP BY HAVING issue

I have two tables of records that I need to find all of the matches. The tables are based on different Primary Key identifiers, but the data points are exactly the same. I need a fast query that can show me records that are duplicated from the first table to the second. Here is an example of what I am trying to do:
DECLARE #Table1 TABLE (ID INT, Value INT)
DECLARE #Table2 TABLE (ID INT, Value INT)
INSERT INTO #Table1 VALUES (1, 500)
INSERT INTO #Table1 VALUES (2, 500)
INSERT INTO #Table2 VALUES (3, 500)
INSERT INTO #Table2 VALUES (4, 500)
SELECT MAX(x.T1ID)
,MAX(x.T2ID)
FROM (
SELECT T1ID = t1.ID
,T2ID = 0
,t1.Value
FROM #Table1 t1
UNION ALL
SELECT T1ID = 0
,T2ID = t2.ID
,t2.Value
FROM #Table2 t2
) x
GROUP BY x.Value
HAVING COUNT(*) >= 2
The problem with this code is that it returns record 2 in table 1 correlated to record 4 in table 2. I really need it to return record 1 in table 1 correlated to record 3 in table 2. I tried the following:
SELECT MIN(x.T1ID)
,MIN(x.T2ID)
FROM (
SELECT T1ID = t1.ID
,T2ID = 0
,t1.Value
FROM #Table1 t1
UNION ALL
SELECT T1ID = 0
,T2ID = t2.ID
,t2.Value
FROM #Table2 t2
) x
GROUP BY x.Value
HAVING COUNT(*) >= 2
This code does not work either. It returns 0,0.
Is there a way to return the MIN value greater than 0 for both tables?
Might answer my own question. This seems to work. Are there any reasons why I would not do this?
SELECT MIN(t1.ID)
,MIN(t2.ID)
FROM #Table1 t1
INNER JOIN #Table2 t2 ON t1.Value = t2.Value
GROUP BY t1.Value
If you want to see the records in table1 that have matches in table2 then
select *
from #Table1 T1
where exists (select * from #Table2 T2
where T1.ID=T2.ID
-- you would put the complete join clause that defines a match here
)

T-SQL: SQL to get mutual records from a single table

This is kind of difficult to describe. I have a table with two fields: Col1, Col2 with the following data:
Col1, Col2
1 10
1 11
1 12
10 1
11 1
13 1
The values in Col1 can act like a foreign key value in Col2. I want to find all the rows for a given value in Col1 where the value in Col2 appears in Col1 but where it's Col2 value is also the given value. So for example, if I am looking for the value of 1 in Col1, the following rows are returned:
Col1, Col2
1 10
1 11
10 1
11 1
The row with Col1 set to 13 would not be returned because it does not appear in Col2 where Col1 is set to 1.
The sql I created works:
Select T1.*
From Table1 T1
Inner Join Table1 T2 On T1.Col2 = T2.Col1
Where (Exists(Select * From Table1 T3 Where (T2.Col1 = T3.Col1) And (T3.Col2 = T1.Col1)))
And (T1.Col1 = 1)
This returns duplicates of rows. I can always add the DISTINCT keyword and this will remove the duplicates. My question is whether my sql is really the correct way of selecting the records and whether it can be done without the DISTINCT keyword.
I think this may do what you want:
SELECT T1.*
FROM Table1 T1
JOIN Table1 T2 ON (T1.Col2 = T2.Col1) AND (T1.Col1 = T2.Col2)
WHERE T1.Col1 = 1
I think you can drop the JOIN and just use EXISTS
SELECT T1.*
FROM Table1 T1
WHERE 1 IN ( T1.Col1, T1.Col2 )
AND ( EXISTS ( SELECT *
FROM Table1 T2
WHERE ( T2.Col1 = T1.Col2
AND T2.Col2 = T1.Col1
) ) )

TSQL difficult join issue

I struggling with a problem I have in TSQL, I need to get the top 10 results for each user from a table that might contain more than 10 results.
My natural (and procedurally minded) approach is "for each user in table T select the top 10 results ordered by date".
Each time I try to formulate the question in my mind in a set based approach, I keep running into the term "foreach".
Is it possible to do something like this:
SELECT *
FROM table AS t1
INNER JOIN (
SELECT TOP 10 *
FROM table AS t2
WHERE t2.id = t1.id
ORDER BY date DESC
)
Or even
SELECT ( SELECT TOP 10 *
FROM table AS t2
WHERE t2.id = t1.id
ORDER BY date )
FROM table AS t1
Or is there another solution to this using temp tables that I should think about?
EDIT:
Just to be perfectly clear - I need to the top 10 results for each user in the table, e.g. 10 * N where N = number of users.
EDIT:
In response to a suggestion made by RBarryYoung, I'm having an issue, which is best demonstrated with code:
CREATE TABLE #temp (id INT, date DATETIME)
INSERT INTO #temp (id, date) VALUES (1, GETDATE())
INSERT INTO #temp (id, date) VALUES (1, GETDATE())
SELECT *
FROM #temp AS t1
CROSS APPLY (
SELECT TOP 1 *
FROM #temp AS t2
WHERE t2.id = t1.id
ORDER BY t2.date DESC
) AS t2
DROP TABLE #temp
Running this, you can see that this doesn't limit the results to the TOP 1... Am I doing something wrong here?
EDIT:
It seems my last example provided a bit of confusion. Here is an example showing what I want to do:
CREATE TABLE #temp (id INT, date DATETIME)
INSERT INTO #temp (id, date) VALUES (1, GETDATE())
INSERT INTO #temp (id, date) VALUES (1, GETDATE())
INSERT INTO #temp (id, date) VALUES (1, GETDATE())
INSERT INTO #temp (id, date) VALUES (2, GETDATE())
SELECT *
FROM #temp AS t1
CROSS APPLY
(
SELECT TOP 2 *
FROM #temp AS t2
WHERE t2.id = t1.id
ORDER BY t2.date DESC
) AS t2
DROP TABLE #temp
This outputs:
1 2009-08-26 09:05:56.570 1 2009-08-26 09:05:56.583
1 2009-08-26 09:05:56.570 1 2009-08-26 09:05:56.583
1 2009-08-26 09:05:56.583 1 2009-08-26 09:05:56.583
1 2009-08-26 09:05:56.583 1 2009-08-26 09:05:56.583
1 2009-08-26 09:05:56.583 1 2009-08-26 09:05:56.583
1 2009-08-26 09:05:56.583 1 2009-08-26 09:05:56.583
2 2009-08-26 09:05:56.583 2 2009-08-26 09:05:56.583
If I use distinct:
SELECT DISTINCT t1.id
FROM #temp AS t1
CROSS APPLY
(
SELECT TOP 2 *
FROM #temp AS t2
WHERE t2.id = t1.id
ORDER BY t2.date DESC
) AS t2
I get
1
2
I need
1
1
2
Does anyone know if this is possible?
EDIT:
The following code will do this
WITH RowTable AS
(
SELECT
id, date, ROW_NUMBER() OVER (PARTITION BY id ORDER BY date DESC) AS RowNum
FROM #temp
)
SELECT *
FROM RowTable
WHERE RowNum <= 2;
I posted in the comments, but there is no code formatting, so it doesn't look very nice.
Yes, there are several differet good ways to do this in 2005 and 2008. The one most similar to what you are already trying is with CROSS APPLY:
SELECT T2.*
FROM (
SELECT DISTINCT ID FROM table
) AS t1
CROSS APPLY (
SELECT TOP 10 *
FROM table AS t2
WHERE t2.id = t1.id
ORDER BY date DESC
) AS t2
ORDER BY T2.id, date DESC
This then returns the ten most recent entries in [table] (or as many as exist, up to 10), for each distinct [id]. Asumming that [id] corresponds to a user, then this should be exactly what you are asking for.
(edit: slight changes because I did not take into account that T1 and T2 were the same tables and thus there will be multiple duplicate t1.IDs matching multiple duplicate T2.ids.)
select userid, foo, row_number() over (partition by userid order by foo) as rownum from table where rownum <= 10
It is possible, however using nested queries will be slower.
The following will also find the results you are looking for:
SELECT TOP 10 *
FROM table as t1
INNER JOIN table as t2
ON t1.id = t2.id
ORDER BY date DESC
I believe this SO question will answer your question. It's not answering exactly the same question, but I think the solution will work for you too.
Here's a trick I use to do this "top-N-per-group" type of query:
SELECT t1.id
FROM table t1 LEFT OUTER JOIN table t2
ON (t1.user_id = t2.user_id AND (t1.date > t2.date
OR t1.date = t2.date AND t1.id > t2.id))
GROUP BY t1.id
HAVING COUNT(*) < 10
ORDER BY t1.user_id, COALESCE(COUNT(*), 0);