How to exclude row not distinct by 2 fields - tsql

I have some code, which returns not expected result.
SELECT DISTINCT
T.Field_1
T.Field_2
FROM
(
SELECT
...
FROM
UNION ALL
SELECT
...
FROM
) AS T
GROUP BY T.Field_1, T.Field_2
result:
Line 3 must be absent because it is not distinct by Field_1 and Field_2. How to do this with some SQL features?

If Field_1 is the only column that has to be unique and you just want the first record for every unique Field_1 value then below is a way to accomplish that.
CREATE TABLE #table (Field_1 VARCHAR(MAX), Field_2 VARCHAR(MAX))
INSERT INTO
#table
VALUES
('Item A', ''),
('Item B', ''),
('Item B', 'Item A'),
('Item C', ''),
('Item D', 'Item A')
SELECT DISTINCT
[b].[Field_1], [b].[Field_2]
FROM
#table AS [a]
OUTER APPLY (
SELECT TOP 1
[Field_1], [Field_2]
FROM
#table AS [b]
WHERE
[a].[Field_1] = [b].[Field_1]
) AS [b]
DROP TABLE #table

Related

Postgres Jsonb aggregation

I am trying to achieve following (Result Required) output from POSTGRES jsonb columns, but not getting desired result using "jsonb_agg" function.
I went through this postgres document https://www.postgresql.org/docs/12/functions-json.html, but no luck here.
Also am not that good in json data in postgres, so please suggest good resource for json formatting related stuff for postgres.
City
JColA
JColB
NY
[{"id":"ID1","name":"ID1_NAME","type":"ID1_TYPE","amount":20.12,"full_name":null},{"id":"ID2","name":"ID2_NAME","type":"ID2_TYPE","amount":11.55,"full_name":null},{"id":"ID1","name":"ID1_NAME","type":"ID1_TYPE","amount":5.45,"full_name":null}]
[{"key":"key1","value":"1"},{"key":"key2","value":"2"},{"key":"key3","value":"3"}]
DC
[{"id":"ID1","name":"ID1_NAME","type":"ID1_TYPE","amount":1.5,"full_name":null},{"id":"ID3","name":"ID3_NAME","type":"ID3_TYPE","amount":1.2,"full_name":null},{"id":"ID4","name":"ID4_NAME","type":"ID4_TYPE","amount":1,"full_name":null}]
[{"key":"key1","value":"1"},{"key":"key1","value":"2"},{"key":"key1","value":"3"}]
DL
[{"id":"ID4","name":"ID4_NAME","type":"ID4_TYPE","amount":1.5,"full_name":null},{"id":"ID2","name":"ID2_NAME","type":"ID2_TYPE","amount":1.2,"full_name":null},{"id":"ID4","name":"ID4_NAME","type":"ID4_TYPE","amount":1,"full_name":null}]
[{"key":"key1","value":"2"},{"key":"key2","value":"2"},{"key":"key3","value":"4"}]
NY
[{"id":"ID1","name":"ID1_NAME","type":"ID1_TYPE","amount":4.5,"full_name":null},{"id":"ID2","name":"ID2_NAME","type":"ID2_TYPE","amount":2.2,"full_name":null},{"id":"ID4","name":"ID4_NAME","type":"ID4_TYPE","amount":6,"full_name":null}]
[{"key":"key4","value":"2"},{"key":"key2","value":"5"},{"key":"key2","value":"4"}]
DC
[{"id":"ID3","name":"ID3_NAME","type":"ID3_TYPE","amount":2.5,"full_name":null},{"id":"ID3","name":"ID3_NAME","type":"ID3_TYPE","amount":2.2,"full_name":null},{"id":"ID4","name":"ID4_NAME","type":"ID4_TYPE","amount":2,"full_name":null}]
[{"key":"key1","value":"2"},{"key":"key2","value":"2"},{"key":"key3","value":"4"}]
Required Result
City
AggJSonColA
AggJsonColB
NY
[{"id":"ID1","name":"ID1_NAME","type":"ID1_TYPE","amount":30.07,"full_name":null},{"id":"ID2","name":"ID2_NAME","type":"ID2_TYPE","amount":13.75,"full_name":null},{"id":"ID4","name":"ID4_NAME","type":"ID4_TYPE","amount":6,"full_name":null}]
[{"key":"key1","value":"1"},{"key":"key2","value":"11"},{"key":"key3","value":"3"}, {"key":"key4","value":"2"}]
DC
[{"id":"ID1","name":"ID1_NAME","type":"ID1_TYPE","amount":1.5,"full_name":null},{"id":"ID3","name":"ID3_NAME","type":"ID3_TYPE","amount":5.9,"full_name":null},{"id":"ID4","name":"ID4_NAME","type":"ID4_TYPE","amount":3,"full_name":null}]
[{"key":"key1","value":"8"},{"key":"key2","value":"2"},{"key":"key3","value":"4"}]
DL
[{"id":"ID4","name":"ID4_NAME","type":"ID4_TYPE","amount":1.5,"full_name":null},{"id":"ID2","name":"ID2_NAME","type":"ID2_TYPE","amount":1.2,"full_name":null},{"id":"ID4","name":"ID4_NAME","type":"ID4_TYPE","amount":1,"full_name":null}]
[{"key":"key1","value":"2"},{"key":"key2","value":"2"},{"key":"key3","value":"4"}]
You need to break out the arrays with jsonb_to_recordset, rebuild the objects using jsonb_build_object, and aggregate them back up with jsonb_agg
SELECT
A.City,
A.JColA,
B.JColB
FROM (
SELECT
City,
jsonb_agg(JColA) AS JColA
FROM (
SELECT
t.City,
json_build_object(
'id', id,
'name', name,
'type', type,
'amount', SUM(amount),
'full_name', full_name
) AS JColA
FROM YourTable t,
LATERAL jsonb_to_recordset(t.JColA)
AS arr(id varchar(10), name varchar(100), type varchar(100), amount decimal (18,2), full_name varchar(100))
GROUP BY
t.City, arr.id, arr.name, arr.type, arr.full_name
) A
GROUP BY
City
) A
JOIN (
SELECT
City,
jsonb_agg(JColB) AS JColB
FROM (
SELECT
t.City,
json_build_object(
'key', "key",
'value', SUM(value)
) AS JColB
FROM YourTable t,
LATERAL jsonb_to_recordset(t.JColB)
AS arr("key" varchar(10), value int)
GROUP BY
t.City, arr."key"
) B
GROUP BY
City
) B ON B.City = A.City;
I feel it's easier to requery the original table again, however if you want to avoid that, you could first aggregate all the arrays together by City, break them back out and re-aggregate.
SELECT
t.City,
(
SELECT
jsonb_agg(JColA)
FROM (
SELECT
json_build_object(
'id', id,
'name', name,
'type', type,
'amount', SUM(amount),
'full_name', full_name
) AS JColA
FROM jsonb_array_elements(t.JColA) AS outerArr,
LATERAL jsonb_to_recordset(outerArr)
AS arr(id varchar(10), name varchar(100), type varchar(100), amount decimal (18,2), full_name varchar(100))
GROUP BY
arr.id, arr.name, arr.type, arr.full_name
) A
) AS JColA,
(
SELECT
jsonb_agg(JColB)
FROM (
SELECT
json_build_object(
'key', "key",
'value', SUM(arr.value)
) AS JColB
FROM jsonb_array_elements(t.JColB) AS outerArr,
LATERAL jsonb_to_recordset(outerArr)
AS arr("key" varchar(10), value int)
GROUP BY
arr."key"
) B
) AS JColB
FROM (
SELECT
t.City,
jsonb_agg(JColA) AS JColA,
jsonb_agg(JColB) AS JColB
FROM YourTable t
GROUP BY
t.City
) t;
db<>fiddle

derived column in Order by in partition

Using Sql Server 2008 R2.
Where there is more than 1 row of type demographic change, I need to delete all but 1 per person, but the types of demographic changes are weighted, with some more important than others. I don't know what the data will hold but if a more important one exists for a particular Contact, I want it to rise to the top.
I tried:
;WITH cte AS
(
SELECT lastname, firstname, FieldChanged,
Case 'FieldChanged'
When 'firstname' then 0
When 'lastname' then 0
When 'ssn' then 1
When 'xyz' then 5
End as "Weight"
, ROW_NUMBER() OVER (PARTITION BY D2.ContactId, D2.ContractId ORDER BY weight asc) AS demorow
FROM MyDATA d2
where d2.FieldChanged in ('firstname', 'lastname', 'ssn', 'xyz')
)
SELECT *
FROM cte
WHERE demorow > 1
This gives me an error: Invalid column name 'weight'.
I think I can't use APPLY since there's no unique key in the source table, which is not under my control.
Update:
CREATE TABLE dbo.MyTempTable
(firstname varchar(25) NOT NULL,
lastname varchar(25) NOT NULL,
FieldChanged varchar(25),
ContactId uniqueidentifier,
ContractId uniqueidentifier
)
GO
Insert into dbo.mytemptable
(firstname ,
lastname ,
FieldChanged ,
ContactId ,
ContractId)
Values
('john', 'smith', 'ssn', '688CB150-C7FD-E511-8709-00155D070201', '688CB150-C7FD-E511-8709-00155D070202')
, ('john', 'smith', 'xyz', '688CB150-C7FD-E511-8709-00155D070201', '688CB150-C7FD-E511-8709-00155D070202')
, ('mary', 'doe', 'xyz', '688CB150-C7FD-E511-8709-00155D070203', '688CB150-C7FD-E511-8709-00155D070202')
, ('mary', 'doe', 'firstname', '688CB150-C7FD-E511-8709-00155D070203', '688CB150-C7FD-E511-8709-00155D070202')
, ('mary', 'doe', 'lastname', '688CB150-C7FD-E511-8709-00155D070203', '688CB150-C7FD-E511-8709-00155D070202')
, ('mary', 'doe', 'ssn', '688CB150-C7FD-E511-8709-00155D070203', '688CB150-C7FD-E511-8709-00155D070202')
For this data I'd want John Smith's and Mary Doe's respective xyz rows to be selected, as less important than their name change rows.
Update 2:
I think this works:
;WITH cte AS
(
SELECT lastname, firstname, FieldChanged,
Case FieldChanged
When 'firstname' then 0
When 'lastname' then 0
When 'ssn' then 5
When 'xyz' then 1
else 9
End as "Weight",
ContactId, ContractID
FROM edi..MyDATA d2
where d2.FieldChanged in ('firstname', 'lastname', 'ce_ssn', 'Policy Number')
),
cte2 As
(
SELECT *
, ROW_NUMBER() OVER (PARTITION BY ContactId, ContractId ORDER BY weight asc) AS demorow
FROM cte
)
SELECT *
FROM cte2
WHERE demorow > 1
Column aliases are assigned after all of the other clauses of a SELECT expression are executed (except for ORDER BY clauses, but not ORDER BY expressions), so you cannot use them within the same SELECT expression, only outside of them (or in an ORDER BY clause).
Here's a quick fix:
;WITH cte AS
(
SELECT lastname, firstname, FieldChanged,
Case FieldChanged
When 'firstname' then 0
When 'lastname' then 0
When 'ssn' then 1
When 'xyz' then 5
End as "Weight",
ContactId, ContractID
FROM MyDATA d2
where d2.FieldChanged in ('firstname', 'lastname', 'ssn', 'xyz')
),
cte2 As
(
SELECT *
, ROW_NUMBER() OVER (PARTITION BY ContactId, ContractId ORDER BY weight asc) AS demorow
FROM cte
)
SELECT *
FROM cte2
WHERE demorow > 1
replace "weight" in the order by with the full CASE statement. Or put the main query (without order by) in a sub query and the row number in the outer query. YOu should then be able to access the "weight" column in order by.

TSQL: How to concatenate string of GROUPED values

I encountered a lot of thread about this, the solutions suggested all tend to go the same way, but it is very inconvenient in my case.
Most of the time something like this is suggested.
DECLARE #Actors TABLE ( [Id] INT , [Name] VARCHAR(20) , [MovieId] INT);
DECLARE #Movie TABLE ( [Id] INT, [Name] VARCHAR(20), [FranchiseId] INT );
INSERT INTO #Actors
( Id, Name, MovieId )
VALUES ( 1, 'Sean Connery', 1 ),
( 2, 'Gert Fröbe', 1 ),
( 3, 'Honor Blackman', 1 ),
( 4, 'Daniel Craig', 2 ),
( 5, 'Judi Dench', 2 ),
( 2, 'Harrison Ford', 3 )
INSERT INTO #Movie
( Id, Name, FranchiseId )
VALUES ( 1, 'Goldfinger', 1 ),
( 2, 'Skyfall', 1 ),
( 3, 'Return of the Jedi', 2 )
SELECT m.Name ,
STUFF(( SELECT ',' + a_c.Name
FROM #Actors a_c
WHERE a_c.MovieId = m.Id
FOR
XML PATH('')
), 1, 1, '')
FROM #Actors a
JOIN #Movie m ON a.MovieId = m.Id
GROUP BY m.Id ,
m.Name
The Problem is (how shall I explain?), one does not really access the grouped Items (as Count(), Max(), Min(), ...), one does rebuild the joining pattern of the "outer query" and force in the WHERE statement, that the corresponding values are the same as those in the GROUP BY statement (in the outer query).
If you do not understand what I'm trying to say, I extended the Example above, by one additional table and you will see, that I will also have to extend the "Inner Query"
DECLARE #Actors TABLE ( [Id] INT , [Name] VARCHAR(20) , [MovieId] INT);
DECLARE #Movie TABLE ( [Id] INT, [Name] VARCHAR(20), [FranchiseId] INT );
DECLARE #Franchise TABLE ( [Id] INT , [Name] VARCHAR(20));
INSERT INTO #Actors
( Id, Name, MovieId )
VALUES ( 1, 'Sean Connery', 1 ),
( 2, 'Gert Fröbe', 1 ),
( 3, 'Honor Blackman', 1 ),
( 4, 'Daniel Craig', 2 ),
( 5, 'Judi Dench', 2 ),
( 2, 'Harrison Ford', 3 )
INSERT INTO #Movie
( Id, Name, FranchiseId )
VALUES ( 1, 'Goldfinger', 1 ),
( 2, 'Skyfall', 1 ),
( 3, 'Return of the Jedi', 2 )
INSERT INTO #Franchise
( Id, Name )
VALUES ( 1, 'James Bond' ),
( 2, 'Star Wars' )
SELECT f.Name ,
STUFF(( SELECT ',' + a_c.Name
FROM #Actors a_c
JOIN #Movie m_c ON a_c.MovieId = m_c.Id
WHERE m_c.FranchiseId = f.Id
FOR
XML PATH('')
), 1, 1, '')
FROM #Actors a
JOIN #Movie m ON a.MovieId = m.Id
JOIN #Franchise f ON m.FranchiseId = m.Id
GROUP BY f.Id ,
f.Name
And now, going somewhat further, imagine a huge query, very complicated, several grouping values over many tables. Performance is an issue. I don't want to rebuild the whole joining pattern in the "inner query".
So is there any other way? A way that does not kill performance and you do not have to duplicate the joining pattern?
Contrary to what I said in this comment, you need no GROUP BY clause, nor a WHERE clause, at all!
You simply need the outer SELECT to "iterate" over all franchises (or whatever you want to group by). Then in the inner SELECT, you need some JOINs to get to the franchise key column. Instead of a WHERE clause to filter by the outer franchise's key, simply use the outer franchise key directly in the INNER JOIN:
SELECT f.Name AS FranchiseName,
COALESCE(STUFF((SELECT DISTINCT ', ' + a.Name
FROM #Actor a
JOIN #Movie m ON a.MovieId = m.Id
WHERE m.FranchiseId = f.Id
ORDER BY ', ' + a.Name -- this is optional
FOR XML PATH('')), 1, 1, ''), '') AS ActorNames
FROM #Franchise f
Source of information: "High Performance T-SQL Using Window Functions" by Itzik Ben-Gak. Because SQL Server unfortunately does not have an aggregate/window function for concatenating values, the book's author recommends something like the above as the next best solution.
P.S.: I've removed my previous solution that substituted an additional JOIN for a WHERE clause; I am now fairly certain that a WHERE clause is likely to perform better. Nevertheless, I left some evidence of my previous solution (i.e. the striked-through text) because of that reference to a comment I made earlier.

JOIN vs. IN vs. EXISTS

I was reading an article that explained the difference between join and in and exists clause but I got confused with the explanation of different results when using NOT IN vs. NOT EXISTS clause. Can someone clarify why there is a difference between the output for NOT EXISTS clause vs. NOT IN clause? I tried after deleting the NULL row (t2.id = 8) from the table t2 and still got the same result.
Here's the SQL script from the article:
CREATE TABLE t1 (id INT, title VARCHAR(20), someIntCol INT)
GO
CREATE TABLE t2 (id INT, t1Id INT, someData VARCHAR(20))
GO
INSERT INTO t1
SELECT 1, 'title 1', 5 UNION ALL
SELECT 2, 'title 2', 5 UNION ALL
SELECT 3, 'title 3', 5 UNION ALL
SELECT 4, 'title 4', 5 UNION ALL
SELECT null, 'title 5', 5 UNION ALL
SELECT null, 'title 6', 5
INSERT INTO t2
SELECT 1, 1, 'data 1' UNION ALL
SELECT 2, 1, 'data 2' UNION ALL
SELECT 3, 2, 'data 3' UNION ALL
SELECT 4, 3, 'data 4' UNION ALL
SELECT 5, 3, 'data 5' UNION ALL
SELECT 6, 3, 'data 6' UNION ALL
SELECT 7, 4, 'data 7' UNION ALL
SELECT 8, null, 'data 8' UNION ALL
SELECT 9, 6, 'data 9' UNION ALL
SELECT 10, 6, 'data 10' UNION ALL
SELECT 11, 8, 'data 11'
And here's the SQL queries and their explanation:
-- IN doesn't get correct results.
-- That's because of how IN treats NULLs and the Three-valued logic
-- NULL is treated as an unknown, so if there's a null in the t2.t1id
-- NOT IN will return either NOT TRUE or NOT UNKNOWN. And neither can be TRUE.
-- when there's a NULL in the t1id column of the t2 table the NOT IN query will always return an empty set.
SELECT t1.*
FROM t1
WHERE t1.id NOT IN (SELECT t1id FROM t2)
-- NOT EXISTS gets correct results
SELECT t1.*
FROM t1
WHERE NOT EXISTS (SELECT * FROM t2 WHERE t1.id = t2.t1id)
GO
DROP TABLE t2
DROP TABLE t1
Here's the link to the article: http://weblogs.sqlteam.com/mladenp/archive/2007/05/18/60210.aspx
Thank you!
As I can see, you can use them as the same thing in a lot of cases, but you can't forget the details behind them.
Probably you can get the same results applying both NOT IN and NOT EXISTS, but you could see differences in query which involve the NULL value. Because NOT EXISTS is the only way to obtain those rows with the NULL value.
You can see it better in this example:
update cars set c_owner = NULL where c_id = BMW03444
Well... Let's try to see if we have any car in stock that has not been sold yet.
select count(*) from cars where c_owner not it (select c_name from customers);
Output:
COUNT(*): 0
Where's the failure? Quite simple. You're not requesting a group of cars whose buyers has not been included in the list. You are simply asking for a car without owner. Anybody, even if he's not in the list. The correct form is:
select count(*)
from cars c1
where not exists (
select c_owner
from customers c2
where c1.c_owner=c2.customer_id
);
COUNT(*): 1
This is because NOT IN needs specific values to check in. So NULL values are set as FALSE and not counted.
NOT EXISTS checks the non existence of an element in a set, so NULL values are set as TRUE and are included.

T-SQL Column values count

Say I have a table A and it has 5 columns (Column1, Column2.. Column5), the values in each column is one char size and stored only
as alphabetic as follows
ID Column1 Column2 Column3 Column4 Column5
1 A C D A B
2 A D A B A
3 B K Q C Q
4 A K E E B
5 F K F F S
I need a count of each different value stored in column1 to column5, I want the following information
Column1 has A's count=3, B's count=1, F's count=1
Column2 has C's count=1, D's count=1, K's count=3
and so on
What is the correct way and format to return these values?
Thanks
You could try:
SELECT 'Col1' As 'Col', Column1 as 'Value', COUNT(*) as 'Ct'
FROM MyTable
GROUP BY Column1
UNION ALL
SELECT 'Col2', Column2, COUNT(*)
FROM MyTable
GROUP BY Column2
...
You will need to write an additional SELECT to UNION for each column you want to aggregate, but it will return the data you are after.
Can you just execute an individual query for each needed column using a GROUP BY?
SELECT Column1, COUNT(Column1) FROM TableName
GROUP BY Column1
You can use unpivot in a derived table (or CTE) and group by column name and value in the outer query.
Try this:
declare #T table
(
ID int,
Column1 char(1),
Column2 char(1),
Column3 char(1),
Column4 char(1),
Column5 char(1)
)
insert into #T values
(1, 'A', 'C', 'D', 'A', 'B'),
(2, 'A', 'D', 'A', 'B', 'A'),
(3, 'B', 'K', 'Q', 'C', 'Q'),
(4, 'A', 'K', 'E', 'E', 'B'),
(5, 'F', 'K', 'F', 'F', 'S')
;with C as
(
select ID, Col, Val
from (
select ID, Column1, Column2, Column3, Column4, Column5
from #T
) as T
unpivot (Val for Col in (Column1, Column2, Column3, Column4, Column5)) as U
)
select Col, Val, count(*) as ValCount
from C
group by Col, Val
order by Col
The union approach is going to server you best. The individual query for each union would look something like this:
Select Distinct Column1, COUNT(Column1)OVER(PARTITION BY Column1) Col1Count, 'Column1' ColumnName
From ColTable
Union All
...