Find duplicate row "details" in table - tsql

OrderId OrderCode Description
-------------------------------
1 Z123 Stuff
2 ABC999 Things
3 Z123 Stuff
I have duplicates in a table like the above. I'm trying to get a report of which Orders are duplicates, and what Order they are duplicates of, so I can figure out how they got into the database.
So ideally I'd like to get an output something like;
OrderId IsDuplicatedBy
-------------------------
1 3
3 1
I can't work out how to code this in SQL.

You can use the same table twice in one query and join on the fields you need to check against. T1.OrderID <> T2.OrderID is needed to not find a duplicate for the same row.
declare #T table (OrderID int, OrderCode varchar(10), Description varchar(50))
insert into #T values
(1, 'Z123', 'Stuff'),
(2, 'ABC999', 'Things'),
(3, 'Z123', 'Stuff')
select
T1.OrderID,
T2.OrderID as IsDuplicatedBy
from #T as T1
inner join #T as T2
on T1.OrderCode = T2.OrderCode and
T1.Description = T2.Description and
T1.OrderID <> T2.OrderID
Result:
OrderID IsDuplicatedBy
1 3
3 1

Related

TSQL Get Item Price History from Item Price Changes

I have a table of item price changes, and I want to use it to create a table of item prices for each date (between the item's launch and end dates).
Here's some code to create the date:-
declare #Item table (item_id int, item_launch_date date, item_end_date date);
insert into #Item Values (1,'2001-01-01','2016-01-01'), (2,'2001-01-01','2016-01-01')
declare #ItemPriceChanges table (item_id int, item_price money, my_date date);
INSERT INTO #ItemPriceChanges VALUES (1, 123.45, '2001-01-01'), (1, 345.34, '2001-01-03'), (2, 34.34, '2001-01-01'), (2,23.56 , '2005-01-01'), (2, 56.45, '2016-05-01'), (2, 45.45, '2017-05-01'); ;
What I'd like to see is something like this:-
item_id date price
------- ---- -----
1 2001-01-01 123.45
1 2001-01-02 123.45
1 2001-01-03 345.34
1 2001-01-04 345.34
etc.
2 2001-01-01 34.34
2 2001-01-02 34.34
etc.
Any suggestions on how to write the query?
I'm using SQL Server 2016.
Added:
I also have a calendar table called "dim_calendar" with one row per day. I had hoped to use a windowing function, but the nearest I can find is lead() and it doesn't do what I thought it would do:-
select
i.item_id,
c.day_date,
ipc.item_price as item_price_change,
lead(item_price,1,NULL) over (partition by i.item_id ORDER BY c.day_date) as item_price
from dim_calendar c
inner join #Item i
on c.day_date between i.item_launch_date and i.item_end_date
left join #ItemPriceChanges ipc
on i.item_id=ipc.item_id
and ipc.my_date=c.day_date
order by
i.item_id,
c.day_date;
Thanks
I wrote this prior to your edit. Note that your sample output suggests that an item can have two prices on the day of the price change. The following assumes that an item can only have one price on a price change day and that is the new price.
declare #Item table (item_id int, item_launch_date date, item_end_date date);
insert into #Item Values (1,'2001-01-01','2016-01-01'), (2,'2001-01-01','2016-01-01')
declare #ItemPriceChange table (item_id int, item_price money, my_date date);
INSERT INTO #ItemPriceChange VALUES (1, 123.45, '2001-01-01'), (1, 345.34, '2001-01-03'), (2, 34.34, '2001-01-01'), (2,23.56 , '2005-01-01'), (2, 56.45, '2016-05-01'), (2, 45.45, '2017-05-01');
SELECT * FROM #ItemPriceChange
-- We need a table variable holding all possible date points for the output
DECLARE #DatePointList table (DatePoint date);
DECLARE #StartDatePoint date = '01-Jan-2001';
DECLARE #MaxDatePoint date = GETDATE();
DECLARE #DatePoint date = #StartDatePoint;
WHILE #DatePoint <= #MaxDatePoint BEGIN
INSERT INTO #DatePointList (DatePoint)
SELECT #DatePoint;
SET #DatePoint = DATEADD(DAY,1,#DatePoint);
END;
-- We can use a CTE to sequence the price changes
WITH ItemPriceChange AS (
SELECT item_id, item_price, my_date, ROW_NUMBER () OVER (PARTITION BY Item_id ORDER BY my_date ASC) AS SeqNo
FROM #ItemPriceChange
)
-- With the price changes sequenced, we can derive from and to dates for each price and use a join to the table of date points to produce the output. Also, use an inner join back to #item to only return rows for dates that are within the start/end date of the item
SELECT ItemPriceDate.item_id, DatePointList.DatePoint, ItemPriceDate.item_price
FROM #DatePointList AS DatePointList
INNER JOIN (
SELECT ItemPriceChange.item_id, ItemPriceChange.item_price, ItemPriceChange.my_date AS from_date, ISNULL(ItemPriceChange_Next.my_date,#MaxDatePoint) AS to_date
FROM ItemPriceChange
LEFT OUTER JOIN ItemPriceChange AS ItemPriceChange_Next ON ItemPriceChange_Next.item_id = ItemPriceChange.item_id AND ItemPriceChange.SeqNo = ItemPriceChange_Next.SeqNo - 1
) AS ItemPriceDate ON DatePointList.DatePoint >= ItemPriceDate.from_date AND DatePointList.DatePoint < ItemPriceDate.to_date
INNER JOIN #item AS item ON item.item_id = ItemPriceDate.item_id AND DatePointList.DatePoint BETWEEN item.item_launch_date AND item.item_end_date
ORDER BY ItemPriceDate.item_id, DatePointList.DatePoint;
#AlphaStarOne Perfect! I've modified it to use a Windowing function rather than a self-join, but what you've suggested works. Here's my implementation of that in case anyone else needs it:
SELECT
ipd.item_id,
dc.day_date,
ipd.item_price
FROM dim_calendar dc
INNER JOIN (
SELECT
item_id,
item_price,
my_date AS from_date,
isnull(lead(my_date,1,NULL) over (partition by item_id ORDER BY my_date),getdate()) as to_date
FROM #ItemPriceChange ipc1
) AS ipd
ON dc.day_date >= ipd.from_date
AND dc.day_date < ipd.to_date
INNER JOIN #item AS i
ON i.item_id = ipd.item_id
AND dc.day_date BETWEEN i.item_launch_date AND i.item_end_date
ORDER BY
ipd.item_id,
dc.day_date;

After doing CTE Select Order By and then Update, Update results are not ordered the same (TSQL)

The code is roughly like this:
WITH cte AS
(
SELECT TOP 4 id, due_date, check
FROM table_a a
INNER JOIN table_b b ON a.linkid = b.linkid
WHERE
b.status = 1
AND due_date > GetDate()
ORDER BY due_date, id
)
UPDATE cte
SET check = 1
OUTPUT
INSERTED.id,
INSERTED.due_date
Note: the actual data has same due_date.
When I ran the SELECT statement only inside the cte, I could get the result, for ex: 1, 2, 3, 4.
But after the UPDATE statement, the updated results are: 4, 1, 2, 3
Why is this (order-change) happening?
How to keep or re-order the results back to 1,2,3,4 in this same 1 query?
In MSDN https://msdn.microsoft.com/pl-pl/library/ms177564(v=sql.110).aspx you can read that
There is no guarantee that the order in which the changes are applied
to the table and the order in which the rows are inserted into the
output table or table variable will correspond.
Thats mean you can't solve your problem with only one query. But you still can use one batch to do what you need. Because your output don't guarantee the order then you have to save it in another table and order it after update. This code will return your output values in order that you assume:
declare #outputTable table( id int, due_date date);
with cte as (
select top 4 id, due_date, check
from table_a a
inner join table_b b on a.linkid = b.linkid
where b.status = 1
and due_date > GetDate()
order by due_date, id
)
update cte
set check = 1
output inserted.id, inserted.due_date
into #outputTable;
select *
from #outputTable
order by due_date, id;

How can I SUM distinct records in a Postgres database where there are duplicate records?

Imagine a table that looks like this:
The SQL to get this data was just SELECT *
The first column is "row_id" the second is "id" - which is the order ID and the third is "total" - which is the revenue.
I'm not sure why there are duplicate rows in the database, but when I do a SUM(total), it's including the second entry in the database, even though the order ID is the same, which is causing my numbers to be larger than if I select distinct(id), total - export to excel and then sum the values manually.
So my question is - how can I SUM on just the distinct order IDs so that I get the same revenue as if I exported to excel every distinct order ID row?
Thanks in advance!
Easy - just divide by the count:
select id, sum(total) / count(id)
from orders
group by id
See live demo.
Also handles any level of duplication, eg triplicates etc.
You can try something like this (with your example):
Table
create table test (
row_id int,
id int,
total decimal(15,2)
);
insert into test values
(6395, 1509, 112), (22986, 1509, 112),
(1393, 3284, 40.37), (24360, 3284, 40.37);
Query
with distinct_records as (
select distinct id, total from test
)
select a.id, b.actual_total, array_agg(a.row_id) as row_ids
from test a
inner join (select id, sum(total) as actual_total from distinct_records group by id) b
on a.id = b.id
group by a.id, b.actual_total
Result
| id | actual_total | row_ids |
|------|--------------|------------|
| 1509 | 112 | 6395,22986 |
| 3284 | 40.37 | 1393,24360 |
Explanation
We do not know what the reasons is for orders and totals to appear more than one time with different row_id. So using a common table expression (CTE) using the with ... phrase, we get the distinct id and total.
Under the CTE, we use this distinct data to do totaling. We join ID in the original table with the aggregation over distinct values. Then we comma-separate row_ids so that the information looks cleaner.
SQLFiddle example
http://sqlfiddle.com/#!15/72639/3
Create custom aggregate:
CREATE OR REPLACE FUNCTION sum_func (
double precision, pg_catalog.anyelement, double precision
)
RETURNS double precision AS
$body$
SELECT case when $3 is not null then COALESCE($1, 0) + $3 else $1 end
$body$
LANGUAGE 'sql';
CREATE AGGREGATE dist_sum (
pg_catalog."any",
double precision)
(
SFUNC = sum_func,
STYPE = float8
);
And then calc distinct sum like:
select dist_sum(distinct id, total)
from orders
SQLFiddle
You can use DISTINCT in your aggregate functions:
SELECT id, SUM(DISTINCT total) FROM orders GROUP BY id
Documentation here: https://www.postgresql.org/docs/9.6/static/sql-expressions.html#SYNTAX-AGGREGATES
If we can trust that the total for 1 order is actually 1 row. We could eliminate the duplicates in a sub-query by selecting the the MAX of the PK id column. An example:
CREATE TABLE test2 (id int, order_id int, total int);
insert into test2 values (1,1,50);
insert into test2 values (2,1,50);
insert into test2 values (5,1,50);
insert into test2 values (3,2,100);
insert into test2 values (4,2,100);
select order_id, sum(total)
from test2 t
join (
select max(id) as id
from test2
group by order_id) as sq
on t.id = sq.id
group by order_id
sql fiddle
In difficult cases:
select
id,
(
SELECT SUM(value::int4)
FROM jsonb_each_text(jsonb_object_agg(row_id, total))
) as total
from orders
group by id
I would suggest just use a sub-Query:
SELECT "a"."id", SUM("a"."total")
FROM (SELECT DISTINCT ON ("id") * FROM "Database"."Schema"."Table") AS "a"
GROUP BY "a"."id"
The Above will give you the total of each id
Use below if you want the full total of each duplicate removed:
SELECT SUM("a"."total")
FROM (SELECT DISTINCT ON ("id") * FROM "Database"."Schema"."Table") AS "a"
Using subselect (http://sqlfiddle.com/#!7/cef1c/51):
select sum(total) from (
select distinct id, total
from orders
)
Using CTE (http://sqlfiddle.com/#!7/cef1c/53):
with distinct_records as (
select distinct id, total from orders
)
select sum(total) from distinct_records;

Find all records NOT in any blocked range where blocked ranges are in a table

I have a table TaggedData with the following fields and data
ID GroupID Tag MyData
** ******* *** ******
1 Texas AA01 Peanut Butter
2 Texas AA15 Cereal
3 Ohio AA05 Potato Chips
4 Texas AA08 Bread
I have a second table of BlockedTags as follows:
ID StartTag EndTag
** ******** ******
1 AA00 AA04
2 AA15 AA15
How do I select from this to return all data matching a given GroupId but NOT in any blocked range (inclusive)? For the data given if the GroupId is Texas, I don't want to return Cereal because it matches the second range. It should only return Bread.
I did try left joins based queries but I'm not even that close.
Thanks
create table TaggedData (
ID int,
GroupID varchar(16),
Tag char(4),
MyData varchar(50))
create table BlockedTags (
ID int,
StartTag char(4),
EndTag char(4)
)
insert into TaggedData(ID, GroupID, Tag, MyData)
values (1, 'Texas', 'AA01', 'Peanut Butter')
insert into TaggedData(ID, GroupID, Tag, MyData)
values (2, 'Texas' , 'AA15', 'Cereal')
insert into TaggedData(ID, GroupID, Tag, MyData)
values (3, 'Ohio ', 'AA05', 'Potato Chips')
insert into TaggedData(ID, GroupID, Tag, MyData)
values (4, 'Texas', 'AA08', 'Bread')
insert into BlockedTags(ID, StartTag, EndTag)
values (1, 'AA00', 'AA04')
insert into BlockedTags(ID, StartTag, EndTag)
values (2, 'AA15', 'AA15')
select t.* from TaggedData t
left join BlockedTags b on t.Tag between b.StartTag and b.EndTag
where b.ID is null
Returns:
ID GroupID Tag MyData
----------- ---------------- ---- --------------------------------------------------
3 Ohio AA05 Potato Chips
4 Texas AA08 Bread
(2 row(s) affected)
So, to match on given GroupID you change the query like that:
select t.* from TaggedData t
left join BlockedTags b on t.Tag between b.StartTag and b.EndTag
where b.ID is null and t.GroupID=#GivenGroupID
I Prefer the NOT EXISTS simply because it gives you more readability, usability and better performance usually in large data (several cases get better execution plans):
would be like this:
SELECT * from TaggedData
WHERE GroupID=#GivenGroupID
AND NOT EXISTS(SELECT 1 FROM BlockedTags WHERE Tag BETWEEN StartTag ANDEndTag)

T-SQL query, multiple values in a field

I have two tables in a database. The first table tblTracker contains many columns, but the column of particular interest is called siteAdmin and each row in that column can contain multiple loginIDs of 5 digits like 21457, 21456 or just one like 21444. The next table users contains columns like LoginID, fname, and lname.
What I would like to be able to do is take the loginIDs contained in tblTracker.siteAdmin and return fname + lname from users. I can successfully do this when there is only one loginID in the row such as 21444 but I cannot figure out how to do this when there is more than one like 21457, 21456.
Here is the SQL statement I use for when there is one loginID in that column
SELECT b.FName + '' '' + b.LName AS siteAdminName,
FROM tblTracker a
LEFT OUTER JOIN users b ON a.siteAdmin= b.Login_Id
However this doesn't work when it tries to join a siteAdmin with more than one LoginID in it
Thanks!
I prefer the number table approach to split a string in TSQL
For this method to work, you need to do this one time table setup:
SELECT TOP 10000 IDENTITY(int,1,1) AS Number
INTO Numbers
FROM sys.objects s1
CROSS JOIN sys.objects s2
ALTER TABLE Numbers ADD CONSTRAINT PK_Numbers PRIMARY KEY CLUSTERED (Number)
Once the Numbers table is set up, create this split function:
CREATE FUNCTION [dbo].[FN_ListToTable]
(
#SplitOn char(1) --REQUIRED, the character to split the #List string on
,#List varchar(8000)--REQUIRED, the list to split apart
)
RETURNS TABLE
AS
RETURN
(
----------------
--SINGLE QUERY-- --this will not return empty rows
----------------
SELECT
ListValue
FROM (SELECT
LTRIM(RTRIM(SUBSTRING(List2, number+1, CHARINDEX(#SplitOn, List2, number+1)-number - 1))) AS ListValue
FROM (
SELECT #SplitOn + #List + #SplitOn AS List2
) AS dt
INNER JOIN Numbers n ON n.Number < LEN(dt.List2)
WHERE SUBSTRING(List2, number, 1) = #SplitOn
) dt2
WHERE ListValue IS NOT NULL AND ListValue!=''
);
GO
You can now easily split a CSV string into a table and join on it:
select * from dbo.FN_ListToTable(',','1,2,3,,,4,5,6777,,,')
OUTPUT:
ListValue
-----------------------
1
2
3
4
5
6777
(6 row(s) affected)
Your can now use a CROSS APPLY to split every row in your table like:
DECLARE #users table (LoginID int, fname varchar(5), lname varchar(5))
INSERT INTO #users VALUES (1, 'Sam', 'Jones')
INSERT INTO #users VALUES (2, 'Don', 'Smith')
INSERT INTO #users VALUES (3, 'Joe', 'Doe')
INSERT INTO #users VALUES (4, 'Tim', 'White')
INSERT INTO #users VALUES (5, 'Matt', 'Davis')
INSERT INTO #users VALUES (15,'Sue', 'Me')
DECLARE #tblTracker table (RowID int, siteAdmin varchar(50))
INSERT INTO #tblTracker VALUES (1,'1,2,3')
INSERT INTO #tblTracker VALUES (2,'2,3,4')
INSERT INTO #tblTracker VALUES (3,'1,5')
INSERT INTO #tblTracker VALUES (4,'1')
INSERT INTO #tblTracker VALUES (5,'5')
INSERT INTO #tblTracker VALUES (6,'')
INSERT INTO #tblTracker VALUES (7,'8,9,10')
INSERT INTO #tblTracker VALUES (8,'1,15,3,4,5')
SELECT
t.RowID, u.LoginID, u.fname+' '+u.lname AS YourAdmin
FROM #tblTracker t
CROSS APPLY dbo.FN_ListToTable(',',t.siteAdmin) st
LEFT OUTER JOIN #users u ON st.ListValue=u.LoginID --to get all rows even if missing siteAdmin
--INNER JOIN #users u ON st.ListValue=u.LoginID --to remove rows without any siteAdmin
ORDER BY t.RowID,u.fname,u.lname
OUTPUT:
RowID LoginID YourAdmin
----------- ----------- -----------
1 2 Don Smith
1 3 Joe Doe
1 1 Sam Jones
2 2 Don Smith
2 3 Joe Doe
2 4 Tim White
3 5 Matt Davis
3 1 Sam Jones
4 1 Sam Jones
5 5 Matt Davis
7 NULL NULL
7 NULL NULL
7 NULL NULL
8 3 Joe Doe
8 5 Matt Davis
8 1 Sam Jones
8 15 Sue Me
8 4 Tim White
(18 row(s) affected)