removing duplicate rows and dependencies without cursor

removing duplicate rows and dependencies without cursor - tsql

I have a table that has a long list of duplicated items. I am working on a stored procedure to consolidate them all into one record. Each one of the duplicated items has a number of child tables that should either be deleted, or rekeyed to point at the resulting record. My table has an Id, but the ReadableIdentifier is the column I need to deduplicate.
Id | ReadableIdentifier | Name | UpdatedOn
1 | ABC1234 | Product X | 2014-04-25 16:00:08.000
2 | ABC1234 | Product X | 2014-04-28 16:00:08.000
3 | ABC1234 | Product X | 2014-04-21 16:00:08.000
4 | ABDD9945 | Widget R | 2014-04-25 16:00:08.000
5 | ABDD9945 | Widget R | 2014-04-25 18:45:08.000
As you can see, records 1-3 are duplicates with different Id's and UpdatedOn dates. Same for 4-5. I need to consolidate these into one record, preferring the one with the most recent UpdatedOn date.
End Goal (not showing children tables):
Id | ReadableIdentifier | Name | UpdatedOn
2 | ABC1234 | Product X | 2014-04-28 16:00:08.000
5 | ABDD9945 | Widget R | 2014-04-25 18:45:08.000
I am using a CURSOR to do this, but am wondering if there is a better solution.
DECLARE dupeCursor CURSOR
FAST_FORWARD
FOR
WITH Counts AS (
SELECT
COUNT(1) Count,
ReadableIdentifier
FROM dbo.Item WITH (NOLOCK)
WHERE ReadableIdentifier IS NOT NULL
GROUP BY ReadableIdentifier)
SELECT
Counts.Count,
Counts.ReadableIdentifier,
Counts.CompanyId
FROM
Counts
WHERE Counts.Count > 1;
OPEN dupeCursor;
DECLARE #readableId VARCHAR(50);
DECLARE #itemToPersistId INT, #itemToDeleteId INT;
FETCH NEXT FROM dupeCursor INTO #readableId;
WHILE ##FETCH_STATUS = 0
BEGIN
WITH V AS (
SELECT Id, ROW_NUMBER() OVER (PARTITION BY ReadableId ORDER BY UpdatedOn DESC) as Row
FROM dbo.Item WITH (NOLOCK) WHERE ReadableId = #readableId
)
SELECT #itemToPersistId = Id
FROM V
WHERE V.Row = 1
CREATE TABLE #itemsToDelete (Id UNIQUEIDENTIFIER)
INSERT INTO #itemsToDelete
SELECT Id
FROM dbo.Item WITH (NOLOCK)
WHERE ReadableId = #readableId AND Id != #itemToPersistId;
--UPDATE CHILDREN TABLES
DELETE FROM dbo.ItemDetails WHERE ItemId IN (SELECT Id FROM #itemsToDelete);
UPDATE dbo.ItemPurchases SET ItemId = #itemToPersistId
WHERE ItemId IN (SELECT Id FROM #itemsToDelete);
UPDATE dbo.PurchaseOrders SET ItemId = #itemToPersistId
WHERE ItemId IN (SELECT Id FROM #itemsToDelete);
DELETE FROM dbo.ItemMetadata WHERE ItemId IN (SELECT Id FROM #itemsToDelete);
--delete Duplicated Items
DELETE FROM dbo.Item WHERE Id IN (SELECT Id FROM #itemsToDelete);
DROP TABLE #itemsToDelete
FETCH NEXT FROM dupeCursor INTO #readableId;
END
CLOSE dupeCursor;
DEALLOCATE dupeCursor;
I realize the cursor is most likely the issue, but I'm not sure how to go about updating all of the child tables without using one.

Ok I dont have data to test this for the child tables but it should work:
WITH V
AS (SELECT *,
ROW_NUMBER() OVER(PARTITION BY ReadableId ORDER BY UpdatedOn DESC) AS Row
FROM dbo.Item WITH (NOLOCK))
SELECT *
INTO #itemsToDelete
FROM V;
--UPDATE CHILDREN TABLES
DELETE FROM dbo.ItemDetails
WHERE ItemId IN
(
SELECT Id
FROM #itemsToDelete
WHERE Row > 1
);
UPDATE IP
SET
IP.ItemId = itk.ID
FROM dbo.ItemPurchases AS IP
INNER JOIN #itemsToDelete AS itd ON IP.ItemId = itd.ID
AND itd.Row > 1
INNER JOIN #itemsToDelete AS itk ON itk.ReadableIdentifier = itd.ReadableIdentifier
AND itk.Row = 1
AND itd.Row > 1;
UPDATE po
SET
po.ItemId = itk.ID
FROM dbo.PurchaseOrders AS po
INNER JOIN #itemsToDelete AS itd ON po.ItemId = itd.ID
AND itd.Row > 1
INNER JOIN #itemsToDelete AS itk ON itk.ReadableIdentifier = itd.ReadableIdentifier
AND itk.Row = 1
AND itd.Row > 1;
DELETE FROM dbo.ItemMetadata
WHERE ItemId IN
(
SELECT Id
FROM #itemsToDelete
WHERE Row > 1
);
--delete Duplicated Items
DELETE FROM dbo.Item
WHERE Id IN
(
SELECT Id
FROM #itemsToDelete
WHERE Row > 1
);

Related

Fixing duplicate rows to adhere to constraint

Is there a way to force existing rows to be unique on a column before adding a unique constraint? I am adding this constraint to my db:
create unique index customfields_name_org_id_key
on CustomFields(name, org_id) where deleted is false;
But would like to first find all cases where this constraint wouldn't be met, and add 1 to the name of one of the rows (adding higher numbers if there are more than 2 colliding rows). So, for example,
SELECT name, org_id, deleted,
row_number() OVER (PARTITION BY name, deleted ORDER BY org_id) AS rnum
FROM customfields ORDER BY org_id;
gives me
name | org_id | deleted | rnum
-----------+--------+---------+------
Another | 1 | f | 1
Bad email | 1 | t | 1
Dog? | 1 | f | 1
New | 1 | f | 1
New | 1 | f | 2
New | 1 | t | 1
New field | 1 | t | 1
and I would like
New | 1 | f | 2
To be renamed "New2"
I have written this code:
update CustomFields
set name = case
when cf.rnum = 1 or cf.deleted
then cf.name
else cf.name || rnum
end
from (SELECT name, org_id, deleted,
row_number() OVER (PARTITION BY name, deleted ORDER BY org_id) AS rnum
FROM customfields ORDER BY org_id) as cf;
But it just takes the first row from the select and renames all the names to "Another". How do I alter this code so that the update works on the corresponding rows in cf?
Sample code: https://www.db-fiddle.com/#&togetherjs=iKSKze0tGm

You could update table by using:
WITH cte AS (
SELECT *,row_number() OVER (PARTITION BY name, deleted ORDER BY org_id) AS rnum
FROM customfields
)
UPDATE CustomFields
SET name = (SELECT case when cf.rnum = 1 or cf.deleted then cf.name
else cf.name || rnum end
FROM cte cf WHERE cf.id = CustomFields.id);
db<>fiddle demo

how can I get all ids starting from a given id recursively in a postgresql table that references itself?

the title may not be very clear so let's consider this example (this is not my code, just taking this example to model my request)
I have a table that references itself (like a filesystem)
id | parent | name
----+----------+-------
1 | null | /
2 | 1 | home
3 | 2 | user
4 | 3 | bin
5 | 1 | usr
6 | 5 | local
Is it possible to make a sql request so if I choose :
1 I will get a table containing 2,3,4,5,6 (because this is the root) so matching :
/home
/home/user
/home/user/bin
/usr
etc...
2 I will get a table containing 3,4 so matching :
/home/user
/home/user/bin
and so on

Use recursive common table expression. Always starting from the root, use an array of ids to get paths for a given id in the WHERE clause.
For id = 1:
with recursive cte(id, parent, name, ids) as (
select id, parent, name, array[id]
from my_table
where parent is null
union all
select t.id, t.parent, concat(c.name, t.name, '/'), ids || t.id
from cte c
join my_table t on c.id = t.parent
)
select id, name
from cte
where 1 = any(ids) and id <> 1
id | name
----+-----------------------
2 | /home/
5 | /usr/
6 | /usr/local/
3 | /home/user/
4 | /home/user/bin/
(5 rows)
For id = 2:
with recursive cte(id, parent, name, ids) as (
select id, parent, name, array[id]
from my_table
where parent is null
union all
select t.id, t.parent, concat(c.name, t.name, '/'), ids || t.id
from cte c
join my_table t on c.id = t.parent
)
select id, name
from cte
where 2 = any(ids) and id <> 2
id | name
----+-----------------------
3 | /home/user/
4 | /home/user/bin/
(2 rows)
Bidirectional query
The question is really interesting. The above query works well but is inefficient as it parses all tree nodes even when we're asking for a leaf. The more powerful solution is a bidirectional recursive query. The inner query walks from a given node to top, while the outer one goes from the node to bottom.
with recursive outer_query(id, parent, name) as (
with recursive inner_query(qid, id, parent, name) as (
select id, id, parent, name
from my_table
where id = 2 -- parameter
union all
select qid, t.id, t.parent, concat(t.name, '/', q.name)
from inner_query q
join my_table t on q.parent = t.id
)
select qid, null::int, right(name, -1)
from inner_query
where parent is null
union all
select t.id, t.parent, concat(q.name, '/', t.name)
from outer_query q
join my_table t on q.id = t.parent
)
select id, name
from outer_query
where id <> 2; -- parameter

Update Count column in Postgresql

I have a single table laid out as such:
id | name | count
1 | John |
2 | Jim |
3 | John |
4 | Tim |
I need to fill out the count column such that the result is the number of times the specific name shows up in the column name.
The result should be:
id | name | count
1 | John | 2
2 | Jim | 1
3 | John | 2
4 | Tim | 1
I can get the count of occurrences of unique names easily using:
SELECT COUNT(name)
FROM table
GROUP BY name
But that doesn't fit into an UPDATE statement due to it returning multiple rows.
I can also get it narrowed down to a single row by doing this:
SELECT COUNT(name)
FROM table
WHERE name = 'John'
GROUP BY name
But that doesn't allow me to fill out the entire column, just the 'John' rows.

you can do that with a common table expression:
with counted as (
select name, count(*) as name_count
from the_table
group by name
)
update the_table
set "count" = c.name_count
from counted c
where c.name = the_table.name;
Another (slower) option would be to use a co-related sub-query:
update the_table
set "count" = (select count(*)
from the_table t2
where t2.name = the_table.name);
But in general it is a bad idea to store values that can easily be calculated on the fly:
select id,
name,
count(*) over (partition by name) as name_count
from the_table;

Another method : Using a derived table
UPDATE tb
SET count = t.count
FROM (
SELECT count(NAME)
,NAME
FROM tb
GROUP BY 2
) t
WHERE t.NAME = tb.NAME

How to get latest value from table with self inner join

Please see http://sqlfiddle.com/#!6/9254d/3/0
I have two tables, Person and Values, PersonID is the link between them. Each person in the Values table has multiple values per day for every hour. I need to get the latest value for each user. I had a look on SO and what I could find was to get MAX(ValueDate) and then join on that but doesn't work. Join on PersonID didn't work either, not sure what else to try.
The output I need is
Name Value
1fn 1ln 2
2fn 2ln 20
3fn 3ln 200
I don't need the greatest value, I need the latest value for each person. Please share if you have any ideas. Thanks.

Try this:
SQLFIDDLEExample
DECLARE #Org nvarchar(3)
SELECT #Org = 'aaa'
DECLARE #MyDate date
SELECT #MyDate = CONVERT(date, '2014-09-12')
SELECT a.Name,
a.Value as Revenue
FROM(
SELECT p.FName + ' ' + p.LName AS Name,
vt.Value,
ROW_NUMBER()OVER(PARTITION BY vt.PersonID ORDER BY vt.ValueDate desc) as rnk
FROM Person p
LEFT JOIN ValueTable vt
ON vt.PersonID = p.PersonID
WHERE vt.ValueDate < DATEADD(day,1,#MyDate)
AND vt.ValueDate >= #MyDate
AND vt.Org = #Org)a
WHERE a.rnk = 1
ORDER BY a.Name ASC
Result:
| NAME | REVENUE |
|---------|---------|
| 1fn 1ln | 2 |
| 2fn 2ln | 20 |
| 3fn 3ln | 200 |

sql join if value exists in other table then Count it

I have following tables.
Table A
UserID | key 1 | A 2 | B 3 | A 4 | C 5 |
Table B
UserID | Num1 | 501 | 3002 |3 | 1004 | 20
I have query like this
SELECT COUNT(key) AS cnt, key
FROM A
WHERE key <> ''
GROUP BY key
ORDER BY cnt DESC
The results should be something like this
key | cnt A | 2 B | 1 C | 1
What I would like to add is Joining Table B.
If UserID has value in Num in Table B, I would like to count UserID with/Num Grouped by key
Here is desired results
key | cnt | Has Num? A | 2 | 2 B | 1 | 0 C | 1 | 1
I tried to write subquery but I can't attach it to main query. Subquery is something like this.
SELECT COUNT(DISTINCT UserID) AS num
FROM B
LEFT OUTER JOIN A ON B.UserID = A.UserID
WHERE Num <>'' AND key <> ''
GROUP BY key

If I'm understanding this correctly, what you're looking for is a count of the Keys in Table A when they were used by a UserID, and then a count of the number of unique UserIDs in Table B who both appeared in the first Table A query and had a Num.
Try this:
SELECT a.[Key], COUNT(a.[Key]) AS cnt, isNull(SUM(b.bCnt), 0) AS [Has Num?]
FROM #TableA a
LEFT OUTER JOIN (
SELECT b.UserID, 1
FROM #TableB b
WHERE LEN(b.Num) > 0
GROUP BY b.UserID
) b (UserID, bCnt) ON b.UserID = a.UserID
WHERE LEN(a.[Key]) > 0
GROUP BY a.[Key]
This query gives the results that you were expecting.

DECLARE #TableA TABLE(UserID INT, [Key] CHAR(1))
INSERT INTO #TableA VALUES(1,'A'),(2,'B'),(3,'A'),(4,'C'),(5,'')
DECLARE #TableB TABLE(UserID INT, Num INT NULL)
INSERT INTO #TableB VALUES(1,50),(1,300),(2,NULL),(3,100),(4,20)
SELECT x.[Key],x.Cnt,y.[Has Num?]
FROM
( SELECT [Key],Cnt = COUNT([Key])
FROM #TableA
WHERE LEN([Key])>0
GROUP BY [Key]
)X
JOIN
(
SELECT a.[Key],[Has Num?] = COUNT(b.Num)
FROM #TableA a
JOIN #TableB b ON a.UserID = b.UserID
GROUP BY a.[Key]
)Y
ON x.[Key] = Y.[Key]
Key Cnt Has Num?
A 2 3
B 1 0
C 1 1

How about an OUTER APPLY
SELECT [Key], COUNT(a.[Key]) AS cnt, SUM(x.NumCount) AS [Has Num?]
FROM #TableA a
OUTER APPLY (SELECT COUNT(NUM) AS NumCount
FROM #TableB b
WHERE b.UserId = a.UserId AND Num IS NOT NULL
) x
WHERE [Key] <> ''
GROUP BY [Key]
ORDER BY cnt DESC
Result:
Key cnt Has Num?
---- ----------- -----------
A 2 3
B 1 0
C 1 1

We Keep Coding

iphone swift flutter scala powershell matlab mongodb postgresql perl eclipse

removing duplicate rows and dependencies without cursor - tsql

Related

Fixing duplicate rows to adhere to constraint

how can I get all ids starting from a given id recursively in a postgresql table that references itself?

Update Count column in Postgresql

How to get latest value from table with self inner join

sql join if value exists in other table then Count it

Categories

Resources