override data in T-SQL - tsql

I'm trying to figure out the most concise way to achieve the following result set:
given this input data:
Notice that rows 1 and 3 of the input data consist of the same item+bucket combination. Rows with a source of 'mandate' are supposed to take precedence over rows with a source of 'forecast' for matching item+bucket combinations when generating the result set. In cases where item+bucket combinations have no duplication due to difference sources, those rows are to appear in the final result set regardless of their source.
Here is the code for the input data:
declare #t table
(
source varchar(20) not null,
item int not null,
bucket date not null,
quantity int not null,
primary key clustered (source, item, bucket)
);
insert into #t values
('forecast', 8501, '9/1/2016', 100),
('forecast', 8528, '9/1/2016', 100),
('mandate', 8501, '9/1/2016', 200),
('mandate', 8530, '9/1/2016', 200);
select * from #t;

Use ROW_NUMBER and order by the [source] field, descending (so "m..." appears first):
declare #t table
(
source varchar(20) not null,
item int not null,
bucket date not null,
quantity int not null,
primary key clustered (source, item, bucket)
);
insert into #t values
('forecast', 8501, '9/1/2016', 100),
('forecast', 8528, '9/1/2016', 100),
('mandate', 8501, '9/1/2016', 200),
('mandate', 8530, '9/1/2016', 200);
; WITH QtyRank
AS (SELECT *
, qRank = ROW_NUMBER() OVER(PARTITION BY [item] ORDER BY [source] DESC)
FROM #t
)
SELECT *
FROM QtyRank
WHERE QtyRank.qRank = 1 ;

This works:
with
overlap as
(
select t2.*
from #t t1
inner join #t t2
on t1.item = t2.item
and t1.bucket = t2.bucket
where t1.source = 'forecast' and t2.source = 'mandate'
)
select t.item, t.bucket, t.quantity
from #t t
left outer join overlap o
on t.item = o.item
and t.bucket = o.bucket
where o.item is null
union all
select item, bucket, quantity from overlap;
Not sure it's the most concise approach though.

Related

Is it possible to find duplicating records in two columns simultaneously in PostgreSQL?

I have the following database schema (oversimplified):
create sequence partners_partner_id_seq;
create table partners
(
partner_id integer default nextval('partners_partner_id_seq'::regclass) not null primary key,
name varchar(255) default NULL::character varying,
company_id varchar(20) default NULL::character varying,
vat_id varchar(50) default NULL::character varying,
is_deleted boolean default false not null
);
INSERT INTO partners(name, company_id, vat_id) VALUES('test1','1010109191191', 'BG1010109191192');
INSERT INTO partners(name, company_id, vat_id) VALUES('test2','1010109191191', 'BG1010109191192');
INSERT INTO partners(name, company_id, vat_id) VALUES('test3','3214567890102', 'BG1010109191192');
INSERT INTO partners(name, company_id, vat_id) VALUES('test4','9999999999999', 'GE9999999999999');
I am trying to figure out how to return test1, test2 (because the company_id column value duplicates vertically) and test3 (because the vat_id column value duplicates vertically as well).
To put it in other words - I need to find duplicating company_id and vat_id records and group them together, so that test1, test2 and test3 would be together, because they duplicate by company_id and vat_id.
So far I have the following query:
SELECT *
FROM (
SELECT *, LEAD(row, 1) OVER () AS nextrow
FROM (
SELECT *, ROW_NUMBER() OVER (w) AS row
FROM partners
WHERE is_deleted = false
AND ((company_id != '' AND company_id IS NOT null) OR (vat_id != '' AND vat_id IS NOT NULL))
WINDOW w AS (PARTITION BY company_id, vat_id ORDER BY partner_id DESC)
) x
) y
WHERE (row > 1 OR nextrow > 1)
AND is_deleted = false
This successfully shows all company_id duplicates, but does not appear to show vat_id ones - test3 row is missing. Is this possible to be done within one query?
Here is a db-fiddle with the schema, data and predefined query reproducing my result.
You can do this with recursion, but depending on the size of your data you may want to iterate, instead.
The trick is to make the name just another match key instead of treating it differently than the company_id and vat_id:
create table partners (
partner_id integer generated always as identity primary key,
name text,
company_id text,
vat_id text,
is_deleted boolean not null default false
);
insert into partners (name, company_id, vat_id) values
('test1','1010109191191', 'BG1010109191192'),
('test2','1010109191191', 'BG1010109191192'),
('test3','3214567890102', 'BG1010109191192'),
('test4','9999999999999', 'GE9999999999999'),
('test5','3214567890102', 'BG8888888888888'),
('test6','2983489023408', 'BG8888888888888')
;
I added a couple of test cases and left in the lone partner.
with recursive keys as (
select partner_id,
array['n_'||name, 'c_'||company_id, 'v_'||vat_id] as matcher,
array[partner_id] as matchlist,
1 as size
from partners
), matchers as (
select *
from keys
union all
select p.partner_id, c.matcher,
p.matchlist||c.partner_id as matchlist,
p.size + 1
from matchers p
join keys c
on c.matcher && p.matcher
and not p.matchlist #> array[c.partner_id]
), largest as (
select distinct sort(matchlist) as matchlist
from matchers m
where not exists (select 1
from matchers
where matchlist #> m.matchlist
and size > m.size)
-- and size > 1
)
select *
from largest
;
matchlist
{1,2,3,5,6}
{4}
fiddle
EDIT UPDATE
Since recursion did not perform, here is an iterative example in plpgsql that uses a temporary table:
create temporary table match1 (
partner_id int not null,
group_id int not null,
matchkey uuid not null
);
create index on match1 (matchkey);
create index on match1 (group_id);
insert into match1
select partner_id, partner_id, md5('n_'||name)::uuid from partners
union all
select partner_id, partner_id, md5('c_'||company_id)::uuid from partners
union all
select partner_id, partner_id, md5('v_'||vat_id)::uuid from partners;
do $$
declare _cnt bigint;
begin
loop
with consolidate as (
select group_id,
min(group_id) over (partition by matchkey) as new_group_id
from match1
), minimize as (
select group_id, min(new_group_id) as new_group_id
from consolidate
group by group_id
), doupdate as (
update match1
set group_id = m.new_group_id
from minimize m
where m.group_id = match1.group_id
and m.new_group_id != match1.group_id
returning *
)
select count(*) into _cnt from doupdate;
if _cnt = 0 then
exit;
end if;
end loop;
end;
$$;
updated fiddle

What would be Postgres equivalent of this T-SQL

I want to run this either interactively from psql or from code.
create proc recent_orders_by_region
as
select top(3) * from view_orders where region = 'NA' order by order_date desc
select top(3) * from view_orders where region = 'WE' order by order_date desc
select top(3) * from view_orders where region = 'EE' order by order_date desc
You can use rank() over () but remember that order must be unique, otherwise RANK will assign the same number to multiple rows in group.
Query with example data
SELECT a.* FROM (
SELECT *,
rank() OVER (
PARTITION BY region
ORDER BY order_date DESC,ident DESC
)
FROM (
values
(1, current_date, 'NA'),(2, current_date-1, 'NA'),
(3, current_date-1, 'NA'),(4, current_date-4, 'NA'),
(5, current_date, 'NA1'),(6, current_date-1, 'NA1'),
(7, current_date-1, 'NA1'),(8, current_date-4, 'NA1')
) view_orders (ident, order_date, region)
) a WHERE RANK <=3
ident is only for demonstration purposes and should be replaced by real column from view_orders table

PostgreSQL join to denormalize a table with generate_series

I've this table:
CREATE TABLE "mytable"
( name text, count integer );
INSERT INTO mytable VALUES ('john', 4),('mark',2),('albert',3);
and I would like "denormlize" the rows in this way:
SELECT name FROM mytable JOIN generate_series(1,4) tmp(a) ON (a<=count)
so I've a number of rows for each name equals to the count column: I've 4 rows with john, 2 with mark and 3 with albert.
But i can't use the generate_series() function if I don't know the highest count (in this case 4). There is a way to do this without knowing the MAX(count) ?
select name,
generate_series(1,count)
from mytable;
Set returning functions can be used in the select list and will do a cross join with the row retrieved from the base table.
I think this is an undocumented behaviour that might go away in the future, but I'm not sure about that (I recall some discussion regarding this on the mailing list)
SQLFiddle example
DROP TABLE ztable ;
CREATE TABLE ztable (zname varchar, zvalue INTEGER NOT NULL);
INSERT INTO ztable(zname, zvalue) VALUES( 'one', 1), ( 'two', 2 ), ( 'three', 3) , ( 'four', 4 );
WITH expand AS (
WITH RECURSIVE zzz AS (
SELECT 1::integer AS rnk , t0.zname
FROM ztable t0
UNION
SELECT 1+rr.rnk , t1.zname
FROM ztable t1
JOIN zzz rr ON rr.rnk < t1.zvalue
)
SELECT zzz.zname
FROM zzz
)
SELECT x.*
FROM expand x
;

Getting those records that have a match of all records in another table

Within the realm of this problem I have 3 entities:
User
Position
License
Then I have two relational (many-to-many) tables:
PositionLicense - this one connects Position with License ie. which licenses are required for a particular position
UserLicense - this one connects User with License ie. which licenses a particular user has. But with an additional complexity: user licenses have validity date range (ValidFrom and ValidTo)
The problem
These are input variables:
UserID that identifiers a particular User
RangeFrom defines the lower date range limit
RangeTo defines the upper date range limit
What I need to get? For a particular user (and date range) I need to get a list of positions that this particular user can work at. The problem is that user must have at least all licenses required by every matching position.
I'm having huge problems writing a SQL query to get this list.
If at all possible I would like to do this using a single SQL query (can have additional CTEs of course). If you can convince me that doing it in several queries would be more efficient I'm willing to listen in.
Some workable data
Copy and runs this script. 3 users, 3 positions, 6 licenses. Mark and John should have a match but not Jane.
create table [User] (
UserID int identity not null
primary key,
Name nvarchar(100) not null
)
go
create table Position (
PositionID int identity not null
primary key,
Name nvarchar(100) not null
)
go
create table License (
LicenseID int identity not null
primary key,
Name nvarchar(100) not null
)
go
create table UserLicense (
UserID int not null
references [User](UserID),
LicenseID int not null
references License(LicenseID),
ValidFrom date not null,
ValidTo date not null,
check (ValidFrom < ValidTo),
primary key (UserID, LicenseID)
)
go
create table PositionLicense (
PositionID int not null
references Position(PositionID),
LicenseID int not null
references License(LicenseID),
primary key (PositionID, LicenseID)
)
go
insert [User] (Name) values ('Mark the mechanic');
insert [User] (Name) values ('John the pilot');
insert [User] (Name) values ('Jane only has arts PhD but not medical.');
insert Position (Name) values ('Mechanic');
insert Position (Name) values ('Pilot');
insert Position (Name) values ('Doctor');
insert License (Name) values ('Mecha');
insert License (Name) values ('Flying');
insert License (Name) values ('Medicine');
insert License (Name) values ('PhD');
insert License (Name) values ('Phycho');
insert License (Name) values ('Arts');
insert PositionLicense (PositionID, LicenseID) values (1, 1);
insert PositionLicense (PositionID, LicenseID) values (2, 2);
insert PositionLicense (PositionID, LicenseID) values (2, 5);
insert PositionLicense (PositionID, LicenseID) values (3, 3);
insert PositionLicense (PositionID, LicenseID) values (3, 4);
insert UserLicense (UserID, LicenseID, ValidFrom, ValidTo) values (1, 1, '20110101', '20120101');
insert UserLicense (UserID, LicenseID, ValidFrom, ValidTo) values (2, 2, '20110101', '20120101');
insert UserLicense (UserID, LicenseID, ValidFrom, ValidTo) values (2, 5, '20110101', '20120101');
insert UserLicense (UserID, LicenseID, ValidFrom, ValidTo) values (3, 4, '20110101', '20120101');
insert UserLicense (UserID, LicenseID, ValidFrom, ValidTo) values (3, 6, '20110101', '20120101');
Resulting solution
I've setup my resulting solution based on accepted answer which provides the most simplified solution to this problem. If you'd like to play with the query just hit edit/clone (whether you're logged in or not). What can be changed:
three variables:
two variable to set date range (#From and #To)
user ID (#User)
you can toggle commented code in the first CTE to switch code between fully overlapping user licenses or partially overlapping ones.
This makes a number of assumptions (ignores presence of time in the datetime columns, assumes fairly obvious primary keys) and skips the joins to pull in user name, position details, and the like. (And you implied that the user had to hold all the licenses for the full period specified, right?)
SELECT pl.PositionId
from PositionLicense pl
left outer join (-- All licenses user has for the entirety (sp?) of the specified date range
select LicenseId
from UserLicense
where UserId = #UserId
and #RangeFrom <= ValidFrom
and #RangeTo >= ValidTo) li
on li.LicenseId = pl.LicenseId
group by pl.PositionId
-- Where all licenses required by position are held by user
having count(pl.LicenseId) = count(li.LicenseId)
No data so I can't debug or test it, but this or something very close to it should do the trick.
Select ...
From User As U
Cross Join Position As P
Where Exists (
Select 1
From PositionLicense As PL1
Join UserLicense As UL1
On UL1.LicenseId = PL1.LicenseId
And UL1.ValidFrom <= #RangeTo
And UL1.ValidTo >= #RangeFrom
Where PL1.PositionId = P.Id
And UL1.UserId = U.Id
Except
Select 1
From PositionLicense As PL2
Left Join UserLicense As UL2
On UL2.LicenseId = PL2.LicenseId
And UL2.ValidFrom <= #RangeTo
And UL2.ValidTo >= #RangeFrom
And UL2.UserId = U.Id
Where PL2.PositionId = P.Id
And UL2.UserId Is Null
)
If the requirement is that you want users and positions that are valid across the entire range, that is trickier:
With Calendar As
(
Select #RangeFrom As [Date]
Union All
Select DateAdd(d, 1, [Date])
From Calendar
Where [Date] <= #RangeTo
)
Select ...
From User As U
Cross Join Position As P
Where Exists (
Select 1
From UserLicense As UL1
Join PositionLicense As PL1
On PL1.LicenseId = UL1.LicenseId
Where UL1.UserId = U.Id
And PL1.PositionId = P.Id
And UL1.ValidFrom <= #RangeTo
And UL1.ValidTo >= #RangeFrom
Except
Select 1
From Calendar As C1
Cross Join User As U1
Cross Join PositionLicense As PL1
Where U1.Id = U.Id
And PL1.PositionId = P.Id
And Not Exists (
Select 1
From UserLicense As UL2
Where UL2.LicenseId = PL1.LicenseId
And UL1.UserId = U1.Id
And C1.Date Between UL2.ValidFrom And UL2.ValidTo
)
)
Option ( MaxRecursion 0 );
Runnable Version Here
WITH PositionRequirements AS (
SELECT p.PositionID, COUNT(*) AS LicenseCt
FROM #Position AS p
INNER JOIN #PositionLicense AS posl
ON posl.PositionID = p.PositionID
GROUP BY p.PositionID
)
,Satisfied AS (
SELECT u.UserID, posl.PositionID, COUNT(*) AS LicenseCt
FROM #User AS u
INNER JOIN #UserLicense AS perl
ON perl.UserID = u.UserID
-- AND #Date BETWEEN perl.ValidFrom AND perl.ValidTo
AND '20110101' BETWEEN perl.ValidFrom AND perl.ValidTo
INNER JOIN #PositionLicense AS posl
ON posl.LicenseID = perl.LicenseID
-- WHERE u.UserID = #UserID -- Not strictly necessary, we can go over all people
GROUP BY u.UserID, posl.PositionID
)
SELECT PositionRequirements.PositionID, Satisfied.UserID
FROM PositionRequirements
INNER JOIN Satisfied
ON Satisfied.PositionID = PositionRequirements.PositionID
AND PositionRequirements.LicenseCt = Satisfied.LicenseCt
You could probably turn this into an inline table-valued function parameterized on effective date.

TSQL: Remove duplicates based on max(date)

I am searching for a query to select the maximum date (a datetime column) and keep its id and row_id. The desire is to DELETE the rows in the source table.
Source Data
id date row_id(unique)
1 11/11/2009 1
1 12/11/2009 2
1 13/11/2009 3
2 1/11/2009 4
Expected Survivors
1 13/11/2009 3
2 1/11/2009 4
What query would I need to achieve the results I am looking for?
Tested on PostgreSQL:
delete from table where (id, date) not in (select id, max(date) from table group by id);
There are various ways of doing this, but the basic idea is the same:
- Indentify the rows you want to keep
- Compare each row in your table to the ones you want to keep
- Delete any that don't match
DELETE
[source]
FROM
yourTable AS [source]
LEFT JOIN
yourTable AS [keep]
ON [keep].id = [source].id
AND [keep].date = (SELECT MAX(date) FROM yourTable WHERE id = [keep].id)
WHERE
[keep].id IS NULL
DELETE
[yourTable]
FROM
[yourTable]
LEFT JOIN
(
SELECT id, MAX(date) AS date FROM yourTable GROUP BY id
)
AS [keep]
ON [keep].id = [yourTable].id
AND [keep].date = [yourTable].date
WHERE
[keep].id IS NULL
DELETE
[source]
FROM
yourTable AS [source]
WHERE
[source].row_id != (SELECT TOP 1 row_id FROM yourTable WHERE id = [source].id ORDER BY date DESC)
DELETE
[source]
FROM
yourTable AS [source]
WHERE
NOT EXISTS (SELECT id FROM yourTable GROUP BY id HAVING id = [source].id AND MAX(date) != [source].date)
Because you are using SQL Server 2000, you'er not able to use the Row Over technique of setting up a sequence and to identify the top row for each unique id.
So, your proposed technique is to use a datetime column to get the top 1 row to remove duplicates. That might work, but there is a possibility that you might still get duplicates having the same datetime value. But that's easy enough to check for.
First check the assumption that all rows are unique based on the id and date columns:
CREATE TABLE #TestTable (rowid INT IDENTITY(1,1), thisid INT, thisdate DATETIME)
INSERT INTO #TestTable (thisid,thisdate) VALUES (1, '11/11/2009')
INSERT INTO #TestTable (thisid,thisdate) VALUES (1, '12/11/2009')
INSERT INTO #TestTable (thisid,thisdate) VALUES (1, '12/12/2009')
INSERT INTO #TestTable (thisid,thisdate) VALUES (2, '1/11/2009')
INSERT INTO #TestTable (thisid,thisdate) VALUES (2, '1/11/2009')
SELECT COUNT(*) AS thiscount
FROM #TestTable
GROUP BY thisid, thisdate
HAVING COUNT(*) > 1
This example returns a value of 2 - indicating that you will still end up with duplicates even after using the date column to remove duplicates. If you return 0, then you have proven that your proposed technique will work.
When de-duping production data, I think one should take some precautions and test before and after. You should create a table to hold the rows you plan to remove so you can recover them easily if you need to after the delete statement has been executed.
Also, it's a good idea to know beforehand how many rows you plan to remove so you can verify the count before and after - and you can gauge the magnitude of the delete operation. Based on how many rows will be affected, you can plan when to run the operation.
To test before the de-duping process, find the occurrences.
-- Get occurrences of duplicates
SELECT COUNT(*) AS thiscount
FROM
#TestTable
GROUP BY thisid
HAVING COUNT(*) > 1
ORDER BY thisid
That gives you the rows with more than one row with the same id. Capture the rows from this query into a temporary table and then run a query using the SUM to get the total number of rows that are not unique based on your key.
To get the number of rows you plan to delete, you need the count of rows that are duplicate based on your unique key, and the number of distinct rows based on your unique key. You subtract the distinct rows from the count of occurrences. All that is pretty straightforward - so I'll leave you to it.
Try this
declare #t table (id int, dt DATETIME,rowid INT IDENTITY(1,1))
INSERT INTO #t (id,dt) VALUES (1, '11/11/2009')
INSERT INTO #t (id,dt) VALUES (1, '11/12/2009')
INSERT INTO #t (id,dt) VALUES (1, '11/13/2009')
INSERT INTO #t (id,dt) VALUES (2, '11/01/2009')
Query:
delete from #t where rowid not in(
select t.rowid from #t t
inner join(
select MAX(dt)maxdate
from #t
group by id) X
on t.dt = X.maxdate )
select * from #t
Output:
id dt rowid
1 2009-11-13 00:00:00.000 3
2 2009-11-01 00:00:00.000 4
delete from temp where row_id not in (
select t.row_id from temp t
right join
(select id,MAX(dt) as dt from temp group by id) d
on t.dt = d.dt and t.id = d.id)
I have tested this answer..
INSERT INTO #t (id,dt) VALUES (1, '11/11/2009')
INSERT INTO #t (id,dt) VALUES (1, '11/12/2009')
INSERT INTO #t (id,dt) VALUES (1, '11/13/2009')
INSERT INTO #t (id,dt) VALUES (2, '11/01/2009')
select * from #t
;WITH T AS(
select dense_rank() over(partition by id order by dt desc)NO,DT,ID,rowid from #t )
DELETE T WHERE NO>1