Postgres Distinct Order by - postgresql

I have to tables that I want to join, order by two timestamps and get as result the distinct values (for several columns). But it doesn't work.
See examples below:
CREATE TABLE t1(myid int, myyear int, mycol int, mdate timestamp);
INSERT INTO t1 VALUES
(11833,2022,1059,'2022-11-03 22:02:00'),(11834,2022,1059,'2022-11-17 19:56:41'),(11832,2021,1058,'2021-11-16 16:38:21'),(11839,2021,1057,'2021-11-10 18:08:09'),(11847,2021,1055,'2022-05-31 12:13:11'),(11847,2021,1055,'2022-05-31 12:13:11'),(11850,2021,1049,'2021-09-29 16:11:31'),(11853,2021,1046,'2022-01-24 11:44:41'),(11855,2021,1045,'2022-01-24 11:38:05'),(11865,2021,1044,'2022-01-24 11:23:51'),(11856,2021,1043,'2022-01-24 11:00:24'),(11840,2021,1042,'2021-11-30 12:28:13'),(11831,2021,1042,'2021-11-30 12:22:30'),(11846,2022,1042,'2022-11-02 15:06:00'),(11829,2022,1036,'2022-11-02 02:37:00'),(11826,2021,1035,'2021-09-24 13:07:48'),(11825,2021,1034,'2021-10-06 08:22:23'),(11830,2022,1033,'2022-11-03 21:18:00'),(11827,2022,1033,'2022-11-15 21:46:04'),(11828,2022,1032,'2022-11-08 16:44:08'),(11824,2022,1031,'2022-10-25 18:09:03'),(11823,2022,1031,'2022-11-02 03:10:00'),(11822,2022,1030,'2022-10-24 14:59:25')
;
CREATE TABLE t2(myid int, name varchar,idate timestamp);
INSERT INTO t2 VALUES
(11833,'Name1684','2023-01-10 15:52:55'),(11834,'Name1727','2023-01-10 15:52:55'),(11832,'Name609','2023-01-10 15:52:54'),(11839,'Name608','2023-01-10 15:52:59'),(11847,'Name606','2023-01-10 15:53:03'),(11847,'Name607','2023-01-10 15:53:03'),(11850,'Name605','2023-01-10 15:53:04'),(11853,'Name604','2023-01-10 15:53:05'),(11855,'Name603','2023-01-10 15:53:06'),(11865,'Name602','2023-01-10 15:53:10'),(11856,'Name601','2023-01-10 15:53:07'),(11840,'Name600','2023-01-10 15:52:59'),(11831,'Name1726','2023-01-10 15:52:53'),(11846,'Name1683','2023-01-10 15:53:03'),(11829,'Name1682','2023-01-10 15:52:52'),(11826,'Name599','2023-01-10 15:52:50'),(11825,'Name598','2023-01-10 15:52:49'),(11830,'Name1681','2023-01-10 15:52:52'),(11827,'Name1725','2023-01-10 15:52:51'),(11828,'Name1680','2023-01-10 15:52:51'),(11824,'Name1678','2023-01-10 15:52:48'),(11823,'Name1679','2023-01-10 15:52:48'),(11822,'Name1677','2023-01-10 15:52:47')
;
Show example which is not working before order and distinct:
Select
*
from t1
join t2
on t1.myid=t2.myid where t1.mycol =1059
=> Gives me this result:
myid
myyear
mycol
mdate
myid
name
idate
11833
2022
1059
2022-11-03 22:02:00
11833
Name1684
2023-01-10 15:52:55
11834
2022
1059
2022-11-17 19:56:41
11834
Name1727
2023-01-10 15:52:55
I want to order first by column mdate, then by idate (both to see the youngest dates) and then see only distinct values of (myyear and mycol)
CREATE TABLE expectedresult(myid int, myyear int,mycol int, mdate timestamp,name varchar,idate timestamp);
INSERT INTO expectedresult VALUES
(11834,2022,1059,'2022-11-17 19:56:41','Name1727','2023-01-10 15:52:55')
myid
myyear
mycol
mdate
name
idate
11834
2022
1059
2022-11-17 19:56:41
Name1727
2023-01-10 15:52:55
This is what I have tried:
create table t3 as(
select distinct on (subq1.myyear,subq1.mycol)
*
from(
Select
t1.myid,
t1.myyear,
t1.mycol,
t1.mdate,
t2.name,
t2.idate
from t1
join t2
on t1.myid=t2.myid
order by t1.mdate desc, t2.idate desc) subq1)
But it "distincts" the wrong row(because a younger mdate is available):
select * from t3 where mycol =1059
myid
myyear
mycol
mdate
name
idate
11833
2022
1059
2022-11-03 22:02:00
Name1684
2023-01-10 15:52:55
here also as fiddle:
https://dbfiddle.uk/eS5FoBeq
Best

SELECT DISTINCT ON (t1.myyear, t1.mycol)
*
FROM
t1
JOIN t2 ON t1.myid = t2.myid
ORDER BY
t1.myyear,
t1.mycol,
t1.mdate DESC,
t2.idate DESC;
or rewrite your query as:
SELECT DISTINCT ON (subq1.myyear, subq1.mycol)
*
FROM (
SELECT
t1.myid,
t1.myyear,
t1.mycol,
t1.mdate,
t2.name,
t2.idate
FROM
t1
JOIN t2 ON t1.myid = t2.myid
ORDER BY
t1.mdate DESC,
t2.idate DESC) subq1
ORDER BY
subq1.myyear,
subq1.mycol,
subq1.mdate DESC,
subq1.idate DESC;
if you distinct on (x,y) then you order by should be order by x,y,z
x, y is the columns that you want to get the unique row.In a group set (x,y), there are many rows, but you only want one, then you need order by z to get the only one row in a group set (x,y) in a deterministic way, otherwise, it will get a random row in a group set(x,y).

In general I try to avoid using distinct.
You can use row number to identify the number of elements with the same "myyear" and "mycol" and order them by newest date and then select the first value (rn =1).
with cte as(
Select
t1.myid,
t1.myyear,
t1.mycol,
t1.mdate,
t2.name,
t2.idate,
ROW_NUMBER() OVER (PARTITION BY myyear, mycol ORDER BY mdate DESC) as rn
from t1
join t2
on t1.myid=t2.myid
subq1)
)
Select *
from cte
where rn = 1

Related

SQL Debugging Help Needed

I am writing a query in Redshift to answer the question "Give the average lifetime spend of users who spent more on their first order than their second order." This is based off of an order_items table which has one row for every item ordered (so an order with 3 items would be represented in 3 rows). Here's a snapshot of the first 10 rows:
First 10 rows of order_items:
Here is my solution:
with
cte1_lifetime as (
select oi.user_id, sum(oi.sale_price) as lifetime_spend
from order_items as oi
group by oi.user_id
),
cte2_order as (
select oi.user_id, oi.order_id, sum(oi.sale_price) as order_total, rank() over(partition by oi.user_id order by oi.created_at) as order_rank
from order_items as oi
group by oi.user_id, oi.order_id, oi.created_at
order by oi.user_id, oi.order_id
),
cte3_first_order as (
select user_id, order_id, order_total
from cte2_order
where order_rank=1
order by user_id, order_id
),
cte4_second_order as (
select user_id, order_id, order_total
from cte2_order
where order_rank=2
order by user_id, order_id
)
select avg(cte1.lifetime_spend) as average_lifetime_spend
from cte1_lifetime as cte1
where exists (
select *
from cte3_first_order as cte3, cte4_second_order as cte4
where cte3.user_id=cte4.user_id
and cte1.user_id=cte3.user_id
and cte3.order_total > cte4.order_total)
And here is the answer key:
WITH
table1 AS
(SELECT user_id, order_id,
SUM(sale_price) OVER (PARTITION BY order_id ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as order_total,
RANK() OVER (PARTITION BY user_id ORDER BY created_at) AS "sequence"
FROM order_items)
,
table2 AS
(SELECT user_id, SUM(sale_price) AS lifetime_spend
FROM order_items
WHERE EXISTS
(SELECT t1.user_id
FROM table1 t1, table1 t2
WHERE t1.user_id = t2.user_id AND t1.sequence = 1 AND t2.sequence = 2 AND t1.order_total>t2.order_total
AND t1.user_id = order_items.user_id)
GROUP BY 1
ORDER BY 1)
SELECT AVG(lifetime_spend)
FROM table2
These answers yield slightly different results on the same data- an average lifetime spend of $215 vs $220. I'd really like to understand why they are different but so far I can't figure it out. Any ideas?

Date in one table is before date in another table - Postgres

I have a table 1
and Table 2
I need to get the following table where the date from table 1 is the closest (i.e. before) to the date from table 2 by id.
I assume I need to join two table where table1.id=table2.id and table1.date<=table2.date and then, rank to get the 'last' record in that merged table? Is it correct? Is there a simpler way?
You can see structure and result in: dbfiddle
select
distinct on (t1.id)
t1.id,
last_value(t1.type) over (order by to_date(t1.date, 'mm/dd/yyyy') desc)
from
table1 t1 inner join table2 t2 on t1.id = t2.id
where
to_date(t1.date, 'mm/dd/yyyy') <= to_date(t2.date, 'mm/dd/yyyy');

Selecting the 1st and 10th Records Only

Have a table with 3 columns: ID, Signature, and Datetime, and it's grouped by Signature Having Count(*) > 9.
select * from (
select s.Signature
from #Sigs s
group by s.Signature
having count(*) > 9
) b
join #Sigs o
on o.Signature = b.Signature
order by o.Signature desc, o.DateTime
I now want to select the 1st and 10th records only, per Signature. What determines rank is the Datetime descending. Thus, I would expect every Signature to have 2 rows.
Thanks,
I would go with a couple of common table expressions.
The first will select all records from the table as well as a count of records per signature, and the second one will select from the first where the record count > 9 and add row_number partitioned by signature - and then just select from that where the row_number is either 1 or 10:
With cte1 AS
(
SELECT ID, Signature, Datetime, COUNT(*) OVER(PARTITION BY Signature) As NumberOfRows
FROM #Sigs
), cte2 AS
(
SELECT ID, Signature, Datetime, ROW_NUMBER() OVER(PARTITION BY Signature ORDER BY DateTime DESC) As Rn
FROM cte1
WHERE NumberOfRows > 9
)
SELECT ID, Signature, Datetime
FROM cte2
WHERE Rn IN (1, 10)
ORDER BY Signature desc
Because I don't know what your data looks like, this might need some adjustment.
The simplest way here, since you already know your sort order (DateTime DESC) and partitioning (Signature), is probably to assign row numbers and then select the rows you want.
SELECT *
FROM
(
select o.Signature
,o.DateTime
,ROW_NUMBER() OVER (PARTITION BY o.Signature ORDER BY o.DateTime DESC) [Row]
from (
select s.Signature
from #Sigs s
group by s.Signature
having count(*) > 9
) b
join #Sigs o
on o.Signature = b.Signature
order by o.Signature desc, o.DateTime
)
WHERE [Row] IN (1,10)

Grouping consecutive dates in PostgreSQL

I have two tables which I need to combine as sometimes some dates are found in table A and not in table B and vice versa. My desired result is that for those overlaps on consecutive days be combined.
I'm using PostgreSQL.
Table A
id startdate enddate
--------------------------
101 12/28/2013 12/31/2013
Table B
id startdate enddate
--------------------------
101 12/15/2013 12/15/2013
101 12/16/2013 12/16/2013
101 12/28/2013 12/28/2013
101 12/29/2013 12/31/2013
Desired Result
id startdate enddate
-------------------------
101 12/15/2013 12/16/2013
101 12/28/2013 12/31/2013
Right. I have a query that I think works. It certainly works on the sample records you provided. It uses a recursive CTE.
First, you need to merge the two tables. Next, use a recursive CTE to get the sequences of overlapping dates. Finally, get the start and end dates, and join back to the "merged" table to get the id.
with recursive allrecords as -- this merges the input tables. Add a unique row identifier
(
select *, row_number() over (ORDER BY startdate) as rowid from
(select * from table1
UNION
select * from table2) a
),
path as ( -- the recursive CTE. This gets the sequences
select rowid as parent,rowid,startdate,enddate from allrecords a
union
select p.parent,b.rowid,b.startdate,b.enddate from allrecords b join path p on (p.enddate + interval '1 day')>=b.startdate and p.startdate <= b.startdate
)
SELECT id,g.startdate,g.enddate FROM -- outer query to get the id
-- inner query to get the start and end of each sequence
(select parent,min(startdate) as startdate, max(enddate) as enddate from
(
select *, row_number() OVER (partition by rowid order by parent,startdate) as row_number from path
) a
where row_number = 1 -- We only want the first occurrence of each record
group by parent)g
INNER JOIN allrecords a on a.rowid = parent
The below fragment does what you intend. (but it will probably be very slow) The problem is that detecteng (non)overlapping dateranges is impossible with standard range operators, since a range could be split into two parts.
So, my code does the following:
split the dateranges from table_A into atomic records, with one date per record
[the same for table_b]
cross join these two tables (we are only interested in A_not_in_B, and B_not_in_A) , remembering which of the L/R outer join wings it came from.
re-aggregate the resulting records into date ranges.
-- EXPLAIN ANALYZE
--
WITH RECURSIVE ranges AS (
-- Chop up the a-table into atomic date units
WITH ar AS (
SELECT generate_series(a.startdate,a.enddate , '1day'::interval)::date AS thedate
, 'A'::text AS which
, a.id
FROM a
)
-- Same for the b-table
, br AS (
SELECT generate_series(b.startdate,b.enddate, '1day'::interval)::date AS thedate
, 'B'::text AS which
, b.id
FROM b
)
-- combine the two sets, retaining a_not_in_b plus b_not_in_a
, moments AS (
SELECT COALESCE(ar.id,br.id) AS id
, COALESCE(ar.which, br.which) AS which
, COALESCE(ar.thedate, br.thedate) AS thedate
FROM ar
FULL JOIN br ON br.id = ar.id AND br.thedate = ar.thedate
WHERE ar.id IS NULL OR br.id IS NULL
)
-- use a recursive CTE to re-aggregate the atomic moments into ranges
SELECT m0.id, m0.which
, m0.thedate AS startdate
, m0.thedate AS enddate
FROM moments m0
WHERE NOT EXISTS ( SELECT * FROM moments nx WHERE nx.id = m0.id AND nx.which = m0.which
AND nx.thedate = m0.thedate -1
)
UNION ALL
SELECT rr.id, rr.which
, rr.startdate AS startdate
, m1.thedate AS enddate
FROM ranges rr
JOIN moments m1 ON m1.id = rr.id AND m1.which = rr.which AND m1.thedate = rr.enddate +1
)
SELECT * FROM ranges ra
WHERE NOT EXISTS (SELECT * FROM ranges nx
-- suppress partial subassemblies
WHERE nx.id = ra.id AND nx.which = ra.which
AND nx.startdate = ra.startdate
AND nx.enddate > ra.enddate
)
;

Select last value in a month for all given IDs

I have 2 tables, one containing meter IDs, and another containing measurements for some of the meters in the first table. This is the table structure:
MeterConfig:
MeterID (int)
MeterNumber (char[16])
Type (char[25])
Readings:
MeterID (int)
Date (datetime)
Value (numeric(18,6))
I need to get the last reading (and its date) from a given period for each meter, as well as the meter number. I managed to do this in T-SQL, although I'm not particularly pleased with the way I did it using this query:
select distinct
cfg.MeterNumber,
(select top 1 r.Date from Readings as r where r.Date between #startdate and #endDate and r.MeterID = cfg.MeterID order by r.Date desc) as Date,
(select top 1 r.Value from Readings as r where r.Date between #startdate and #endDate and r.MeterID = cfg.MeterID order by r.Date desc) as Value
from
MeterConfig cfg, Readings r1
where cfg.MeterID = r1.MeterID and r1.Date between #startdate and #endDate;
How can I do this more efficiently?
WITH CTE AS (
SELECT mc.MeterID, Date, Value, ROW_NUMBER() OVER (PARTITION BY mc.MeterID ORDER BY Date DESC) as Rank
FROM MeterConfig mc
INNER JOIN Readings rd
ON mc.MeterID = rd.MeterID
WHERE rd.Date BETWEEN #startdate AND #endDate)
SELECT * FROM CTE WHERE Rank = 1
Assuming the dates in Readings are unique (ic include a timestamp), following should be equivalent to your query.
SELECT DISTINCT cfg.MeterNumber
, r1.Date
, r1.Value
FROM MeterConfig cfg
INNER JOIN Readings r1 ON cfg.MeterID = r1.MeterID
INNER JOIN (
SELECT date = MAX(r.Date)
FROM Readings r
WHERE r.Date BETWEEN #StartDate AND #EndDate
) r2 On r2.date = r1.date