GROUP BY column and clause in postgres - postgresql

I would like group the columns of a table with by a column value as well as when another condition is met. For example, with the following table:
Events:
id session_id flags created_at ...
--------------------------------------------
1 100 OTHER ...
2 101 OTHER ...
3 101 NEW_SESSION ...
4 101 OTHER ...
5 101 NEW_SESSION ...
6 100 OTHER ...
7 102 OTHER ...
I want the following result:
session_id events_count first_event_id last_event_id
-------------------------------------------------------
100-0 2 1 6
101-0 1 2 2
101-1 2 3 4
101-2 1 5 5
102-0 1 7 7
The basic idea is that I want to extract sessions from events. They are grouped by session_id. I also want a new session whenever I have the flag NEW_SESSION.
The query is something like this:
SELECT ? as session_id
, count(id) as events_count
, MIN(id) as first_event_id
, MAX(id) last_event_id
GROUP BY session_id
-- , and whenever flags is NEW_SESSION
ORDER BY id
But I dont know how to express the group by condition properly. Any idea ?

Update 2
In the comments I've noticed that you want them unique. Then we can use a variable:
SET #inc := 0;
(
SELECT CONCAT(session_id, '-', !ABS(STRCMP(flags, 'NEW_SESSION'))) AS session_id
, COUNT(id) AS events_count
, MIN(id) AS first_event_id
, MAX(id) last_event_id
FROM events
WHERE flags != 'NEW_SESSION'
GROUP BY events.session_id, events.flags
ORDER BY events.id
) UNION (
SELECT CONCAT(session_id, '-', #inc := #inc + 1) AS session_id
, COUNT(id) AS events_count
, MIN(id) AS first_event_id
, MAX(id) last_event_id
FROM events
WHERE flags = 'NEW_SESSION'
GROUP by events.id
ORDER BY events.id
);
Update
The following prevents grouping for the NEW_SESSION rows:
(
SELECT CONCAT(session_id, '-', !ABS(STRCMP(flags, 'NEW_SESSION'))) AS session_id
, COUNT(id) AS events_count
, MIN(id) AS first_event_id
, MAX(id) last_event_id
FROM events
WHERE flags != 'NEW_SESSION'
GROUP BY events.session_id, events.flags
ORDER BY events.id
) UNION (
SELECT CONCAT(session_id, '-1') AS session_id
, COUNT(id) AS events_count
, MIN(id) AS first_event_id
, MAX(id) last_event_id
FROM events
WHERE flags = 'NEW_SESSION'
GROUP BY id
ORDER BY events.id
);
Original answer
As far as I understand, you are trying to group events by the session IDs and
"whether it's a NEW_SESSION" flag. If it's so, then I'd express it as follows:
SELECT CONCAT(session_id, '-', !ABS(STRCMP(flags, 'NEW_SESSION'))) AS session_id
, COUNT(id) AS events_count
, MIN(id) AS first_event_id
, MAX(id) last_event_id
FROM events
GROUP BY events.session_id, events.flags
ORDER BY events.id;

Related

How to collapse overlapping date periods with acceptable gaps using T-SQL?

We want to group our members' enrollments into "continuous enrollments," allowing for a gap of up to 45 days. I know how to use LEAD to determine if an enrollment should be grouped with the next, but I don't know how to group them. Would it be more appropriate to add 45 to the term date and subtract 45 from the effective date, then check for overlapping date periods? My goal is to have a SQL view that returns the results similar to the final query below. Thank you for your help.
SELECT '101' AS MemID, '2021-01-01' AS EffDate, '2021-01-31' AS TermDate INTO #T1 UNION
SELECT '101', '2021-02-01', '2021-02-28' UNION
SELECT '101', '2021-03-01', '2021-03-31' UNION
SELECT '101', '2021-06-01', '2021-06-30' UNION
SELECT '999', '2021-01-01', '2021-01-15' UNION
SELECT '999', '2021-09-01', '2021-09-28' UNION
SELECT '999', '2021-10-01', '2021-10-31'
SELECT *
, LEAD(EffDate) OVER (PARTITION BY MemID ORDER BY EffDate) AS LeadEffDate
, DATEDIFF(DAY, TermDate, (LEAD(EffDate) OVER (PARTITION BY MemID ORDER BY EffDate))) AS DaysToNextEnrollment
, CASE WHEN (DATEDIFF(DAY, TermDate, (LEAD(EffDate) OVER (PARTITION BY MemID ORDER BY EffDate)))) <= 45 THEN 1 ELSE 0 END AS CombineWithNextRecord
FROM #T1
-- result objective
SELECT 101 AS MemID, '2021-01-01' AS EffDate, '2021-03-31' AS TermDate UNION
SELECT 101, '2021-06-01', '2021-06-30' UNION
SELECT 999, '2021-01-01', '2021-01-15' UNION
SELECT 999, '2021-09-01', '2021-10-31'
I think you are really close. Your question is very similar to
TSQL - creating from-to date table while ignoring in-between steps with conditions with a logic difference on what you want to consider to be the same group.
My basic approach is to use the LAG() function to figure out the previous values for MemID and TermDate and combine that with your 45 day rule to define a group. And finally get the first and last values of each group.
Here is my response to that question modified to your situation.
SELECT
a4.MemID
, CONVERT (DATE, a4.First_EffDate) AS [EffDate]
, CONVERT (DATE, a4.TermDate) AS [TermDate]
FROM (
SELECT
a3.MemID
, a3.EffDate
, a3.TermDate
, a3.MemID_group
, FIRST_VALUE (a3.EffDate) OVER (PARTITION BY a3.MemID_group ORDER BY a3.EffDate) AS [First_EffDate]
, ROW_NUMBER () OVER (PARTITION BY a3.MemID_group ORDER BY a3.EffDate DESC) AS [Row_number]
FROM (
SELECT
a2.MemID
, a2.EffDate
, a2.TermDate
, a2.Previous_MemID
, a2.Previous_TermDate
, a2.New_group
, SUM (a2.New_group) OVER (ORDER BY a2.MemID, a2.EffDate) AS [MemID_group]
FROM (
SELECT
a1.MemID
, a1.EffDate
, a1.TermDate
, a1.Previous_MemID
, a1.Previous_TermDate
---------------------------------------------------------------------------------
-- new group if the MemID is different from the previous row OR
-- if the MemID is the same as the previous row AND it has been more than 45 days
-- between the TermDate of the previous row and the EffDate of the current row
,
IIF((a1.MemID <> a1.Previous_MemID)
OR (
a1.MemID = a1.Previous_MemID
AND DATEDIFF (DAY, a1.Previous_TermDate, a1.EffDate) > 45
)
, 1
, 0) AS [New_group]
---------------------------------------------------------------------------------
FROM (
SELECT
MemID
, EffDate
, TermDate
, LAG (MemID) OVER (ORDER BY MemID) AS [Previous_MemID]
, LAG (TermDate) OVER (PARTITION BY MemID ORDER BY EffDate) AS [Previous_TermDate]
FROM #T1
) a1
) a2
) a3
) a4
WHERE a4.[Row_number] = 1;
Here is the dbfiddle.

Find the first five occurence of unique values and implement them into aggregation functions

As the title said, I need to find the first five occurence of each value in my table and then aggregate them.
Table structure:
user_id
booking_created_time
booking_paid_time
booking_price_amount
Code:
select x.user_id, row_number() over(partition by x.user_id order by x.booking_created_time)
as booking_sequence, x.booking_created_time::date as booking_created_date, x.booking_price_amount,
sum(y.booking_price_amount) as total_booking_price_amount from
(
select user_id, booking_created_time, booking_price_amount from fact_flight_sales
group by user_id, booking_created_time, booking_price_amount
) as x
join
(
select user_id, booking_price_amount
from fact_flight_sales group by user_id, booking_price_amount
) as y
on x.user_id = y.user_id
group by x.user_id, x.booking_created_time, x.booking_price_amount
having count(x.user_id) >= 1 and sum(y.booking_price_amount) >25000000
order by total_booking_price_amount desc, booking_sequence asc;
The output that I have now looks like
Sample output:
user_id
booking_sequence
booking_created_date
booking_price_amount
total_booking_price_amount
sforlongf
1
2017-05-21
8257056
39826576
sforlongf
2
2017-09-19
8449307
39826576
sforlongf
3
2018-01-08
8677950
39826576
sforlongf
4
2018-09-01
4317539
39826576
sforlongf
5
2018-09-16
6196224
39826576
sforlongf
6
2018-12-16
3928500
39826576
smassy0
1
2017-04-09
9109669
33241207
smassy0
2
2017-06-11
2609767
33241207
smassy0
3
2018-03-31
9809016
33241207
smassy0
4
2018-11-02
7223492
33241207
smassy0
5
2018-11-06
4489263
33241207
As you can see, sforlongf has 6 occurences, how do I limit it's occurence to 5 and make it affect the total_booking_price_amount?
I just can't get the table to work! Sorry about that! It works in preview :/
I managed to make it work by adding a number_row() into each subquery and limiting it
select x.user_id, row_number() over(partition by x.user_id order by x.booking_created_time)
as booking_sequence, x.booking_created_time::date as booking_created_date, x.booking_price_amount,
sum(y.booking_price_amount) as total_booking_price_amount from
(
select user_id, booking_created_time, booking_price_amount, row_number() over(partition by
user_id order by booking_created_time)
as booking_sequence from fact_flight_sales
group by user_id, booking_created_time, booking_price_amount
) as x
join
(
select user_id, booking_price_amount, row_number() over(partition by
user_id order by booking_created_time) as booking_sequence
from fact_flight_sales order by booking_created_time
) as y
on x.user_id = y.user_id where x.booking_sequence<=5 and y.booking_sequence<=5
group by x.user_id, x.booking_created_time, x.booking_price_amount
having count(x.user_id) >= 1 and sum(y.booking_price_amount) >25000000
order by total_booking_price_amount desc, booking_sequence asc;

SQL Server - Select with Group By together Raw_Number

I'm using SQL Server 2000 (80). So, it's not possible to use the LAG function.
I have a code a data set with four columns:
Purchase_Date
Facility_no
Seller_id
Sale_id
I need to identify missing Sale_ids. So every sale_id is a 100% sequential, so the should not be any gaps in order.
This code works for a specific date and store if specified. But i need to work on entire data set looping looping through every facility_id and every seller_id for ever purchase_date
declare #MAXCOUNT int
set #MAXCOUNT =
(
select MAX(Sale_Id)
from #table
where
Facility_no in (124) and
Purchase_date = '2/7/2020'
and Seller_id = 1
)
;WITH TRX_COUNT AS
(
SELECT 1 AS Number
union all
select Number + 1 from TRX_COUNT
where Number < #MAXCOUNT
)
select * from TRX_COUNT
where
Number NOT IN
(
select Sale_Id
from #table
where
Facility_no in (124)
and Purchase_Date = '2/7/2020'
and seller_id = 1
)
order by Number
OPTION (maxrecursion 0)
My Dataset
This column:
case when
Sale_Id=0 or 1=Sale_Id-LAG(Sale_Id) over (partition by Facility_no, Purchase_Date, Seller_id)
then 'OK' else 'Previous Missing' end
will tell you which Seller_Ids have some sale missing. If you want to go a step further and have exactly your desired output, then filter out and distinct the 'Previous Missing' ones, and join with a tally table on not exists.
Edit: OP mentions in comments they can't use LAG(). My suggestion, then, would be:
Make a temp table that that has the max(sale_id) group by facility/seller_id
Then you can get your missing results by this pseudocode query:
Select ...
from temptable t
inner join tally N on t.maxsale <=N.num
where not exists( select ... from sourcetable s where s.facility=t.facility and s.seller=t.seller and s.sale=N.num)
> because the only way to "construct" nonexisting combinations is to construct them all and just remove the existing ones.
This one worked out
; WITH cte_Rn AS (
SELECT *, ROW_NUMBER() OVER(PARTITION BY Facility_no, Purchase_Date, Seller_id ORDER BY Purchase_Date) AS [Rn_Num]
FROM (
SELECT
Facility_no,
Purchase_Date,
Seller_id,
Sale_id
FROM MyTable WITH (NOLOCK)
) a
)
, cte_Rn_0 as (
SELECT
Facility_no,
Purchase_Date,
Seller_id,
Sale_id,
-- [Rn_Num] AS 'Skipped Sale'
-- , case when Sale_id = 0 Then [Rn_Num] - 1 Else [Rn_Num] End AS 'Skipped Sale for 0'
, [Rn_Num] - 1 AS 'Skipped Sale for 0'
FROM cte_Rn a
)
SELECT
Facility_no,
Purchase_Date,
Seller_id,
Sale_id,
-- [Skipped Sale],
[Skipped Sale for 0]
FROM cte_Rn_0 a
WHERE NOT EXISTS
(
select * from cte_Rn_0 b
where b.Sale_id = a.[Skipped Sale for 0]
and a.Facility_no = b.Facility_no
and a.Purchase_Date = b.Purchase_Date
and a.Seller_id = b.Seller_id
)
--ORDER BY Purchase_Date ASC

How to get the id of max count group in hive?

I have a table like this:
id , m_id , group_id
1 , a , 0
1 , b , 0
1 , c , 1
1 , d , 1
2 , e , 0
2 , f , 0
2 , g , 0
2 , h , 1
2 , i , 1
For each id, I would like to get the m_id which they belong to the group that has max number of m_id. If there is a tie, I will just take a random group of m_id. Hence the expected output will be like:
id , m_id
1 , a
1 , b
2 , e
2 , f
2 , g
Notice: the number from group_id is only an indicator of group identification under each id. i.e. group_id = 0 does not not mean the same thing between id=1, and id=2.
My original idea is to get the max(group_id) group by (id,m_id), and return the id,m_id which has the max(group_id). However, this approach wont help on the tie situation (id = 2 cases).
Really hope someone can help me on this!
Thanks!
Use row_number() and partition the group by id to get the max grouping.Then self join to get the max grouping for each id,group_id
CREATE TABLE test
(
id integer , m_id char(1) , group_id integer
);
INSERT INTO test (id,m_id,group_id) VALUES (1,'a',0);
INSERT INTO test (id,m_id,group_id) VALUES (1,'b',0);
INSERT INTO test (id,m_id,group_id) VALUES (1,'c',1);
INSERT INTO test (id,m_id,group_id) VALUES (1,'d',1);
INSERT INTO test (id,m_id,group_id) VALUES (2,'e',0);
INSERT INTO test (id,m_id,group_id) VALUES (2,'f',0);
INSERT INTO test (id,m_id,group_id) VALUES (2,'g',0);
INSERT INTO test (id,m_id,group_id) VALUES (2,'h',1);
INSERT INTO test (id,m_id,group_id) VALUES (2,'i',1);
select b.id,b.group_id,b.m_id
from (
select id,group_id,row_number() over(partition by id order by id,group_id,count(*) desc) as r_no
from test
group by id,group_id
) a
join test b on b.id=a.id and b.group_id=a.group_id
where a.r_no=1
Output
You can use row_number with aggregation to do this.
select t1.id,t1.group_id,t1.m_id
from (select id,group_id,row_number() over(partition by id order by count(*) desc) as rnum
from tbl
group by id,group_id
) t
join tbl t1 on t1.id=t.id and t1.group_id=t.group_id
where t.rnum=1

Convertion of tabular data to JSON in Redshift

I am unable to figure out how to convert tabular data to JSON format and store it in another table in Redshift. For example, I have a "DEMO" table with four columns: pid,stid,item_id,trans_id.
For each combination of pid,stid,item_id there exist many trans_ids.
pid stid item_id trans_id :
1 , AB , P1 , T1
1 , AB , P1 , T2
1 , AB , P1 , T3
1 , AB , P1 , T4
2 , ABC , P2 , T5
2 , ABC , P2 , T6
2 , ABC , P2 , T7
2 , ABC , P2 , T8
I want to store this data in another table called "SAMPLE" as:
pid stid item_id trans_id
1 , AB , P1 , {"key1":T1, "key2":"T2" "key2":"T3" "key2":"T4"}
2 , ABC , P2 , {"key1":T5, "key2":"T6" "key2":"T7" "key2":"T8"}
I am unable to figure out how to load the data from "DEMO" to "SAMPLE" in JSON format only for column "trans_id" using a SQL query in Redshift. I don't want to use any intermediate files.
There is LISTAGG aggregate function that allows you to concatenate text values within groups. It allows the effective construction of JSON objects:
SELECT
pid
,stid
,item_id
,'{'||listagg(
'"key'||row_number::varchar||'":'||trans_id::varchar
,',') within group (order by row_number)
||'}'
FROM (
SELECT *, row_number() over (partition by pid,stid,item_id order by trans_id)
FROM "DEMO"
)
GROUP BY 1,2,3;
As a side note, in this particular case an array of transaction IDs might work better, you'll be able to request the element of a specific order easily without using keyN key:
WITH tran_arrays as (
SELECT
pid
,stid
,item_id
,listagg(trans_id::varchar,',') within group (order by trans_id) as tran_array
FROM "DEMO"
GROUP BY 1,2,3
)
SELECT *
,split_part(tran_array,',',1) as first_element
FROM tran_arrays;
Very similar to the existing Answer however slightly different. This example is also run out of an Oracle Database. I put the work into it and felt like sharing in case it may help someone else out.
/* Oracle Example */
WITH demo_data AS
(
SELECT 1 AS pid, 'AB' AS stid, 'P1' AS item_id, 'T1' AS trans_id FROM dual UNION ALL
SELECT 1 AS pid, 'AB' AS stid, 'P1' AS item_id, 'T2' AS trans_id FROM dual UNION ALL
SELECT 1 AS pid, 'AB' AS stid, 'P1' AS item_id, 'T3' AS trans_id FROM dual UNION ALL
SELECT 1 AS pid, 'AB' AS stid, 'P1' AS item_id, 'T4' AS trans_id FROM dual UNION ALL
SELECT 2 AS pid, 'ABC' AS stid, 'P2' AS item_id, 'T5' AS trans_id FROM dual UNION ALL
SELECT 2 AS pid, 'ABC' AS stid, 'P2' AS item_id, 'T6' AS trans_id FROM dual UNION ALL
SELECT 2 AS pid, 'ABC' AS stid, 'P2' AS item_id, 'T7' AS trans_id FROM dual UNION ALL
SELECT 2 AS pid, 'ABC' AS stid, 'P2' AS item_id, 'T8' AS trans_id FROM dual
)
, transformData AS
(
SELECT pid, stid, item_id, trans_id, rownum AS keyNum FROM demo_data
)
SELECT pid, stid, item_id
, '{'||
LISTAGG(CHR(34)||'key'||keynum||CHR(34)||':'||CHR(34)||trans_id||CHR(34), ' ')
WITHIN GROUP (ORDER BY pid)
||'}' AS trans_id
FROM transformData
GROUP BY pid, stid, item_id
;
Output will look like this: