Summary for duration per month for each ID for TSQL / sql server 2016 - tsql

I have a table (##table) with an ID and the status for each day. I need a summary per month(!) how many days each ID was in which status. (##table_result). See below.
This was my approach. But it does not work. How can I summarize the days for each ID and per status for each month?
select item, Cur_Status, convert(varchar(7), s_date, 126) as YM_S_Date, lag(s_date) over(order by month(s_date) asc) as Start_date ,s_date , datediff (day, lag(s_date) over( order by month(s_date) asc), s_date) as duration from ##table order by item, Start_date
Data:
create table ##table (item nvarchar(30), S_date date, Cur_Status nvarchar(30));
insert into ##table values
('A','2022/01/01','AA'),
('A','2022/01/02','AA'),
('A','2022/01/03','AA'),
('A','2022/01/04','BB'),
('A','2022/01/05','BB'),
('A','2022/01/06','BB'),
('A','2022/01/07','AA'),
('A','2022/01/08','AA'),
('A','2022/01/09','AA'),
('A','2022/01/10','AA'),
('A','2022/01/11','AA'),
('A','2022/01/12','AA'),
('A','2022/01/13','CC'),
('A','2022/01/14','CC'),
('A','2022/01/15','AA'),
('A','2022/01/16','DD'),
('A','2022/01/17','DD'),
('A','2022/01/18','DD'),
('A','2022/01/19','EE'),
('A','2022/01/20','AA'),
('A','2022/01/21','BB'),
('A','2022/01/22','FF'),
('A','2022/01/23','FF'),
('A','2022/01/24','FF'),
('A','2022/01/25','FF'),
('A','2022/01/26','AA'),
('A','2022/01/27','AA'),
('A','2022/01/28','AA'),
('A','2022/01/29','AA'),
('A','2022/01/30','AA'),
('A','2022/01/31','AA'),
('A','2022/02/01','AA'),
('A','2022/02/02','AA'),
('A','2022/02/03','AA'),
('A','2022/02/04','AA'),
('A','2022/02/05','AA'),
('A','2022/02/06','BB'),
('A','2022/02/07','AA'),
('A','2022/02/08','AA'),
('A','2022/02/09','AA'),
('A','2022/02/10','AA'),
('A','2022/02/11','AA'),
('A','2022/02/12','AA'),
('A','2022/02/13','CC'),
('A','2022/02/14','CC'),
('A','2022/02/15','AA'),
('A','2022/02/16','DD'),
('A','2022/02/17','DD'),
('A','2022/02/18','DD'),
('A','2022/02/19','EE'),
('A','2022/02/20','AA'),
('A','2022/02/21','BB'),
('A','2022/02/22','AA'),
('A','2022/02/23','AA'),
('A','2022/02/24','AA'),
('A','2022/02/25','FF'),
('A','2022/02/26','AA'),
('A','2022/02/27','AA'),
('A','2022/02/28','AA'),
('A','2022/03/01','AA'),
('A','2022/03/02','AA'),
('A','2022/03/03','BB'),
('A','2022/03/04','AA'),
('B','2022/01/01','AA'),
('B','2022/01/02','AA'),
('B','2022/01/03','AA'),
('B','2022/01/04','BB'),
('B','2022/01/05','BB'),
('B','2022/01/06','BB'),
('B','2022/01/07','AA'),
('B','2022/01/08','AA'),
('B','2022/01/09','AA'),
('B','2022/01/10','AA'),
('B','2022/01/11','AA'),
('B','2022/01/12','AA'),
('B','2022/01/13','AA'),
('B','2022/01/14','AA'),
('B','2022/01/15','AA'),
('B','2022/01/16','AA'),
('B','2022/01/17','AA'),
('B','2022/01/18','AA'),
('B','2022/01/19','AA'),
('B','2022/01/20','AA'),
('B','2022/01/21','AA'),
('B','2022/01/22','AA'),
('B','2022/01/23','AA'),
('B','2022/01/24','AA'),
('B','2022/01/25','AA'),
('B','2022/01/26','AA'),
('B','2022/01/27','AA'),
('B','2022/01/28','AA'),
('B','2022/01/29','AA'),
('B','2022/01/30','AA'),
('B','2022/01/31','AA'),
('B','2022/02/01','AA'),
('B','2022/02/02','AA'),
('B','2022/02/03','AA'),
('B','2022/02/04','FF'),
('B','2022/02/05','FF'),
('B','2022/02/06','FF'),
('B','2022/02/07','AA'),
('B','2022/02/08','AA'),
('B','2022/02/09','AA'),
('B','2022/02/10','AA'),
('B','2022/02/11','AA'),
('B','2022/02/12','AA'),
('B','2022/02/13','CC'),
('B','2022/02/14','CC'),
('B','2022/02/15','AA'),
('B','2022/02/16','DD'),
('B','2022/02/17','DD'),
('B','2022/02/18','DD'),
('B','2022/02/19','EE'),
('B','2022/02/20','AA'),
('B','2022/02/21','AA'),
('B','2022/02/22','AA'),
('B','2022/02/23','AA'),
('B','2022/02/24','AA'),
('B','2022/02/25','FF'),
('B','2022/02/26','AA'),
('B','2022/02/27','AA'),
('B','2022/02/28','AA'),
('B','2022/03/01','BB'),
('B','2022/03/02','AA'),
('B','2022/03/03','AA'),
('B','2022/03/04','AA'),
('B','2022/03/05','AA'),
('B','2022/03/06','AA'),
('B','2022/03/07','AA'),
('B','2022/03/08','AA'),
('B','2022/03/09','BB'),
('B','2022/03/10','BB'),
('B','2022/03/11','BB'),
('B','2022/03/12','BB'),
('B','2022/03/13','BB'),
('B','2022/03/14','AA'),
('B','2022/03/15','AA'),
('B','2022/03/16','AA'),
('B','2022/03/17','AA'),
('B','2022/03/18','AA'),
('B','2022/03/19','DD'),
('B','2022/03/20','DD'),
('B','2022/03/21','AA'),
('B','2022/03/22','AA'),
('B','2022/03/23','AA'),
('B','2022/03/24','AA'),
('B','2022/03/25','BB'),
('B','2022/03/26','AA'),
('B','2022/03/27','AA'),
('B','2022/03/28','BB'),
('B','2022/03/30','AA'),
('B','2022/03/31','BB'),
('B','2022/04/01','BB'),
('B','2022/04/02','BB'),
('B','2022/04/04','BB'),
('C','2022/04/04','BB'),
('C','2022/04/05','BB'),
('C','2022/04/06','BB'),
('C','2022/04/07','AA'),
('C','2022/04/08','AA'),
('C','2022/04/09','AA'),
('C','2022/04/10','AA'),
('C','2022/04/11','AA'),
('C','2022/04/12','AA'),
('C','2022/04/13','CC'),
('C','2022/04/14','CC'),
('E','2022/04/15','AA'),
('E','2022/04/16','DD'),
('E','2022/04/17','DD'),
('E','2022/04/18','DD'),
('E','2022/04/19','EE'),
('E','2022/04/20','AA'),
('E','2022/04/21','BB'),
('E','2022/04/22','FF'),
('E','2022/04/23','FF'),
('E','2022/04/24','FF'),
('E','2022/04/25','FF'),
('E','2022/04/26','AA'),
('E','2022/04/27','AA'),
('E','2022/04/28','AA'),
('E','2022/04/29','AA'),
('E','2022/04/30','FF'),
('E','2022/05/01','FF'),
('E','2022/05/01','FF')
;
select * from ##table order by item, S_date
Expected result:
create table ##table_Result (item nvarchar(30), Start_date date, End_date date, Cur_Status nvarchar(30), Duration int);
insert into ##table_result values
('A','2022/01/01','2022/01/03','AA','3' ),
('A','2022/01/04','2022/01/06','BB','3' ),
('A','2022/01/07','2022/01/12','AA','4' ),
('A','2022/01/13','2022/01/14','CC','2' ),
('A','2022/01/15','2022/01/15','AA','2' ),
('A','2022/01/16','2022/01/18','DD','2' ),
('A','2022/01/19','2022/01/19','EE','1' ),
('A','2022/01/20','2022/01/20','AA','1' ),
('A','2022/01/21','2022/01/21','BB','1' ),
('A','2022/01/22','2022/01/25','FF','4' ),
('A','2022/01/26','2022/01/31','AA','6' ),
('A','2022/02/01','2022/02/05','AA','5' ),
('A','2022/02/06','2022/02/06','BB','5' ),
('A','2022/02/07','2022/02/12','AA','6' ),
('A','2022/02/13','2022/02/14','CC','2' ),
('A','2022/02/15','2022/02/15','AA','1' ),
('A','2022/02/16','2022/02/18','DD','3' ),
('A','2022/02/19','2022/02/19','EE','1' ),
('A','2022/02/20','2022/02/20','AA','1' ),
('A','2022/02/21','2022/02/21','BB','1' ),
('A','2022/02/22','2022/02/24','AA','1' ),
('A','2022/02/25','2022/02/25','FF','1' ),
('A','2022/02/26','2022/02/28','AA','3' ),
('A','2022/03/01','2022/03/02','AA','2' ),
('A','2022/03/03','2022/03/03','BB','2' ),
('A','2022/03/04','2022/03/04','AA','2' ),
('B','2022/01/01','2022/01/02','AA','2' ),
('B','2022/01/03','2022/01/03','AA','1' ),
('B','2022/01/04','2022/01/06','BB','2' ),
('B','2022/01/07','2022/01/31','AA','25'),
('B','2022/02/01','2022/01/03','AA','3')

Yikes, unless you're really super sure you need a global temporary table, you likely should not be using them at all.
Here's a good way to present your demo data and tables:
DECLARE #table TABLE (item NVARCHAR(30), S_date DATE, Cur_Status NVARCHAR(30));
INSERT INTO #table (item, S_date, Cur_Status) VALUES
('A','2022/01/01','AA'), ('A','2022/01/02','AA'), ('A','2022/01/03','AA'), ('A','2022/01/04','BB'), ('A','2022/01/05','BB'), ('A','2022/01/06','BB'), ('A','2022/01/07','AA'), ('A','2022/01/08','AA'), ('A','2022/01/09','AA'), ('A','2022/01/10','AA'),
('A','2022/01/11','AA'), ('A','2022/01/12','AA'), ('A','2022/01/13','CC'), ('A','2022/01/14','CC'), ('A','2022/01/15','AA'), ('A','2022/01/16','DD'), ('A','2022/01/17','DD'), ('A','2022/01/18','DD'), ('A','2022/01/19','EE'), ('A','2022/01/20','AA'),
('A','2022/01/21','BB'), ('A','2022/01/22','FF'), ('A','2022/01/23','FF'), ('A','2022/01/24','FF'), ('A','2022/01/25','FF'), ('A','2022/01/26','AA'), ('A','2022/01/27','AA'), ('A','2022/01/28','AA'), ('A','2022/01/29','AA'), ('A','2022/01/30','AA'),
('A','2022/01/31','AA'), ('A','2022/02/01','AA'), ('A','2022/02/02','AA'), ('A','2022/02/03','AA'), ('A','2022/02/04','AA'), ('A','2022/02/05','AA'), ('A','2022/02/06','BB'), ('A','2022/02/07','AA'), ('A','2022/02/08','AA'), ('A','2022/02/09','AA'),
('A','2022/02/10','AA'), ('A','2022/02/11','AA'), ('A','2022/02/12','AA'), ('A','2022/02/13','CC'), ('A','2022/02/14','CC'), ('A','2022/02/15','AA'), ('A','2022/02/16','DD'), ('A','2022/02/17','DD'), ('A','2022/02/18','DD'), ('A','2022/02/19','EE'),
('A','2022/02/20','AA'), ('A','2022/02/21','BB'), ('A','2022/02/22','AA'), ('A','2022/02/23','AA'), ('A','2022/02/24','AA'), ('A','2022/02/25','FF'), ('A','2022/02/26','AA'), ('A','2022/02/27','AA'), ('A','2022/02/28','AA'), ('A','2022/03/01','AA'),
('A','2022/03/02','AA'), ('A','2022/03/03','BB'), ('A','2022/03/04','AA'), ('B','2022/01/01','AA'), ('B','2022/01/02','AA'), ('B','2022/01/03','AA'), ('B','2022/01/04','BB'), ('B','2022/01/05','BB'), ('B','2022/01/06','BB'), ('B','2022/01/07','AA'),
('B','2022/01/08','AA'), ('B','2022/01/09','AA'), ('B','2022/01/10','AA'), ('B','2022/01/11','AA'), ('B','2022/01/12','AA'), ('B','2022/01/13','AA'), ('B','2022/01/14','AA'), ('B','2022/01/15','AA'), ('B','2022/01/16','AA'), ('B','2022/01/17','AA'),
('B','2022/01/18','AA'), ('B','2022/01/19','AA'), ('B','2022/01/20','AA'), ('B','2022/01/21','AA'), ('B','2022/01/22','AA'), ('B','2022/01/23','AA'), ('B','2022/01/24','AA'), ('B','2022/01/25','AA'), ('B','2022/01/26','AA'), ('B','2022/01/27','AA'),
('B','2022/01/28','AA'), ('B','2022/01/29','AA'), ('B','2022/01/30','AA'), ('B','2022/01/31','AA'), ('B','2022/02/01','AA'), ('B','2022/02/02','AA'), ('B','2022/02/03','AA'), ('B','2022/02/04','FF'), ('B','2022/02/05','FF'), ('B','2022/02/06','FF'),
('B','2022/02/07','AA'), ('B','2022/02/08','AA'), ('B','2022/02/09','AA'), ('B','2022/02/10','AA'), ('B','2022/02/11','AA'), ('B','2022/02/12','AA'), ('B','2022/02/13','CC'), ('B','2022/02/14','CC'), ('B','2022/02/15','AA'), ('B','2022/02/16','DD'),
('B','2022/02/17','DD'), ('B','2022/02/18','DD'), ('B','2022/02/19','EE'), ('B','2022/02/20','AA'), ('B','2022/02/21','AA'), ('B','2022/02/22','AA'), ('B','2022/02/23','AA'), ('B','2022/02/24','AA'), ('B','2022/02/25','FF'), ('B','2022/02/26','AA'),
('B','2022/02/27','AA'), ('B','2022/02/28','AA'), ('B','2022/03/01','BB'), ('B','2022/03/02','AA'), ('B','2022/03/03','AA'), ('B','2022/03/04','AA'), ('B','2022/03/05','AA'), ('B','2022/03/06','AA'), ('B','2022/03/07','AA'), ('B','2022/03/08','AA'),
('B','2022/03/09','BB'), ('B','2022/03/10','BB'), ('B','2022/03/11','BB'), ('B','2022/03/12','BB'), ('B','2022/03/13','BB'), ('B','2022/03/14','AA'), ('B','2022/03/15','AA'), ('B','2022/03/16','AA'), ('B','2022/03/17','AA'), ('B','2022/03/18','AA'),
('B','2022/03/19','DD'), ('B','2022/03/20','DD'), ('B','2022/03/21','AA'), ('B','2022/03/22','AA'), ('B','2022/03/23','AA'), ('B','2022/03/24','AA'), ('B','2022/03/25','BB'), ('B','2022/03/26','AA'), ('B','2022/03/27','AA'), ('B','2022/03/28','BB'),
('B','2022/03/30','AA'), ('B','2022/03/31','BB'), ('B','2022/04/01','BB'), ('B','2022/04/02','BB'), ('B','2022/04/04','BB'), ('C','2022/04/04','BB'), ('C','2022/04/05','BB'), ('C','2022/04/06','BB'), ('C','2022/04/07','AA'), ('C','2022/04/08','AA'),
('C','2022/04/09','AA'), ('C','2022/04/10','AA'), ('C','2022/04/11','AA'), ('C','2022/04/12','AA'), ('C','2022/04/13','CC'), ('C','2022/04/14','CC'), ('E','2022/04/15','AA'), ('E','2022/04/16','DD'), ('E','2022/04/17','DD'), ('E','2022/04/18','DD'),
('E','2022/04/19','EE'), ('E','2022/04/20','AA'), ('E','2022/04/21','BB'), ('E','2022/04/22','FF'), ('E','2022/04/23','FF'), ('E','2022/04/24','FF'), ('E','2022/04/25','FF'), ('E','2022/04/26','AA'), ('E','2022/04/27','AA'), ('E','2022/04/28','AA'),
('E','2022/04/29','AA'), ('E','2022/04/30','FF'), ('E','2022/05/01','FF'), ('E','2022/05/01','FF') ;
Now on to the answer. This looks like an rCTE to me:
;WITH base AS (
SELECT item, S_date, Cur_Status, LAG(Cur_Status,1) OVER (PARTITION BY item ORDER BY S_date) AS prev_Status, CASE WHEN Cur_Status = LAG(Cur_Status,1) OVER (PARTITION BY item ORDER BY S_date) AND DATEPART(MONTH,S_date) = DATEPART(MONTH,LAG(S_date,1) OVER (PARTITION BY item ORDER BY S_date)) THEN 1 END AS Counter
FROM #table
), rCTE AS (
SELECT item, S_date, S_Date AS StartDate, Cur_Status, 1 AS Counter, S_date AS StopDate
FROM base
WHERE Counter IS NULL
UNION ALL
SELECT a.item, r.S_date, a.StartDate, a.Cur_Status, a.Counter + r.Counter, r.S_date AS StopDate
FROM rCTE a
INNER JOIN base r
ON a.item = r.item
AND a.Cur_Status = r.Cur_Status
AND a.S_date = DATEADD(DAY,-1,r.S_date)
AND r.Counter IS NOT NULL
)
SELECT item, rCTE.StartDate AS Start_date, MAX(rCTE.StopDate) AS End_Date, rCTE.Cur_Status, MAX(Counter) AS Duration
FROM rCTE
GROUP BY item, rCTE.StartDate, rCTE.Cur_Status
ORDER BY item, End_Date
OPTION (MAXRECURSION 0)
Basically what we're doing here is iterating over all the rows to make groups where they don't naturally exist.
It looks like your expected data is off too, I found this line:
item Start_date End_date Cur_Status Duration
----------------------------------------------------
A 2022-01-07 2022-01-12 AA 4
Should that not be 6?
Here's some of the example out put:
item Start_date End_Date Cur_Status Duration
----------------------------------------------------
A 2022-01-01 2022-01-03 AA 3
A 2022-01-04 2022-01-06 BB 3
A 2022-01-07 2022-01-12 AA 6
A 2022-01-13 2022-01-14 CC 2
A 2022-01-15 2022-01-15 AA 1
A 2022-01-16 2022-01-18 DD 3
A 2022-01-19 2022-01-19 EE 1
A 2022-01-20 2022-01-20 AA 1
Edit:
I modified the query to take into account the end of month.
The case statement expression handing the NULLABLE counter is now:
CASE WHEN Cur_Status = LAG(Cur_Status,1) OVER (PARTITION BY item ORDER BY S_date) AND DATEPART(MONTH,S_date) = DATEPART(MONTH,LAG(S_date,1) OVER (PARTITION BY item ORDER BY S_date)) THEN 1 END
and an additional predicate was applied in the rCTE:
AND r.Counter IS NOT NULL
Example results are now:
item Start_date End_Date Cur_Status Duration
----------------------------------------------------
A 2022-01-20 2022-01-20 AA 1
A 2022-01-21 2022-01-21 BB 1
A 2022-01-22 2022-01-25 FF 4
**A 2022-01-26 2022-01-31 AA 6**
**A 2022-02-01 2022-02-05 AA 5**
A 2022-02-06 2022-02-06 BB 1

I am sure there is a more eloquent way to do this, but it seems like a gaps and island type of problem. So here is a different way to go about it:
SELECT Item,MIN(S_Date) Start_Date, MAX(S_Date) End_Date, Cur_Status, DATEDIFF(day,MIN(S_Date), MAX(s_DATE)) + 1 Duration
FROM
(
SELECT * ,SUM(CASE WHEN Cur_Status <> LG THEN 1 ELSE 0 END) OVER(PARTITION BY Item,YR,MN ORDER BY s_Date) GRP
FROM
(
select * ,MONTH(s_Date) MN, Year(s_Date) YR, LAG(Cur_Status,1) OVER(PARTITION BY Item ORDER BY S_Date) LG
from #table
) X
) Y
GROUP BY Item,GRP,YR,MN,Cur_Status
ORDER BY Item,Start_Date

First of all thanks for your reply. Yes, you're right, it has to be 6.
I've try it but the issue is that I need the duration for every month. That means that if a period (start-end) goes beyond the month, the duration (start-end) may only be determined up to the end of the month. The rest from the duration should be determined and add to the next month.
For example: A | 2022-01-26 | 2022-02-05 |AA
Result:
A | 2022-01-26 | 2022-01-31 | AA | 6
A | 2000-02-01 | 2022-02-05 | AA | 5

Related

Forward fill grouped timeseries using timestamps from other groups in postgresql

I have a table that looks the following way
time
group
sub_group
count
2022-01-01
A
True
3
2022-01-01
A
False
1
2022-01-01
B
True
2
2022-01-01
B
False
1
2022-01-02
A
False
2
2022-01-02
A
True
5
2022-01-02
B
False
3
2022-01-03
A
False
3
2022-01-03
B
False
4
2022-01-03
B
True
3
So an increasing count per group+sub_group per day, unless on a day when a count did not change for a group+subgroup, the row is missing.
in the example above missing rows would be:
...
| 2022-01-02 | B | True | 2 |
...
| 2022-01-03 | A | True | 5 |
...
For ease of data handling, I need a continuous timestamp per day for all groups+sub_groups. So the result would look like this:
time
group
sub_group
count
2022-01-01
A
True
3
2022-01-01
A
False
1
2022-01-01
B
True
2
2022-01-01
B
False
1
2022-01-02
A
False
2
2022-01-02
A
True
5
2022-01-02
B
False
3
2022-01-02
B
True
2
2022-01-03
A
False
3
2022-01-03
A
True
5
2022-01-03
B
False
4
2022-01-03
B
True
3
How could I achieve this? Probably some parition by ... over select construct, but I can't wrap my head around how to partition by timestamps from other groups in this case, as I don't have the NULL counts to forward fill for each group as intermediate.
update:
So far, I seem to have the reached the intermediate state that filled the missing timestamps (basically just daily frequency is fine here) between groups like this:
with time_range as (
select min(time) as start_time, -- current_timestamp - interval '2 day'
max(time) as end_time
from my_table-- current_timestamp
),
interested_events as (
select e.group, e.sub_group, e.time, e.count
from my_table e
),
classes_having_events as (
select distinct group, sub_group
from interested_events
ORDER BY group, sub_group
),
periods as (
select ts as period_start, ts + interval '1 day' as period_end
from generate_series(
(select start_time from time_range),
(select end_time from time_range) - interval '1 second',
interval '1 day') ts
), resampled as (
SELECT period_start,
period_end,
classes_having_events.group,
classes_having_events.sub_group,
interested_events.count
FROM periods
CROSS JOIN classes_having_events
LEFT JOIN interested_events
ON time >= period_start AND time < period_end
AND interested_events.group = classes_having_events.group
AND interested_events.sub_group = classes_having_events.sub_group
ORDER BY period_start DESC
)
Okay, seems like I was pretty close and rubber duck debugging helped.
This seems to do what I wanted to have:
WITH time_range AS (
SELECT MIN(time) AS start_time, -- current_timestamp - interval '2 day'
MAX(time) AS end_time
FROM my_table-- current_timestamp
),
interested_events AS (
SELECT e.group, e.sub_group, e.time, e.count
FROM my_table e
),
classes_having_events AS (
SELECT DISTINCT
GROUP, sub_group
FROM interested_events
ORDER BY
GROUP, sub_group
),
periods AS (
SELECT ts AS period_start, ts + INTERVAL '1 day' AS period_end
FROM GENERATE_SERIES(
(
SELECT start_time
FROM time_range
),
(
SELECT end_time
FROM time_range
) - INTERVAL '1 second',
INTERVAL '1 day') ts
),
resampled AS (
SELECT period_start,
period_end,
classes_having_events.group,
classes_having_events.sub_group,
interested_events.count
FROM periods
CROSS JOIN classes_having_events
LEFT JOIN interested_events
ON time >= period_start AND time < period_end
AND interested_events.group = classes_having_events.group
AND interested_events.sub_group = classes_having_events.sub_group
ORDER BY period_start DESC
)
SELECT period_start AS time,
"group",
sub_group,
MAX(count) OVER (PARTITION BY "group", "sub_group" ORDER BY period_start) AS count
FROM resampled
ORDER BY period_start DESC, "group", sub_group;

BigQuery SQL: Group rows with shared ID that occur within 7 days of each other, and return values from most recent occurrence

I have a table of datestamped events that I need to bundle into 7-day groups, starting with the earliest occurrence of each event_id.
The final output should return each bundle's start and end date and 'value' column of the most recent event from each bundle.
There is no predetermined start date, and the '7-day' windows are arbitrary, not 'week of the year'.
I've tried a ton of examples from other posts but none quite fit my needs or use things I'm not sure how to refactor for BigQuery
Sample Data;
Event_Id
Event_Date
Value
1
2022-01-01
010203
1
2022-01-02
040506
1
2022-01-03
070809
1
2022-01-20
101112
1
2022-01-23
131415
2
2022-01-02
161718
2
2022-01-08
192021
3
2022-02-12
212223
Expected output;
Event_Id
Start_Date
End_Date
Value
1
2022-01-01
2022-01-03
070809
1
2022-01-20
2022-01-23
131415
2
2022-01-02
2022-01-08
192021
3
2022-02-12
2022-02-12
212223
You might consider below.
CREATE TEMP FUNCTION cumsumbin(a ARRAY<INT64>) RETURNS INT64
LANGUAGE js AS """
bin = 0;
a.reduce((c, v) => {
if (c + Number(v) > 6) { bin += 1; return 0; }
else return c += Number(v);
}, 0);
return bin;
""";
WITH sample_data AS (
select 1 event_id, DATE '2022-01-01' event_date, '010203' value union all
select 1 event_id, '2022-01-02' event_date, '040506' value union all
select 1 event_id, '2022-01-03' event_date, '070809' value union all
select 1 event_id, '2022-01-20' event_date, '101112' value union all
select 1 event_id, '2022-01-23' event_date, '131415' value union all
select 2 event_id, '2022-01-02' event_date, '161718' value union all
select 2 event_id, '2022-01-08' event_date, '192021' value union all
select 3 event_id, '2022-02-12' event_date, '212223' value
),
binning AS (
SELECT *, cumsumbin(ARRAY_AGG(diff) OVER w1) bin
FROM (
SELECT *, DATE_DIFF(event_date, LAG(event_date) OVER w0, DAY) AS diff
FROM sample_data
WINDOW w0 AS (PARTITION BY event_id ORDER BY event_date)
) WINDOW w1 AS (PARTITION BY event_id ORDER BY event_date)
)
SELECT event_id,
MIN(event_date) start_date,
ARRAY_AGG(
STRUCT(event_date AS end_date, value) ORDER BY event_date DESC LIMIT 1
)[OFFSET(0)].*
FROM binning GROUP BY event_id, bin;

Gaps and Islands - get a list of dates unemployed over a date range with Postgresl

I have a table called Position, in this table, I have the following, dates are inclusive (yyyy-mm-dd), below is a simplified view of the employment dates
id, person_id, start_date, end_date , title
1 , 1 , 2001-12-01, 2002-01-31, 'admin'
2 , 1 , 2002-02-11, 2002-03-31, 'admin'
3 , 1 , 2002-02-15, 2002-05-31, 'sales'
4 , 1 , 2002-06-15, 2002-12-31, 'ops'
I'd like to be able to calculate the gaps in employment, assuming some of the dates overlap to produce the following output for the person with id=1
person_id, start_date, end_date , last_position_id, gap_in_days
1 , 2002-02-01, 2002-02-10, 1 , 10
1 , 2002-06-01, 2002-06-14, 3 , 14
I have looked at numerous solutions, UNIONS, Materialized views, tables with generated calendar date ranges, etc. I really am not sure what is the best way to do this. Is there a single query where I can get this done?
step-by-step demo:db<>fiddle
You just need the lead() window function. With this you are able to get a value (start_date in this case) to the current row.
SELECT
person_id,
end_date + 1 AS start_date,
lead - 1 AS end_date,
id AS last_position_id,
lead - (end_date + 1) AS gap_in_days
FROM (
SELECT
*,
lead(start_date) OVER (PARTITION BY person_id ORDER BY start_date)
FROM
positions
) s
WHERE lead - (end_date + 1) > 0
After getting the next start_date you are able to compare it with the current end_date. If they differ, you have a gap. These positive values can be filtered within the WHERE clause.
(if 2 positions overlap, the diff is negative. So it can be ignored.)
first you need to find what dates overlaps Determine Whether Two Date Ranges Overlap
then merge those ranges as a single one and keep the last id
finally calculate the ranges of days between one end_date and the next start_date - 1
SQL DEMO
with find_overlap as (
SELECT t1."id" as t1_id, t1."person_id", t1."start_date", t1."end_date",
t2."id" as t2_id, t2."start_date" as t2_start_date, t2."end_date" as t2_end_date
FROM Table1 t1
LEFT JOIN Table1 t2
ON t1."person_id" = t2."person_id"
AND t1."start_date" <= t2."end_date"
AND t1."end_date" >= t2."start_date"
AND t1.id < t2.id
), merge_overlap as (
SELECT
person_id,
start_date,
COALESCE(t2_end_date, end_date) as end_date,
COALESCE(t2_id, t1_id) as last_position_id
FROM find_overlap
WHERE t1_id NOT IN (SELECT t2_id FROM find_overlap WHERE t2_ID IS NOT NULL)
), cte as (
SELECT *,
LEAD(start_date) OVER (partition by person_id order by start_date) next_start
FROM merge_overlap
)
SELECT *,
DATE_PART('day',
(next_start::timestamp - INTERVAL '1 DAY') - end_date::timestamp
) as days
FROM cte
WHERE next_start IS NOT NULL
OUTPUT
| person_id | start_date | end_date | last_position_id | next_start | days |
|-----------|------------|------------|------------------|------------|------|
| 1 | 2001-12-01 | 2002-01-31 | 1 | 2002-02-11 | 10 |
| 1 | 2002-02-11 | 2002-05-31 | 3 | 2002-06-15 | 14 |

t-sql select max value between two columns, or col one when col two is null

This is not easy for me to describe in the title (please forgive me), but here is my problem:
Suppose you have the following table:
CREATE TABLE Subscriptions (product char(3), start_date datetime, end_date datetime);
INSERT INTO #Subscriptions
VALUES('ABC', '2015-01-28 00:00:00', '2016-02-15 00:00:00'),
('ABC', '2016-02-04 12:08:00', NULL),
('DEF', '2013-04-15 00:00:00', '2013-06-10 00:00:00'),
('GHI', '2013-01-11 00:00:00', '2013-04-08 00:00:00');
Now I want to find out for how long a subscription has been either active or passive. I thus need to select the newest end_dates grouped by product, BUT if end_date is null, then I want start_date.
So - I have:
product start_date end_date
ABC 28-01-2015 00:00 15-02-2016 00:00
ABC 04-02-2016 12:08 NULL
DEF 15-04-2013 00:00 10-06-2013 00:00
GHI 11-01-2013 00:00 08-04-2013 00:00
What I want to find in my query:
product relevant_date
ABC 04-02-2016 12:08
DEF 10-06-2013 00:00
GHI 08-04-2013 00:00
I have tried using a union, and that seems to work, but it is very slow, and my question is: is there a more efficient way to solve this (I am using MS SQL Server 2012):
SELECT [product]
,MAX([start_date]) AS start_date
,NULL AS [end_date]
,MAX([start_date]) AS relevant_date
FROM Subscriptions
where end_date IS NULL
GROUP BY product
UNION
SELECT [product]
,NULL
,MAX([end_date])
,MAX([end_date])
FROM Subscriptions
where end_date IS not NULL and product not in (SELECT product FROM Subscriptions
where end_date IS NULL)
GROUP BY product
(If you have a suggestion for another title for my question, I am also all ears!)
For version 2012 or higher you can use a combination of distinct, first_value and isnull, like this:
SELECT DISTINCT
product,
FIRST_VALUE(ISNULL(end_date,start_date))
OVER(PARTITION BY product
ORDER BY ISNULL(end_date, '9999-12-31') DESC) AS EndDate
FROM Subscriptions
Results:
product EndDate
ABC 04.02.2016 12:08:00
DEF 10.06.2013 00:00:00
GHI 08.04.2013 00:00:00
For versions between 2008 and 2012, you can use a cte with row_number to get the same effect:
;WITH CTE AS
(
SELECT product,
ISNULL(end_date,start_date) As relevant_date,
ROW_NUMBER() OVER(PARTITION BY product ORDER BY ISNULL(end_date, '9999-12-31') DESC) As rn
FROM Subscriptions
)
SELECT product,
relevant_date
FROM CTE
WHERE rn = 1
See a live demo on rextester.
If the second ABC row is showing the incorrect start_date then this query should work
SELECT S.product
, relevant_date = MAX(ISNULL(S.end_date,S.start_date))
FROM dbo.Subscriptions S
GROUP BY S.product
This should do it:
select s1.product,MAX(case when useStartDate=1 then s1.startDate else s1.endDate end) 'SubscriptionDate'
from #Subscriptions s1
join (select s2s1.product, max(case when s2s1.endDate is null then 1 else 0 end) 'useStartDate' from #Subscriptions s2s1 group by s2s1.product) s2 on s1.product=s2.product
group by s1.product

t-sql group by category and get top n values

Imagine I have this table:
Month | Person | Value
----------------------
Jan | P1 | 1
Jan | P2 | 2
Jan | P3 | 3
Feb | P1 | 5
Feb | P2 | 4
Feb | P3 | 3
Feb | P4 | 2
...
How can I build a t-sql query to get the top 2 value rows and a third with the sum of others?
Something like this:
RESULT:
Month | Person | Value
----------------------
Jan | P3 | 3
Jan | P2 | 2
Jan | Others | 1 -(sum of the bottom value - in this case (Jan, P1, 1))
Feb | P1 | 5
Feb | P2 | 4
Feb | Others | 5 -(sum of the bottom values - in this case (Feb, P3, 3) and (Feb, P4, 2))
Thanks
In the assumption you are using SQL Server 2005 or higher, using a CTE would do the trick.
Attach a ROW_NUMBER to each row, starting with the highest value, resetting for each month.
SELECT the top 2 rows for each month from this query (rownumber <= 2)
UNION with the remaining rows (rownumber > 2)
SQL Statement
;WITH Months (Month, Person, Value) AS (
SELECT 'Jan', 'P1', 1 UNION ALL
SELECT 'Jan', 'P2', 2 UNION ALL
SELECT 'Jan', 'P3', 3 UNION ALL
SELECT 'Feb', 'P1', 5 UNION ALL
SELECT 'Feb', 'P2', 4 UNION ALL
SELECT 'Feb', 'P3', 3 UNION ALL
SELECT 'Feb', 'P4', 2
),
q AS (
SELECT Month
, Person
, Value
, RowNumber = ROW_NUMBER() OVER (PARTITION BY Month ORDER BY Value DESC)
FROM Months
)
SELECT Month
, Person
, Value
FROM (
SELECT Month
, Person
, Value
, RowNumber
FROM q
WHERE RowNumber <= 2
UNION ALL
SELECT Month
, Person = 'Others'
, SUM(Value)
, MAX(RowNumber)
FROM q
WHERE RowNumber > 2
GROUP BY
Month
) q
ORDER BY
Month DESC
, RowNumber
Kudo's go to Andriy for teaching me some new tricks.
;WITH atable (Month, Person, Value) AS (
SELECT 'Jan', 'P1', 1 UNION ALL
SELECT 'Jan', 'P2', 2 UNION ALL
SELECT 'Jan', 'P3', 3 UNION ALL
SELECT 'Feb', 'P1', 5 UNION ALL
SELECT 'Feb', 'P2', 4 UNION ALL
SELECT 'Feb', 'P3', 3 UNION ALL
SELECT 'Feb', 'P4', 2
),
numbered AS (
SELECT
Month, Person, Value,
rownum = ROW_NUMBER() OVER (PARTITION BY Month ORDER BY Value DESC)
FROM atable
),
grouped AS (
SELECT
Month, Person, Value,
Grp = CASE WHEN rownum < 3 THEN rownum ELSE 3 END
FROM numbered
)
SELECT
Month,
Person = CASE Grp WHEN 3 THEN 'Others' ELSE MAX(Person) END,
Value = SUM(Value)
FROM grouped
GROUP BY Month, Grp
ORDER BY Month DESC, Grp
WITH NTable AS
(
SELECT [Month],
Person,
Value,
ROW_NUMBER() OVER (PARTITION BY [Month] ORDER BY Value DESC)
AS Rownumber
FROM MyTable
)
SELECT t.[Month],
CASE Rownumber WHEN 1 THEN t.Person WHEN 2 THEN t.Person ELSE 'Others' END As Person,
SUM(t.Value) As [Sum]
FROM NTable t
GROUP BY t.[Month], CASE Rownumber WHEN 1 THEN t.Person WHEN 2 THEN t.Person ELSE 'Others' END
ORDER BY t.[Month]