Count distinct values with OVER(PARTITION BY id)

Count distinct values with OVER(PARTITION BY id) - postgresql

Is it possible to count distinct values in conjunction with window functions like OVER(PARTITION BY id)? Currently my query is as follows:
SELECT congestion.date, congestion.week_nb, congestion.id_congestion,
congestion.id_element,
ROW_NUMBER() OVER(
PARTITION BY congestion.id_element
ORDER BY congestion.date),
COUNT(DISTINCT congestion.week_nb) OVER(
PARTITION BY congestion.id_element
) AS week_count
FROM congestion
WHERE congestion.date >= '2014.01.01'
AND congestion.date <= '2014.12.31'
ORDER BY id_element, date
However, when I try to execute the query I get the following error:
"COUNT(DISTINCT": "DISTINCT is not implemented for window functions"

No, as the error message states, DISTINCT is not implemented with windows functions. Aplying info from this link into your case you could use something like:
WITH uniques AS (
SELECT congestion.id_element, COUNT(DISTINCT congestion.week_nb) AS unique_references
FROM congestion
WHERE congestion.date >= '2014.01.01'
AND congestion.date <= '2014.12.31'
GROUP BY congestion.id_element
)
SELECT congestion.date, congestion.week_nb, congestion.id_congestion,
congestion.id_element,
ROW_NUMBER() OVER(
PARTITION BY congestion.id_element
ORDER BY congestion.date),
uniques.unique_references AS week_count
FROM congestion
JOIN uniques USING (id_element)
WHERE congestion.date >= '2014.01.01'
AND congestion.date <= '2014.12.31'
ORDER BY id_element, date
Depending on the situation you could also put a subquery straight into SELECT-list:
SELECT congestion.date, congestion.week_nb, congestion.id_congestion,
congestion.id_element,
ROW_NUMBER() OVER(
PARTITION BY congestion.id_element
ORDER BY congestion.date),
(SELECT COUNT(DISTINCT dist_con.week_nb)
FROM congestion AS dist_con
WHERE dist_con.date >= '2014.01.01'
AND dist_con.date <= '2014.12.31'
AND dist_con.id_element = congestion.id_element) AS week_count
FROM congestion
WHERE congestion.date >= '2014.01.01'
AND congestion.date <= '2014.12.31'
ORDER BY id_element, date

If you are counting distinct numbers, you can use other aggregate functions to acheive the same effect, like so.
select
initial.id,
initial.val,
joined.id,
array_length(uniq(sort(array_agg(joined.some_number) over (partition by initial.id))), 1) as distinct_count
from
(values (1,'a'), (2,'b'), (3,'c')) initial(id, val)
left join (values (1, 1),
(1, 1),
(1, 3),
(2, 2),
(2, 2),
(3, 3),
(3, 3),
(3, 3),
(3, 4)) joined(id, some_number) on joined.id = initial.id
;
id val id distinct_count
1 a 1 2
1 a 1 2
1 a 1 2
2 b 2 1
2 b 2 1
3 c 3 2
3 c 3 2
3 c 3 2
3 c 3 2

I find that the easiest way is to use a subquery/CTE and conditional aggregation:
SELECT
c.date,
c.week_nb,
c.id_congestion,
c.id_element,
ROW_NUMBER() OVER (PARTITION BY c.id_element ORDER BY c.date),
(
CASE WHEN seqnum = 1 THEN
1
ELSE
0
END) AS week_count
FROM (
SELECT
c.*,
ROW_NUMBER() OVER (PARTITION BY c.congestion.id_element, c.week_nb ORDER BY c.date) AS seqnum
FROM
congestion c) c
WHERE
c.date >= '2014.01.01'
AND c.date <= '2014.12.31'
ORDER BY
id_element,
date

Make partitioned set smaller, up to the point there is no duplicates over counted field :
SELECT congestion.date, congestion.week_nb, congestion.id_congestion,
congestion.id_element,
ROW_NUMBER() OVER(
PARTITION BY congestion.id_element
ORDER BY congestion.date),
COUNT(congestion.week_nb) -- remove distinct
OVER(
PARTITION BY congestion.id_element,
-- add new fields which will restart counter in case duplication
congestion.id_congestion
) AS week_count
FROM congestion
WHERE congestion.date >= '2014.01.01'
AND congestion.date <= '2014.12.31'
ORDER BY id_element, date

Since this is the first result that pops up from Google, I'll add this reproducible example, similar to Gordon's answer:
Let's first start with creating a sample table:
WITH test as
(
SELECT *
FROM (VALUES
(1, 'A'),
(1, 'A'),
(2, 'B'),
(2, 'B'),
(2, 'D'),
(3, 'C'),
(3, 'C'),
(3, 'C'),
(3, 'E'),
(3, 'F')) AS t (id_element, week_nb)
)
select * from test
This yields:
id_element week_nb
1 A
1 A
2 B
2 B
2 D
3 C
3 C
3 C
3 E
3 F
Then, doing something like:
select
id_element,
week_nb,
sum(first_row_in_sequence) over (partition by id_element) as distinct_week_nb_count
from
(
select
id_element,
week_nb,
case when row_number() over (partition by id_element, week_nb) = 1 then 1 else 0 end as first_row_in_sequence
from test
) as sub
yields
id_element week_nb distinct_week_nb_count
1 A 1
1 A 1
2 B 2
2 B 2
2 D 2
3 C 3
3 C 3
3 C 3
3 E 3
3 F 3

Related

How can I increment the numerical value in my WHERE clause using a loop?

I am currently using the UNION ALL workaround below to calculate old_eps_tfq regression slopes of each ticker based off its corresponding rownum value (see WHERE rownum < x). I am interested to know what the old_eps_tfq is when rownum < 4 then increment 4 by 1 to find out what old_eps_tfq is when rownum < 5, and so on (there are ~20 rownum)
Could I use PL/pgSQL for this?
SELECT * FROM(
WITH regression_slope AS(
SELECT
ROW_NUMBER() OVER ( PARTITION BY ticker ORDER BY earnings_growths_ped) AS rownum,
*
FROM "ANALYTICS"."vEARNINGS_GROWTHS"
--WHERE ticker = 'ACN'
ORDER BY ticker )
SELECT
ticker,
current_period_end_date,
max(earnings_growths_ped) AS max_earnings_growths_ped,
--max(rownum) AS max_rownum,
round(regr_slope(old_eps_tfq, rownum)::numeric, 2) AS slope,
round(regr_intercept(old_eps_tfq, rownum)::numeric, 2) AS y_intercept,
round(regr_r2(old_eps_tfq, rownum)::numeric, 3) AS r_squared
FROM regression_slope
WHERE rownum < 4
GROUP BY ticker, current_period_end_date
ORDER BY ticker asc ) q
UNION ALL
SELECT * FROM(
WITH regression_slope AS(
SELECT
ROW_NUMBER() OVER ( PARTITION BY ticker ORDER BY earnings_growths_ped) AS rownum,
*
FROM "ANALYTICS"."vEARNINGS_GROWTHS"
--WHERE ticker = 'ACN'
ORDER BY ticker )
SELECT
ticker,
current_period_end_date,
max(earnings_growths_ped) AS max_earnings_growths_ped,
--max(rownum) AS max_rownum,
round(regr_slope(old_eps_tfq, rownum)::numeric, 2) AS slope,
round(regr_intercept(old_eps_tfq, rownum)::numeric, 2) AS y_intercept,
round(regr_r2(old_eps_tfq, rownum)::numeric, 3) AS r_squared
FROM regression_slope
WHERE rownum < 5
GROUP BY ticker, current_period_end_date
ORDER BY ticker asc ) q
Here is my table

The top query SELECT * FROM (...) q sounds like useless.
Then you can try this :
WITH regression_slope AS(
SELECT
ROW_NUMBER() OVER ( PARTITION BY ticker ORDER BY earnings_growths_ped) AS rownum,
*
FROM "ANALYTICS"."vEARNINGS_GROWTHS"
--WHERE ticker = 'ACN'
ORDER BY ticker )
SELECT
max,
ticker,
current_period_end_date,
max(earnings_growths_ped) AS max_earnings_growths_ped,
--max(rownum) AS max_rownum,
round(regr_slope(old_eps_tfq, rownum)::numeric, 2) AS slope,
round(regr_intercept(old_eps_tfq, rownum)::numeric, 2) AS y_intercept,
round(regr_r2(old_eps_tfq, rownum)::numeric, 3) AS r_squared
FROM regression_slope
INNER JOIN generate_series(4, 24) AS max -- the range 4 to 24 can be adjusted to the need
ON rownum < max
GROUP BY max, ticker, current_period_end_date
ORDER BY max asc, ticker asc

postgres aggregate subset from group by rows

I'm trying to evaluate user loyalty bonuses balance when bonuses burns after half-year inactivity. I want my sum consist of ord's 4, 5 and 6 for user 1.
create table transactions (
user int,
ord int, -- transaction date replacement
amount int,
lag interval -- after previous transaction
);
insert into transactions values
(1, 1, 10, '1h'::interval),
(1, 2, 10, '.5y'::interval),
(1, 3, 10, '1h'::interval),
(1, 4, 10, '.5y'::interval),
(1, 5, 10, '.1h'::interval),
(1, 6, 10, '.1h'::interval),
(2, 1, 10, '1h'::interval),
(2, 2, 10, '.5y'::interval),
(2, 3, 10, '.1h'::interval),
(2, 4, 10, '.1h'::interval),
(3, 1, 10, '1h'::interval),
;
select user, sum(
amount -- but starting from last '.5y'::interval if any otherwise everything counts
) from transactions group by user
user | sum(amount)
--------------------
1 | 30 -- (4+5+6), not 50, not 60
2 | 30 -- (2+3+4), not 40
3 | 10

try this:
with cte as(
select *,
case when (lead(lag) over (partition by user_ order by ord)) >= interval '.5 year'
then 1 else 0 end "flag" from test
),
cte1 as (
select *,
case when flag=(lag(flag,1) over (partition by user_ order by ord)) then 0 else 1 end "flag1" from cte
)
select distinct on (user_) user_, sum(amount) over (partition by user_,grp order by ord) from (
select *, sum(flag1) over (partition by user_ order by ord) "grp" from cte1) t1
order by user_ , ord desc
DEMO
Though it is very complicated and slow but resolve your problem

Is this what you're looking for ?
with last_5y as(
select "user", max(ord) as ord
from transactions
where lag = '.5y'::interval group by "user"
) select t.user, sum(amount)
from transactions t, last_5y t2
where t.user = t2.user and t.ord >= t2.ord
group by t.user

Checking Slowly Changing Dimension 2

I have a table that looks like this:
A slowly changing dimension type 2, according to Kimball.
Key is just a surrogate key, a key to make rows unique.
As you can see there are three rows for product A.
Timelines for this product are ok. During time the description of the product changes.
From 1-1-2020 up until 4-1-2020 the description of this product was ProdA1.
From 5-1-2020 up until 12-2-2020 the description of this product was ProdA2 etc.
If you look at product B, you see there are gaps in the timeline.
We use DB2 V12 z/Os. How can I check if there are gaps in the timelines for each and every product?
Tried this, but doesn't work
with selectie (key, tel) as
(select product, count(*)
from PROD_TAB
group by product
having count(*) > 1)
Select * from
PROD_TAB A
inner join selectie B
on A.product = B.product
Where not exists
(SELECT 1 from PROD_TAB C
WHERE A.product = C.product
AND A.END_DATE + 1 DAY = C.START_DATE
)
Does anyone know the answer?

The following query returns all gaps for all products.
The idea is to enumerate (RN column) all periods inside each product by START_DATE and join each record with its next period record.
WITH
/*
MYTAB (PRODUCT, DESCRIPTION, START_DATE, END_DATE) AS
(
SELECT 'A', 'ProdA1', DATE('2020-01-01'), DATE('2020-01-04') FROM SYSIBM.SYSDUMMY1
UNION ALL SELECT 'A', 'ProdA2', DATE('2020-01-05'), DATE('2020-02-12') FROM SYSIBM.SYSDUMMY1
UNION ALL SELECT 'A', 'ProdA3', DATE('2020-02-13'), DATE('2020-12-31') FROM SYSIBM.SYSDUMMY1
UNION ALL SELECT 'B', 'ProdB1', DATE('2020-01-05'), DATE('2020-01-09') FROM SYSIBM.SYSDUMMY1
UNION ALL SELECT 'B', 'ProdB2', DATE('2020-01-12'), DATE('2020-03-14') FROM SYSIBM.SYSDUMMY1
UNION ALL SELECT 'B', 'ProdB3', DATE('2020-03-15'), DATE('2020-04-18') FROM SYSIBM.SYSDUMMY1
UNION ALL SELECT 'B', 'ProdB4', DATE('2020-04-16'), DATE('2020-05-03') FROM SYSIBM.SYSDUMMY1
)
,
*/
MYTAB_ENUM AS
(
SELECT
T.*
, ROWNUMBER() OVER (PARTITION BY PRODUCT ORDER BY START_DATE) RN
FROM MYTAB T
)
SELECT A.PRODUCT, A.END_DATE + 1 START_DT, B.START_DATE - 1 END_DT
FROM MYTAB_ENUM A
JOIN MYTAB_ENUM B ON B.PRODUCT = A.PRODUCT AND B.RN = A.RN + 1
WHERE A.END_DATE + 1 <> B.START_DATE
AND A.END_DATE < B.START_DATE;
The result is:
|PRODUCT|START_DT |END_DT |
|-------|----------|----------|
|B |2020-01-10|2020-01-11|
May be more efficient way:
WITH MYTAB2 AS
(
SELECT
T.*
, LAG(END_DATE) OVER (PARTITION BY PRODUCT ORDER BY START_DATE) END_DATE_PREV
FROM MYTAB T
)
SELECT PRODUCT, END_DATE_PREV + 1 START_DATE, START_DATE - 1 END_DATE
FROM MYTAB2
WHERE END_DATE_PREV + 1 <> START_DATE
AND END_DATE_PREV < START_DATE;

Thnx Mark, will try this one of these days.
Never heard of LAG in DB2 V12 for z/Os
Will read about it
Thnx

Postgresql dense ranking to start at 2 if there is an initial tie at 1

So i have a table and a query that ranks the cost of items and doesn't allows ties with position 1, if there is a tie at position 1 the ranking starts at 2.
Here is the schema with a sample data
CREATE TABLE applications
(id int, name char(10), cost int);
INSERT INTO applications
(id, name, cost)
VALUES
(1, 'nfhfjs', 10),
(2, 'oopdld', 20),
(3, 'Wedass', 14),
(4, 'djskck', 22),
(5, 'laookd', 25),
(6, 'mfjjf', 25),
(7, 'vfhgg', 28),
(8, 'nvopq', 29),
(9, 'nfhfj', 56),
(10, 'voapp', 56);
Here is the query
WITH start_tie AS (
SELECT
DENSE_RANK() OVER(ORDER BY cost DESC) cost_rank,
lead(cost,1) OVER (ORDER BY cost DESC) as next_app_cost
FROM
applications LIMIT 1
)
SELECT
*,
DENSE_RANK() OVER(ORDER BY cost DESC) cost_rank,
(CASE start_tie.cost_rank WHEN start_tie.next_app_cost THEN cost_rank+1 ELSE cost_rank END) AS right_cost_rank
FROM
applications;
my expected result is
id name cost cost_rank
10 voapp 56 2
9 nfhfj 56 2
8 nvopq 29 3
7 vfhgg 28 4
6 mfjjf 25 5
5 laookd 25 5
4 djskck 22 6
2 oopdld 20 7
3 Wedass 14 8
1 nfhfjs 10 9
Please modify the query to achieve the result.
SQL FIDDLE

All you need to do is to check if the highest cost is the same as the second-highest cost. And if that is the case, add 1 to all rank values:
with start_tie as (
select case
when cost = lead(cost) over (order by cost desc) then 1
else 0
end as tie_offset
from applications
order by cost desc
limit 1
)
select *,
dense_rank() over (order by cost desc) + (select tie_offset from start_tie) cost_rank
from applications;
Example: http://rextester.com/EKSLJK65530
If the number of ties defines the offset to be used for the "new" ranking, the offset could be calculated using this:
with start_tie as (
select count(*) - 1 as tie_offset
from applications a1
where cost = (select max(cost) from applications)
)
select *,
dense_rank() over(order by cost desc) + (select tie_offset from start_tie) cost_rank
from applications;

No tie at first, means more than one with rank 1
replace r.cost_rank+x.c-1 with r.cost_rank+1 if fixed start at 2 rank to regardless of how many are in tie ranks are
WITH r AS (
SELECT
*
,DENSE_RANK() OVER(ORDER BY cost DESC) cost_rank
FROM
applications
), x as (select count(*) as c from r where cost_rank=1)
SELECT
r.*, (CASE WHEN 1<x.c THEN r.cost_rank+x.c-1 ELSE r.cost_rank END) as fixed
FROM
r,x;

Cumulative time difference per group

I have a table structure like below
DECLARE #XTable TABLE
(
ColA Varchar(20),
ColB Varchar(20),
DateCol DATE
)
INSERT INTO #XTable
VALUES
('A', 'X1', '4/1/2015'), ('A', 'X2', '4/10/2015'), ('A', 'X3', '4/12/2015'),
('A', 'X4', '4/16/2015'), ('B', 'X1', '5/18/2015'), ('B', 'X2', '5/20/2015')
Expected output:
/*
ColA ColB DateCol Diff
A X1 4/1/2015 0
A X2 4/10/2015 9
A X3 4/12/2015 2
A X3 4/12/2015 11
A X4 4/16/2015 15
A X4 4/16/2015 5
A X4 4/16/2015 4
B X1 5/18/2015 0
B X2 5/20/2015 12
*/
for example:
A X4 will have a difference of date from A X1, A X2 and A X3
& A X3 will have a difference of date from A X1 & A X2
I am able to get difference from last row via below query
;WITH Dataf
AS (
SELECT
*,
ROW_NUMBER() OVER (ORDER BY ColA,ColB, DateCol) AS RowNum
FROM
#XTable
)
SELECT a.ColA, a.ColB, SUM(DATEDIFF(Dd,b.DateCol,a.DateCol)) as TotalTime
FROM
Dataf AS A
LEFT OUTER JOIN Dataf AS B
ON A.RowNum = B.RowNum + 1 and a.ColA = b.ColA
GROUP BY a.ColA, a.ColB
Thought of applying multiple CTE, here is on what I am working now
;WITH Dataf
AS (
SELECT
*,
ROW_NUMBER() OVER (PARTITION BY ColA ORDER BY DateCol) AS RowNum
FROM
#XTable
),
CTE AS
(
SELECT ColA, ColB, DateCol, RowNum, NULL AS DateDifference
FROM Dataf WHERE RowNum = 1
UNION ALL
SELECT DF.ColA, DF.ColB, DF.DateCol, DF.RowNum ,
DATEDIFF(DD, CT.DateCol, DF.DateCol) AS DateDifference
FROM Dataf DF
JOIN CTE CT ON DF.ColA = CT.ColA AND DF.RowNum = CT.RowNum + 1
)
SELECT *
FROM CTE
ORDER BY ColA

You can instead use LEFT OUTER JOIN to the CTE you prepared. Here is how it can be done
;WITH DataForm
AS (
SELECT
*,
ROW_NUMBER() OVER (PARTITION BY Cola ORDER BY DateCol) AS RowNum
FROM
#XTable
)
SELECT ColA, ColB, DateCol, 0
FROM DataForm WHERE RowNum = 1
UNION
SELECT T1.ColA, T1.ColB, T1.DateCol
, DATEDIFF(dd,T2.DateCol, T1.Datecol)
FROM DataForm T1
LEFT OUTER JOIN DataForm T2 ON T1.ColA = T2.ColA
AND T1.RowNum >= T2.RowNum
WHERE DATEDIFF(dd,T2.DateCol, T1.Datecol) > 0
SQL Fiddler Example