sql window function to detect change in column values - postgresql

I'd like to detect changes in column values in this (example) db
WITH events(id, row,event) AS (
VALUES
(1,1, 0 )
,(1,2, 0 )
,(1,3, 1 )
,(1,4, 0 )
,(1,5, 1 )
,(2,1, 0 )
,(2,2, 1 )
,(3,1, 0 )
,(3,2, 0 )
)
select * from events
What I am looking for is code for a new column 'code' which switches to 1 AFTER
de event column shows a 1. Within the same id the code stays 1.
For this example this new column wil look like this
WITH events2(id, row,event, code) AS (
VALUES
(1,1, 0, 0 )
,(1,2, 0, 0 )
,(1,3, 1, 0 )
,(1,4, 0, 1 ) -- notice the switch here
,(1,5, 1, 1 ) --
,(2,1, 0, 0 )
,(2,2, 1, 0 )
,(3,1, 0, 0 )
,(3,2, 0, 0 )
)
select * from events2
I have a hunch that the answer will be related to the answer on this question : PostgreSQL window function: partition by comparison
Somehow I cannot figure this out myself..
Peter

COALESCE over a scalar subquery:
WITH events(id, zrow, zevent) AS (
VALUES
(1,1, 0 ) ,(1,2, 0 ) ,(1,3, 1 ) ,(1,4, 0 ) ,(1,5, 1 )
,(2,1, 0 ) ,(2,2, 1 )
,(3,1, 0 ) ,(3,2, 0 )
)
SELECT id, zrow, zevent
, COALESCE((SELECT 1 FROM events ex WHERE ex.id = ev.id AND ex.zrow < ev.zrow AND ex.zevent> 0),0) AS oevent
FROM events ev
;
Or, avoid the COALESCE() by typecasting the boolean EXISTS() to INTEGER:
WITH events(id, zrow,event) AS (
VALUES
(1,1, 0 ) ,(1,2, 0 ) ,(1,3, 1 ) ,(1,4, 0 ) ,(1,5, 1 )
,(2,1, 0 ) ,(2,2, 1 )
,(3,1, 0 ) ,(3,2, 0 )
)
SELECT id, zrow, event
, EXISTS(SELECT 1 FROM events ex WHERE ex.id = ev.id AND ex.zrow < ev.zrow AND ex.event> 0)::integer AS oevent
FROM events ev
;
Find the MAX() value over the previous records within the same group (frame):
WITH events(id, zrow,event) AS (
VALUES
(1,1, 0 ) ,(1,2, 0 ) ,(1,3, 1 ) ,(1,4, 0 ) ,(1,5, 1 )
,(2,1, 0 ) ,(2,2, 1 )
,(3,1, 0 ) ,(3,2, 0 )
)
, drag AS (
SELECT id, zrow, event, MAX(event)
OVER (PARTITION BY id
ORDER BY zrow
ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
) AS lagged
FROM events ev
)
SELECT id, zrow, event
, COALESCE(lagged,0) AS oevent
FROM drag dr
;
The same without the extra CTE:
WITH events(id, zrow,event) AS (
VALUES
(1,1, 0 ) ,(1,2, 0 ) ,(1,3, 1 ) ,(1,4, 0 ) ,(1,5, 1 )
,(2,1, 0 ) ,(2,2, 1 )
,(3,1, 0 ) ,(3,2, 0 )
)
SELECT id, zrow, event, COALESCE(MAX(event) OVER (PARTITION BY id
ORDER BY zrow
ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
),0) AS lagged
FROM events ev
;
Another way to perform the self-join would be to use a recursive query.

Related

Max(case when) to expand indicator from single row to group of rows

I need to expand the indicator (currently on daily basis) to a larger group (groups multiple consecutive days into one grp). I have following type of data:
id date grp new_ind traditional_ind
--------------------------------------------------
1 02-01-2021 1 1 0
1 02-02-2021 1 0 1
1 02-03-2021 1 0 0
1 02-04-2021 1 null null
1 02-06-2021 2 0 1
1 02-07-2021 2 0 0
2 02-01-2021 1 null null
where new_ind and traditional_ind are mutually exclusive. With this, I am trying to create new indicator that expands the indicators that are currently on daily level to grp level, that will look like:
id date grp new_ind traditional_ind final_type
----------------------------------------------------------------
1 02-01-2021 1 1 0 new
1 02-02-2021 1 0 1 new
1 02-03-2021 1 0 0 new
1 02-04-2021 1 null null new
1 02-06-2021 2 0 1 traditional
1 02-07-2021 2 0 0 traditional
2 02-01-2021 1 null null none
basically,
if new_ind was ever 1, I want to flag entire grp as 'new'
if new_ind=0 and if traditional_ind is ever 1, flag entire grp as 'traditional'
if both indicators were null, then flag entire grp as 'none'
so that each id and grp can have single value of final_type.
I've tried:
max(case when new_ind = 1 then 'New'
when traditional_ind = 1 then 'Traditional'
else 'None' end) over (partition by id, grp) as final_type
but this wouldn't recognize when new_ind=1 then 'New' and flag all of new_ind = 1 as 'None' (but show traditional correctly):
id date grp new_ind traditional_ind final_type
----------------------------------------------------------------
1 02-01-2021 1 1 0 none
1 02-02-2021 1 0 1 none
1 02-03-2021 1 0 0 none
1 02-04-2021 1 null null none
1 02-06-2021 2 0 1 traditional
1 02-07-2021 2 0 0 traditional
2 02-01-2021 1 null null none
But if I remove else statement and only run:
max(case when new_ind = 1 then 'New'
when traditional_ind = 1 then 'Traditional'
end) over (partition by id, grp) as final_type
then this does accurately expand indicator as I hope, just returns null values (which I need to show as 'None' instead of nulls):
id date grp new_ind traditional_ind final_type
----------------------------------------------------------------
1 02-01-2021 1 1 0 new
1 02-02-2021 1 0 1 new
1 02-03-2021 1 0 0 new
1 02-04-2021 1 null null new
1 02-06-2021 2 0 1 traditional
1 02-07-2021 2 0 0 traditional
2 02-01-2021 1 null null null
Can anyone help identify issue with my max case when statement?
I think something like this should work:
WITH final_types AS (
SELECT
id,
grp,
( case
when bool_or(new_ind = 1) then 'New'
when bool_or(traditional_ind = 1) then 'Traditional'
else 'None'
end
) AS final_type
FROM your_table
GROUP BY id, grp
)
SELECT
t1.*,
t2.final_type
FROM your_table t1
JOIN final_types t2 ON t1.id = t2.id AND t1.grp = t2.grp

Improve performance on CTE with sub-queries

I have a table with this structure:
WorkerID Value GroupID Sequence Validity
1 '20%' 1 1 2018-01-01
1 '10%' 1 1 2017-06-01
1 'Yes' 1 2 2017-06-01
1 '2018-01-01' 2 1 2017-06-01
1 '17.2' 2 2 2017-06-01
2 '10%' 1 1 2017-06-01
2 'No' 1 2 2017-06-01
2 '2016-03-01' 2 1 2017-06-01
2 '15.9' 2 2 2017-06-01
This structure was created so that the client can create customized data for a worker. For example Group 1 can be something like "Salary" and Sequence is one value that belongs to that Group like "Overtime Compensation". The column Value is a VARCHAR(150) field and the correct validation and conversation is done in another part of the application.
The Validity column exist mainly for historical reasons.
Now I would like to show, for the different workers, the information in a grid where each row should be one worker (displaying the one with the most recent Validity):
Worker 1_1 1_2 2_1 2_2
1 20% Yes 2018-01-01 17.2
2 10% No 2016-03-01 15.9
To accomplish this I created a CTE that looks like this:
WITH CTE_worker_grid
AS
(
SELECT
worker,
/* 1 */
(
SELECT top 1 w.Value
FROM worker_values AS w
WHERE w.GroupID = 1
AND w.Sequence = 1
ORDER BY w.Validity DESC
) AS 1_1,
(
SELECT top 1 w.Value
FROM worker_values AS w
WHERE w.GroupID = 1
AND w.Sequence = 2
ORDER BY w.Validity DESC
) AS 1_2,
/* 2 */
(
SELECT top 1 w.Value
FROM worker_values AS w
WHERE w.GroupID = 2
AND w.Sequence = 1
ORDER BY w.Validity DESC
) AS 2_1,
(
SELECT top 1 w.Value
FROM worker_values AS w
WHERE w.GroupID = 2
AND w.Sequence = 2
ORDER BY w.Validity DESC
) AS 2_2
)
GO
This produces the correct result but it's very slow as it creates this grid for over 18'000 worker with almost 30 Groups and up to 20 Sequences in each Group.
How could one speed up the process of a CTE of this magnitude? Should CTE even be used? Can the sub-queries be changed or re-factored out to speed up the execution?
Use a PIVOT!
+----------+---------+---------+------------+---------+
| WorkerId | 001_001 | 001_002 | 002_001 | 002_002 |
+----------+---------+---------+------------+---------+
| 1 | 20% | Yes | 2018-01-01 | 17.2 |
| 2 | 10% | No | 2016-03-01 | 15.9 |
+----------+---------+---------+------------+---------+
SQL Fiddle: http://sqlfiddle.com/#!18/6e768/1
CREATE TABLE WorkerAttributes
(
WorkerID INT NOT NULL
, [Value] VARCHAR(50) NOT NULL
, GroupID INT NOT NULL
, [Sequence] INT NOT NULL
, Validity DATE NOT NULL
)
INSERT INTO WorkerAttributes
(WorkerID, Value, GroupID, Sequence, Validity)
VALUES
(1, '20%', 1, 1, '2018-01-01')
, (1, '10%', 1, 1, '2017-06-01')
, (1, 'Yes', 1, 2, '2017-06-01')
, (1, '2018-01-01', 2, 1, '2017-06-01')
, (1, '17.2', 2, 2, '2017-06-01')
, (2, '10%', 1, 1, '2017-06-01')
, (2, 'No', 1, 2, '2017-06-01')
, (2, '2016-03-01', 2, 1, '2017-06-01')
, (2, '15.9', 2, 2, '2017-06-01')
;WITH CTE_WA_RANK
AS
(
SELECT
ROW_NUMBER() OVER (PARTITION BY WorkerID, GroupID, [Sequence] ORDER BY Validity DESC) AS VersionNumber
, WA.WorkerID
, WA.GroupID
, WA.[Sequence]
, WA.[Value]
FROM
WorkerAttributes AS WA
),
CTE_WA
AS
(
SELECT
WA_RANK.WorkerID
, RIGHT('000' + CAST(WA_RANK.GroupID AS VARCHAR(3)), 3)
+ '_'
+ RIGHT('000' + CAST(WA_RANK.[Sequence] AS VARCHAR(3)), 3) AS SMART_KEY
, WA_RANK.[Value]
FROM
CTE_WA_RANK AS WA_RANK
WHERE
WA_RANK.VersionNumber = 1
)
SELECT
WorkerId
, [001_001] AS [001_001]
, [001_002] AS [001_002]
, [002_001] AS [002_001]
, [002_002] AS [002_002]
FROM
(
SELECT
CTE_WA.WorkerId
, CTE_WA.SMART_KEY
, CTE_WA.[Value]
FROM
CTE_WA
) AS WA
PIVOT
(
MAX([Value])
FOR
SMART_KEY IN
(
[001_001]
, [001_002]
, [002_001]
, [002_002]
)
) AS PVT

Optimize SQL statement for this Query?

I have two tables T1 and T2.
T1 have ID,F1,F2,F3,F4,F5,F6,F7,F8
T2 have ID,T1_ID,F1,F2,F3,F4,F5,F6,F7,F8,SUM
Examples Data for T1 and T2
T1
ID,F1,F2,F3,F4,F5,F6,F7,F8
1, 1, 2, 3, 0, 0, 5, 0, 0
2, 0, 0, 0, 1, 0, 4, 5, 0
3, 4, 1, 3, 2, 0, 0, 0, 5
4, 1 ,3, 4, 0, 0 ,0, 0, 0
5, 7, 2, 1, 3, 0, 0, 0, 0
.
.
.
T2
ID,T1_ID,F1,F2,F3,F4,F5,F6,F7,F8,SUM
1, 1, 2, 3, 5, 0, 0, 3, 0, 0,100
2, 5, 9, 8, 8, 1, 0, 0, 0, 0,200
3, 2, 0, 0, 0, 5, 0, 6, 6, 0,300
4, 1 ,3, 4, 2, 0 ,0, 3, 0, 0,255
5, 4, 8, 8, 8, 0, 0, 0, 0, 0,155
.
.
Select * from T2 where T1.F1....T1.F8 have (1 and 2 and 3)
query must return records 1,2,4
1, 1, 2, 3, 5, 0, 0, 3, 0, 0,100
2, 5, 9, 8, 8, 1, 0, 0, 0, 0,200
4, 1 ,3, 4, 2, 0 ,0, 3, 0, 0,255
I create this query
Select T2.ID,T2.F1,T2.F2,T2.F3,T2.F4.T2.F5,T2.F6,T2.F7,T2.F8,T2.SUM,T1.ID
from T2
join T1 on T1.ID = T2.T1_ID
where
(CASE WHEN ( T1_ID.F1 = 1 ) THEN T2.F1 between 0 and 1000 end)
and(CASE WHEN ( T1_ID.F1 = 2 ) THEN T2.F1 between 0 and 1000 end)
and(CASE WHEN ( T1_ID.F1 = 3 ) THEN T2.F1 between 0 and 1000 end)
or
(CASE WHEN ( T1_ID.F1 = 1 ) THEN T2.F1 between 0 and 1000 end)
and(CASE WHEN ( T1_ID.F2 = 2 ) THEN T2.F2 between 0 and 1000 end)
and(CASE WHEN ( T1_ID.F1 = 3 ) THEN T2.F1 between 0 and 1000 end)
or
(CASE WHEN ( T1_ID.F1 = 1 ) THEN T2.F1 between 0 and 1000 end)
and(CASE WHEN ( T1_ID.F3 = 2 ) THEN T2.F3 between 0 and 1000 end)
and(CASE WHEN ( T1_ID.F1 = 3 ) THEN T2.F1 between 0 and 1000 end)
or
(CASE WHEN ( T1_ID.F1 = 1 ) THEN T2.F1 between 0 and 1000 end)
and(CASE WHEN ( T1_ID.F4 = 2 ) THEN T2.F4 between 0 and 1000 end)
and(CASE WHEN ( T1_ID.F1 = 3 ) THEN T2.F1 between 0 and 1000 end)
.
.
.
or
(CASE WHEN ( T1_ID.F2 = 1 ) THEN T2.F2 between 0 and 1000 end)
and(CASE WHEN ( T1_ID.F1 = 2 ) THEN T2.F1 between 0 and 1000 end)
and(CASE WHEN ( T1_ID.F1 = 3 ) THEN T2.F1 between 0 and 1000 end)
or
(CASE WHEN ( T1_ID.F2 = 1 ) THEN T2.F2 between 0 and 1000 end)
and(CASE WHEN ( T1_ID.F2 = 2 ) THEN T2.F2 between 0 and 1000 end)
and(CASE WHEN ( T1_ID.F1 = 3 ) THEN T2.F1 between 0 and 1000 end)
or
(CASE WHEN ( T1_ID.F2 = 1 ) THEN T2.F2 between 0 and 1000 end)
and(CASE WHEN ( T1_ID.F3 = 2 ) THEN T2.F3 between 0 and 1000 end)
and(CASE WHEN ( T1_ID.F1 = 3 ) THEN T2.F1 between 0 and 1000 end)
.
.
.
this is too big statement.
How can I optimize statement ?
SELECT * FROM T2
WHERE EXISTS ( SELECT N.ID
FROM( SELECT T1.ID , T1.F1 AS F
FROM T1
UNION ALL
SELECT T1.ID , T1.F2 AS F
FROM T1
UNION ALL
SELECT T1.ID , T1.F3 AS F
FROM T1
UNION ALL
SELECT T1.ID , T1.F4 AS F
FROM T1
UNION ALL
SELECT T1.ID , T1.F5 AS F
FROM T1
UNION ALL
SELECT T1.ID , T1.F6 AS F
FROM T1
UNION ALL
SELECT T1.ID , T1.F7 AS F
FROM T1
UNION ALL
SELECT T1.ID , T1.F8 AS F
FROM T1
UNION ALL
) N
WHERE N.F IN ( 1, 2, 3 )
AND N.ID = T2.T1_ID
GROUP BY N.ID
HAVING COUNT(DISTINCT N.F) = 3
);

Sum Booleans and then display only over a certain count

I've been banging my head against the wall a while.
I have a table that has DwgID (integer) and Current (boolean) as part of the table.
What I want to do is filter just the DwgID's that have more than 1 Current selected.
This code gives me the count correctly:
`SELECT "DwgID", SUM( CASE WHEN "Current" = 1 THEN 1 ELSE 0 END ) "Current" FROM "tblRev" GROUP BY "DwgID"`
But if I try to add a criteria where the Current Count is > 1, it fails with a data content could not be loaded.
`SELECT "DwgID", SUM( CASE WHEN "Current" = 1 THEN 1 ELSE 0 END ) "Current" FROM "tblRev" GROUP BY "DwgID" HAVING ( ( SUM( CASE WHEN "Current" = 1 THEN 1 ELSE 0 END ) > 1 ) )`
What am I doing wrong?
You need to choose a different alias because Current is the column name. Let's call it NumCurrent instead.
SELECT "DwgID", SUM( CASE WHEN "Current" = 1 THEN 1 ELSE 0 END ) "NumCurrent" FROM "tblRev" GROUP BY "DwgID" HAVING ( ( SUM( CASE WHEN "Current" = 1 THEN 1 ELSE 0 END ) > 1 ) )

How to average 3 values in Sql Server?

I have three variables :-
#ScoreA DECIMAL(10,7)
#ScoreB DECIMAL(10,7)
#ScoreC DECIMAL(10,7)
#FinalScore DECIMAL(10, 7)
I wish to get the average of the three scores. BUT 1, 2 or all 3 values might be zero.
Eg. scenarios:
A = 1.4, B=3.5, C=5.0; FinalScore = 3.3
A = 0.0, B=0.0, C=0.0; FinalScore = 0.0
A = 1.1, B=0.0, C=0.0; FinalScore = 1.1
A = 0.0, B=2.0, C=4.8; FinalScore = 3.4
Cheers!
IF #A > 0 OR #B > 0 OR #C > 0
SELECT ((#A + #B + #C) /
(0 +
CASE WHEN #A = 0 THEN 0 ELSE 1 END +
CASE WHEN #B = 0 THEN 0 ELSE 1 END +
CASE WHEN #C = 0 THEN 0 ELSE 1 END ))
ELSE
SELECT 0.0
EDIT
Modified query to now handle divide by zero scenario's.
EDIT2
Here is "the trick with the AVG(..) function" :) with Common Table Expression
WITH T(I) AS (SELECT #A UNION SELECT #B UNION SELECT #C)
SELECT AVG(I) FROM T
WHERE I > 0
SELECT ((#A + #B + #C) /
(CASE WHEN (#A = 0.0 AND #B = 0.0 AND #C = 0.0) THEN 1 ELSE 0 END
+ CASE WHEN #A = 0 THEN 0 ELSE 1 END
+ CASE WHEN #B = 0 THEN 0 ELSE 1 END
+ CASE WHEN #C = 0 THEN 0 ELSE 1 END
)
)
For me this is easier to read and understand:
DECLARE
#ScoreA DECIMAL(10,7),
#ScoreB DECIMAL(10,7),
#ScoreC DECIMAL(10,7),
#FinalScore DECIMAL(10, 7)
SET #ScoreA = 1.4
SET #ScoreB = 3.5
SET #ScoreC = 5.0
DECLARE
#AVG TABLE (value DECIMAL(10,7))
INSERT INTO #AVG
SELECT #ScoreA WHERE #ScoreA > 0
UNION
SELECT #ScoreB WHERE #ScoreB > 0
UNION
SELECT #ScoreC WHERE #ScoreC > 0
SELECT COALESCE(AVG(value), 0) FROM #AVG