Average after fixed interval and Group By in SQL - group-by

Is it possible to Average after every fixed interval and group by one column in MSSQL ?
Suppose I have a table A as under:
NAME Interval Data1 Data2
1 0.01 1 4
1 0.05 4 2
1 0.09 7 6
1 0.11 1 2
1 0.15 7 6
1 0.18 3 1
1 0.19 2 5
2 0.209 9 0
I want the Output to group by Name and run average every 10 counts.
So for expamle
Name - 1
Interval Start - 0
Interval End - 10
Data 1 Avg - 4 [(1 + 4 + 7) / 3]
Data 3 Avg - 4 [(4 + 2 + 6) / 3]
AND
Name - 1
Interval Start - 10
Interval End - 20
Data 1 Avg - 3.25 [(1 + 7 + 3 + 2) / 4]
Data 3 Avg - 3.50 [(2 + 6 + 1 + 5) / 4]
So I want the Ouput as below. The interval per "Name" column is different.
Name Interval-Start Interval-End DataAvg1 DataAvg2
1 0 10 4 4
1 10 20 3.25 3.50
2 0 10 0 0
2 10 20 0 0
2 20 30 9 0
I used the below query, but cant figure out logic per interval.
SELECT Name, Interval, AVG(Data1) AS Data1Avg, AVG(Data2) AS Data2Avg
FROM TableA
GROUP BY Name;
Can someone please help me with it.

using cursor and temp table
--drop table dbo.#result
--drop table dbo.#steps
CREATE TABLE dbo.#result
(
[Name] varchar(50),
[Interval-Start] float,
[Interval-End] float,
[DataAvg1] float,
[DataAvg2] float
)
CREATE TABLE dbo.#steps
(
[IntervalStart] float,
[IntervalEnd] float
)
declare #min int, #max int, #step float
DECLARE #Name varchar(50), #IntervalStart float, #IntervalEnd float;
set #min = 0
set #max = 1
set #step = 0.1
insert into #steps
select #min + Number * #step IntervalStart, #min + Number * #step + #step IntervalEnd
from master..spt_values
where type = 'P' and number between 0 and (#max - #min) / #step
DECLARE _cursor CURSOR FOR
SELECT [Name], [IntervalStart], [IntervalEnd] FROM
(select [Name] from [TableA] Group by [Name]) t
INNER JOIN #steps on 1=1
OPEN _cursor;
FETCH NEXT FROM _cursor
INTO #Name, #IntervalStart, #IntervalEnd;
WHILE ##FETCH_STATUS = 0
BEGIN
insert into dbo.#result
select #Name, #IntervalStart, #IntervalEnd, AVG(CAST(Data1 as FLOAT)), AVG(CAST(Data2 as FLOAT))
FROM [TableA]
where [NAME] = #Name and Interval between #IntervalStart and #IntervalEnd
FETCH NEXT FROM _cursor
INTO #Name, #IntervalStart, #IntervalEnd;
END
CLOSE _cursor;
DEALLOCATE _cursor;
select * from dbo.#result

Related

Get count of values in different subgroups

I need to delete some rows in the dataset, of which the speed equals zero and lasting over N times (let's assume N is 2).
The structure of the table demo looks like:
id
car
speed
time
1
foo
0
1
2
foo
0
2
3
foo
0
3
4
foo
1
4
5
foo
1
5
6
foo
0
6
7
bar
0
1
8
bar
0
2
9
bar
5
3
10
bar
5
4
11
bar
5
5
12
bar
5
6
Then I hope to generate a table like the one below by using window_function:
id
car
speed
time
lasting
1
foo
0
1
3
2
foo
0
2
3
3
foo
0
3
3
4
foo
1
4
2
5
foo
1
5
2
6
foo
0
6
1
7
bar
0
1
2
8
bar
0
2
2
9
bar
5
3
4
10
bar
5
4
4
11
bar
5
5
4
12
bar
5
6
4
Then I can easily exclude those rows by using WHERE NOT (speed = 0 AND lasting > 2)
Put the code I tried here, but it didn't return the value I expected and I guess those FROM (SELECT ... FROM (SELECT ... might not be the best practice to solve the problem:
SELECT g3.*, count(id) OVER (PARTITION BY car, cumsum ORDER BY id) as num
FROM (SELECT g2.*, sum(grp2) OVER (PARTITION BY car ORDER BY id) AS cumsum
FROM (SELECT g1.*, (CASE ne0 WHEN 0 THEN 0 ELSE 1 END) AS grp2
FROM (SELECT g.*, speed - lag(speed, 1, 0) OVER (PARTITION BY car) AS ne0
FROM (SELECT *, row_number() OVER (PARTITION BY car) AS grp FROM demo) g ) g1 ) g2 ) g3
ORDER BY id;
You can use window function LAG() to check for the previous speed value for each row and SUM() window function to create the groups for the continuous values.
Then with COUNT() window function you can count the number of rows in each group so that you can filter out the rows with 0 speed in the groups that have more than 2 rows:
SELECT id, car, speed, time
FROM (
SELECT *, COUNT(*) OVER (PARTITION BY car, grp) counter
FROM (
SELECT *, SUM(flag::int) OVER (PARTITION BY car ORDER BY time) grp
FROM (
SELECT *, speed <> LAG(speed, 1, speed - 1) OVER (PARTITION BY car ORDER BY time) flag
FROM demo
) t
) t
) t
WHERE speed <> 0 OR counter <= 2
ORDER BY id;
See the demo.

Recursive Cumulative Sum up to a certain value Postgres

I have my data that looks like this:
user_id touchpoint_number days_difference
1 1 5
1 2 20
1 3 25
1 4 10
2 1 2
2 2 30
2 3 4
I would like to create one more column that would create a cumulative sum of the days_difference, partitioned by user_id, but would reset whenever the value reaches 30 and starts counting from 0. I have been trying to do it, but I couldn't figure it out how to do it in PostgreSQL, because it has to be recursive.
The outcome I would like to have would be something like:
user_id touchpoint_number days_difference cum_sum_upto30
1 1 5 5
1 2 20 25
1 3 25 0 --- new count all over again
1 4 10 10
2 1 2 2
2 2 30 0 --- new count all over again
2 3 4 4
Do you have any cool ideas how this could be done?
This should do what you want:
with cte as (
select t.a, t.b, t.c, t.c as sumc
from t
where b = 1
union all
select t.a, t.b, t.c,
(case when t.c + cte.sumc > 30 then 0 else t.c + cte.sumc end)
from t join
cte
on t.b = cte.b + 1 and t.a = cte.a
)
select *
from cte
order by a, b;
Here is a rextester.

postgreSQL: averaging non-null values

Seems simple, but... I have data like so:
pid score1 score2 score3
1 1 3 2
2 3 1
3 4
I want to do an average score for the three only where there are non null values. Sort of like sum(score1+score2+score3)/3 but the denominator essentially needs to be a total of the non-null values for the given row, so 3 for pid 1, 2 for 2, and 1 for 3.
Is there a simple thing I'm missing here?
with t(pid, score1, score2, score3) as (
values (1,1,3,2), (2,3,null,1), (3,4,null,null)
)
select
(sum(score1) + sum(score2) + sum(score3))::numeric /
(count(score1) + count(score2) + count(score3))
as avg,
avg(coalesce(score1, 0) + coalesce(score2, 0) + coalesce(score3, 0))
as avg2
from t;
avg | avg2
--------------------+--------------------
2.3333333333333333 | 4.6666666666666667

Select data for 15 minute windows - PostgreSQL

Right so I have a table such as this in PostgreSQL:
timestamp duration
2013-04-03 15:44:58 4
2013-04-03 15:56:12 2
2013-04-03 16:13:17 9
2013-04-03 16:16:30 3
2013-04-03 16:29:52 1
2013-04-03 16:38:25 1
2013-04-03 16:41:37 9
2013-04-03 16:44:49 1
2013-04-03 17:01:07 9
2013-04-03 17:07:48 1
2013-04-03 17:11:00 2
2013-04-03 17:11:16 2
2013-04-03 17:15:17 1
2013-04-03 17:16:53 4
2013-04-03 17:20:37 9
2013-04-03 17:20:53 3
2013-04-03 17:25:48 3
2013-04-03 17:29:26 1
2013-04-03 17:32:38 9
2013-04-03 17:36:55 4
And I would like to get the following output:
timestampwindowstart = 2013-04-03 15:44:58
duration count
1 0
2 1
3 0
4 1
9 0
timestampwindowstart = 2013-04-03 15:59:58
duration count
1 0
2 0
3 0
4 0
9 1
timestampwindowstart = 2013-04-03 16:14:58
duration count
1 1
2 0
3 1
4 0
9 0
timestampwindowstart = 2013-04-03 16:29:58
duration count
1 2
2 0
3 0
4 0
9 1
etc...
So basically it cycles through the timestamps in 15 minute windows and outputs the distinct duration values along with their frequency (count). The timestampwindowstart value is the earliest timestamp for the window (i.e timestampwindowfinish = timestampwindowstart + 15 minutes)
This is so I can then plot the 15 minute interval histograms...
I have tried reading up but it is a bit complicated for me to get my head around and I don't have much time...
Thanks for any help!
Quick and dirty way: http://sqlfiddle.com/#!1/bd2f6/21 I named my column tstamp instead of your timestamp
with t as (
select
generate_series(mitstamp,matstamp,'15 minutes') as int,
duration
from
(select min(tstamp) mitstamp, max(tstamp) as matstamp from tmp) a,
(select duration from tmp group by duration) b
)
select
int as timestampwindowstart,
t.duration,
count(tmp.duration)
from
t
left join tmp on
(tmp.tstamp >= t.int and
tmp.tstamp < (t.int + interval '15 minutes') and
t.duration = tmp.duration)
group by
int,
t.duration
order by
int,
t.duration
Brief explanation:
Calculate minimum and maximum timestamp
Generate 15 minutes intervals between minimum and maximum
Cross join results with unique values of duration
Left join original data (left join is important, because this will keep all possible combination in output and there will be null where duration does not exists for given interval.
Aggregate data. count(null)=0
In case you have more tables and the algorithm should be applied on their union. Suppose we have three tables tmp1, tmp2, tmp3 all with columns tstamp and duration. The we can extend the previous solution:
with
tmpout as (
select * from tmp1 union all
select * from tmp2 union all
select * from tmp3
)
,t as (
select
generate_series(mitstamp,matstamp,'15 minutes') as int,
duration
from
(select min(tstamp) mitstamp, max(tstamp) as matstamp from tmpout) a,
(select duration from tmpout group by duration) b
)
select
int as timestampwindowstart,
t.duration,
count(tmp.duration)
from
t
left join tmpout on
(tmp.tstamp >= t.int and
tmp.tstamp < (t.int + interval '15 minutes') and
t.duration = tmp.duration)
group by
int,
t.duration
order by
int,
t.duration
You should really know with clause in PostgreSQL. It is invaluable concept for any data analysis in PostgreSQL.

How to generate larger sets of lottery numbers efficiently

I am a beginner with SQL and I was looking for more experiences with SQL hence I decided to design a procedure to generate X amount of random lotto picks. The lottery here in my area allows you to pick 5 numbers from 1-47 and 1 "mega" number from 1-27. The trick is the "mega" number could repeat with the 5 numbers previously, i.e. 1, 2, 3, 4, 5, mega 1.
I created the following procedure to generate 10 million lottery picks, and it took 12 hours and 57 minutes for the process to finish. While my friends tested the same thing with java and it took seconds. I was wondering if there's any improvements I can make to the code or if there's any mistakes that I've made? I'm new at this hence I am trying to learn better approaches etc, all comments welcome.
USE lotto
DECLARE
#counter INT,
#counter1 INT,
#pm SMALLINT,
#i1 SMALLINT,
#i2 SMALLINT,
#i3 SMALLINT,
#i4 SMALLINT,
#i5 SMALLINT,
#sort int
SET #counter1=0
TRUNCATE TABLE picks
WHILE #counter1<10000000
BEGIN
TRUNCATE TABLE sort
SET #counter = 1
WHILE #counter < 6
BEGIN
INSERT INTO sort (pick)
SELECT CAST(((47+ 1) - 0) * RAND() + 1 AS TINYINT)
IF (SELECT count(distinct pick) FROM sort)<#counter
BEGIN
TRUNCATE TABLE sort
SET #counter=1
END
ELSE IF (SELECT COUNT(DISTINCT pick) FROM sort)=#counter
BEGIN
SET #counter = #counter + 1
END
END
SET #sort = 0
WHILE #sort<5
BEGIN
UPDATE sort
SET sort=#sort
WHERE pick = (SELECT min(pick) FROM sort WHERE sort is null)
SET #sort=#sort + 1
END
SET #i1 = (SELECT pick FROM sort WHERE sort = 0)
SET #i2 = (SELECT pick FROM sort WHERE sort = 1)
SET #i3 = (SELECT pick FROM sort WHERE sort = 2)
SET #i4 = (SELECT pick FROM sort WHERE sort = 3)
SET #i5 = (SELECT pick FROM sort WHERE sort = 4)
SET #pm = (CAST(((27+ 1) - 0) * RAND() + 1 AS TINYINT))
INSERT INTO picks(
First,
Second,
Third,
Fourth,
Fifth,
Mega,
Sequence
)
Values(
#i1,
#i2,
#i3,
#i4,
#i5,
#pm,
#counter1
)
SET #counter1 = #counter1+1
END
I generated 10000 rows in 0 sec. I did it i another way. Hope this will help you
;WITH Nbrs ( n ) AS (
SELECT 1 UNION ALL
SELECT 1 + n FROM Nbrs WHERE n < 10000 )
SELECT
(ABS(CHECKSUM(NewId())) % 47 + 1) AS First,
(ABS(CHECKSUM(NewId())) % 47 + 1) AS Second,
(ABS(CHECKSUM(NewId())) % 47 + 1) AS Third,
(ABS(CHECKSUM(NewId())) % 47 + 1) AS Fourth,
(ABS(CHECKSUM(NewId())) % 47 + 1) AS Fifth,
(ABS(CHECKSUM(NewId())) % 27 + 1) AS Mega,
Nbrs.n AS Sequence
FROM
Nbrs
OPTION ( MAXRECURSION 0 )
10000 rows 0 sec
100000 rows 1 sec
1000000 rows 13 sec
10000000 rows 02 min 21 sec
Or with cross joins
WITH E00(N) AS (SELECT 1 UNION ALL SELECT 1),
E02(N) AS (SELECT 1 FROM E00 a, E00 b),
E04(N) AS (SELECT 1 FROM E02 a, E02 b),
E08(N) AS (SELECT 1 FROM E04 a, E04 b),
E16(N) AS (SELECT 1 FROM E08 a, E08 b),
E32(N) AS (SELECT 1 FROM E16 a, E16 b),
Nbrs(N) AS (SELECT ROW_NUMBER() OVER (ORDER BY N) FROM E32)
SELECT
(ABS(CHECKSUM(NewId())) % 47 + 1) AS First,
(ABS(CHECKSUM(NewId())) % 47 + 1) AS Second,
(ABS(CHECKSUM(NewId())) % 47 + 1) AS Third,
(ABS(CHECKSUM(NewId())) % 47 + 1) AS Fourth,
(ABS(CHECKSUM(NewId())) % 47 + 1) AS Fifth,
(ABS(CHECKSUM(NewId())) % 27 + 1) AS Mega,
Nbrs.n AS Sequence
FROM Nbrs
WHERE N <= 10000000;
10000 rows 0 sec
100000 rows 1 sec
1000000 rows 14 sec
10000000 rows 03 min 29 sec
I should also mention that the reason I am using
(ABS(CHECKSUM(NewId())) % 47 + 1)
is that it returns a random number per row. The solution with
CAST(((47+ 1) - 0) * RAND() + 1 AS TINYINT)
return the same random number for each row if you select them in one go. To test this run this example:
;WITH Nbrs ( n ) AS (
SELECT 1 UNION ALL
SELECT 1 + n FROM Nbrs WHERE n < 5 )
SELECT
CAST(((47+ 1) - 0) * RAND() + 1 AS TINYINT) AS Random,
(ABS(CHECKSUM(NewId())) % 47 + 1) AS RadomCheckSum,
Nbrs.n AS Sequence
FROM Nbrs
Ok. So I did see your comment and I have a solution for that as well. If you really want to order the numbers. The complexity of the algorithm elevates and that also means that the time of the algorithm increases. But i still think it is doable. But not in the same neat way.
--Yeah declaring a temp table for just the random order number
DECLARE #tbl TABLE(value int)
--The same function but with the number of the random numbers
;WITH Nbrs ( n ) AS (
SELECT 1 UNION ALL
SELECT 1 + n FROM Nbrs WHERE n < 5 )
INSERT INTO #tbl
(
value
)
SELECT
Nbrs.n AS Sequence
FROM Nbrs
;WITH Nbrs ( n ) AS (
SELECT CAST(1 as BIGINT) UNION ALL
SELECT 1 + n FROM Nbrs WHERE n < 100000 )
SELECT
tblOrderRandomNumbers.[1] AS First,
tblOrderRandomNumbers.[2] AS Second,
tblOrderRandomNumbers.[3] AS Third,
tblOrderRandomNumbers.[4] AS Fourth,
tblOrderRandomNumbers.[5] AS Fifth,
(ABS(CHECKSUM(NewId())) % 27 + 1) AS Mega,
Nbrs.n AS Sequence
FROM
Nbrs
--This cross join. Joins with the declared table
CROSS JOIN
(
SELECT
[1], [2], [3], [4], [5]
FROM
(
SELECT
Random,
ROW_NUMBER() OVER(ORDER BY tblRandom.Random ASC) AS RowNumber
FROM
(
SELECT
(ABS(CHECKSUM(NewId())) % 47 + 1) AS Random
FROM
#tbl AS tblNumbers
) AS tblRandom
)AS tblSortedRadom
--A pivot makes the rows to columns. Using the row index over order of the random number
PIVOT
(
AVG(Random)
FOR RowNumber IN ([1], [2], [3], [4],[5])
) as pivottable
) AS tblOrderRandomNumbers
OPTION ( MAXRECURSION 0 )
But still i manage to do it in a little time
10000 Rows : 0 sec
100000 Rows : 4 sec
1000000 Rows : 43 sec
10000000 Rows : 7 min 9 sec
I Hope this help
I wrote this script just out of curiousity. It should do better than your script, but I cant tell for sure.
Beware that I use a declared table, and if you use a real table performance should be better when generating larger amounts of rows.
I generated 10000 rows on about 13 seconds, that counts to about 3.5 hours to generate 10 000 000 rows. Still far worse than the Java-case you described.
set nocount on
go
declare #i int = 1
declare #t table(nr1 int, nr2 int, nr3 int, nr4 int, nr5 int, mega int, seq int)
while #i <= 10000
begin
;with numbers(nr)
as
(
select 1
union all
select nr+1
from numbers
where nr < 47
)
,mega(nr)
as
(
select 1
union all
select nr+1
from mega
where nr < 27
)
,selectednumbers(nr)
as
(
select top 5 nr
from numbers
order by newid()
)
,selectedmega(mega)
as
(
select top 1 nr
from mega
order by newid()
)
,tmp
as
(
select *
,row_number() over(order by nr) as rownr
from selectednumbers
)
insert into #t
select max(nr1) as nr1
,max(nr2) as nr2
,max(nr3) as nr3
,max(nr4) as nr4
,max(nr5) as nr5
,(select mega from selectedmega) as mega
,#i as seq
from (
select case when rownr = 1 then nr else 0 end as nr1
,case when rownr = 2 then nr else 0 end as nr2
,case when rownr = 3 then nr else 0 end as nr3
,case when rownr = 4 then nr else 0 end as nr4
,case when rownr = 5 then nr else 0 end as nr5
from tmp
) x
set #i = #i + 1
end
select * from #t