Summing arrays in conjunction with GROUP BY - postgresql

I've got some periodic counter data (like once a second) from different objects that I wish to combine into an hourly total.
If I do it with separate column names, it's pretty straightforward:
CREATE TABLE ts1 (
id INTEGER,
ts TIMESTAMP,
count0 integer,
count1 integer,
count2 integer
);
INSERT INTO ts1 VALUES
(1, '2017-12-07 10:37:48', 10, 20, 50),
(2, '2017-12-07 10:37:48', 13, 7, 88),
(1, '2017-12-07 10:37:49', 12, 23, 34),
(2, '2017-12-07 10:37:49', 11, 13, 46),
(1, '2017-12-07 10:37:50', 8, 33, 80),
(2, '2017-12-07 10:37:50', 9, 3, 47),
(1, '2017-12-07 10:37:51', 17, 99, 7),
(2, '2017-12-07 10:37:51', 9, 23, 96);
SELECT id, date_trunc('hour', ts + '1 hour') nts,
sum(count0), sum(count1), sum(count2)
FROM ts1 GROUP BY id, nts;
id | nts | sum | sum | sum
----+---------------------+-----+-----+-----
1 | 2017-12-07 11:00:00 | 47 | 175 | 171
2 | 2017-12-07 11:00:00 | 42 | 46 | 277
(2 rows)
The problem is that different objects have different numbers of counts (though each particular object's rows -- ones sharing the same ID -- all have the same number of counts). Hence I want to use an array.
The corresponding table looks like this:
CREATE TABLE ts2 (
id INTEGER,
ts TIMESTAMP,
counts INTEGER[]
);
INSERT INTO ts2 VALUES
(1, '2017-12-07 10:37:48', ARRAY[10, 20, 50]),
(2, '2017-12-07 10:37:48', ARRAY[13, 7, 88]),
(1, '2017-12-07 10:37:49', ARRAY[12, 23, 34]),
(2, '2017-12-07 10:37:49', ARRAY[11, 13, 46]),
(1, '2017-12-07 10:37:50', ARRAY[8, 33, 80]),
(2, '2017-12-07 10:37:50', ARRAY[9, 3, 47]),
(1, '2017-12-07 10:37:51', ARRAY[17, 99, 7]),
(2, '2017-12-07 10:37:51', ARRAY[9, 23, 96]);
I have looked at this answer https://stackoverflow.com/a/24997565/1076479 and I get the general gist of it, but I cannot figure out how to get the correct rows summed together when I try to combine it with the grouping by id and timestamp.
For example, with this I get all the rows, not just the ones with matching id and timestamp:
SELECT id, date_trunc('hour', ts + '1 hour') nts, ARRAY(
SELECT sum(elem) FROM ts2 t, unnest(t.counts)
WITH ORDINALITY x(elem, rn) GROUP BY rn ORDER BY rn
) FROM ts2 GROUP BY id, nts;
id | nts | array
----+---------------------+--------------
1 | 2017-12-07 11:00:00 | {89,221,448}
2 | 2017-12-07 11:00:00 | {89,221,448}
(2 rows)
FWIW, I'm using postgresql 9.6

The problem with you original query is that you're summing all elements, because GROUP BY id, nts is executed in outer query. Combining a CTE with LATERAL JOIN would do the trick:
WITH tmp AS (
SELECT
id,
date_trunc('hour', ts + '1 hour') nts,
sum(elem) AS counts
FROM
ts2
LEFT JOIN LATERAL unnest(counts) WITH ORDINALITY x(elem, rn) ON TRUE
GROUP BY
id, nts, rn
)
SELECT id, nts, array_agg(counts) FROM tmp GROUP BY id, nts

Related

How to get task_id in group by

I have a table task_activity
create table task_activity(
id serial,
task_date timestamp,
task_id int
);
which has data regarding tasks completed
(1, '2020-01-30 01:00:00',1)
(2, '2020-01-29 01:00:00',1)
(3, '2020-01-15 01:00:00',1)
(4, '2020-01-14 01:00:00',1)
(5, '2020-01-13 01:00:00',1)
(6, '2020-01-30 01:00:00',2)
(7, '2020-01-16 01:00:00',2)
(8, '2020-01-15 01:00:00',2)
(9, '2020-01-14 01:00:00',2)
(10, '2020-01-13 01:00:00',2)
I run the following query
WITH
groups(date, dateMinusRow) AS (
SELECT
date_trunc('day', task_date) date,
date_trunc('day', task_date) - INTERVAL '1' DAY * DENSE_RANK() OVER (ORDER BY date_trunc('day', task_date)) dateMinusRow
FROM task_activity
GROUP BY date_trunc('day', task_date)
HAVING COUNT(*) >= 1
)
SELECT
COUNT(*) AS streak,
MIN(date) AS startDate,
MAX(date) AS endDate
FROM groups
GROUP BY dateMinusRow
ORDER BY endDate DESC
to get this data
streak startdate enddate
2 2020-01-29T00:00:00Z 2020-01-30T00:00:00Z
4 2020-01-13T00:00:00Z 2020-01-16T00:00:00Z
How do I include the column task_id in this data too?

How can I remove the null values and make it to 10 rows in Postgresql?

I am new to Postgresql. I have a table called 'sales'.
create table sales
(
cust varchar(20),
prod varchar(20),
day integer,
month integer,
year integer,
state char(2),
quant integer
)
insert into sales values ('Bloom', 'Pepsi', 2, 12, 2001, 'NY', 4232);
insert into sales values ('Knuth', 'Bread', 23, 5, 2005, 'PA', 4167);
insert into sales values ('Emily', 'Pepsi', 22, 1, 2006, 'CT', 4404);
insert into sales values ('Emily', 'Fruits', 11, 1, 2000, 'NJ', 4369);
insert into sales values ('Helen', 'Milk', 7, 11, 2006, 'CT', 210);
insert into sales values ('Emily', 'Soap', 2, 4, 2002, 'CT', 2549);
something like this:
Now I want to find the “most favorable” month (when most amount of the product was
sold) and the “least favorable” month (when the least amount of the product was sold) for each product.
The result should be like this:
I entered
SELECT
prod product,
MAX(CASE WHEN rn2 = 1 THEN month END) MOST_FAV_MO,
MAX(CASE WHEN rn1 = 1 THEN month END) LEAST_FAV_MO
FROM (
SELECT
*,
ROW_NUMBER() OVER(PARTITION BY prod ORDER BY quant ) rn1,
ROW_NUMBER() OVER(PARTITION BY prod ORDER BY quant DESC) rn2
FROM sales
) x
WHERE rn1 = 1 or rn2 = 1
GROUP BY prod,quant;
Then there are null values for each product and there are 20 rows in total:
So how can I remove the null values in these rows and make the total number of rows to 10 (There are 10 distinct products in total)???
I would say that the GROUP BY clause should be
GROUP BY prod
Otherwise you get one line per different quant, which is not what you want.

Records based on time difference

I have a very strange request. I'm trying to create an SQL statement to do this. I know I can create a cursor but trying to see if it can be done is SQL
Here is my source data.
1 - 1:00 PM
2 - 1:02 PM
3 - 1:03 PM
4 - 1:05 PM
5 - 1:06 PM
6 - 1:09 PM
7 - 1:10 PM
8 - 1:12 PM
9 - 1:13 PM
10 - 1:15 PM
I'm trying to create a function that if I pass an interval it will return the resulting data set.
For example I pass in 5 minutes, then the records I would want back are records 1, 4, 7, & 10.
Is there a way to do this in SQL. Note: if record 4 (1:05 PM wasn't in the data set I would expect to see 1, 5, & 8. I would see 5 because it is the next record with a time greater than 5 minutes from record 1 and record 8 because it is the next record with a time greater than 5 minutes from record 5.
Here is a create script that you should have provided:
declare #Table1 TABLE
([id] int, [time] time)
;
INSERT INTO #Table1
([id], [time])
VALUES
(1, '1:00 PM'),
(2, '1:02 PM'),
(3, '1:03 PM'),
(4, '1:05 PM'),
(5, '1:06 PM'),
(6, '1:09 PM'),
(7, '1:10 PM'),
(8, '1:12 PM'),
(9, '1:13 PM'),
(10, '1:15 PM')
;
I would do this with this query:
declare #interval int
set #interval = 5
;with next_times as(
select id, [time], (select min([time]) from #Table1 t2 where t2.[time] >= dateadd(minute, #interval, t1.[time])) as next_time
from #Table1 t1
),
t as(
select id, [time], next_time
from next_times t1 where id=1
union all
select t3.id, t3.[time], t3.next_time
from t inner join next_times t3
on t.next_time = t3.[time]
)
select id, [time] from t order by 1
-- results:
id time
----------- ----------------
1 13:00:00.0000000
4 13:05:00.0000000
7 13:10:00.0000000
10 13:15:00.0000000
(4 row(s) affected)
It works even for the situations with a missing interval:
-- delete the 1:05 PM record
delete from #table1 where id = 4;
;with next_times as(
select id, [time], (select min([time]) from #Table1 t2 where t2.[time] >= dateadd(minute, #interval, t1.[time])) as next_time
from #Table1 t1
),
t as(
select id, [time], next_time
from next_times t1 where id=1
union all
select t3.id, t3.[time], t3.next_time
from t inner join next_times t3
on t.next_time = t3.[time]
)
select id, [time] from t order by 1;
-- results:
id time
----------- ----------------
1 13:00:00.0000000
5 13:06:00.0000000
8 13:12:00.0000000
(3 row(s) affected)

PostgreSQL, mixing SUM horizontaly and vertically

I have a temporary table which is result of previously heavy combined data from which I have to create html document to show.
This table in short illustrates situation:
DROP TABLE IF EXISTS temp11;
CREATE TABLE temp11 (t_idx int PRIMARY KEY, mydate text, myclass int, mypercent double precision, valpercent double precision, valclass double precision);
INSERT INTO temp11
(t_idx, mydate, myclass, mypercent, valpercent, valclass) VALUES
(1, '01.01.2014', 1, 10, 10, 1),
(2, '01.01.2014', 2, 20, 20, 4),
(3, '01.01.2014', 2, 20, 50, 10),
(4, '01.01.2014', 1, 10, 17, 1.7),
(5, '02.01.2014', 2, 20, 40, 8),
(6, '02.01.2014', 1, 10, 18, 1.8),
(7, '02.01.2014', 2, 20, 50, 10),
(8, '03.01.2014', 1, 10, 10, 1),
(9, '03.01.2014', 2, 20, 40, 8),
(10, '03.01.2014', 1, 10, 20, 2),
(11, '03.01.2014', 2, 20, 30, 6);
Now I have a query for grouping and summing that into dates and valclasses:
SELECT mydate, myclass, mypercent,
SUM(valpercent) AS sumvalpercent,
SUM(valclass) AS sumvalclass,
SUM(valpercent+valclass) AS sum_row
FROM temp11
GROUP BY mydate, myclass, mypercent
ORDER BY mydate;
Result of this query is expectable:
"01.01.2014" 2 20 70 14.0 84.0
"01.01.2014" 1 10 27 2.7 29.7
"02.01.2014" 1 10 18 1.8 19.8
"02.01.2014" 2 20 90 18.0 108.0
"03.01.2014" 2 20 70 14.0 84.0
"03.01.2014" 1 10 30 3.0 33.0
But needs are a bit extended.
Is it possible to do with PostgreSQL that in same process after every date I get vertically SUM of data inside that date and after all, at the end, SUM of data from all dates so result will look like this:
"01.01.2014" 2 20 70 14.0 84.0
"01.01.2014" 1 10 27 2.7 29.7
97 16.7 113.7
"02.01.2014" 1 10 18 1.8 19.8
"02.01.2014" 2 20 90 18.0 108.0
108 19.8 127.8
"03.01.2014" 2 20 70 14.0 84.0
"03.01.2014" 1 10 30 3.0 33.0
100 17.0 117.0
305 53.5 358.5
If this is possible such (or similar), how that query should look like with showed data?
The simplest way I can think of is to use UNION ALL to get all the desired output at once.
If you leave out the fact that the dates are shown (needed for the order by clause) this query gives the requested output in the simplest way.
SELECT mydate, myclass, mypercent,
SUM(valpercent) AS sumvalpercent,
SUM(valclass) AS sumvalclass,
SUM(valpercent+valclass) AS sum_row
FROM temp11
GROUP BY mydate, myclass, mypercent
UNION ALL
SELECT mydate || ' total', null, null,
SUM(valpercent) AS sumvalpercent,
SUM(valclass) AS sumvalclass,
SUM(valpercent+valclass) AS sum_row
FROM temp11
GROUP BY mydate
UNION ALL
SELECT 'Total', null, null,
SUM(valpercent) AS sumvalpercent,
SUM(valclass) AS sumvalclass,
SUM(valpercent+valclass) AS sum_row
FROM temp11
ORDER BY mydate;
Here's a fiddle
Perhaps it can be rewritten more elegantly using WITH
EDIT:
This will be more efficient because it only traverses through temp11 table just once. Then it only uses the temporary table temp100 which has much fewer rows for the additional totals (no more than one row per day). The UNIONs still remain and the logic is still the same.
WITH temp100 (mydate,myclass,mypercent, sumvalpercent,sumvalclass,sum_row) as (
SELECT mydate, myclass, mypercent,
SUM(valpercent) AS sumvalpercent,
SUM(valclass) AS sumvalclass,
SUM(valpercent+valclass) AS sum_row
FROM temp11
GROUP BY mydate, myclass, mypercent
)
SELECT mydate,myclass,mypercent, sumvalpercent,sumvalclass,sum_row
FROM temp100
UNION ALL
SELECT mydate || ' total' as mydate, null, null, SUM(sumvalpercent), SUM(sumvalclass), SUM(sum_row)
FROM temp100
GROUP BY mydate
UNION ALL
SELECT 'Total' as mydate, null, null, SUM(sumvalpercent), SUM(sumvalclass), SUM(sum_row)
FROM temp100
ORDER BY mydate;
This is the fiddle

SQL finding average value of n rows where n is a sum of a field

I have data that looks like this.
SoldToRetailer
OrderDate | Customer | Product | price | units
-------------------------------------------------
1-jun-2011 | customer1 | Product1 | $10 | 5
2-jun-2011 | customer1 | Product1 | $5 | 3
3-jun-2011 | customer1 | Product2 | $10 | 4
4-jun-2011 | customer1 | Product1 | $4 | 4
5-jun-2011 | customer2 | Product3 | $10 | 1
SalesByRetailers
Customer | Product | units
-----------------------------
customer1 | Product1 | 5
customer2 | Product3 | 1
Here's what I need.
Sales(average price)
Customer | Product | units | Average Price
--------------------------------------------
customer1 | Product1 | 5 | $3.44
customer2 | Product3 | 1 | $10
Average Price is defined as the average price of the most recent SoldToRetailer Prices that add up to the units.
So in the first case, I grab the orders from June 4th and June 2nd. I don't need (actually want) the orders from june 1st to be included.
EDIT: Hopefully a better explanation.
I'm attempting to determine the correct (most recent) price where an item was sold to a retailer. It's LIFO order for the prices. The price is determined by averaging the price sold over the last n orders. Where n = total retail sales for a particular product and customer.
In SQL pseudcode it would look like this.
Select s1.Customer, s1.product, average(s2.price)
from SalesByRetailers s1
join SoldToRetailer s2
on s1.customer=s2.customer
and s1.product=s2.product
and ( select top (count of records where s2.units = s1.units) from s2 order by OrderDate desc)
What I need to return is the number of records from SoldToRetailer where the sum of units is >= SalesByRetailer Units.
It looks like it could be solved by a RANK or rowover partition, but I'm at a loss.
The SoldToRetailer table is ginormous so performance is at a premium.
Running on SQL 2008R2
Thanks for helping
So I used 3 techniques. First I created a table with an over by clause to give me a sorted list of products and prices, then I edited the table to add in the running average. An OUTER APPLY sub select fixded my final problem. Hopefully the code will help someone else with a similar problem.
A shout out to Jeff Moden of SQLSderverCentral.com fame for the running average help.
SELECT d.ProductKey,
d.ActualDollars,
d.Units,
ROW_NUMBER() OVER(PARTITION BY ProductKey ORDER BY d.OrderDateKey DESC) AS RowNumber,
NULL AS RunningTotal,
CONVERT(DECIMAL(10, 4), 0) AS RunningDollarsSum,
CONVERT(DECIMAL(10, 4), 0) AS RunningAverage
INTO #CustomerOrdersDetails
FROM dbo.orders d
WHERE customerKey = #CustomerToSelect
--DB EDIT... Google "Quirky update SQL Server central. Jeff Moden's version of a
--Running total. Holy crap it's faster. tried trangular joins before.
CREATE CLUSTERED INDEX [Index1]
ON #CustomerOrdersDetails ( ProductKey ASC, RowNumber ASC )
DECLARE #RunningTotal INT
DECLARE #PrevProductKey INT
DECLARE #RunningDollarsSum DECIMAL(10, 4)
UPDATE #CustomerOrdersDetails
SET #RunningTotal = RunningTotal = CASE
WHEN ProductKey = #PrevProductKey THEN c.Units + ISNULL(#RunningTotal, 0)
ELSE c.Units
END,
#RunningDollarsSum = RunningDollarsSum = CASE
WHEN ProductKey = #PrevProductKey THEN c.ActualDollars + ISNULL(#RunningDollarsSum, 0)
ELSE c.ActualDollars
END,
#PrevProductKey = ProductKey,
RunningAverage = #RunningDollarsSum / NULLIF(#RunningTotal, 0)
FROM #CustomerOrdersDetails c WITH (TABLOCKX)
OPTION (MAXDOP 1)
-- =============================================
-- Update Cost fields with average price calculation
-- =============================================
UPDATE d
SET DolSoldCostUSD = COALESCE(d.DolSoldCostUSD,
d.UnitsSold * a.RunningAverage),
FROM dbo.inbound d
OUTER APPLY (SELECT TOP 1 *
FROM #CustomerOrdersDetails ap
WHERE ap.ProductKey = d.ProductKey
AND d.UnitsSold + d.UnitsOnHand + d.UnitsOnOrder + d.UnitsReceived + d.UnitsReturned >= RunningTotal
ORDER BY RunningTotal) AS a
declare #table table (customer varchar(15), product varchar(15), qty int, price decimal(6,2))
insert into #table (customer, product, qty, price)
values
('customer1', 'product1', 5, 3),
('customer1', 'product1', 4, 4),
('customer1', 'product1', 3, 2),
('customer1', 'product1', 2, 13),
('customer1', 'product1', 3, 3),
('customer1', 'product2', 5, 1),
('customer1', 'product2', 4, 7),
('customer1', 'product2', 2, 5),
('customer1', 'product2', 6, 23),
('customer1', 'product2', 2, 1),
('customer2', 'product1', 2, 1),
('customer2', 'product1', 4, 4),
('customer2', 'product1', 7, 3),
('customer2', 'product1', 1, 12),
('customer2', 'product1', 2, 3),
('customer2', 'product2', 3, 2),
('customer2', 'product2', 6, 5),
('customer2', 'product2', 8, 4),
('customer2', 'product2', 2, 11),
('customer2', 'product2', 1, 2)
select customer, product, sum(qty) as units, (sum(qty * price))/SUM(qty) as 'Average Price' from #table
group by customer, product