Ignore the first and last row in a query result - postgresql

I'm trying to do a query where I want to ignore the first and the last row of the result query.
My result query is retrieving the sum of all mediums in the last hour grouped by 5 minutes.
To ignore the first record I'm using offset(1) and to ignore the last i was trying to do a limit in my id field, ordering by timestamp desc.
My query:
ws_controller_hist=>
SELECT to_timestamp(floor((extract('epoch' FROM TIMESTAMP) / 300)) * 300)
AS timestamp_min,
TYPE,
floor(sum(medium[1]))
FROM default_dataset
WHERE TYPE LIKE 'ap_clients.wlan0'
AND TIMESTAMP > CURRENT_TIMESTAMP - interval '85 minutes'
AND organization_id = '9fc02db4-c3df-4890-93ac-8dd575ca5638'
AND id NOT IN
(SELECT id
FROM default_dataset
ORDER BY TIMESTAMP DESC
LIMIT 1)
GROUP BY timestamp_min,
TYPE
ORDER BY timestamp_min ASC
OFFSET 1;
timestamp_min | type | floor
------------------------+------------------+-------
2017-12-19 14:20:00+00 | ap_clients.wlan0 | 38
2017-12-19 14:25:00+00 | ap_clients.wlan0 | 37
2017-12-19 14:30:00+00 | ap_clients.wlan0 | 39
2017-12-19 14:35:00+00 | ap_clients.wlan0 | 42
2017-12-19 14:40:00+00 | ap_clients.wlan0 | 43
2017-12-19 14:45:00+00 | ap_clients.wlan0 | 44
2017-12-19 14:50:00+00 | ap_clients.wlan0 | 45
2017-12-19 14:55:00+00 | ap_clients.wlan0 | 45
2017-12-19 15:00:00+00 | ap_clients.wlan0 | 43
2017-12-19 15:05:00+00 | ap_clients.wlan0 | 43
2017-12-19 15:10:00+00 | ap_clients.wlan0 | 50
2017-12-19 15:15:00+00 | ap_clients.wlan0 | 52
2017-12-19 15:20:00+00 | ap_clients.wlan0 | 50
2017-12-19 15:25:00+00 | ap_clients.wlan0 | 53
2017-12-19 15:30:00+00 | ap_clients.wlan0 | 49
2017-12-19 15:35:00+00 | ap_clients.wlan0 | 39
2017-12-19 15:40:00+00 | ap_clients.wlan0 | 16
This is not ignoring the last record because i have the same records dont using the subquery " and id not in (select id from default_dataset order by timestamp desc limit 1) "

Wrap your query in an outer query and use lag and OFFSET to do the trick.
SELECT lag(timestamp_min) OVER (ORDER BY timestamp_min) AS timestamp_min,
lag(type) OVER (ORDER BY timestamp_min) AS type,
lag(sum_first_medium) OVER (ORDER BY timestamp_min),
FROM (SELECT to_timestamp(
floor(
(extract('epoch' FROM TIMESTAMP) / 300)
) * 300
) AS timestamp_min,
type,
floor(sum(medium[1])) AS sum_first_medium
FROM default_dataset
WHERE type = 'ap_clients.wlan0'
AND timestamp > current_timestamp - INTERVAL '85 minutes'
AND organization_id = '9fc02db4-c3df-4890-93ac-8dd575ca5638'
GROUP BY timestamp_min, type) lagme
OFFSET 2;

This is probably bit long, but will do exactly as you requested
SELECT z.*
FROM
(SELECT y.*, min(row_number) OVER(), max(row_number) OVER()
FROM
(SELECT x.*, row_number() OVER(ORDER BY timestamp_min)
FROM
(SELECT to_timestamp(floor((extract('epoch' FROM TIMESTAMP) / 300)) * 300)
AS timestamp_min,
TYPE,
floor(sum(medium[1]))
FROM default_dataset
WHERE TYPE LIKE 'ap_clients.wlan0'
AND TIMESTAMP > CURRENT_TIMESTAMP - interval '85 minutes'
AND organization_id = '9fc02db4-c3df-4890-93ac-8dd575ca5638'
AND id NOT IN
(SELECT id
FROM default_dataset
ORDER BY TIMESTAMP DESC
LIMIT 1)
GROUP BY timestamp_min,
TYPE
ORDER BY timestamp_min ASC
) AS x
) AS y
) AS z WHERE row_number NOT IN (min, max)

Related

How to detect streaks of continuous activity in Postgres?

I want to aggregate data based on streaks of continuous activity.
DDL:
CREATE TABLE t_series (t date, data int)
INSERT INTO t_series VALUES
(date '2018-03-01',12),
(date '2018-03-02',43),
(date '2018-03-03',9),
(date '2018-03-04',13),
(date '2018-03-09',23),
(date '2018-03-10',26),
(date '2018-03-11',28),
(date '2018-03-14',21),
(date '2018-03-15',15)
I want an intermediate output as:
          t | data | period
------------+------+------
 2018-03-01 | 12 | 1
 2018-03-02 | 43 | 1
 2018-03-03 | 9 | 1
 2018-03-04 | 13 | 1
 2018-03-09 | 23 | 2
 2018-03-10 | 26 | 2
 2018-03-11 | 28 | 2
 2018-03-14 | 21 | 3
 2018-03-15 | 15 | 3
And the final output as:
period | sum
--------+-----
      1 | 77
      2 | 77
      3 | 36
I have tried using below but doesn't seem to work:
SELECT *, SUM(CASE WHEN diff IS NULL
                     OR diff <2 THEN 1 ELSE NULL END) OVER (ORDER BY t) AS period
       FROM (SELECT *, t - lag(t, 1) OVER (ORDER BY t) AS diff
             FROM t_series
       ) AS x;
Could anyone please suggest a fix.
Thanks in advance.
I came up with this solution:
SELECT period, SUM(data) AS sum
FROM (
SELECT t, data, SUM(groups) OVER (ORDER BY t) AS period
FROM (
SELECT t, data,
CASE
WHEN diff IS NULL OR diff = 1 THEN 0
ELSE 1
END AS groups
FROM (
SELECT t, data, t - LAG(t) OVER (ORDER BY t) AS diff
FROM t_series
) d
) g -- your intermediate output
) p
GROUP BY period
ORDER BY period
;
Result:
period | sum
--------+-----
0 | 77
1 | 77
2 | 36
The only difference is that my period starts with 0, but I think it's ok

Calculate duration of time ranges without overlap in PostgreSQL

I'm on Postgres 13 and have a table like this
| key | from | to
-------------------------------------------
| A | 2022-11-27T08:00 | 2022-11-27T09:00
| B | 2022-11-27T09:00 | 2022-11-27T10:00
| C | 2022-11-27T08:30 | 2022-11-27T10:30
I want to calculate the duration of each record, but without overlaps. So the desired result would be
| key | from | to | duration
----------------------------------------------------------
| A | 2022-11-27T08:00 | 2022-11-27T09:00 | '1 hour'
| B | 2022-11-27T09:00 | 2022-11-27T09:45 | '45 minutes'
| C | 2022-11-27T08:30 | 2022-11-27T10:00 | '15 minutes'
I guess, I need a subquery and subtract the overlap somehow, but how would I factor in multiple overlaps? In the example above C overlaps A and B, so I must subtract 30 minutes from A and then 45 minute from B... But I'm stuck here:
SELECT key, (("to" - "from")::interval - s.overlap) as duration
FROM time_entries, (
SELECT (???) as overlap
) s
select
key,
fromDT,
toDT,
(toDT-fromDT)::interval -
COALESCE((SELECT SUM(LEAST(te2.toDT,te1.toDT)-GREATEST(te2.fromDT,te1.fromDT))::interval
FROM time_entries te2
WHERE (te2.fromDT<te1.toDT or te2.toDT>te1.fromDT)
AND te2.key<te1.key),'0 minutes') as duration
from time_entries te1;
output:
key
fromdt
todt
duration
A
2022-11-27 08:00:00
2022-11-27 09:00:00
01:00:00
B
2022-11-27 09:00:00
2022-11-27 10:00:00
01:00:00
C
2022-11-27 08:30:00
2022-11-27 10:30:00
00:30:00
I renamed the columns from and to to fromDT and toDT to avoid using reserved words.
a, step by step, explanation is in the DBFIDDLE
Another approach.
WITH DATA AS
(SELECT KEY,
FROMDT,
TODT,
MIN(FROMDT) OVER(PARTITION BY FROMDT::DATE
ORDER BY KEY) AS START_DATE,
MAX(TODT) OVER(PARTITION BY FROMDT::DATE
ORDER BY KEY) AS END_DATE
FROM TIME_ENTRIES
ORDER BY KEY) ,STAGING_DATA AS
(SELECT KEY,
FROMDT,
TODT,
COALESCE(LAG(START_DATE) OVER (PARTITION BY FROMDT::DATE
ORDER BY KEY),FROMDT) AS T1_DATE,
COALESCE(LAG(END_DATE) OVER (PARTITION BY FROMDT::DATE
ORDER BY KEY),TODT) AS T2_DATE
FROM DATA)
SELECT KEY,
FROMDT,
TODT,
CASE
WHEN FROMDT = T1_DATE
AND TODT = T2_DATE THEN (TODT - FROMDT) ::Interval
WHEN T2_DATE < TODT THEN (TODT - T2_DATE)::Interval
ELSE (T2_DATE - TODT)::interval
END
FROM STAGING_DATA;

Run a SQL query against ten-minutes time intervals

I have a postgresql table with this schema:
id SERIAL PRIMARY KEY,
traveltime INT,
departuredate TIMESTAMPTZ,
departurehour TIMETZ
Here is a bit of data (edited):
id | traveltime | departuredate | departurehour
----+------------+------------------------+---------------
1 | 73 | 2019-12-24 00:00:03+01 | 00:00:03+01
2 | 73 | 2019-12-24 00:12:16+01 | 00:12:16+01
53 | 115 | 2019-12-24 07:53:44+01 | 07:53:44+01
54 | 116 | 2019-12-24 07:58:45+01 | 07:58:45+01
55 | 119 | 2019-12-24 08:03:46+01 | 08:03:46+01
56 | 120 | 2019-12-24 08:08:47+01 | 08:08:47+01
57 | 121 | 2019-12-24 08:13:48+01 | 08:13:48+01
58 | 121 | 2019-12-24 08:18:48+01 | 08:18:48+01
542 | 112 | 2019-12-26 07:52:41+01 | 07:52:41+01
543 | 114 | 2019-12-26 07:57:42+01 | 07:57:42+01
544 | 116 | 2019-12-26 08:02:43+01 | 08:02:43+01
545 | 116 | 2019-12-26 08:07:44+01 | 08:07:44+01
546 | 117 | 2019-12-26 08:12:45+01 | 08:12:45+01
547 | 118 | 2019-12-26 08:17:46+01 | 08:17:46+01
548 | 118 | 2019-12-26 08:22:48+01 | 08:22:48+01
1031 | 80 | 2019-12-28 07:50:33+01 | 07:50:33+01
1032 | 81 | 2019-12-28 07:55:34+01 | 07:55:34+01
1033 | 81 | 2019-12-28 08:00:35+01 | 08:00:35+01
1034 | 82 | 2019-12-28 08:05:36+01 | 08:05:36+01
1035 | 82 | 2019-12-28 08:10:37+01 | 08:10:37+01
1036 | 83 | 2019-12-28 08:15:38+01 | 08:15:38+01
1037 | 83 | 2019-12-28 08:20:39+01 | 08:20:39+01
I'd like to get the average for all the values collected for traveltime for each 10 minutes interval for several weeks.
Expected result for the data sample: for the 10-minutes interval between 8h00 and 8h10, the rows that will be included in the avg are with id 55, 56, 544, 545, 1033 and 1034
and so on.
I can get the average for a specific interval:
select avg(traveltime) from belt where departurehour >= '10:40:00+01' and departurehour < '10:50:00+01';
To avoid creating a query for each interval, I used this query to get all the 10-minutes intervals for the complete period encoded:
select i from generate_series('2019-11-23', '2020-01-18', '10 minutes'::interval) i;
What I miss is a way to apply my AVG query to each of these generated intervals. Any direction would be helpful!
It turns out that the generate_series does not actually apply as requardless of the date range. The critical part is the 144 10Min intervals per day. Unfortunatly Postgres does not provide an interval type for minuets. (Perhaps creating one would be a useful exersize). But all is not loss you can simulate the same with BETWEEN, just need to play with the ending of the range.
The following generates this simulation using a recursive CTE. Then as previously joins to your table.
set timezone to '+1'; -- necessary to keep my local offset from effecting results.
-- create table an insert data here
-- additional data added outside of date range so should not be included)
with recursive min_intervals as
(select '00:00:00'::timetz start_10Min -- start of 1st 10Min interval
, '00:09:59.999999'::timetz end_10Min -- last microsecond in 10Min interval
, 1 interval_no
union all
select start_10Min + interval '10 min'
, end_10Min + interval '10 min'
, interval_no + 1
from Min_intervals
where interval_no < 144 -- 6 10Min intervals/hr * 24 Hr/day = No of 10Min intervals in any day
) -- select * from min_intervals;
select start_10Min, end_10Min, avg(traveltime) average_travel_time
from min_intervals
join belt
on departuredate::time between start_10Min and end_10Min
where departuredate::date between date '2019-11-23' and date '2020-01-18'
group by start_10Min, end_10Min
order by start_10Min;
-- test result for 'specified' Note added rows fall within time frame 08:00 to 08:10
-- but these should be excluded so the avg for that period should be the same for both queries.
select avg(traveltime) from belt where id in (55, 56, 544, 545, 1033, 1034);
My issue with the above is the data range is essentially hard coded (yes substitution parameter are available) and manually but that is OK for psql or an IDE but not good for a production environment. If this is to be used in that environment I'd use the following function to return a virtual table of the same results.
create or replace function travel_average_per_10Min_interval(
start_date_in date
, end_date_in date
)
returns table (Start_10Min timetz
,end_10Min timetz
,avg_travel_time numeric
)
language sql
as $$
with recursive min_intervals as
(select '00:00:00'::timetz start_10Min -- start of 1st 10Min interval
, '00:09:59.999999'::timetz end_10Min -- last microsecond in 10Min interval
, 1 interval_no
union all
select start_10Min + interval '10 min'
, end_10Min + interval '10 min'
, interval_no + 1
from Min_intervals
where interval_no < 144 -- 6 10Min intervals/hr * 24 Hr/day = No of 10Min intervals in any day
) -- select * from min_intervals;
select start_10Min, end_10Min, avg(traveltime) average_travel_time
from min_intervals
join belt
on departuredate::time between start_10Min and end_10Min
where departuredate::date between start_date_in and end_date_in
group by start_10Min, end_10Min
order by start_10Min;
$$;
-- test
select * from travel_average_per_10Min_interval(date '2019-11-23', date '2020-01-18');

Showing only TOP 1 value result from join duplicates

I have 3 tables like below. You will see how they are joined.
Orders Table
+---------+------------+
| Orderid | LocationId |
+---------+------------+
| 36 | 14 |
| 38 | 13 |
+---------+------------+
OrdersDetails Table
+-----------+------------+
| Detailsid | OrderId |
+-----------+------------+
| 38 | 36 |
| 39 | 36 |
| 40 | 38 |
+-----------+------------+
OrderLocations
+------------+------------+
| Locationid | DistanceKM |
+------------+------------+
| 13 | 550 |
| 14 | 245 |
+------------+------------+
When doing an inner join of the 3 tables we get:
I don't want to have a duplicate DistanceKM, ex. 245. I would like a 0 instead for line item 2 like this:
Here is my solution:
Creating tables:
CREATE TABLE #Orders
(
Orderid INT, LocationId INT
);
INSERT INTO #Orders
VALUES
(36, 14
),
(38, 13
);
CREATE TABLE #OrdersDetails
(
Detailsid INT, OrderId INT
);
INSERT INTO #OrdersDetails
VALUES
(38, 36
),
(39, 36
),
(40, 38
);
CREATE TABLE #OrderLocations
(
Locationid INT, DistanceKM INT
);
INSERT INTO #OrderLocations
VALUES
(13, 550
),
(14, 245
);
The actual query:
;WITH cte
AS
(SELECT o.Orderid, d.Detailsid, l.DistanceKM, ROW_NUMBER() OVER
(PARTITION BY l.DistanceKM ORDER BY o.Orderid
) AS rn
FROM #Orders AS o
INNER JOIN
#OrdersDetails AS d
ON o.Orderid = d.OrderId
INNER JOIN
#OrderLocations AS l
ON o.LocationId = l.Locationid
)
SELECT cte.Orderid, cte.Detailsid,
CASE
WHEN cte.rn > 1
THEN 0
ELSE cte.DistanceKM
END AS DistanceKM
FROM CTE;
And here is the results:

Select limited set of fields from inner query with preserved order

I've got a SQL query which involves one-to-many relationships with ORDER BY clause:
SELECT
s0_.id,
s0_.created_at,
s5_.sort_order
FROM
surveys_submits s0_
INNER JOIN surveys_answers s3_ ON s0_.id = s3_.submit_id
INNER JOIN surveys_questions s4_ ON s3_.question_id = s4_.id
INNER JOIN surveys_questions_references s5_ ON s4_.id = s5_.question_id
ORDER BY
s0_.created_at DESC,
s5_.sort_order ASC
This query returns following results:
id | created_at | sort_order
----+---------------------+-----------
218 | 2014-03-18 12:21:09 | 1
218 | 2014-03-18 12:21:09 | 2
218 | 2014-03-18 12:21:09 | 3
218 | 2014-03-18 12:21:09 | 4
218 | 2014-03-18 12:21:09 | 5
217 | 2014-03-18 12:20:57 | 1
217 | 2014-03-18 12:20:57 | 2
217 | 2014-03-18 12:20:57 | 3
...
214 | 2014-03-18 12:18:01 | 4
214 | 2014-03-18 12:18:01 | 5
213 | 2014-03-18 12:17:48 | 1
213 | 2014-03-18 12:17:48 | 2
213 | 2014-03-18 12:17:48 | 3
213 | 2014-03-18 12:17:48 | 4
213 | 2014-03-18 12:17:48 | 5
Now, I need to modify this query in a way that would return first 25 distinct ids from the begining with preserved order.
I've tried something like this:
SELECT DISTINCT id
FROM (
SELECT ... ORDER BY ...
) inner_query
ORDER BY created_at DESC, sort_order ASC
LIMIT 25 OFFSET 0;
But obviously it doesn't work:
ERROR: for SELECT DISTINCT, ORDER BY expressions must appear in select list
LINE 16: created_at DESC,
^
********** Error **********
...and I can't add created_at and sort_order columns to SELECT clause cause it would result in duplicated ids, just like the first query.
select *
from (
SELECT distinct on (s0_.id)
s0_.id,
s0_.created_at,
s5_.sort_order
FROM
surveys_submits s0_
INNER JOIN surveys_answers s3_ ON s0_.id = s3_.submit_id
INNER JOIN surveys_questions s4_ ON s3_.question_id = s4_.id
INNER JOIN surveys_questions_references s5_ ON s4_.id = s5_.question_id
ORDER BY
s0_.id,
s0_.created_at DESC,
s5_.sort_order ASC
) s
order by
created_at desc,
sort_order ASC
limit 25
From the manual
SELECT DISTINCT ON ( expression [, ...] ) keeps only the first row of each set of rows where the given expressions evaluate to equal. The DISTINCT ON expressions are interpreted using the same rules as for ORDER BY (see above). Note that the "first row" of each set is unpredictable unless ORDER BY is used to ensure that the desired row appears first.