PostgreSQL Median without using percentile function - postgresql

To calculate the median for even number of rows, I know I have to divide the RowNum by 2. I don't understand why I need the "+1" and "+2" inside "WHERE RowNum IN ((RowCnt + 1) / 2, (RowCnt + 2) / 2)". Can't I just write "WHERE RowNum IN ((RowCnt) / 2, (RowCnt) / 2)" instead?
with t1 as
(select *,
row_number() over(order by hourly_pay) as RowNum,
count(*) over() as RowCnt
from employee_data)
select *
from t1
WHERE RowNum IN ((RowCnt + 1) / 2, (RowCnt + 2) / 2)
Video of me explaining my confusion: https://streamable.com/mv2vj

Related

Is there a smarter method to create series with different intervalls for count within a query?

I want to create different intervalls:
0 to 10 steps 1
10 to 100 steps 10
100 to 1.000 steps 100
1.000 to 10.000 steps 1.000
to query a table for count the items.
with "series" as (
(SELECT generate_series(0, 10, 1) AS r_from)
union
(select generate_series(10, 90, 10) as r_from)
union
(select generate_series(100, 900, 100) as r_from)
union
(select generate_series(1000, 9000, 1000) as r_from)
order by r_from
)
, "range" as ( select r_from
, case
when r_from < 10 then r_from + 1
when r_from < 100 then r_from + 10
when r_from < 1000 then r_from + 100
else r_from + 1000
end as r_to
from series)
select r_from, r_to,(SELECT count(*) FROM "my_table" WHERE "my_value" BETWEEN r_from AND r_to) as "Anz."
FROM "range";
I think generate_series is the right way, there is another way, we can use simple math to calculate the numbers.
SELECT 0 as r_from,1 as r_to
UNION ALL
SELECT power(10, steps ) * v ,
power(10, steps ) * v + power(10, steps )
FROM generate_series(1, 9, 1) v
CROSS JOIN generate_series(0, 3, 1) steps
so that might as below
with "range" as
(
SELECT 0 as r_from,1 as r_to
UNION ALL
SELECT power(10, steps) * v ,
power(10, steps) * v + power(10, steps)
FROM generate_series(1, 9, 1) v
CROSS JOIN generate_series(0, 3, 1) steps
)
select r_from, r_to,(SELECT count(*) FROM "my_table" WHERE "my_value" BETWEEN r_from AND r_to) as "Anz."
FROM "range";
sqlifddle
Rather than generate_series you could create defined integer range types (int4range), then test whether your value is included within the range (see Range/Multirange Functions and Operators. So
with ranges (range_set) as
( values ( int4range(0,10,'[)') )
, ( int4range(10,100,'[)') )
, ( int4range(100,1000,'[)') )
, ( int4range(1000,10000,'[)') )
) --select * from ranges;
select lower(range_set) range_start
, upper(range_set) - 1 range_end
, count(my_value) cnt
from ranges r
left join my_table mt
on (mt.my_value <# r.range_set)
group by r.range_set
order by lower(r.range_set);
Note the 3rd parameter in creating the ranges.
Creating a CTE as above is good if your ranges are static, however if dynamic ranges are required you can put the ranges into a table. Changes ranges then becomes a matter to managing the table. Not simple but does not require code updates. The query then reduces to just the Main part of the above:
select lower(range_set) range_start
, upper(range_set) - 1 range_end
, count(my_value) cnt
from range_tab r
left join my_table mt
on (mt.my_value <# r.range_set)
group by r.range_set
order by lower(r.range_set);
See demo for both here.

How can I increment the numerical value in my WHERE clause using a loop?

I am currently using the UNION ALL workaround below to calculate old_eps_tfq regression slopes of each ticker based off its corresponding rownum value (see WHERE rownum < x). I am interested to know what the old_eps_tfq is when rownum < 4 then increment 4 by 1 to find out what old_eps_tfq is when rownum < 5, and so on (there are ~20 rownum)
Could I use PL/pgSQL for this?
SELECT * FROM(
WITH regression_slope AS(
SELECT
ROW_NUMBER() OVER ( PARTITION BY ticker ORDER BY earnings_growths_ped) AS rownum,
*
FROM "ANALYTICS"."vEARNINGS_GROWTHS"
--WHERE ticker = 'ACN'
ORDER BY ticker )
SELECT
ticker,
current_period_end_date,
max(earnings_growths_ped) AS max_earnings_growths_ped,
--max(rownum) AS max_rownum,
round(regr_slope(old_eps_tfq, rownum)::numeric, 2) AS slope,
round(regr_intercept(old_eps_tfq, rownum)::numeric, 2) AS y_intercept,
round(regr_r2(old_eps_tfq, rownum)::numeric, 3) AS r_squared
FROM regression_slope
WHERE rownum < 4
GROUP BY ticker, current_period_end_date
ORDER BY ticker asc ) q
UNION ALL
SELECT * FROM(
WITH regression_slope AS(
SELECT
ROW_NUMBER() OVER ( PARTITION BY ticker ORDER BY earnings_growths_ped) AS rownum,
*
FROM "ANALYTICS"."vEARNINGS_GROWTHS"
--WHERE ticker = 'ACN'
ORDER BY ticker )
SELECT
ticker,
current_period_end_date,
max(earnings_growths_ped) AS max_earnings_growths_ped,
--max(rownum) AS max_rownum,
round(regr_slope(old_eps_tfq, rownum)::numeric, 2) AS slope,
round(regr_intercept(old_eps_tfq, rownum)::numeric, 2) AS y_intercept,
round(regr_r2(old_eps_tfq, rownum)::numeric, 3) AS r_squared
FROM regression_slope
WHERE rownum < 5
GROUP BY ticker, current_period_end_date
ORDER BY ticker asc ) q
Here is my table
The top query SELECT * FROM (...) q sounds like useless.
Then you can try this :
WITH regression_slope AS(
SELECT
ROW_NUMBER() OVER ( PARTITION BY ticker ORDER BY earnings_growths_ped) AS rownum,
*
FROM "ANALYTICS"."vEARNINGS_GROWTHS"
--WHERE ticker = 'ACN'
ORDER BY ticker )
SELECT
max,
ticker,
current_period_end_date,
max(earnings_growths_ped) AS max_earnings_growths_ped,
--max(rownum) AS max_rownum,
round(regr_slope(old_eps_tfq, rownum)::numeric, 2) AS slope,
round(regr_intercept(old_eps_tfq, rownum)::numeric, 2) AS y_intercept,
round(regr_r2(old_eps_tfq, rownum)::numeric, 3) AS r_squared
FROM regression_slope
INNER JOIN generate_series(4, 24) AS max -- the range 4 to 24 can be adjusted to the need
ON rownum < max
GROUP BY max, ticker, current_period_end_date
ORDER BY max asc, ticker asc

Create a column of increasing number in Postgresql

I want to create a CTE which only contains a single column by Postgresql(Redshift)- increasing number by 1, like 1,2,3,4,..until 1000.
Here's one that goes to 1024. Add "TOP 1000" if you only want 1000.
SELECT
1 + p0.n
+ p1.n*2
+ p2.n * POWER(2,2)
+ p3.n * POWER(2,3)
+ p4.n * POWER(2,4)
+ p5.n * POWER(2,5)
+ p6.n * POWER(2,6)
+ p7.n * POWER(2,7)
+ p8.n * POWER(2,8)
+ p9.n * POWER(2,9)
as number
FROM
(SELECT 0 as n UNION SELECT 1) p0,
(SELECT 0 as n UNION SELECT 1) p1,
(SELECT 0 as n UNION SELECT 1) p2,
(SELECT 0 as n UNION SELECT 1) p3,
(SELECT 0 as n UNION SELECT 1) p4,
(SELECT 0 as n UNION SELECT 1) p5,
(SELECT 0 as n UNION SELECT 1) p6,
(SELECT 0 as n UNION SELECT 1) p7,
(SELECT 0 as n UNION SELECT 1) p8,
(SELECT 0 as n UNION SELECT 1) p9
Order by 1
Yes, this is possible with a CTE:
with recursive numbers (nr) as (
values (1)
union all
select p.nr + 1
from numbers p
where p.nr < 1000
)
select *
from numbers;
Online example

How to simplify this UNION query (remove the UNION)?

I have a table with players, let's call it player.
Let's say they have 3 columns: userId (UUID in a varchar(255)), levelNumber (integer) and a column through a one-to-one relation with FetchType.Lazy, let's say facebookProfile.
I need to retrieve the rankings "around" the player, so 9 players above the given player and 9 players below the given player, to have a total of 19 players (with my player in the middle).
Some time ago I just came up with this idea:
(select * from player where current_level >= :levelNumber + 1 and (not userid = :userIdToIgnore) order by current_level asc limit 9)
union
(select * from player where current_level <= :levelNumber - 1 and (not userid = :userIdToIgnore) order by current_level desc limit 9)
You get the idea.
Is there any way to simplify this so it doesn't use the UNION?
I'm asking cause I need to convert that to a JPQL query, so it won't be a nativeQuery.
This is all because nativeQueries lead to the N+1 problem and I have troubles with lazy-loading (facebookProfile column) and multiple selects later. That's why I need to simplify that algorithm to be able to use JPQL.
I think you can do this with window functions and conditional expressions:
select *
from (
select p.*,
case when current_level >= :levelNumber + 1 then row_number() over(order by current_level) end rn1,
case when current_level <= :levelNumber - 1 then row_number() over(order by current_level desc) end rn_desc
from player p
where userid <> :userIdToIgnore and (current_level >= :levelNumber + 1 or current_level <= :levelNumber - 1)
) t
where rn1 between 1 and 9 or rn2 between 1 and 9

How can you generate a date list from a range in Amazon Redshift?

Getting date list in a range in PostgreSQL shows how to get a date range in PostgreSQL. However, Redshift does not support generate_series():
ans=> select (generate_series('2012-06-29', '2012-07-03', '1 day'::interval))::date;
ERROR: function generate_series("unknown", "unknown", interval) does not exist
HINT: No function matches the given name and argument types. You may need to add explicit type casts.
Is there way to replicate what generate_series() does in Redshift?
a hack, but works:
use a table with many many rows, and a window function to generate the series
this works as long as you are generating a series that is smaller than the number of rows in the table you're using to generate the series
WITH x(dt) AS (SELECT '2016-01-01'::date)
SELECT
dateadd(
day,
COUNT(*) over(rows between unbounded preceding and current row) - 1,
dt)
FROM users, x
LIMIT 100
the initial date 2016-01-01 controls the start date, and the limit controls the number of days in the generated series.
Update: * Will only run on the leader node
Redshift has partial support for the generate_series function but unfortunately does not mention it in their documentation.
This will work and is the shortest & most legible way of generating a series of dates as of this date (2018-01-29):
SELECT ('2016-01-01'::date + x)::date
FROM generate_series(1, 100, 1) x
One option if you don't want to rely on any existing tables is to pre-generate a series table filled with a range of numbers, one for each row.
create table numbers as (
select
p0.n
+ p1.n*2
+ p2.n * power(2,2)
+ p3.n * power(2,3)
+ p4.n * power(2,4)
+ p5.n * power(2,5)
+ p6.n * power(2,6)
+ p7.n * power(2,7)
+ p8.n * power(2,8)
+ p9.n * power(2,9)
+ p10.n * power(2,10)
as number
from
(select 0 as n union select 1) p0,
(select 0 as n union select 1) p1,
(select 0 as n union select 1) p2,
(select 0 as n union select 1) p3,
(select 0 as n union select 1) p4,
(select 0 as n union select 1) p5,
(select 0 as n union select 1) p6,
(select 0 as n union select 1) p7,
(select 0 as n union select 1) p8,
(select 0 as n union select 1) p9,
(select 0 as n union select 1) p10
order by 1
);
This will create a table with numbers from 0 to 2^10, if you need more numbers, just add more clauses :D
Once you have this table, you can join to it as a substitute for generate_series
with date_range as (select
'2012-06-29'::timestamp as start_date ,
'2012-07-03'::timestamp as end_date
)
select
dateadd(day, number::int, start_date)
from date_range
inner join numbers on number <= datediff(day, start_date, end_date)
#michael_erasmus It's interesting, and I make a change for maybe better performance.
CREATE OR REPLACE VIEW v_series_0_to_1024 AS SELECT
p0.n
| (p1.n << 1)
| (p2.n << 2)
| (p3.n << 3)
| (p4.n << 4)
| (p5.n << 5)
| (p6.n << 6)
| (p7.n << 7)
| (p8.n << 8)
| (p9.n << 9)
as number
from
(select 0 as n union select 1) p0,
(select 0 as n union select 1) p1,
(select 0 as n union select 1) p2,
(select 0 as n union select 1) p3,
(select 0 as n union select 1) p4,
(select 0 as n union select 1) p5,
(select 0 as n union select 1) p6,
(select 0 as n union select 1) p7,
(select 0 as n union select 1) p8,
(select 0 as n union select 1) p9
order by number
Last 30 days date series:
select dateadd(day, -number, current_date) as dt from v_series_0_to_1024 where number < 30