Subquery containing aggreagate function in Hiveql

Subquery containing aggreagate function in Hiveql - hiveql

Why doesn't this work in Hiveql?
create table hist (
id int,
state int,
create_date string);
insert into hist values (1, 1, '2017-01-01');
insert into hist values (1, 1, '2017-02-01');
insert into hist values (1, 1, '2017-03-01');
insert into hist values (1, 2, '2017-02-01');
insert into hist values (1, 2, '2017-03-01');
insert into hist values (1, 2, '2017-04-01');
WITH MonthEnd as
(select to_date(trunc(add_months(current_date, -1), 'MM'))
MonthEndDate)
SELECT date_add(MAX(a.create_date), 0)
FROM hist a, monthend me
where a.create_date <=
(select max(b.create_date) from hist b
where a.id=b.id and b.state=2
and b.create_date < me.MonthEndDate) x
and a.state=1 and a.create_date < me.MonthEndDate;
I would expect an answer of '2017-03-01', but I get this instead:
Error while compiling statement: FAILED: ParseException line 20:24 cannot recognize input near 'select' 'max' '(' in expression specification
Side note: I'm highlighting the bottom select statement in Hue before executing it. The select statement is in lines 13-17. There is other commented code after the select query. Therefore, I have no idea why it's saying the compilation error is on Line 20.
The only StackOverflow question I found close to this recommended I name the subquery. That's why I added the 'x' after it. It didn't help. Same error.
The subquery by itself works:
WITH MonthEnd as
(select to_date(trunc(add_months(current_date, -1), 'MM'))
MonthEndDate)
select max(b.create_date) from hist b, monthend me
where b.state=2 and b.create_date < me.MonthEndDate;
2017-04-01
Thanks in advance for any insight anyone can give here!

Related

PostgreSQL: replace generate_series with an array

I have a working SQL code that creates geometries according to numbers generated from a generate_series function:
CREATE TEMPORARY TABLE catchments ON COMMIT DROP AS (
SELECT lims, ST_ConcaveHull(the_geom, alpha_factor) AS the_geom_overlap FROM (
SELECT lims, ST_MakeValid(ST_Collect(n.the_geom)) AS the_geom
FROM generate_series(1, 10, 2) AS lims, pgr_drivingDistance(
'SELECT id, source, target, cost, reverse_cost FROM edges',
vertex_id, lims, true
) a, nodes n WHERE a.node = n.vid
GROUP BY lims
) AS conv_hull
ORDER BY lims DESC
);
Now I need to replace the fixed interval series by an array with varying intervals, e.g. [1, 2, 5, 7, 8].
Is there a simple way to "convert" the generate_series by an array with the same logic? I would like to avoid using a for loop if possible.

FROM unnest(ARRAY[1,2,5,7,8]) AS lims
should do it.

Divide table raw into chunks in Postgres with st_dwithin limit

I got a table with linestrings that I want to divide into chunks that have a list of id not higher than provided number for each and store only lines that are within certain distance.
For example, I got a table with 14 rows
create table lines ( id integer primary key, geom geometry(linestring) );
insert into lines (id, geom) values ( 1, 'LINESTRING(0 0, 0 1)');
insert into lines (id, geom) values ( 2, 'LINESTRING(0 1, 1 1)');
insert into lines (id, geom) values ( 3, 'LINESTRING(1 1, 1 2)');
insert into lines (id, geom) values ( 4, 'LINESTRING(1 2, 2 2)');
insert into lines (id, geom) values ( 11, 'LINESTRING(2 2, 2 3)');
insert into lines (id, geom) values ( 12, 'LINESTRING(2 3, 3 3)');
insert into lines (id, geom) values ( 13, 'LINESTRING(3 3, 3 4)');
insert into lines (id, geom) values ( 14, 'LINESTRING(3 4, 4 4)');
create index lines_gix on lines using gist(geom);
I want to split it into chunks with 3 ids for each chunk with lines that are within 2 meters from each other or the first one.
The result I am trying to get from this example is:
| Chunk No.| Id chunk list |
|----------|----------------|
| 1 | 1, 2, 3 |
| 2 | 4, 5, 6 |
| 3 | 7, 8, 9 |
| 4 | 10, 11, 12 |
| 5 | 13, 14 |
I tried to use st_clusterwithin but when lines are close to each other it will return all of them not split into chunks.
I also tried to use some with recursive magic like the one from the answer provided by Paul Ramsey here. But I don't know how to modify the query to return limited grouped id list.

I am not sure if it is the best possible answer so if anyone has a better method or know how to improve provided answer feel free to update it. With a little modification of Paul answer, I've managed to create following queries that are doing what I asked for.
-- Create function for easier interaction
CREATE OR REPLACE FUNCTION find_connected(integer, double precision, integer, integer[])
returns integer[] AS
$$
WITH RECURSIVE lines_r AS -- Recursive allow to use the same query on the output - is like continues append to result and use it inside a query
(SELECT ARRAY[id] AS idlist,
geom, id
FROM lines
WHERE id = $1
UNION ALL
SELECT array_append(lines_r.idlist, lines.id) AS idlist, -- append id list to array
lines.geom AS geom, -- keep geometry
lines.id AS id -- keep source table id
FROM (SELECT * FROM lines WHERE NOT $4 #> array[id]) lines, lines_r -- from source table and recursive table
WHERE ST_DWITHIN(lines.geom, lines_r.geom, $2) -- where lines are within 2 meters
AND NOT lines_r.idlist #> ARRAY[lines.id] -- recursive id list array not contain lines array
AND array_length(idlist, 1) <= $3
)
SELECT idlist
FROM lines_r WHERE array_length(idlist, 1) <= $3 ORDER BY array_length(idlist, 1) DESC LIMIT 1;
$$
LANGUAGE 'sql';
-- Create id chunks
WITH RECURSIVE groups_r AS (
(SELECT find_connected(id, 2, 3, ARRAY[id]) AS idlist, find_connected(id, 2, 3, ARRAY[id]) AS grouplist, id
FROM lines WHERE id = 1)
UNION ALL
(SELECT array_cat(groups_r.idlist, find_connected(lines.id, 2, 3, groups_r.idlist)) AS idlist,
find_connected(lines.id, 2, 3, groups_r.idlist) AS grouplist,
lines.id
FROM lines,
groups_r
WHERE NOT groups_r.idlist #> ARRAY[lines.id]
LIMIT 1))
SELECT
-- (SELECT array_agg(DISTINCT x) FROM unnest(idlist) t (x)) idlist, -- left for better understanding what is happening
row_number() OVER () chunk_id,
(SELECT array_agg(DISTINCT x) FROM unnest(grouplist) t (x)) grouplist,
id input_line_id
FROM groups_r;
The only problem is that performance is quite pure when the number of ids in the chunk increase. For a table with 300 rows and 20 ids per chunk, execution time is around 15 min, even with indexes on geometry and id columns.

PostgreSQL, Variables

I am using PostgreSQL through Npgsql driver for windows/NET and I see that it is possible to use PL/pgSQL language through it.
So that way I can make use of variables for my calculation scripts which may look like in this example:
DO $$
DECLARE
tlist text='mylistofbills';
tcontent text='mycontentofbills';
BEGIN
CREATE TEMP TABLE tlist
(billno integer, bdate timestamp, rebate double precision)
ON COMMIT DROP;
INSERT INTO tlist
VALUES (1, '10.01.2017. 10:14:56', 10),
(2, '10.01.2017. 11:02:13', 5),
(3, '10.01.2017. 11:45:22', 0),
(4, '10.01.2017. 12:01:01', 6);
CREATE TEMP TABLE tcontent
(billno integer, rowno integer, price double precision, tax double precision)
ON COMMIT DROP;
INSERT INTO tcontent
VALUES (1, 1, 100, 19),
(1, 2, 30, 0),
(2, 1, 20, 19),
(3, 1, 18, 19),
(4, 1, 43, 0);
END $$;
SELECT s.price,
l.rebate,
s.price/100*l.rebate AS valrebate,
s.price-(s.price/100*l.rebate) AS worebate,
((s.price-(s.price/100*l.rebate))/100)*s.tax AS valtax,
s.price-(s.price/100*l.rebate)+(((s.price-(s.price/100*l.rebate))/100)*s.tax) AS finalprice
FROM tlist l, tcontent s
WHERE l.billno=s.billno;
Example is simplified (from real situation) and is suitable for pasting into PgAdmin's SQL editor.
So, now is question: Can I somehow in the body of those code, without adding new functions to server use formulas for writing more elegant and readable code?
If I would be able to add simple formulas like:
rebatec=s.price/100*l.rebate
priceworebate=s.price-rebatec
Then my code may look more readable and less error prone.
Like this:
SELECT s.price,
l.rebate,
rebatec AS valrebate,
priceworebate AS worebate,
(priceworebate/100)*s.tax AS valtax,
priceworebate+((priceworebate/100)*s.tax) AS finalprice
FROM tlist l, tcontent s
WHERE l.billno=s.billno;
If that may be possible where and how to put this formulas so it can be used in my last SELECT code?
SOLUTION:
Based on #Clodoaldo's answer which give something new to me I find a solution which I am able to understand:
SELECT s.price,
l.rebate,
rebatec AS valrebate,
priceworebate AS worebate,
priceworebate/100*s.tax AS valtax,
priceworebate+priceworebate/100*s.tax AS finalprice
FROM tlist l, tcontent s, LATERAL
(SELECT s.price/100*l.rebate AS rebatec,
s.price-s.price/100* l.rebate AS priceworebate
)sub
WHERE l.billno=s.billno;
It works and I hope it is technically correct.

Use lateral:
The LATERAL key word can precede a sub-SELECT FROM item. This allows the sub-SELECT to refer to columns of FROM items that appear before it in the FROM list.
select
s.price,
l.rebate,
rebatec as valrebate,
priceworebate as worebate,
priceworebate / 100 * s.tax as valtax,
priceworebate + priceworebate / 100 * s.tax as finalprice
from
tlist l
inner join
tcontent s using (billno)
cross join lateral (
select
s.price / 100 * l.rebate as rebatec,
s.price - s.price / 100 * l.rebate as priceworebate
) cjl
Use the modern join syntax.

You could use a subquery to define those variables:
select var1 * col3
from (
select col1 / col2 as var1
, *
from YourTable
) sub
Or alternatively a common table expression:
with cte as
(
select col1 / col2 as var1
, *
from YourTable
)
select var1 * col3
from cte

How to rewrite SQL joins into window functions?

Database is HP Vertica 7 or PostgreSQL 9.
create table test (
id int,
card_id int,
tran_dt date,
amount int
);
insert into test values (1, 1, '2017-07-06', 10);
insert into test values (2, 1, '2017-06-01', 20);
insert into test values (3, 1, '2017-05-01', 30);
insert into test values (4, 1, '2017-04-01', 40);
insert into test values (5, 2, '2017-07-04', 10);
Of the payment cards used in the last 1 day, what is the maximum amount charged on that card in the last 90 days.
select t.card_id, max(t2.amount) max
from test t
join test t2 on t2.card_id=t.card_id and t2.tran_dt>='2017-04-06'
where t.tran_dt>='2017-07-06'
group by t.card_id
order by t.card_id;
Results are correct
card_id max
------- ---
1 30
I want to rewrite the query into sql window functions.
select card_id, max(amount) over(partition by card_id order by tran_dt range between '60 days' preceding and current row) max
from test
where card_id in (select card_id from test where tran_dt>='2017-07-06')
order by card_id;
But result set does not match, how can this be done?
Test data here:
http://sqlfiddle.com/#!17/db317/1

I can't try PostgreSQL, but in Vertica, you can apply the ANSI standard OLAP window function.
But you'll need to nest two queries: The window function only returns sensible results if it has all rows that need to be evaluated in the result set.
But you only want the row from '2017-07-06' to be displayed.
So you'll have to filter for that date in an outer query:
WITH olap_output AS (
SELECT
card_id
, tran_dt
, MAX(amount) OVER (
PARTITION BY card_id
ORDER BY tran_dt
RANGE BETWEEN '90 DAYS' PRECEDING AND CURRENT ROW
) AS the_max
FROM test
)
SELECT
card_id
, the_max
FROM olap_output
WHERE tran_dt='2017-07-06'
;
card_id|the_max
1| 30

As far as I know, PostgreSQL Window function doesn't support bounded range preceding thus range between '90 days' preceding won't work. It does support bounded rows preceding such as rows between 90 preceding, but then you would need to assemble a time-series query similar to the following for the Window function to operate on the time-based rows:
SELECT c.card_id, t.amount, g.d as d_series
FROM generate_series(
'2017-04-06'::timestamp, '2017-07-06'::timestamp, '1 day'::interval
) g(d)
CROSS JOIN ( SELECT distinct card_id from test ) c
LEFT JOIN test t ON t.card_id = c.card_id and t.tran_dt = g.d
ORDER BY c.card_id, d_series
For what you need (based on your question description), I would stick to using group by.

T-SQL grouping question

Every once and a while I have a scenario like this, and can never come up with the most efficient query to pull in the information:
Let's say we have a table with three columns (A int, B int, C int). My query needs to answer a question like this: "Tell me what the value of column C is for the largest value of column B where A = 5." A real world scenario for something like this would be 'A' is your users, 'B' is the date something happened, and 'C' is the value, where you want the most recent entry for a specific user.
I always end up with a query like this:
SELECT
C
FROM
MyTable
WHERE
A = 5
AND B = (SELECT MAX(B) FROM MyTable WHERE A = 5)
What am I missing to do this in a single query (opposed to nesting them)? Some sort of 'Having' clause?

BoSchatzberg's answer works when you only care about the 1 result where A=5. But I suspect this question is the result of a more general case. What if you want to list the top record for each distinct value of A?
SELECT t1.*
FROM MyTable t1
INNER JOIN
(
SELECT A, MAX(B)
FROM MyTable
GROUP BY A
) t2 ON t1.A = t2.A AND t1.B = t2.B

--
SELECT C
FROM MyTable
INNER JOIN (SELECT A, MAX(B) AS MAX_B FROM MyTable GROUP BY A) AS X
ON MyTable.A = X.A
AND MyTable.B = MAX_B
--
WHERE MyTable.A = 5
In this case the first section (between the comments) can also easily be moved into a view for modularity or reuse.

You can do this:
SELECT TOP 1 C
FROM MyTable
WHERE A = 5
ORDER BY b DESC

I think you are close (and what you have would work). You could use something like the following:
select C
, max(B)
from MyTable
where A = 5
group by C

After a little bit of testing, I don't think that this can be done without doing it the way you're already doing it (i.e. a subquery). Since you need the max of B and you can't get the value of C without also including that in a GROUP BY or HAVING clause, a subquery seems to be the best way.
create table #tempints (
a int,
b int,
c int
)
insert into #tempints values (1, 8, 10)
insert into #tempints values (1, 8, 10)
insert into #tempints values (2, 4, 10)
insert into #tempints values (5, 8, 10)
insert into #tempints values (5, 3, 10)
insert into #tempints values (5, 7, 10)
insert into #tempints values (5, 8, 15)
/* this errors out with "Column '#tempints.c' is invalid in the select list because it is not contained in either an
aggregate function or the GROUP BY clause." */
select t1.c, max(t1.b)
from #tempints t1
where t1.a=5
/* this errors with "An aggregate may not appear in the WHERE clause unless it is in a subquery contained in a HAVING
clause or a select list, and the column being aggregated is an outer reference." */
select t1.c, max(t1.b)
from #tempints t1, #tempints t2
where t1.a=5 and t2.b=max(t1.b)
/* errors with "Column '#tempints.a' is invalid in the HAVING clause because it is not contained in either an aggregate
function or the GROUP BY clause." */
select c
from #tempints
group by b, c
having a=5 and b=max(b)
drop table #tempints

We Keep Coding

iphone swift flutter scala powershell matlab mongodb postgresql perl eclipse

Subquery containing aggreagate function in Hiveql - hiveql

Related

PostgreSQL: replace generate_series with an array

Divide table raw into chunks in Postgres with st_dwithin limit

PostgreSQL, Variables

How to rewrite SQL joins into window functions?

T-SQL grouping question

Categories

Resources