Getting pyspark.sql.utils.ParseException: missing ')' at 'in' in pyspark sql

Getting pyspark.sql.utils.ParseException: missing ')' at 'in' in pyspark sql - pyspark

Hi I am running below query on pyspark sql but getting error. Please help me where I am missing ')'.
Query -
`with cte1 as (select `Project Number`, indication,rank() over (partition by `Project Number`,REGEXP_REPLACE(indication,'[^a-zA-Z0-9]+', '') order by `Project Number`,indication) as rnk from (select distinct `Project Number`, indication from vw_onco_pharma onco_pharma union select distinct `Project Number`, indication from vw_onco_cell_gene cell_gene union select distinct `Project Number`, indication from vw_non_onco_cell_gene onco_cell_gene union select distinct `Project Number`, indication from vw_non_onco_pharma non_onco_pharma union select distinct `Project Number`, indication from vw_plasma_protein plasma_protein)),y as (select max(cast(project_id as integer)) as max_prj_id from vw_project_id) select nvl(max_prj_id,0)+ROW_NUMBER () OVER (ORDER BY `Project Number`,indication) as project_id,`Project Number`,indication,date_format(current_timestamp(),'yyyy-MM-dd hh:mm:ss') as HTA_INSERT_DT from (select cte1.`Project Number`, cte1.indication,max_prj_id from cte1 left join vw_project_id prj on cte1.`Project Number` = prj.`Project Number` and REGEXP_REPLACE(cte1.indicatio,'[^a-zA-Z0-9]+', '') = REGEXP_REPLACE(prj.indication,'[^a-zA-Z0-9]+', '') left join y on 1 = 1 where rnk = 1 and prj.project_id is null and cte1.`project number` in (select `project number` from cte1 group by `project number` having count(*) > 1) union select cte1.`Project Number`, null as indication,max_prj_id from cte1 left join vw_project_id prj on cte1.`Project Number` = prj.`Project Number` left join y on 1 = 1 where rnk = 1 and prj.project_id is null and cte1.`project number` in (select `project number` from cte1 groupby `project number` having count(*) = 1))`
Error-
pyspark.sql.utils.ParseException:
missing ')' at 'in'(line 1, pos 1575)

The last groupby should be group by.
Also try formatting your query, so it can be readable:
with cte1 as (
select
` Project Number `,
indication,
rank() over (
partition by ` Project Number `,
REGEXP_REPLACE(indication, '[^a-zA-Z0-9]+', '')
order by
` Project Number `,
indication
) as rnk
from
(
select
distinct ` Project Number `,
indication
from
vw_onco_pharma onco_pharma
union
select
distinct ` Project Number `,
indication
from
vw_onco_cell_gene cell_gene
union
select
distinct ` Project Number `,
indication
from
vw_non_onco_cell_gene onco_cell_gene
union
select
distinct ` Project Number `,
indication
from
vw_non_onco_pharma non_onco_pharma
union
select
distinct ` Project Number `,
indication
from
vw_plasma_protein plasma_protein
)
),
y as (
select
max(cast(project_id as integer)) as max_prj_id
from
vw_project_id
)
select
nvl(max_prj_id, 0) + ROW_NUMBER () OVER (
ORDER BY
` Project Number `,
indication
) as project_id,
` Project Number `,
indication,
date_format(current_timestamp(), 'yyyy-MM-dd hh:mm:ss') as HTA_INSERT_DT
from
(
select
cte1.` Project Number `,
cte1.indication,
max_prj_id
from
cte1
left join vw_project_id prj on cte1.` Project Number ` = prj.` Project Number `
and REGEXP_REPLACE(cte1.indicatio, '[^a-zA-Z0-9]+', '') = REGEXP_REPLACE(prj.indication, '[^a-zA-Z0-9]+', '')
left join y on 1 = 1
where
rnk = 1
and prj.project_id is null
and cte1.` project number ` in (
select
` project number `
from
cte1
group by
` project number `
having
count(*) > 1
)
union
select
cte1.` Project Number `,
null as indication,
max_prj_id
from
cte1
left join vw_project_id prj on cte1.` Project Number ` = prj.` Project Number `
left join y on 1 = 1
where
rnk = 1
and prj.project_id is null
and cte1.` project number ` in (
select
` project number `
from
cte1 group by ` project number `
having
count(*) = 1
)
)

Related

Join 2 Alias Table from Union Select Table with same numbers of row

I hava 2 table from alias table result of select and union with have same number of row, how or can i make table 2 in right side of table 1? There are dont have same record
Thank you
First query:
SELECT * FROM (
SELECT COUNT(*) “DATA 220” FROM istros_sls_store.sales_store_220)—CEK
UNION ALL
SELECT COUNT(*) FROM istros_sls_item_sales_item_220)—CEK
UNION ALL
SELECT COUNT(*) FROM istros_sls_scat_sales_small_cat_220—CEK
UNION ALL
SELECT COUNT(*) FROM istros_inventory_hstr.inventory_hstr_dtl_220)—CEK
UNION ALL
SELECT COUNT(*) FROM istros_sos.stock_out_supplier_220—CEK
) a
Output from first query:
DATA 220
41
236633
11509
187174
1132
Second query:
SELECT * FROM (
SELECT COUNT(*) “DATA 226” FROM istros_sls_store.sales_store_226—CEK
UNION ALL
SELECT COUNT(*) FROM istros_sls_item_sales_item_226—CEK
UNION ALL
SELECT COUNT(*) FROM istros_sls_scat_sales_small_cat_226—CEK
UNION ALL
SELECT COUNT(*) FROM istros_inventory_hstr.inventory_hstr_dtl_226—CEK
UNION ALL
SELECT COUNT(*) FROM istros_sos.stock_out_supplier_226—CEK
) b
Output from second query:
DATA 226
41
243053
11437
193549
960
The desired output combines these two columns:
DATA 220 | DATA 226
41 | 41
236633 | 243053
11509 | 11437
187174 | 193549
1132 | 960

You may try simple selecting both counts in the union query, so that each one appears as a separate column in the output. Note that below I introduce a computed column, pos, which keeps track of which count queries should appear first in the result set.
SELECT "DATA 220", "DATA 226"
FROM
(
SELECT
1 AS pos,
(SELECT COUNT(*) FROM istros_sls_store.sales_store_220) AS “DATA 220”,
(SELECT COUNT(*) FROM istros_sls_store.sales_store_226) AS “DATA 226”
UNION ALL
SELECT
2,
(SELECT COUNT(*) FROM istros_sls_item_sales_item_220),
(SELECT COUNT(*) FROM istros_sls_item_sales_item_226)
UNION ALL
SELECT
3,
(SELECT COUNT(*) FROM istros_sls_scat_sales_small_cat_220),
(SELECT COUNT(*) FROM istros_sls_scat_sales_small_cat_226)
UNION ALL
SELECT
4,
(SELECT COUNT(*) FROM istros_inventory_hstr.inventory_hstr_dtl_220),
(SELECT COUNT(*) FROM istros_inventory_hstr.inventory_hstr_dtl_226)
UNION ALL
SELECT
5,
(SELECT COUNT(*) FROM istros_sos.stock_out_supplier_220),
(SELECT COUNT(*) FROM istros_sos.stock_out_supplier_226)
) t
ORDER BY
pos;

postgresql combining several periods into one

I'm trying to combine range.
WITH a AS (
select '2017-09-16 07:12:57' as begat,'2017-09-16 11:30:22' as endat
union
select '2017-09-18 17:05:21' ,'2017-09-19 13:18:01'
union
select '2017-09-19 15:34:40' ,'2017-09-22 13:29:37'
union
select '2017-09-22 12:24:16' ,'2017-09-22 13:18:29'
union
select '2017-09-28 09:48:54' ,'2017-09-28 13:39:13'
union
select '2017-09-20 13:52:43' ,'2017-09-20 14:14:43'
), b AS (
SELECT *, lag(endat) OVER (ORDER BY begat) < begat OR NULL AS step
FROM a
)
, c AS (
SELECT *, count(step) OVER (ORDER BY begat) AS grp
FROM b
)
SELECT min(begat), coalesce( max(endat), 'infinity' ) AS range
FROM c
GROUP BY grp
ORDER BY 1
Result
1 "2017-09-16 07:12:57";"2017-09-16 11:30:22"
2 "2017-09-18 17:05:21";"2017-09-19 13:18:01"
3 "2017-09-19 15:34:40";"2017-09-22 13:29:37"
4 "2017-09-22 12:24:16";"2017-09-22 13:18:29"
5 "2017-09-28 09:48:54";"2017-09-28 13:39:13"
positions 3,4 intersect (endata> next begat)
How do I make the union of all the intersections into one large interval
I need result
1 "2017-09-16 07:12:57";"2017-09-16 11:30:22"
2 "2017-09-18 17:05:21";"2017-09-19 13:18:01"
3 "2017-09-19 15:34:40";"2017-09-22 13:29:37"
4 "2017-09-28 09:48:54";"2017-09-28 13:39:13"

Hey I would suggest using the following process :
1- Identify when a row is new, so you give a value of 1 to values that do not overlap (CTE b)
2- Sequence together the rows that have overlaps with others. This way you can see have a common identifier that will allow you to MAX and MIN begat and endat (CTE c)
3- For each sequence, give the MIN of begat and the MAX of endat so you will have your final values
WITH a AS (
select '2017-09-16 07:12:57' as begat,'2017-09-16 11:30:22' as endat
union
select '2017-09-18 17:05:21' ,'2017-09-19 13:18:01'
union
select '2017-09-19 15:34:40' ,'2017-09-22 13:29:37'
union
select '2017-09-22 12:24:16' ,'2017-09-22 13:18:29'
union
select '2017-09-28 09:48:54' ,'2017-09-28 13:39:13'
union
select '2017-09-20 13:52:43' ,'2017-09-20 14:14:43'
)
, b AS (
SELECT
begat
, endat
, (begat > MAX(endat) OVER w IS TRUE)::INT is_new
FROM a
WINDOW w AS (ORDER BY begat ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING)
)
, c AS (
SELECT
begat
, endat
, SUM((is_new)) OVER (ORDER BY begat) seq
FROM b
)
SELECT
MIN(begat) beg_at
, MAX(endat) end_at
FROM c
GROUP BY seq

EDITED
If you need speed you can use a psql function:
create or replace function append_ranges_in_a() returns setof a
language plpgsql
as
$BODY$
declare
v_current a%rowtype;
v_new a%rowtype;
v_first boolean:=true;
begin
for v_current in select begat, endat from a order by begat, endat
loop
if v_first then
v_first := false;
v_new.begat := v_current.begat;
elsif v_new.endat < v_current.begat then
return next v_new;
v_new.begat := v_current.begat;
end if;
v_new.endat := greatest(v_current.endat,v_new.endat);
end loop;
return next v_new;
return;
end;
$BODY$;
select * from append_ranges_in_a()
I test it with ~ 400000 rows:
delete from a;
insert into a (begat, endat)
select time::text, (time+'1 day'::interval)::text
from (select t+(round(random()*23.0)||' hours')::interval as time
from generate_series('1401-01-01'::timestamp,'2018-08-21'::timestamp,'1 day'::interval) t
) t;
select count(*) from a;
select * from append_ranges_in_a() offset 100000 limit 10
and it is twice fast as O(n^2) pure SQL version.
OLD slow solution:
You can use a recursive WITH query https://www.postgresql.org/docs/current/static/queries-with.html to construct the result row by row.
I create the table
The first row is the candidate first row (ending where ending), but the row is not "ready"
Then I look at the next row (step) and if it is not intersecting I add a ready row,
Also I add a not ready row with the current (last) observed range
When I do not have more rows I calculate the last row
I retain ready rows and the last row
Here is the code
CREATE TABLE a as
select '2017-09-16 07:12:57' as begat,'2017-09-16 11:30:22' as endat
union
select '2017-09-18 17:05:21' ,'2017-09-19 13:18:01'
union
select '2017-09-19 15:34:40' ,'2017-09-22 13:29:37'
union
select '2017-09-22 12:24:16' ,'2017-09-22 13:18:29'
union
select '2017-09-28 09:48:54' ,'2017-09-28 13:39:13'
union
select '2017-09-20 13:52:43' ,'2017-09-20 14:14:43';
WITH RECURSIVE t(begat, endat, ready, step) AS (
select * from (
select *,false,1 from a order by begat, endat limit 1) a
UNION ALL
SELECT new_rows.*
FROM (SELECT * FROM t ORDER BY begat DESC limit 1) t,
lateral (SELECT * FROM a ORDER BY begat, endat OFFSET step LIMIT 1) a,
lateral (
SELECT t.begat, t.endat, true as ready, step WHERE t.endat < a.begat
UNION SELECT CASE WHEN t.endat < a.begat THEN a.begat ELSE t.begat END, greatest(a.endat, t.endat), false, step+1
) new_rows
)
select begat, endat
from (
select begat, endat, ready, row_number() over (order by begat desc, endat desc)=1 is_last
from t
order by begat, endat) t
where ready or is_last;

i using range type
https://www.postgresql.org/docs/9.3/static/rangetypes.html
WITH tmp AS (
-- preparation range type
select begat, coalesce( endat, 'infinity' ) as endAt, tsrange( begat, coalesce( endat, 'infinity' ) ) as rg
from (
select '2017-09-11 17:13:03'::timestamp as begat ,'2017-09-12 12:24:09'::timestamp as endat union
select '2017-09-19 15:34:40','2017-09-20 11:04:45' union
select '2017-09-20 08:32:00','2017-09-22 13:28:37' union
select '2017-09-20 13:52:43','2017-09-20 14:14:43' union
select '2017-09-21 12:24:16','2017-09-21 13:28:29' union
select '2017-09-22 12:24:16','2017-09-22 13:28:29' union
select '2017-09-22 12:34:16','2017-09-23 13:28:29' union
select '2017-09-22 12:25:16','2017-09-24 13:28:29' union
select '2017-09-28 09:48:54','2017-09-28 13:39:13' union
select '2017-09-28 14:22:16','2017-09-28 15:52:15' union
select '2017-10-05 12:17:45','2017-10-06 12:35:38' union
select '2017-10-06 16:20:44','2017-10-07 10:11:09' union
select '2017-10-07 20:38:32','2017-10-09 14:42:29' union
select '2017-10-12 18:22:14','2017-10-12 20:52:45'
) a
),a as (
-- group intersecting range
select l.*
from tmp l left join tmp r on l.begAt > r.begAt and r.rg #> l.rg
where r.begAt is null
),
b AS (
SELECT *, lag(endat) OVER (ORDER BY begat) < begat OR NULL AS step
FROM a
)
, c AS (
SELECT *, count(step) OVER (ORDER BY begat) AS grp
FROM b
)
SELECT min(begat), coalesce( max(endat), 'infinity' ) AS range
FROM c
GROUP BY grp
ORDER BY 1

Postgresql nested select max(sum())

My Query:
select table.attribute, count(table.attribute) AS cnt from table
group by table.attribute
order by cnt desc;
The output is something like:
attribute | cnt
-----------+-----
A | 2
B | 2
G | 1
F | 1
But i only want the max values (A & B).

You can do this with a single level of nesting:
select attribute,
cnt
from (
select attribute,
count(*) AS cnt,
max(count(*)) over () as max_cnt
from t
group by attribute
) t
where cnt = max_cnt;

You can use the power of CTE to achieve this:
WITH count_values
AS (SELECT table.attribute,
Count(table.attribute) AS cnt
FROM table
GROUP BY table.attribute),
max_values
AS (SELECT Max(cnt) AS max_cnt
FROM (SELECT cnt
FROM count_values) sub)
SELECT *
FROM count_values cv
JOIN max_values mv
ON mv.max_cnt = cv.cnt;

you can use rank as below
with cte as (
select *, Rank() over(order by cnt desc) as rnk from yourattribute
) select * from cte where rnk = 1

Selecting SUM of COUNT statements on 2 different tables

I want to SUM the COUNT values from 2 COUNT statements on different tables.
I tried:
SELECT(
SELECT COUNT(*) FROM articlegroups
UNION
SELECT COUNT(*) FROM emails
)
as t
I tried:
SELECT(
SELECT COUNT(*) FROM articlegroups
+
SELECT COUNT(*) FROM emails
)
as t
I tried:
SELECT SUM(
SELECT COUNT(*) FROM articlegroups
+
SELECT COUNT(*) FROM emails
)
as t
I don't know what else to try...

You didn't try:
SELECT SUM(cnt) as cnt FROM
(
SELECT COUNT(*) as cnt FROM articlegroups
UNION ALL
SELECT COUNT(*) as cnt FROM emails
) t
Or:
SELECT (SELECT COUNT(*) FROM articlegroups) +
(SELECT COUNT(*) FROM emails) as cnt

You can use:
SELECT (SELECT COUNT(*) FROM articlegroups) +
(SELECT COUNT(*) FROM emails) AS cnt
If either articlegroups or emails can be empty, then you should also use COALESCE:
SELECT COALESCE((SELECT COUNT(*) FROM articlegroups),0) +
COALESCE((SELECT COUNT(*) FROM emails),0) AS cnt

partition over two columns

I'm wanting to partition by two columns (PROJECT_ID, AND CATEGORY_NAME) and I'm having trouble writing the correct syntax. My query below is functional but when I attempt to add an additional over clause it doesn't work correctly. The recursive query was used to concatenate rows partitioning over project_id, creating a list of admins combining and concatenating name_last and name_first to make a list. I need to use an additional over clause to include the CATEGORY_NAME due to admins in the list that work in different categories ('INVISION' AND 'INSIGHT') but are under the same project_id. The first subquery
SELECT
RowNumber() over (PARTITION BY F13.DIM_PROJECT_ID, F13.CATEGORY_NAME ORDER BY F13.PROJECT_NAME),
F13.DIM_PROJECT_ID.....etc.
extracts the correct data, I'm just unsure of how to pull that correct data out partitioning by both project and category. I'm using db2.
with
t1(rowNum, PROJECT_ID, NAME_LAST, NAME_FIRST, POINT_OF_CONTACT, PROJECT_NAME, BUSINESS_NAME) as
(
SELECT
RowNumber() over (PARTITION BY F13.DIM_PROJECT_ID, F13.CATEGORY_NAME ORDER BY F13.PROJECT_NAME),
F13.DIM_PROJECT_ID,
F2P.NAME_LAST,
F2P.NAME_FIRST,
REPLACE(F2P.POINT_OF_CONTACT, ',', ' |') AS POINT_OF_CONTACT,
F13.PROJECT_NAME,
F2H.CATEGORY_NAME,
FROM FACT_TABLE AS F13
INNER JOIN ADMIN AS F2P ON F13.DIM_PROJECT_ID = F2P.DIM_PROJECT_ID
LEFT JOIN HOURS AS F2H ON F13.DIM_PROJECT_ID = F2H.DIM_PROJECT_ID
WHERE F2H.CATEGORY_NAME = ('INVISION')
group by
F13.DIM_PROJECT_ID,
F13.PROJECT_NAME,
F2P.NAME_LAST,
F2P.NAME_FIRST,
F2P.POINT_OF_CONTACT,
F2H.CATEGORY_NAME
) ,
t2(PROJECT_ID, LIST, POINT_OF_CONTACT, PROJECT_NAME, BUSINESS_NAME, cnt) AS
( SELECT PROJECT_ID,
VARCHAR(NAME_FIRST CONCAT ' ' CONCAT NAME_LAST, 6000),
POINT_OF_CONTACT,
PROJECT_NAME,
CATEGORY_NAME,
1
FROM t1
WHERE rowNum = 1
UNION ALL
SELECT t2.PROJECT_ID,
t2.list || ' | ' || t1.NAME_FIRST CONCAT ' ' CONCAT t1.NAME_LAST,
t1.POINT_OF_CONTACT,
t1.PROJECT_NAME,
t1.CATEGORY_NAME
FROM t2, t1
WHERE t2.project_id = t1.project_id
AND t2.cnt + 1 = t1.rowNum )
SELECT PROJECT_ID,
PROJECT_NAME,
POINT_OF_CONTACT,
CATEGORY_NAME
list
FROM t2
WHERE ( PROJECT_ID, cnt ) IN (
SELECT PROJECT_ID, MAX(rowNum)
FROM t1
GROUP BY PROJECT_ID )
The results that I'm getting are producing duplicates but only when the second column (category_name is included in the partition clause. Current results:
Desired results:

I figured it out. I added an ID for category and partitioned by category_id and project_id.
with
t1(rowNum, PROJECT_ID, NAME_LAST, NAME_FIRST, POINT_OF_CONTACT, PROJECT_NAME, CATEGORY_ID, CATEGORY_NAME) as
(
SELECT
RowNumber() over (PARTITION BY F13.DIM_PROJECT_ID, F13.CATEGORY_ID ORDER BY F13.PROJECT_NAME, F13.CATEGORY_NAME),
F13.DIM_PROJECT_ID,
F2P.NAME_LAST,
F2P.NAME_FIRST,
REPLACE(F2P.POINT_OF_CONTACT, ',', ' |') AS POINT_OF_CONTACT,
F13.PROJECT_NAME,
F13.CATEGORY_ID
F13.CATEGORY_NAME,
FROM FACT_TABLE AS F13
INNER JOIN ADMIN AS F2P ON F13.DIM_PROJECT_ID = F2P.DIM_PROJECT_ID
LEFT JOIN HOURS AS F2H ON F13.DIM_PROJECT_ID = F2H.DIM_PROJECT_ID
WHERE F13.CATEGORY_NAME = ('INVISION')
group by
F13.DIM_PROJECT_ID,
F13.PROJECT_NAME,
F2P.NAME_LAST,
F2P.NAME_FIRST,
F2P.POINT_OF_CONTACT,
F13.CATEGORY_ID
F13.CATEGORY_NAME
) ,
t2(PROJECT_ID, LIST, POINT_OF_CONTACT, PROJECT_NAME, CATEGORY_ID, CATEGORY_NAME, cnt) AS
( SELECT PROJECT_ID,
VARCHAR(NAME_FIRST CONCAT ' ' CONCAT NAME_LAST, 6000),
POINT_OF_CONTACT,
PROJECT_NAME,
CATEGORY_ID,
CATEGORY_NAME,
1
FROM t1
WHERE rowNum = 1
UNION ALL
SELECT t2.PROJECT_ID,
t2.list || ' | ' || t1.NAME_FIRST CONCAT ' ' CONCAT t1.NAME_LAST,
t1.POINT_OF_CONTACT,
t1.PROJECT_NAME,
t1.CATEGORY_ID,
t1.CATEGORY_NAME
FROM t2, t1
WHERE t2.project_id = t1.project_id
AND t2.category_id = t1.category_id
AND t2.cnt + 1 = t1.rowNum )
SELECT PROJECT_ID,
PROJECT_NAME,
POINT_OF_CONTACT,
CATEGORY_ID,
CATEGORY_NAME
list
FROM t2
WHERE ( PROJECT_ID, CATEGORY_ID, cnt ) IN (
SELECT PROJECT_ID, CATEGORY_ID, MAX(rowNum)
FROM t1
GROUP BY PROJECT_NAME )

We Keep Coding

iphone swift flutter scala powershell matlab mongodb postgresql perl eclipse

Getting pyspark.sql.utils.ParseException: missing ')' at 'in' in pyspark sql - pyspark

Related

Join 2 Alias Table from Union Select Table with same numbers of row

postgresql combining several periods into one

Postgresql nested select max(sum())

Selecting SUM of COUNT statements on 2 different tables

partition over two columns

Categories

Resources