How do I avoid joining multiple times when using union all statement? - postgresql

I was working on a query where I hit a point:
SELECT tpd.timestamp::Date,'Mon' AS Label,
count(tpd.aggregated)
FROM tap.deving AS tpd INNER JOIN
(select DATE_TRUNC('week', timestamp), MAX(timestamp) AS max_timestamp
from tap.deving
group by DATE_TRUNC('week', timestamp)
) b
on tpd.timestamp = b.max_timestamp
left JOIN ca.hardware AS ch ON tpd.dev = ch.name
left JOIN ca.sites AS css ON css.id = ch.id
WHERE (tpd.aggregated=TRUE)
AND (css.country='USA') and (tpd.timestamp::date=now()::Date - interval '1 day') group by tpd.timestamp
UNION ALL
SELECT tpd.timestamp::date,'Tap but not' AS Label,
count(tpd.tap)
FROM tap.deving AS tpd INNER JOIN
(select DATE_TRUNC('week', timestamp), MAX(timestamp) AS max_timestamp
from tap.deving
group by DATE_TRUNC('week', timestamp)
) b
on tpd.timestamp = b.max_timestamp
left JOIN ca.hardware AS ch ON tpd.dev = ch.name
left JOIN ca.sites AS css ON css.id = ch.id
WHERE (tpd.tap=true)
AND (tpd.aggregated=false) and (tpd.needs_to_be=true)
AND (css.country='USA') and (tpd.timestamp::date=now()::Date - interval '1 day') group by tpd.timestamp
I wrote this query with the help of many SO posts and it has gone quite messy and super slow. I could not get my head on how to optimize this query.

Can you please try this query.
SELECT tpd.timestamp::Date,CASE tpd.aggregated
WHEN false THEN 'Tap but not'
WHEN true THEN 'Mon' as Label,
count(tpd.aggregated)
FROM tap.deving AS tpd INNER JOIN
(select DATE_TRUNC('week', timestamp), MAX(timestamp) AS max_timestamp
from tap.deving
group by DATE_TRUNC('week', timestamp)
) b
on tpd.timestamp = b.max_timestamp
left JOIN ca.hardware AS ch ON tpd.dev = ch.name
left JOIN ca.sites AS css ON css.id = ch.id
WHERE ((tpd.aggregated=TRUE) or ((tpd.tap=true) AND (tpd.aggregated=false) and (tpd.needs_to_be=true)))
AND (css.country='USA') and (tpd.timestamp::date=now()::Date - interval '1 day') group by tpd.timestamp;

Related

pySpark error Expression Referencing the outer Query

I want to recreate this query in spark sql
SELECT
[Id],
[Group],
[Name],
min([Date]) as MinDate,
max([Date]) as MaxDate
FROM recordTable
GROUP BY [Id],[Group],[Name]
)
SELECT
t.Id,
t.[Group],
t.[Name],
c.[Date],
(SELECT top 1 ScoreCount
from recordTable x
where x.[Date] <= c.[Days]
and x.[Group] = t.[Group]
and x.[Name] = t.[Name]
order by x.[Date] desc
) ScoreCount
FROM t
LEFT JOIN calendar c ON c.[Days] BETWEEN t.MinDate AND t.MaxDate
so I have
df = spark.sql("""
WITH t as (
SELECT
Id,
Group,
Name,
min(Date) as MinDate,
max(Date) as MaxDate
FROM recordTable
GROUP BY Id,Group,Name
)
SELECT
t.Id,
t.Group,
t.Name,
c.Date,
(SELECT ScoreCount
from recordTable x
where x.Date <= c.Days
and x.Group = t.Group
and x.Name = t.Name
order by x.Date desc LIMIT 1
) ScoreCount
FROM t
LEFT JOIN calendar c ON c.Days BETWEEN t.MinDate AND t.MaxDate
""")
But I'm getting an error when trying to limit 1 and using an order by clause. Any alternatives?
"Expressions referencing the outer query are not supported outside of where/having clauses"

How to divide a period in columns

I am trying to create a query where the first column shows the list of the companies and the other 3 columns their revenues per month. This is what I do:
WITH time_frame AS
(SELECT date_trunc('month',NOW())-interval '0 week'),
time_frame1 AS
(SELECT date_trunc('month',NOW())-interval '1 month'),
time_frame2 AS
(SELECT date_trunc('month',NOW())-interval '2 month')
select table1.company_name,
(CASE
WHEN table2.date_of_transaction = (SELECT * FROM time_frame2) THEN sum(table2.amount)
ELSE NULL
END) AS "current week - 2",
(CASE
WHEN table2.date_of_transaction = (SELECT * FROM time_frame1) THEN sum(table2.amount)
ELSE NULL
END) AS "current week - 1",
(CASE
WHEN table2.date_of_transaction = (SELECT * FROM time_frame2) THEN
sum(table2.amount)
ELSE NULL
END) AS "current week - 2"
from table1
join table2 on table2.table1_id = table.id
where table1.company_joined >= '04-20-2019'
group by 1
When I execute the table this comes out: Error running query: column "table2.date_of_transaction" must appear in the GROUP BY clause or be used in an aggregate function LINE 15: WHEN table2.date_of_transaction = (SELECT * FROM time_frame) TH... ^
Do you have any ideas on how to solve it? Thank you.
company name
month1
month2
name 1
£233
£343
name 2
£243
£34
name 3
£133
£43
you can simplify the statement by using the filter() operator
select t1.company_name,
sum(t2.amount) filter (where t2.date_of_transaction = date_trunc('month',NOW())-interval '2 month'),
sum(t2.amount) filter (where t2.date_of_transaction = date_trunc('month',NOW())-interval '1 month'),
sum(t2.amount) filter (where t2.date_of_transaction = date_trunc('month',NOW()))
from table1 t1
join table2 t2 on t2.table1_id = t1.id
where t1.company_joined >= date '2019-04-20'
group by t1.company_name;
If you really want to put the date ranges into a CTE, you only need one:
with dates (r1, r2, r3) as (
values
(date_trunc('month',NOW())-interval '2 month',
date_trunc('month',NOW())-interval '1 month',
date_trunc('month',NOW()))
)
select t1.company_name,
sum(t2.amount) filter (where t2.date_of_transaction = d.r1),
sum(t2.amount) filter (where t2.date_of_transaction = d.r2),
sum(t2.amount) filter (where t2.date_of_transaction = d.r3)
from table1 t1
cross join dates d
join table2 t2 on t2.table1_id = t1.id
where t1.company_joined >= date '2019-04-20'
group by t1.company_name
;
The CTE dates returns a single row with three columns and thus the cross join doesn't change the resulting number of rows.

postgres JOIN with left table null

my query is:
SELECT main.group_id, s_ref.title, s_ref.username, main.m_per_group, main.pos, u.lang
FROM (
SELECT user_id, group_id, COUNT(user_id) AS m_per_group,
ROW_NUMBER() OVER (
PARTITION BY group_id
ORDER BY COUNT(group_id) DESC
) AS pos
FROM messages
WHERE message_date > date_trunc('week', now())
GROUP BY group_id, user_id
) AS main
LEFT OUTER JOIN supergroups_ref AS s_ref
USING (group_id)
RIGHT JOIN users AS u
ON u.user_id = main.user_id
WHERE main.user_id = %s
ORDER BY m_per_group DESC
the problem is that when main returns 0 elements, i don't get neither the language of the user of the users JOIN but i get exactly []
i instead would like to get [(None, None, None, None, 'en')] this is why i used a right join. How can i get the result i want?
Move this condition:
WHERE main.user_id = %s
To the main subquery:
WHERE message_date > date_trunc('week', now()) and main.user_id = %s
The way it is now it is turning an outer join into an inner join.

Postgres - ERROR: aggregate functions are not allowed in GROUP BY Position: 305

Having a little trouble with a query provided by Periscope. Can you help point me in the right direction?
Error is - ERROR: aggregate functions are not allowed in GROUP BY Position: 305
with monthly_activity as (
select distinct
date_trunc('month', created_at) as month,
user_id
from oauth_refresh_tokens
),
first_activity as (
select user_id, date(min(created_at)) as month
from oauth_refresh_tokens
group by 2
)
select
this_month.month,
count(distinct user_id)
from monthly_activity this_month
left join monthly_activity last_month
on this_month.user_id = last_month.user_id
and this_month.month = last_month.month + interval '1 month'
join first_activity
on this_month.user_id = first_activity.user_id
and first_activity.month != this_month.month
where last_month.user_id is null
group by 1

Faster left join with last non-empty

Table1:
Shop
Manager
Date
Table2:
Shop
Date
Sales
I need to get Table2 with Manager field from Table1. I did the following trick:
select
t1.[Shop]
,t1.[Date]
,t1.[Sum]
,t2.[Manager]
from t1
left join t2
on t1.[Shop] = t2.[Shop]
and t2.[Date] = (select max(t2.[Date]) from t2
where t2.[Shop] = t1.[Shop]
and t2.[Date] < t1.[Date])
It works, but subquerying is very slow, so I wonder if there is more elegant and fast way to do so?
Some sample data to play around: http://pastebin.com/uLN6x5JE
may seem like a round about way but join on a single condition is typically faster
select t12.[Shop], t12.[Date], t12.[Sum]
, t12.[Manager]
from
( select t1.[Shop], t1.[Date], t1.[Sum]
, t2.[Manager]
, row_number() over (partition by t2.[Shop] order by t2.[Date] desc) as rn
from t1
join t2
on t2.[Shop] = t1.[Shop]
and t1.[Date] < t1.[Date]
) as t12
where t12.rn = 1
union
select t1.[Shop], t1.[Date], t1.[Sum]
, null as [Manager]
from t1
left join t2
on t2.[Shop] = t1.[Shop]
and t1.[Date] < t1.[Date]
group by t1.[Shop], t1.[Date], t1.[Sum]
having count(*) = 1
You may get much better performance by adding a covering index on t2 if you don't already have one:
create index T2ShopDate on t2 ([Shop], [Date]) include ([Manager])
Here is a version that uses a CTE to find all maximum manager dates first and then join back to t2 to get the manager:
;with MaxDates ([Shop], [Date], [Sum], [MaxMgrDate]) as
(
select
t1.[Shop]
,t1.[Date]
,t1.[Sum]
,max(t2.[Date])
from t1
left join t2
on t2.[Shop] = t1.[Shop]
and t2.[Date] < t1.[Date]
group by
t1.[Shop]
,t1.[Date]
,t1.[Sum]
)
select
MaxDates.[Shop]
,MaxDates.[Date]
,MaxDates.[Sum]
,t2.[Manager]
from MaxDates
inner join t2
on t2.[Date] = MaxDates.[MaxMgrDate]
You might be able to remove the second join back to t2 by using row_number():
;with MaxDates ([Shop], [Date], [Sum], [Manager], [RowNum]) as
(
select
t1.[Shop]
,t1.[Date]
,t1.[Sum]
,t2.[Manager]
,row_number() over (partition by (t1.[Shop]) order by t2.[Date] desc)
from t1
left join t2
on t2.[Shop] = t1.[Shop]
and t2.[Date] < t1.[Date]
)
select *
from MaxDates
where RowNum = 1