pySpark error Expression Referencing the outer Query - pyspark

I want to recreate this query in spark sql
SELECT
[Id],
[Group],
[Name],
min([Date]) as MinDate,
max([Date]) as MaxDate
FROM recordTable
GROUP BY [Id],[Group],[Name]
)
SELECT
t.Id,
t.[Group],
t.[Name],
c.[Date],
(SELECT top 1 ScoreCount
from recordTable x
where x.[Date] <= c.[Days]
and x.[Group] = t.[Group]
and x.[Name] = t.[Name]
order by x.[Date] desc
) ScoreCount
FROM t
LEFT JOIN calendar c ON c.[Days] BETWEEN t.MinDate AND t.MaxDate
so I have
df = spark.sql("""
WITH t as (
SELECT
Id,
Group,
Name,
min(Date) as MinDate,
max(Date) as MaxDate
FROM recordTable
GROUP BY Id,Group,Name
)
SELECT
t.Id,
t.Group,
t.Name,
c.Date,
(SELECT ScoreCount
from recordTable x
where x.Date <= c.Days
and x.Group = t.Group
and x.Name = t.Name
order by x.Date desc LIMIT 1
) ScoreCount
FROM t
LEFT JOIN calendar c ON c.Days BETWEEN t.MinDate AND t.MaxDate
""")
But I'm getting an error when trying to limit 1 and using an order by clause. Any alternatives?
"Expressions referencing the outer query are not supported outside of where/having clauses"

Related

How to PIVOT this query and display only TOP 10 records filtered by SUM(NetWrittenPremium) DESC

In this query I cant understand what would be the proper syntax to PIVOT it by month and also display just top 10 records based on SUM(NetWrittenPremium).
;with cte_TopClasses
AS (
select
b.YearNum,
b.MonthNum,
REPLACE(ClassCode,'+','') + ' - '+ QLL.Description as Description,
SUM( Premium) as NetWrittenPremium
FROM tblCalendar b
LEFT JOIN ProductionReportMetrics prm ON b.MonthNum=Month(prm.EffectiveDate) AND b.YearNum = YEAR(EffectiveDate)
AND prm.EffectiveDate >=DateAdd(yy, -1, DATEADD(d, 1, EOMONTH(GETDATE()))) AND prm.EffectiveDate <= EOMONTH(GETDATE()) AND CompanyLine = 'Ironshore Insurance Company'
LEFT JOIN NetRate_Quote_Insur_Quote Q ON prm.NetRate_QuoteID = Q.QuoteID
LEFT JOIN NetRate_Quote_Insur_Quote_Locat QL ON Q.QuoteID = QL.QuoteID
LEFT JOIN (SELECT * FROM NetRate_Quote_Insur_Quote_Locat_Liabi nqI
JOIN ( SELECT LocationID as LocID, MAX(ClassCode) as ClCode
FROM NetRate_Quote_Insur_Quote_Locat_Liabi GROUP BY LocationID ) nqA
ON nqA.LocID = nqI.LocationID AND nqA.ClCode = nqI.ClassCode ) QLL
ON QLL.LocationID = QL.LocationID
WHERE ( b.YearNum = YEAR(GETDATE())-1 and b.MonthNum >= MONTH(GETDATE())+1 ) OR
( b.YearNum = YEAR(GETDATE()) and b.MonthNum <= MONTH(GETDATE()) )
GROUP BY b.YearNum,b.MonthNum,ClassCode, QLL.Description
)
SELECT
--TOP 10
RANK() OVER (ORDER BY NetWrittenPremium DESC) AS Rank, *
FROM cte_TopClasses
WHERE Description IS NOT NULL
ORDER BY NetWrittenPremium DESC,YearNum,MonthNum
The result should look something like that:
If I use the query below and then using matrics in SSRS to PIVOT it - then after grouping by Description it only displays me 2 Description.
;with cte_TopClasses
AS (
select
b.YearNum,
b.MonthNum,
REPLACE(ClassCode,'+','') + ' - '+ QLL.Description as Description,
SUM( Premium) as NetWrittenPremium
FROM tblCalendar b
LEFT JOIN ProductionReportMetrics prm ON b.MonthNum=Month(prm.EffectiveDate) AND b.YearNum = YEAR(EffectiveDate)
AND prm.EffectiveDate >=DateAdd(yy, -1, DATEADD(d, 1, EOMONTH(GETDATE()))) AND prm.EffectiveDate <= EOMONTH(GETDATE()) AND CompanyLine = 'Ironshore Insurance Company'
LEFT JOIN NetRate_Quote_Insur_Quote Q ON prm.NetRate_QuoteID = Q.QuoteID
LEFT JOIN NetRate_Quote_Insur_Quote_Locat QL ON Q.QuoteID = QL.QuoteID
LEFT JOIN (SELECT * FROM NetRate_Quote_Insur_Quote_Locat_Liabi nqI
JOIN ( SELECT LocationID as LocID, MAX(ClassCode) as ClCode
FROM NetRate_Quote_Insur_Quote_Locat_Liabi GROUP BY LocationID ) nqA
ON nqA.LocID = nqI.LocationID AND nqA.ClCode = nqI.ClassCode ) QLL
ON QLL.LocationID = QL.LocationID
WHERE ( b.YearNum = YEAR(GETDATE())-1 and b.MonthNum >= MONTH(GETDATE())+1 ) OR
( b.YearNum = YEAR(GETDATE()) and b.MonthNum <= MONTH(GETDATE()) )
GROUP BY b.YearNum,b.MonthNum,ClassCode, QLL.Description
)
SELECT *
FROM (SELECT RANK() OVER (ORDER BY NetWrittenPremium DESC) AS Rank, *
FROM cte_TopClasses
WHERE Description IS NOT NULL) AA
WHERE AA.Rank <= 10
ORDER BY AA.NetWrittenPremium DESC, AA.YearNum, AA.MonthNum
And the result of it in SSRS matrics :
You could try something like this at the end of the query, rather than what is there now:
SELECT *
FROM (SELECT RANK() OVER (ORDER BY [Description] DESC) AS Rank, *
FROM cte_TopClasses
WHERE Description IN (SELECT [Description]
FROM (SELECT RANK() OVER (ORDER BY SUM(NetWrittenPremium) DESC) AS [Rank], [Description], SUM(NetWrittenPremium) AS total
FROM cte_TopClasses
WHERE [Description] IS NOT NULL
GROUP BY [Description]) BB
WHERE [Rank] <= 10)) AA
ORDER BY YearNum, MonthNum
This wraps the query in a SELECT, and filters the ranked results to the 10 you want.
Then use a matrix in the report to pivot the results.

Faster left join with last non-empty

Table1:
Shop
Manager
Date
Table2:
Shop
Date
Sales
I need to get Table2 with Manager field from Table1. I did the following trick:
select
t1.[Shop]
,t1.[Date]
,t1.[Sum]
,t2.[Manager]
from t1
left join t2
on t1.[Shop] = t2.[Shop]
and t2.[Date] = (select max(t2.[Date]) from t2
where t2.[Shop] = t1.[Shop]
and t2.[Date] < t1.[Date])
It works, but subquerying is very slow, so I wonder if there is more elegant and fast way to do so?
Some sample data to play around: http://pastebin.com/uLN6x5JE
may seem like a round about way but join on a single condition is typically faster
select t12.[Shop], t12.[Date], t12.[Sum]
, t12.[Manager]
from
( select t1.[Shop], t1.[Date], t1.[Sum]
, t2.[Manager]
, row_number() over (partition by t2.[Shop] order by t2.[Date] desc) as rn
from t1
join t2
on t2.[Shop] = t1.[Shop]
and t1.[Date] < t1.[Date]
) as t12
where t12.rn = 1
union
select t1.[Shop], t1.[Date], t1.[Sum]
, null as [Manager]
from t1
left join t2
on t2.[Shop] = t1.[Shop]
and t1.[Date] < t1.[Date]
group by t1.[Shop], t1.[Date], t1.[Sum]
having count(*) = 1
You may get much better performance by adding a covering index on t2 if you don't already have one:
create index T2ShopDate on t2 ([Shop], [Date]) include ([Manager])
Here is a version that uses a CTE to find all maximum manager dates first and then join back to t2 to get the manager:
;with MaxDates ([Shop], [Date], [Sum], [MaxMgrDate]) as
(
select
t1.[Shop]
,t1.[Date]
,t1.[Sum]
,max(t2.[Date])
from t1
left join t2
on t2.[Shop] = t1.[Shop]
and t2.[Date] < t1.[Date]
group by
t1.[Shop]
,t1.[Date]
,t1.[Sum]
)
select
MaxDates.[Shop]
,MaxDates.[Date]
,MaxDates.[Sum]
,t2.[Manager]
from MaxDates
inner join t2
on t2.[Date] = MaxDates.[MaxMgrDate]
You might be able to remove the second join back to t2 by using row_number():
;with MaxDates ([Shop], [Date], [Sum], [Manager], [RowNum]) as
(
select
t1.[Shop]
,t1.[Date]
,t1.[Sum]
,t2.[Manager]
,row_number() over (partition by (t1.[Shop]) order by t2.[Date] desc)
from t1
left join t2
on t2.[Shop] = t1.[Shop]
and t2.[Date] < t1.[Date]
)
select *
from MaxDates
where RowNum = 1

Use an Alias in Where Clause Subquery in Oracle

i need to show some field from another table in oracle here is my query
SELECT
ANGGARAN.SIMPEG_PEGAWAI.ID_PEGAWAI AS KODE,
ANGGARAN.SIMPEG_PEGAWAI.NAMA,
ANGGARAN.SIMPEG_PEGAWAI.NIP,
ANGGARAN.SIMPEG_ESELON_JABATAN.JABATAN,
ANGGARAN.SIMPEG_KODE_GOLONGAN_PANGKAT.GOLONGAN,
ANGGARAN.SIMPEG_KODE_GOLONGAN_PANGKAT.PANGKAT,
(SELECT *
FROM (SELECT CONCAT(TO_CHAR(abs(sysdate - TO_DATE(TMT_JABATAN))/360,'9,999,999.9'),' TAHUN')
FROM SIMPEG_JABATAN where ID_PEGAWAI=KODE ORDER BY TMT_JABATAN desc)
WHERE ROWNUM = 1) AS MASA_KERJA
FROM
ANGGARAN.SIMPEG_PEGAWAI
INNER JOIN ANGGARAN.SIMPEG_ESELON_JABATAN
ON ANGGARAN.SIMPEG_PEGAWAI.ESELON_JABATAN = ANGGARAN.SIMPEG_ESELON_JABATAN.ID_ESELON_JABATAN
INNER JOIN ANGGARAN.SIMPEG_KODE_GOLONGAN_PANGKAT
ON ANGGARAN.SIMPEG_PEGAWAI.PANGKAT = ANGGARAN.SIMPEG_KODE_GOLONGAN_PANGKAT.ID_GOLONGAN_PANGKAT
WHERE
ANGGARAN.SIMPEG_PEGAWAI.ST_AKTIF = 1 AND
ANGGARAN.SIMPEG_PEGAWAI.ESELON2 <> 1 AND
ANGGARAN.SIMPEG_PEGAWAI.PANGKAT >= 12 AND
ANGGARAN.SIMPEG_ESELON_JABATAN.STATUS = 1 AND
ANGGARAN.SIMPEG_ESELON_JABATAN.ID_ESELON2=2
ORDER BY
ANGGARAN.SIMPEG_KODE_GOLONGAN_PANGKAT.SORT DESC
result i got
[Err] ORA-00904: "KODE": invalid identifier
the KODE come from query ANGGARAN.SIMPEG_PEGAWAI.ID_PEGAWAI AS KODE, and used for this query
(SELECT *
FROM (SELECT CONCAT(TO_CHAR(abs(sysdate - TO_DATE(TMT_JABATAN))/360,'9,999,999.9'),' TAHUN')
FROM SIMPEG_JABATAN where ID_PEGAWAI=KODE ORDER BY TMT_JABATAN desc)
WHERE ROWNUM = 1) AS MASA_KERJA
that i miss something ? or that could be worogn using an alias in subquery where clause in oracle database ?
You can use an identifier defined in an external query in only one level deep queries. You have to rethink your strategy. My suggestion is to remove the subquery from the select list and put it in the FROM clause. And add another rownumber column like this:
(SELECT
ID_PEGAWAI,
CONCAT(TO_CHAR(abs(sysdate - TO_DATE(TMT_JABATAN))/360,'9,999,999.9'),' TAHUN') MASA_KERJA,
ROW_NUMBER() OVER (PARTITION BY ID_PEGAWAI ORDER BY TMT_JABATAN DESC) rownumber
FROM SIMPEG_JABATAN) xxx
And join like:
ON ANGGARAN.SIMPEG_PEGAWAI = xxx.ID_PEGAWAI
Then in the where clause you can do simply:
WHERE
....
AND xxx.rownumber = 1
Complete query:
SELECT
ANGGARAN.SIMPEG_PEGAWAI.ID_PEGAWAI AS KODE,
ANGGARAN.SIMPEG_PEGAWAI.NAMA,
ANGGARAN.SIMPEG_PEGAWAI.NIP,
ANGGARAN.SIMPEG_ESELON_JABATAN.JABATAN,
ANGGARAN.SIMPEG_KODE_GOLONGAN_PANGKAT.GOLONGAN,
ANGGARAN.SIMPEG_KODE_GOLONGAN_PANGKAT.PANGKAT
FROM
ANGGARAN.SIMPEG_PEGAWAI
INNER JOIN ANGGARAN.SIMPEG_ESELON_JABATAN
ON ANGGARAN.SIMPEG_PEGAWAI.ESELON_JABATAN = ANGGARAN.SIMPEG_ESELON_JABATAN.ID_ESELON_JABATAN
INNER JOIN ANGGARAN.SIMPEG_KODE_GOLONGAN_PANGKAT
ON ANGGARAN.SIMPEG_PEGAWAI.PANGKAT = ANGGARAN.SIMPEG_KODE_GOLONGAN_PANGKAT.ID_GOLONGAN_PANGKAT
INNER JOIN (
SELECT
ID_PEGAWAI,
CONCAT(TO_CHAR(abs(sysdate - TO_DATE(TMT_JABATAN))/360,'9,999,999.9'),' TAHUN') MASA_KERJA,
ROW_NUMBER() OVER (PARTITION BY ID_PEGAWAI ORDER BY TMT_JABATAN DESC) rownumber
FROM SIMPEG_JABATAN
) xxx
ON ANGGARAN.SIMPEG_PEGAWAI.ID_PEGAWAI = xxx.ID_PEGAWAI
WHERE
ANGGARAN.SIMPEG_PEGAWAI.ST_AKTIF = 1 AND
ANGGARAN.SIMPEG_PEGAWAI.ESELON2 <> 1 AND
ANGGARAN.SIMPEG_PEGAWAI.PANGKAT >= 12 AND
ANGGARAN.SIMPEG_ESELON_JABATAN.STATUS = 1 AND
ANGGARAN.SIMPEG_ESELON_JABATAN.ID_ESELON2=2 AND
xxx.rownumber = 1
ORDER BY ANGGARAN.SIMPEG_KODE_GOLONGAN_PANGKAT.SORT DESC
Oracle does not support columns aliases in WHERE clauses (or similar situations like here). You have to name the column again (by its original name).
select dummy as kode from dual where kode = 'X'
> ORA-00904: "KODE": invalid identifier
You need to assign an alias in the level below to use it in a query (I haven't checked the syntax and workability of your query, just changed the part which is essential to answer your question):
SELECT
TMP.KODE,
TMP.NAMA,
TMP.NIP,
ANGGARAN.SIMPEG_ESELON_JABATAN.JABATAN,
ANGGARAN.SIMPEG_KODE_GOLONGAN_PANGKAT.GOLONGAN,
ANGGARAN.SIMPEG_KODE_GOLONGAN_PANGKAT.PANGKAT,
(SELECT *
FROM (SELECT CONCAT(TO_CHAR(abs(sysdate - TO_DATE(TMT_JABATAN))/360,'9,999,999.9'),' TAHUN')
FROM SIMPEG_JABATAN where ID_PEGAWAI=TMP.KODE ORDER BY TMT_JABATAN desc)
WHERE ROWNUM = 1) AS MASA_KERJA
FROM
(SELECT ANGGARAN.SIMPEG_PEGAWAI.ID_PEGAWAI AS KODE, ANGGARAN.SIMPEG_PEGAWAI.* FROM ANGGARAN.SIMPEG_PEGAWAI) TMP
INNER JOIN ANGGARAN.SIMPEG_ESELON_JABATAN
ON TMP.ESELON_JABATAN = ANGGARAN.SIMPEG_ESELON_JABATAN.ID_ESELON_JABATAN
INNER JOIN ANGGARAN.SIMPEG_KODE_GOLONGAN_PANGKAT
ON TMP.PANGKAT = ANGGARAN.SIMPEG_KODE_GOLONGAN_PANGKAT.ID_GOLONGAN_PANGKAT
WHERE
TMP.ST_AKTIF = 1 AND
TMP.ESELON2 <> 1 AND
TMP.PANGKAT >= 12 AND
ANGGARAN.SIMPEG_ESELON_JABATAN.STATUS = 1 AND
ANGGARAN.SIMPEG_ESELON_JABATAN.ID_ESELON2=2
ORDER BY
ANGGARAN.SIMPEG_KODE_GOLONGAN_PANGKAT.SORT DESC

How to display rollup data in new column?

I have the following query which returns the number of android questions per each day on StackOverflow in the year of 2011. I want to get the sum of all the questions asked during the year 2011. For this I am using ROLLUP.
select
year(p.CreationDate) as [Year],
month(p.CreationDate) as [Month],
day(p.CreationDate) as [Day],
count(*) as [QuestionsAskedToday]
from Posts p
inner join PostTags pt on p.id = pt.postid
inner join Tags t on t.id = pt.tagid
where
t.tagname = 'android' and
p.CreationDate > '2011-01-01 00:00:00'
group by year(p.CreationDate), month(p.CreationDate),day(p.CreationDate)
​with rollup
order by year(p.CreationDate), month(p.CreationDate) desc,day(p.CreationDate) desc​
This is the output:
The sum of all questions asked on each day in 2011 is being displayed in the QuestionsAskedToday column itself.
Is there a way to display the rollup in a new column with an alias?
Link to the query
To show this as a column rather than a row you can use SUM(COUNT(*)) OVER () instead of ROLLUP. (Online Demo)
SELECT YEAR(p.CreationDate) AS [Year],
MONTH(p.CreationDate) AS [Month],
DAY(p.CreationDate) AS [Day],
COUNT(*) AS [QuestionsAskedToday],
SUM(COUNT(*)) OVER () AS [Total]
FROM Posts p
INNER JOIN PostTags pt
ON p.id = pt.postid
INNER JOIN Tags t
ON t.id = pt.tagid
WHERE t.tagname = 'android'
AND p.CreationDate > '2011-01-01 00:00:00'
GROUP BY YEAR(p.CreationDate),
MONTH(p.CreationDate),
DAY(p.CreationDate)
ORDER BY YEAR(p.CreationDate),
MONTH(p.CreationDate) DESC,
DAY(p.CreationDate) DESC
You could take an approach like this: Example
SELECT
YEAR(p.CreationDate) AS 'Year'
, CASE
WHEN GROUPING(MONTH(p.CreationDate)) = 0
THEN CAST(MONTH(p.CreationDate) AS VARCHAR(2))
ELSE 'Totals:'
END AS 'Month'
, CASE
WHEN GROUPING(DAY(p.CreationDate)) = 0
THEN CAST(DAY(p.CreationDate) AS VARCHAR(2))
ELSE 'Totals:'
END AS [DAY]
, CASE
WHEN GROUPING(MONTH(p.CreationDate)) = 0
AND GROUPING(DAY(p.CreationDate)) = 0
THEN COUNT(1)
END AS 'QuestionsAskedToday'
, CASE
WHEN GROUPING(MONTH(p.CreationDate)) = 1
OR GROUPING(DAY(p.CreationDate)) = 1
THEN COUNT(1)
END AS 'Totals'
FROM Posts AS p
INNER JOIN PostTags AS pt ON p.id = pt.postid
INNER JOIN Tags AS t ON t.id = pt.tagid
WHERE t.tagname = 'android'
AND p.CreationDate >= '2011-01-01'
GROUP BY ROLLUP(YEAR(p.CreationDate)
, MONTH(p.CreationDate)
, DAY(p.CreationDate))
ORDER BY YEAR(p.CreationDate)
, MONTH(p.CreationDate) DESC
, DAY(p.CreationDate) DESC​​​​​​​
If this is what you wanted, the same technique can be applied to Years as well to total them in the new column, or their own column, if you want to query for multiple years and aggregate them.

How to select a date interval in postgresql query?

I have query like this:
select d.id from days as d JOIN tv_programs AS t ON (d.tv_program_id = t.id) WHERE d.date=? AND t.type=0 ORDER BY d.date, t.date DESC OFFSET 1;
it returns data for 1 selected date (type Date), but i want to get values from startDate to endDate, for example, i want to do it using one query which do something like that:
select d.id from days as d JOIN tv_programs AS t ON (d.tv_program_id = t.id) WHERE d.date='2011-09-01' AND t.type=0 ORDER BY d.date, t.date DESC OFFSET 1;
select d.id from days as d JOIN tv_programs AS t ON (d.tv_program_id = t.id) WHERE d.date='2011-09-02' AND t.type=0 ORDER BY d.date, t.date DESC OFFSET 1;
select d.id from days as d JOIN tv_programs AS t ON (d.tv_program_id = t.id) WHERE d.date='2011-09-03' AND t.type=0 ORDER BY d.date, t.date DESC OFFSET 1;
It's possible?
... where d.date between '2011-09-01' and '2011-09-04' should work.