Using the null value over a date value - tsql

I have to select a group of records by Max(StartDate) and where there are multiple records with the same StartDate but Different EndDate I want to choose the record with the NULL EndDate over the value with an actual date.
SELECT UPC, DocumentNumber, MAX(StartDate) AS 'StartDate'
FROM #tbDupRecs
--WHERE EndDate = CASE EndDate WHEN NULL THEN NULL ELSE EndDate END
GROUP BY UPC, DocumentNumber
Order By UPC, DocumentNumber, StartDate
I get errors when I try to include something like EndDate as I cannot have it in the Select statement or the Group By, etc... Everything I tried (as above) picks the record with the date....

I think this should work... it gives a RN to each group of records you grouped by and gets the top one.
with cte as(
SELECT
UPC,
DocumentNumber,
StartDate,
row_number over (partition by UPC, DocumentNumber order by case when StartDate is null then '12/31/2999' else StartDate end desc) as rn
FROM #tbDupRecs)
select * from cte where rn = 1
OR
SELECT
UPC,
DocumentNumber,
StartDate
FROM #tbDupRecs
WHERE EXISTS (SELECT UPC, DocumentNumber, max(isnull(StartDate,'12/31/2999')) FROM #tbDupRecs group by UPC, DocumentNumber)
OR
SELECT
r.UPC,
r.DocumentNumber,
r.StartDate
FROM #tbDupRecs r
INNER JOIN
(SELECT UPC, DocumentNumber, max(isnull(StartDate,'12/31/2999'))
FROM #tbDupRecs
group by UPC, DocumentNumber) r2 on r.UPC = r2.UPC and r.DocumentNumber = r2.DocumentNumber and isnull(r.StartDate,'12/31/2999') = isnull(r2.StartDate,'12/31/2999')

Related

pySpark error Expression Referencing the outer Query

I want to recreate this query in spark sql
SELECT
[Id],
[Group],
[Name],
min([Date]) as MinDate,
max([Date]) as MaxDate
FROM recordTable
GROUP BY [Id],[Group],[Name]
)
SELECT
t.Id,
t.[Group],
t.[Name],
c.[Date],
(SELECT top 1 ScoreCount
from recordTable x
where x.[Date] <= c.[Days]
and x.[Group] = t.[Group]
and x.[Name] = t.[Name]
order by x.[Date] desc
) ScoreCount
FROM t
LEFT JOIN calendar c ON c.[Days] BETWEEN t.MinDate AND t.MaxDate
so I have
df = spark.sql("""
WITH t as (
SELECT
Id,
Group,
Name,
min(Date) as MinDate,
max(Date) as MaxDate
FROM recordTable
GROUP BY Id,Group,Name
)
SELECT
t.Id,
t.Group,
t.Name,
c.Date,
(SELECT ScoreCount
from recordTable x
where x.Date <= c.Days
and x.Group = t.Group
and x.Name = t.Name
order by x.Date desc LIMIT 1
) ScoreCount
FROM t
LEFT JOIN calendar c ON c.Days BETWEEN t.MinDate AND t.MaxDate
""")
But I'm getting an error when trying to limit 1 and using an order by clause. Any alternatives?
"Expressions referencing the outer query are not supported outside of where/having clauses"

How to use alias column in lag function

How can i use alias column sisa to lag function in case statement
select i.item_name , i.item_code, DATE(ic.transaction_date), doc_no, qty,
case when count(item_name) over(partition by item_name) = 1
then sum(qty) over(partition by doc_no)
else
case when row_number() over(partition by item_code) = 1
then first_value(qty) over(partition by item_code)
else
qty + lag(sisa)over (partition by item_code)
end
end as sisa
from item_card_today ic
join item i on i.item_id = ic.item_id
where transaction_date
between '2019-03-31 00:11:42'::timestamp and '2019-04-02 08:17:35'::timestamp
and i.item_code = 'CLDXMZM3NTDGSHCKPRFHBHTM'
order by item_code asc, item_name asc
How i can use sisa in the code?

TSQL - Replace Cursor

I found in our database a cursor statement and I would like to replace it.
Declare #max_date datetime
Select #max_date = max(finished) From Payments
Declare #begin_date datetime = '2015-02-01'
Declare #end_of_last_month datetime
While #begin_date <= #max_date
Begin
SELECT #end_of_last_month = CAST(DATEADD(DAY, -1 , DATEFROMPARTS(YEAR(#begin_date),MONTH(#begin_date),1)) AS DATE) --AS end_of_last_month
Insert Into #table(Customer, ArticleTypeID, ArticleType, end_of_month, month, year)
Select Count(distinct (customerId)), prod.ArticleTypeID, at.ArticleType, #end_of_last_month, datepart(month, #end_of_last_month), datepart(year, #end_of_last_month)
From Customer cust
Inner join Payments pay ON pay.member_id = m.member_id
Inner Join Products prod ON prod.product_id = pay.product_id
Inner Join ArticleType at ON at.ArticleTypeID = prod.ArticleTypeID
Where #end_of_last_month between begin_date and expire_date
and completed = 1
Group by prod.ArticleTypeID, at.ArticleType
order by prod.ArticleTypeID, at.ArticleType
Set #begin_date = DATEADD(month, 1, #begin_date)
End
It groups all User per Month where the begin- and expire date in the actual Cursormonth.
Notes:
The user has different payment types, for e.g. 1 Month, 6 Month and so on.
Is it possible to rewrite the code - my problem is only the identification at the where clause (#end_of_last_month between begin_date and expire_date)
How can I handle this with joins or cte's?
What you need first, if not already is a numbers table
Using said Numbers table you can create a dynamic list of dates for "end_of_Last_Month" like so
;WITH ctexAllDates
AS (
SELECT end_of_last_month = DATEADD(DAY, -1, DATEADD(MONTH, N.N -1, #begin_date))
FROM
dbo.Numbers N
WHERE
N.N <= DATEDIFF(MONTH, #begin_date, #max_date) + 1
)
select * FROM ctexAllDates
Then combine with your query like so
;WITH ctexAllDates
AS (
SELECT end_of_last_month = DATEADD(DAY, -1, DATEADD(MONTH, N.N -1, #begin_date))
FROM
dbo.Numbers N
WHERE
N.N <= DATEDIFF(MONTH, #begin_date, #max_date) + 1
)
INSERT INTO #table
(
Customer
, ArticleTypeID
, ArticleType
, end_of_month
, month
, year
)
SELECT
COUNT(DISTINCT (customerId))
, prod.ArticleTypeID
, at.ArticleType
, A.end_of_last_month
, DATEPART(MONTH, A.end_of_last_month)
, DATEPART(YEAR, A.end_of_last_month)
FROM
Customer cust
INNER JOIN Payments pay ON pay.member_id = m.member_id
INNER JOIN Products prod ON prod.product_id = pay.product_id
INNER JOIN ArticleType at ON at.ArticleTypeID = prod.ArticleTypeID
LEFT JOIN ctexAllDates A ON A.end_of_last_month BETWEEN begin_date AND expire_date
WHERE completed = 1
GROUP BY
prod.ArticleTypeID
, at.ArticleType
, A.end_of_last_month
ORDER BY
prod.ArticleTypeID
, at.ArticleType;

Redshift PostgreSQL Distinct ON Operator

I have a data set that I want to parse for to see multi-touch attribution. The data set is made up by leads who responded to a marketing campaign and their marketing source.
Each lead can respond to multiple campaigns and I want to get their first marketing source and their last marketing source in the same table.
I was thinking I could create two tables and use a select statement from both.
The first table would attempt to create a table with the most recent marketing source from every person (using email as their unique ID).
create table temp.multitouch1 as (
select distinct on (email) email, date, market_source as last_source
from sf.campaignmember
where date >= '1/1/2016' ORDER BY DATE DESC);
Then I would create a table with deduped emails but this time for the first source.
create table temp.multitouch2 as (
select distinct on (email) email, date, market_source as first_source
from sf.campaignmember
where date >= '1/1/2016' ORDER BY DATE ASC);
Finally I wanted to simply select the email and join the first and last market sources to it each in their own column.
select a.email, a.last_source, b.first_source, a.date
from temp.multitouch1 a
left join temp.multitouch b on b.email = a.email
Since distinct on doesn't work on redshift's postgresql version I was hoping someone had an idea to solve this issue in another way.
EDIT 2/22: For more context I'm dealing with people and campaigns they've responded to. Each record is a "campaign response" and every person can have more than one campaign response with multiple sources. I'm trying make a select statement which would dedupe by person and then have columns for the first campaign/marketing source they've responded to and the last campaign/marketing source they've responded to respectively.
EDIT 2/24: Ideal output is a table with 4 columns: email, last_source, first_source, date.
The first and last source columns would be the same for people with only 1 campaign member record and different for everyone who has more than 1 campaign member record.
I believe you could use row_number() inside case expressions like this:
SELECT
email
, MIN(first_source) AS first_source
, MIN(date) first_date
, MAX(last_source) AS last_source
, MAX(date) AS last_date
FROM (
SELECT
email
, date
, CASE
WHEN ROW_NUMBER() OVER (PARTITION BY email ORDER BY date ASC) = 1 THEN market_source
ELSE NULL
END AS first_source
, CASE
WHEN ROW_NUMBER() OVER (PARTITION BY email ORDER BY date DESC) = 1 THEN market_source
ELSE NULL
END AS last_source
FROM sf.campaignmember
WHERE date >= '2016-01-01'
) s
WHERE first_source IS NOT NULL
OR last_source IS NOT NULL
GROUP BY
email
tested here: SQL Fiddle
PostgreSQL 9.3 Schema Setup:
CREATE TABLE campaignmember
(email varchar(3), date timestamp, market_source varchar(1))
;
INSERT INTO campaignmember
(email, date, market_source)
VALUES
('a#a', '2016-01-02 00:00:00', 'x'),
('a#a', '2016-01-03 00:00:00', 'y'),
('a#a', '2016-01-04 00:00:00', 'z'),
('b#b', '2016-01-02 00:00:00', 'x')
;
Query 1:
SELECT
email
, MIN(first_source) AS first_source
, MIN(date) first_date
, MAX(last_source) AS last_source
, MAX(date) AS last_date
FROM (
SELECT
email
, date
, CASE
WHEN ROW_NUMBER() OVER (PARTITION BY email ORDER BY date ASC) = 1 THEN market_source
ELSE NULL
END AS first_source
, CASE
WHEN ROW_NUMBER() OVER (PARTITION BY email ORDER BY date DESC) = 1 THEN market_source
ELSE NULL
END AS last_source
FROM campaignmember
WHERE date >= '2016-01-01'
) s
WHERE first_source IS NOT NULL
OR last_source IS NOT NULL
GROUP BY
email
Results:
| email | first_source | first_date | last_source | last_date |
|-------|--------------|---------------------------|-------------|---------------------------|
| a#a | x | January, 02 2016 00:00:00 | z | January, 04 2016 00:00:00 |
| b#b | x | January, 02 2016 00:00:00 | x | January, 02 2016 00:00:00 |
& a small extension to the request, count the number of contact points.
SELECT
email
, MIN(first_source) AS first_source
, MIN(date) first_date
, MAX(last_source) AS last_source
, MAX(date) AS last_date
, MAX(numof) AS Numberof_Contacts
FROM (
SELECT
email
, date
, CASE
WHEN ROW_NUMBER() OVER (PARTITION BY email ORDER BY date ASC) = 1 THEN market_source
ELSE NULL
END AS first_source
, CASE
WHEN ROW_NUMBER() OVER (PARTITION BY email ORDER BY date DESC) = 1 THEN market_source
ELSE NULL
END AS last_source
, COUNT(*) OVER (PARTITION BY email) as numof
FROM campaignmember
WHERE date >= '2016-01-01'
) s
WHERE first_source IS NOT NULL
OR last_source IS NOT NULL
GROUP BY
email
You can use the good old left join groupwise maximum.
SELECT DISTINCT c1.email, c1.date, c1.market_source
FROM sf.campaignmember c1
LEFT JOIN sf.campaignmember c2
ON c1.email = c2.email AND c1.date > c2.date AND c1.id > c2.id
LEFT JOIN sf.campaignmember c3
ON c1.email = c3.email AND c1.date < c3.date AND c1.id > c3.id
WHERE c1.date >= '1/1/2016' AND c2.date >= '1/1/2016'
AND (c2.email IS NULL OR c3.email IS NULL)
This assumes you have an unique id column, if (date, email) is unique id is not needed.

Select last value in a month for all given IDs

I have 2 tables, one containing meter IDs, and another containing measurements for some of the meters in the first table. This is the table structure:
MeterConfig:
MeterID (int)
MeterNumber (char[16])
Type (char[25])
Readings:
MeterID (int)
Date (datetime)
Value (numeric(18,6))
I need to get the last reading (and its date) from a given period for each meter, as well as the meter number. I managed to do this in T-SQL, although I'm not particularly pleased with the way I did it using this query:
select distinct
cfg.MeterNumber,
(select top 1 r.Date from Readings as r where r.Date between #startdate and #endDate and r.MeterID = cfg.MeterID order by r.Date desc) as Date,
(select top 1 r.Value from Readings as r where r.Date between #startdate and #endDate and r.MeterID = cfg.MeterID order by r.Date desc) as Value
from
MeterConfig cfg, Readings r1
where cfg.MeterID = r1.MeterID and r1.Date between #startdate and #endDate;
How can I do this more efficiently?
WITH CTE AS (
SELECT mc.MeterID, Date, Value, ROW_NUMBER() OVER (PARTITION BY mc.MeterID ORDER BY Date DESC) as Rank
FROM MeterConfig mc
INNER JOIN Readings rd
ON mc.MeterID = rd.MeterID
WHERE rd.Date BETWEEN #startdate AND #endDate)
SELECT * FROM CTE WHERE Rank = 1
Assuming the dates in Readings are unique (ic include a timestamp), following should be equivalent to your query.
SELECT DISTINCT cfg.MeterNumber
, r1.Date
, r1.Value
FROM MeterConfig cfg
INNER JOIN Readings r1 ON cfg.MeterID = r1.MeterID
INNER JOIN (
SELECT date = MAX(r.Date)
FROM Readings r
WHERE r.Date BETWEEN #StartDate AND #EndDate
) r2 On r2.date = r1.date