How to generate larger sets of lottery numbers efficiently - tsql

I am a beginner with SQL and I was looking for more experiences with SQL hence I decided to design a procedure to generate X amount of random lotto picks. The lottery here in my area allows you to pick 5 numbers from 1-47 and 1 "mega" number from 1-27. The trick is the "mega" number could repeat with the 5 numbers previously, i.e. 1, 2, 3, 4, 5, mega 1.
I created the following procedure to generate 10 million lottery picks, and it took 12 hours and 57 minutes for the process to finish. While my friends tested the same thing with java and it took seconds. I was wondering if there's any improvements I can make to the code or if there's any mistakes that I've made? I'm new at this hence I am trying to learn better approaches etc, all comments welcome.
USE lotto
DECLARE
#counter INT,
#counter1 INT,
#pm SMALLINT,
#i1 SMALLINT,
#i2 SMALLINT,
#i3 SMALLINT,
#i4 SMALLINT,
#i5 SMALLINT,
#sort int
SET #counter1=0
TRUNCATE TABLE picks
WHILE #counter1<10000000
BEGIN
TRUNCATE TABLE sort
SET #counter = 1
WHILE #counter < 6
BEGIN
INSERT INTO sort (pick)
SELECT CAST(((47+ 1) - 0) * RAND() + 1 AS TINYINT)
IF (SELECT count(distinct pick) FROM sort)<#counter
BEGIN
TRUNCATE TABLE sort
SET #counter=1
END
ELSE IF (SELECT COUNT(DISTINCT pick) FROM sort)=#counter
BEGIN
SET #counter = #counter + 1
END
END
SET #sort = 0
WHILE #sort<5
BEGIN
UPDATE sort
SET sort=#sort
WHERE pick = (SELECT min(pick) FROM sort WHERE sort is null)
SET #sort=#sort + 1
END
SET #i1 = (SELECT pick FROM sort WHERE sort = 0)
SET #i2 = (SELECT pick FROM sort WHERE sort = 1)
SET #i3 = (SELECT pick FROM sort WHERE sort = 2)
SET #i4 = (SELECT pick FROM sort WHERE sort = 3)
SET #i5 = (SELECT pick FROM sort WHERE sort = 4)
SET #pm = (CAST(((27+ 1) - 0) * RAND() + 1 AS TINYINT))
INSERT INTO picks(
First,
Second,
Third,
Fourth,
Fifth,
Mega,
Sequence
)
Values(
#i1,
#i2,
#i3,
#i4,
#i5,
#pm,
#counter1
)
SET #counter1 = #counter1+1
END

I generated 10000 rows in 0 sec. I did it i another way. Hope this will help you
;WITH Nbrs ( n ) AS (
SELECT 1 UNION ALL
SELECT 1 + n FROM Nbrs WHERE n < 10000 )
SELECT
(ABS(CHECKSUM(NewId())) % 47 + 1) AS First,
(ABS(CHECKSUM(NewId())) % 47 + 1) AS Second,
(ABS(CHECKSUM(NewId())) % 47 + 1) AS Third,
(ABS(CHECKSUM(NewId())) % 47 + 1) AS Fourth,
(ABS(CHECKSUM(NewId())) % 47 + 1) AS Fifth,
(ABS(CHECKSUM(NewId())) % 27 + 1) AS Mega,
Nbrs.n AS Sequence
FROM
Nbrs
OPTION ( MAXRECURSION 0 )
10000 rows 0 sec
100000 rows 1 sec
1000000 rows 13 sec
10000000 rows 02 min 21 sec
Or with cross joins
WITH E00(N) AS (SELECT 1 UNION ALL SELECT 1),
E02(N) AS (SELECT 1 FROM E00 a, E00 b),
E04(N) AS (SELECT 1 FROM E02 a, E02 b),
E08(N) AS (SELECT 1 FROM E04 a, E04 b),
E16(N) AS (SELECT 1 FROM E08 a, E08 b),
E32(N) AS (SELECT 1 FROM E16 a, E16 b),
Nbrs(N) AS (SELECT ROW_NUMBER() OVER (ORDER BY N) FROM E32)
SELECT
(ABS(CHECKSUM(NewId())) % 47 + 1) AS First,
(ABS(CHECKSUM(NewId())) % 47 + 1) AS Second,
(ABS(CHECKSUM(NewId())) % 47 + 1) AS Third,
(ABS(CHECKSUM(NewId())) % 47 + 1) AS Fourth,
(ABS(CHECKSUM(NewId())) % 47 + 1) AS Fifth,
(ABS(CHECKSUM(NewId())) % 27 + 1) AS Mega,
Nbrs.n AS Sequence
FROM Nbrs
WHERE N <= 10000000;
10000 rows 0 sec
100000 rows 1 sec
1000000 rows 14 sec
10000000 rows 03 min 29 sec
I should also mention that the reason I am using
(ABS(CHECKSUM(NewId())) % 47 + 1)
is that it returns a random number per row. The solution with
CAST(((47+ 1) - 0) * RAND() + 1 AS TINYINT)
return the same random number for each row if you select them in one go. To test this run this example:
;WITH Nbrs ( n ) AS (
SELECT 1 UNION ALL
SELECT 1 + n FROM Nbrs WHERE n < 5 )
SELECT
CAST(((47+ 1) - 0) * RAND() + 1 AS TINYINT) AS Random,
(ABS(CHECKSUM(NewId())) % 47 + 1) AS RadomCheckSum,
Nbrs.n AS Sequence
FROM Nbrs
Ok. So I did see your comment and I have a solution for that as well. If you really want to order the numbers. The complexity of the algorithm elevates and that also means that the time of the algorithm increases. But i still think it is doable. But not in the same neat way.
--Yeah declaring a temp table for just the random order number
DECLARE #tbl TABLE(value int)
--The same function but with the number of the random numbers
;WITH Nbrs ( n ) AS (
SELECT 1 UNION ALL
SELECT 1 + n FROM Nbrs WHERE n < 5 )
INSERT INTO #tbl
(
value
)
SELECT
Nbrs.n AS Sequence
FROM Nbrs
;WITH Nbrs ( n ) AS (
SELECT CAST(1 as BIGINT) UNION ALL
SELECT 1 + n FROM Nbrs WHERE n < 100000 )
SELECT
tblOrderRandomNumbers.[1] AS First,
tblOrderRandomNumbers.[2] AS Second,
tblOrderRandomNumbers.[3] AS Third,
tblOrderRandomNumbers.[4] AS Fourth,
tblOrderRandomNumbers.[5] AS Fifth,
(ABS(CHECKSUM(NewId())) % 27 + 1) AS Mega,
Nbrs.n AS Sequence
FROM
Nbrs
--This cross join. Joins with the declared table
CROSS JOIN
(
SELECT
[1], [2], [3], [4], [5]
FROM
(
SELECT
Random,
ROW_NUMBER() OVER(ORDER BY tblRandom.Random ASC) AS RowNumber
FROM
(
SELECT
(ABS(CHECKSUM(NewId())) % 47 + 1) AS Random
FROM
#tbl AS tblNumbers
) AS tblRandom
)AS tblSortedRadom
--A pivot makes the rows to columns. Using the row index over order of the random number
PIVOT
(
AVG(Random)
FOR RowNumber IN ([1], [2], [3], [4],[5])
) as pivottable
) AS tblOrderRandomNumbers
OPTION ( MAXRECURSION 0 )
But still i manage to do it in a little time
10000 Rows : 0 sec
100000 Rows : 4 sec
1000000 Rows : 43 sec
10000000 Rows : 7 min 9 sec
I Hope this help

I wrote this script just out of curiousity. It should do better than your script, but I cant tell for sure.
Beware that I use a declared table, and if you use a real table performance should be better when generating larger amounts of rows.
I generated 10000 rows on about 13 seconds, that counts to about 3.5 hours to generate 10 000 000 rows. Still far worse than the Java-case you described.
set nocount on
go
declare #i int = 1
declare #t table(nr1 int, nr2 int, nr3 int, nr4 int, nr5 int, mega int, seq int)
while #i <= 10000
begin
;with numbers(nr)
as
(
select 1
union all
select nr+1
from numbers
where nr < 47
)
,mega(nr)
as
(
select 1
union all
select nr+1
from mega
where nr < 27
)
,selectednumbers(nr)
as
(
select top 5 nr
from numbers
order by newid()
)
,selectedmega(mega)
as
(
select top 1 nr
from mega
order by newid()
)
,tmp
as
(
select *
,row_number() over(order by nr) as rownr
from selectednumbers
)
insert into #t
select max(nr1) as nr1
,max(nr2) as nr2
,max(nr3) as nr3
,max(nr4) as nr4
,max(nr5) as nr5
,(select mega from selectedmega) as mega
,#i as seq
from (
select case when rownr = 1 then nr else 0 end as nr1
,case when rownr = 2 then nr else 0 end as nr2
,case when rownr = 3 then nr else 0 end as nr3
,case when rownr = 4 then nr else 0 end as nr4
,case when rownr = 5 then nr else 0 end as nr5
from tmp
) x
set #i = #i + 1
end
select * from #t

Related

Unable to calculate compound interest in PostgreSQL

I have a table table1 which contains the details of any depositor like
Depositor
Deposit_Amount
Deposit_Date
Maturity_Date
Tenure
Rate
A
25000
2021-08-10
2022-08-10
12
10%
I have another table table2 which contains the interest due date as:
Interest_Due_Date
2021-09-30
2021-12-31
2022-03-31
2022-06-30
2022-08-10
My Code is:
with recursive recur (n, start_bal, days,principle,interest, end_bal) as
(
select sno,deposit_amount,rate,days,deposit_amount * (((rate::decimal(18,2))/100)/365)*days as interest, deposit_amount+(deposit_amount * (((rate::decimal(18,2))/100)/365)*days) as end_bal from (
SELECT
sno, COALESCE(DATE_PART('day', deposit_date::TIMESTAMP - lag(deposit_date::TIMESTAMP) over
(ORDER BY sno ASC rows BETWEEN UNBOUNDED PRECEDING AND CURRENT row)),0) AS
days, deposit_date, deposit_amount, rate
FROM
( SELECT
ROW_NUMBER () OVER (ORDER BY deposit_date) AS sno,
deposit_date,
deposit_amount,
rate
FROM
( SELECT
t1.deposit_date, t1.deposit_amount, t1.rate from table1 t1
UNION ALL
SELECT
t2.Interest_Due_Date AS idate, 0 as depo_amount, 0 as rate
FROM
table2 t2
ORDER BY
deposit_date) dep) calc) b where sno = 1 union all select b.sno, b.end_bal,b.days,b.prin_bal,(coalesce(a.end_bal,0)) * (((b.rate)/100)/365)*b.days as interest_NEW,
coalesce(a.end_bal,0)+ ((a.end_bal) * (((calc.rate)/100)/365)*calc.days) as end_bal_NEW
from b, recur as a
where calc.sno = a.n+1 ) select * from recur
"Every time when i try to execute the query its showing an error 'relation 'b' does not exist"
...
The result table should be
Deposit Amount
Date
Days
Interest
Total Amount
25000
2021-08-10
0
0
25000
0
2021-09-30
51
349.32
25349.32
0
2021-12-31
92
638.94
25988.26
0
2022-03-31
90
640.81
26629.06
0
2022-06-30
91
663.90
27292.97
0
2022-08-10
41
306.58
27599.54

Taking N-samples from each group in PostgreSQL

I have a table containing data that has a column named id that looks like below:
id
value 1
value 2
value 3
1
244
550
1000
1
251
551
700
1
540
60
1200
...
...
...
...
2
19
744
2000
2
10
903
100
2
44
231
600
2
120
910
1100
...
...
...
...
I want to take 50 sample rows per id that exists but if less than 50 exist for the group to simply take the entire set of data points.
For example I would like a maximum 50 data points randomly selected from id = 1, id = 2 etc...
I cannot find any previous questions similar to this but have tried taking a stab at at least logically working through the solution where I could iterate and union all queries by id and limit to 50:
SELECT * FROM (SELECT * FROM schema.table AS tbl WHERE tbl.id = X LIMIT 50) UNION ALL;
But it's obvious that you cannot use this type of solution because UNION ALL requires aggregating outputs from one id to the next and I do not have a list of id values to use in place of X in tbl.id = X.
Is there a way to accomplish this by gathering that list of unique id values and union all results or is there a more optimal way this could be done?
If you want to select a random sample for each id, then you need to randomize the rows somehow. Here is a way to do it:
select * from (
select *, row_number() over (partition by id order by random()) as u
from schema.table
) as a
where u <= 50;
Example (limiting to 3, and some row number for each id so you can see the selection randomness):
setup
DROP TABLE IF EXISTS foo;
CREATE TABLE foo
(
id int,
value1 int,
idrow int
);
INSERT INTO foo
select 1 as id, (1000*random())::int as value1, generate_series(1, 100) as idrow
union all
select 2 as id, (1000*random())::int as value1, generate_series(1, 100) as idrow
union all
select 3 as id, (1000*random())::int as value1, generate_series(1, 100) as idrow;
Selection
select * from (
select *, row_number() over (partition by id order by random()) as u
from foo
) as a
where u <= 3;
Output:
id
value1
idrow
u
1
542
6
1
1
24
86
2
1
155
74
3
2
505
95
1
2
100
46
2
2
422
33
3
3
966
88
1
3
747
89
2
3
664
19
3
In case you are looking to get 50 (or less) from each group of IDs then you can use windowing -
From question - "I want to take 50 sample rows per id that exists but if less than 50 exist for the group to simply take the entire set of data points."
Query -
with data as (
select row_number() over (partition by id order by random()) rn,
* from table_name)
select * from data where rn<=50 order by id;
Fiddle.
Your description of trying to get the UNION ALL without specifying all the branches ahead of time is aiming for a LATERAL join. And that is one way to solve the problem. But unless you have a table of all distinct ids, you would have to compute one on the fly. For example (using the same fiddle as Pankaj used):
with uniq as (select distinct id from test)
select foo.* from uniq cross join lateral
(select * from test where test.id=uniq.id order by random() limit 3) foo
This could be either slower or faster than the Window Function method, depending on your system and your data and your indexes. In my hands, it was quite a bit faster even with the need to dynamically compute the list of distinct ids.

Moving grouped MEDIAN / Get the MEDIAN of specific months from the past IN T-SQL

Let's say I have a table:
DATE
ID
VALUE
01.2010
1
100
02.2010
1
200
...
...
...
12.2010
1
300
01.2011
1
150
02.2011
1
250
...
...
...
12.2011
1
350
01.2012
1
200
02.2012
1
300
...
...
...
12.2012
1
400
I want to get a median of VALUE grouped by months i.e. get something like
DATE
ID
VALUE
MEDIAN
01.2010
1
100
100
02.2010
1
200
200
...
...
...
...
12.2010
1
300
300
01.2011
1
150
125 = (100+150)/2
02.2011
1
250
225 = (200+250)/2
...
...
...
...
12.2011
1
350
325 = (300+350)/2
01.2012
1
200
150
02.2012
1
300
250
...
...
...
...
12.2012
1
400
350
I have more ID in table so I would like to get this result for every ID.
I have tried doing
SELECT PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY VALUE) OVER (PARTITION BY Id, MONTH(Date) ORDER BY Date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
but I get "The function 'PERCENTILE_CONT' may not have a window frame.
I've also tried the following (but also without any results):
SELECT PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY VALUE)
OVER (PARTITION BY Id, MONTH(Date))
FROM tab1 LEFT JOIN tab2
ON tab1.key = tab2.key
WHERE tab1.Date BETWEEN Min(Date) AND tab2.Date
EDIT
So far I have resolved it with
SELECT (CASE WHEN Date =2010 THEN PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY CASE WHEN Date = 2010 THEN VALUE ELSE NULL) OVER (PARTITION BY Id, MONTH(Date)) ELSE 0 END) +
(CASE WHEN Date =2011 THEN PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY CASE WHEN Date <= 2011 THEN VALUE ELSE NULL) OVER (PARTITION BY Id, MONTH(Date)) ELSE 0 END) +
(CASE WHEN Date =2012 THEN PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY CASE WHEN Date <= 2012 THEN VALUE ELSE NULL) OVER (PARTITION BY Id, MONTH(Date)) ELSE 0 END)
FROM tab1
But to be honest, I would like to have an resolution without assumption of a priori knowledge of dates. I've thought about WHILE LOOP and updating column while #MinYear <= #MaxYear where in every iteration #MinYear = #MinYear+1 but in this case I would have to create temporary tables which I'm trying to avoid.
My idea is to use (Value1+value2)/2 as median as your requirement is little complicated.
CREATE TABLE MedianData
(
[Date] VARCHAR(100)
,ID INT
,[Value] INT
)
INSERT INTO MedianData VALUES ('01.2010', 1, 100)
,('02.2010', 1, 200)
,('12.2010', 1, 300)
,('01.2011', 1, 150)
,('02.2011', 1, 250)
,('12.2011', 1, 350)
,('01.2012', 1, 200)
,('02.2012', 1, 300)
,('12.2012', 1, 400)
SELECT *
,ROW_NUMBER() OVER ( PARTITION BY Substring([Date],1,2 ) ORDER BY [Date] ) AS [row]
,Substring([Date],1,2 ) as [MONTH]
INTO #Temp_tbl2
FROM MedianData
SELECT
A.Date
,A.ID
,A.[Value]
--Logic is applied here. I used (Value1+value2)/2 as median
,CASE WHEN A.[row] = 3 THEN ( A.[Value] + ( SELECT T.[Value] FROM #Temp_tbl2
T where T.[MONTH] = Substring(A.[Date],1,2 ) AND T.[row] = 1 ) )/2
WHEN A.[row] != 1 THEN (A.total/2)
ELSE A.total END as [Median]
INTO #Temp_table
FROM
(
SELECT *
,ROW_NUMBER() OVER ( PARTITION BY Substring([Date],1,2 ) ORDER BY [Date] ) AS [row]
,SUM ([Value] ) OVER ( PARTITION BY Substring([Date],1,2 ) ORDER BY [Date] ) AS [total]
FROM MedianData
) AS A
--to make the table data order
SELECT MedianData.*, #Temp_table.Median
FROM MedianData
INNER JOIN #Temp_table
ON MedianData.[Date] = #Temp_table.[Date]
drop table #Temp_table
drop table #Temp_tbl2

Postgres - Update running count whenever row meets a certain condition

I have a table with the following entries in them
id price quantity
1. 10 75
2. 10 75
3. 10 -150
4. 10 75
5. 10 -75
What I need to do is to update each row with a number that is the number of times the running total has been 0. In the above example, the cumulative totals would be
id. cum_total
1. 750
2. 1500
3. 0
4. 750
5. 0
Desired result
id price quantity seq
1. 10 75 1
2. 10 75 1
3. 10 -150 1
4. 10 75 2
5. 10 -75 2
I'm now lost in a spiral of CTEs and window functions and figured I'd ask the experts.
Thanks in advance :-)
Here is one option using analytic functions:
WITH cte AS (
SELECT *, CASE WHEN SUM(price*quantity) OVER (ORDER BY id) = 0 THEN 1 ELSE 0 END AS price_sum
FROM yourTable
),
cte2 AS (
SELECT *, LAG(price_sum, 1, 0) OVER (ORDER BY id) price_sum_lag
FROM cte
)
SELECT id, price, quantity, 1 + SUM(price_sum_lag) OVER (ORDER BY id) cumulative_total
FROM cte2
ORDER BY id;
Demo
You may try running each CTE in succession to see how the logic is working.
With window functions:
SELECT id, price, quantity,
coalesce(
sum(CASE WHEN iszero THEN 1 ELSE 0 END)
OVER (ORDER BY id
ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING),
0
) + 1 AS batch
FROM (SELECT id, price, quantity,
sum(price * quantity) OVER (ORDER BY id) = 0 AS iszero
FROM mytable) AS subq;

T-SQL A problem with SELECT TOP (case [...])

I have query like that:
(as You see I'd like to retrieve 50% of total rows or first 100 rows etc)
//#AllRowsSelectType is INT
SELECT TOP (
case #AllRowsSelectType
when 1 then 100 PERCENT
when 2 then 50 PERCENT
when 3 then 25 PERCENT
when 4 then 33 PERCENT
when 5 then 50
when 6 then 100
when 7 then 200
end
) ROW_NUMBER() OVER(ORDER BY [id]) AS row_num, a,b,c etc
why have I the error : "Incorrect syntax near the keyword 'PERCENT'." on line "when 1 [...]"
The syntax for TOP is:
TOP (expression) [PERCENT]
[ WITH TIES ]
The reserved keyword PERCENT cannot be included in the expression. Instead you can run two different queries: one for when you want PERCENT and another for when you don't.
If you need this to be one query you can run both queries and use UNION ALL to combine the results:
SELECT TOP (
CASE #AllRowsSelectType
WHEN 1 THEN 100
WHEN 2 THEN 50
WHEN 3 THEN 25
WHEN 4 THEN 33
ELSE 0
END) PERCENT
ROW_NUMBER() OVER(ORDER BY [id]) AS row_num, a, b, c, ...
UNION ALL
SELECT TOP (
CASE #AllRowsSelectType
WHEN 5 THEN 50
WHEN 6 THEN 100
WHEN 7 THEN 200
ELSE 0
END)
ROW_NUMBER() OVER(ORDER BY [id]) AS row_num, a, b, c, ...
You're also mixing two different types of use. The other is.
DECLARE #ROW_LIMT int
IF #AllRowsSelectType < 5
SELECT #ROW_LIMIT = COUNT(*)/#AllRowsSelectType FROM myTable -- 100%, 50%, 33%, 25%
ELSE
SELECT #ROW_LIMIT = 50 * POWER(2, #AllRowsSelectType - 5) -- 50, 100, 200...
WITH OrderedMyTable
(
select *, ROW_NUMBER() OVER (ORDER BY id) as rowNum
FROM myTable
)
SELECT * FROM OrderedMyTable
WHERE rowNum <= #ROW_LIMIT
You could do:
select top (CASE #FilterType WHEN 2 THEN 50 WHEN 3 THEN 25 WHEN 4 THEN 33 ELSE 100 END) percent * from
(select top (CASE #FilterType WHEN 5 THEN 50 WHEN 6 THEN 100 WHEN 7 THEN 200 ELSE 2147483647 END) * from
<your query here>
) t
) t
Which may be easier to read.