Randomly pick N distinct winners with weights for a raffle - postgresql

I've been trying to find a solution to this problem for a day now.
So, I have a table (raffle_tickets), from which I want to pick N distinct users, with their probability of being picked based on the sum of the number of tickets they bought, as the winners of a raffle and insert the winners into raffle_winners.
Now, I've found a solution on SO to pick 1 winner, but not N (And also it has a slight issue, where if there's, let's say, exactly 1 entry it is totally random whenever it is picked or not, which is not acceptable, obviously).
In that same answer (and others of other questions) I saw cross join being used with generate_series, but from what it looks like it would pick with replacement (e.g. with duplicates, not distinct), and that's not what I want.
I'm using Postgres/PSQL 14.5.
Here's some of the table structure:
/* Table with raffle tickets. Each user might have multiple entries in it for the same raffle */
CREATE TABLE IF NOT EXISTS raffle_tickets (
id SERIAL PRIMARY KEY,
raffle_id BIGINT REFERENCES raffles(id),
user_id BIGINT NOT NULL,
num_tickets INT NOT NULL,
date TIMESTAMP NOT NULL DEFAULT NOW()
);
/* Winners of raffles. Selected based on distinct users and weights from `raffle_tickets` */
CREATE TABLE IF NOT EXISTS raffle_winners (
id SERIAL PRIMARY KEY,
raffle_id BIGINT REFERENCES raffles(id),
user_id BIGINT NOT NULL,
probability FLOAT NOT NULL
CONSTRAINT user_winner_once_per_raffle UNIQUE(raffle_id, user_id) /* One user might not be picked more than once as a winner of a raffle */
);
/* Simplified table, in reality it has more fields */
CREATE TABLE IF NOT EXISTS raffles (
id SERIAL PRIMARY KEY,
num_max_winners INT NOT NULL
);
The code I wrote (below) is based on this answer if anyone is interested.
WITH users_and_weights AS (
SELECT
DISTINCT(user_id),
SUM(num_tickets) AS weight
FROM
raffle_tickets
WHERE
raffle_id=$1
GROUP BY
user_id
), p AS ( /* probability */
SELECT
*,
(weight / SUM(weight) OVER ()) AS probability
FROM
users_and_weights
), cp AS ( /* cumulative probability */
SELECT
*,
SUM(p.probability) OVER (
ORDER BY probability DESC
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS cum_probability
FROM
p
), fp AS ( /* final probability */
SELECT
*,
cum_probability - probability AS start_probability,
cum_probability AS end_probability
FROM
cp
)
INSERT INTO
raffle_winners (user_id, raffle_id, probability)
SELECT
user_id,
$1 AS raffle_id,
probability
FROM
fp
WHERE
random() BETWEEN start_probability AND end_probability
LIMIT
(SELECT num_max_winners FROM raffle_data)

You are making this more complicated than necessary.
This is simplified for a single raffle:
with gen_tickets as (
-- Use `generate_series()` to create a row for each ticket
select user_id
from raffle_tickets
cross join lateral generate_series(1, num_tickets)
), shuffle as (
select user_id, row_number() over (order by random()) as rn
from gen_tickets
), min_row as (
-- Limit to one win per user
select user_id, min(rn)
from shuffle
group by user_id
), winner_order as (
select user_id, row_number() over (order by rn) as rn
from min_row
)
select *
from winner_order
where rn <= <num_max_winners>

To those interested, this here's what I ended up using.
(It's not perfect, but I already spent way too much time on it, so if anyone feels like fixing it up, please do.)
(As a side-note, is there a good way to pass query parameters to a function block like this? I'm using asyncpg (python))
DO
$do$
DECLARE
num_uniq_participants INTEGER;
num_max_winners_to_select INTEGER;
BEGIN
num_max_winners_to_select := (
SELECT
num_max_winners
FROM
raffles
WHERE
id={raffle_id}
);
num_uniq_participants := (
SELECT
COUNT(*)
FROM (
SELECT
DISTINCT(user_id)
FROM
raffle_tickets
WHERE
raffle_id={raffle_id}
) AS q
);
IF (num_max_winners_to_select >= num_uniq_participants) THEN
/* There are less participants than the required amount of winners, so everyone is a winner */
INSERT INTO
raffle_winners(user_id, raffle_id, probability)
SELECT
DISTINCT(user_id),
$1 AS raffle_id,
1 AS probability
FROM
raffle_tickets
WHERE
raffle_id={raffle_id};
ELSE
/**
* Pick winners.
* Each iteration the winners are excluded from the
* newly pickable participant list.
**/
/**
* TODO:
* Right now this isn't super efficient, as we always re-calculate
* the weight of each participant in each iteartion.
* For now it's okay, but something to keep in mind in the future.
* (Though, unless there's hunderds of thousands of participants it shouldn't be too bad)
**/
FOR i IN 1..LEAST(num_max_winners_to_select, num_uniq_participants) LOOP
WITH users_and_weights AS (
SELECT
DISTINCT(user_id),
SUM(num_tickets) AS weight
FROM
raffle_tickets rt
WHERE
NOT EXISTS ( /* Don't re-pick winners */
SELECT
1
FROM
raffle_winners rw
WHERE
rw.user_id=rt.user_id AND rw.raffle_id=rt.raffle_id
) AND raffle_id={raffle_id}
GROUP BY
user_id
), p AS ( /* probability */
SELECT
*,
(weight / SUM(weight) OVER ()) AS probability
FROM
users_and_weights
), cp AS ( /* cumulative probability */
SELECT
*,
SUM(p.probability) OVER (
ORDER BY probability DESC
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS cum_probability
FROM
p
), fp AS ( /* final probability */
SELECT
*,
cum_probability - probability AS start_probability,
cum_probability AS end_probability
FROM
cp
), const_rnd AS (
/* Must put this into a CTE otherwise it's re-evaluated for
* each row and might cause no entry to be selected at all.
**/
SELECT RANDOM() AS RND
)
INSERT INTO
raffle_winners(user_id, raffle_id, probability)
SELECT
user_id,
$1 AS raffle_id,
probability
FROM
cp
WHERE
(SELECT rnd FROM const_rnd) BETWEEN cum_probability - probability AND cum_probability
LIMIT
1; /* Pick 1 winner / interation */
END LOOP;
END IF;
END
$do$;

Related

Weighted Random Selection

Please. I have two tables with the most common first and last names. Each table has basically two fields:
Tables
CREATE TABLE "common_first_name" (
"first_name" text PRIMARY KEY, --The text representing the name
"ratio" numeric NOT NULL, -- the % of how many times it occurs compared to the other names.
"inserted_at" timestamp WITH time zone DEFAULT timezone('utc'::text, now()) NOT NULL,
"updated_at" timestamp WITH time zone DEFAULT timezone('utc'::text, now()) NOT NULL
);
CREATE TABLE "common_last_name" (
"last_name" text PRIMARY KEY, --The text representing the name
"ratio" numeric NOT NULL, -- the % of how many times it occurs compared to the other names.
"inserted_at" timestamp WITH time zone DEFAULT timezone('utc'::text, now()) NOT NULL,
"updated_at" timestamp WITH time zone DEFAULT timezone('utc'::text, now()) NOT NULL
);
P.S: The TOP 1 name occurs only ~ 1.8% of the time. The tables have 1000 rows each.
Function (Pseudo, not READY)
CREATE OR REPLACE FUNCTION create_sample_data(p_number_of_records INT)
RETURNS VOID
AS $$
DECLARE
SUM_OF_WEIGHTS CONSTANT INT := 100;
BEGIN
FOR i IN 1..coalesce(p_number_of_records, 0) LOOP
--Get the random first and last name but taking in consideration their probability (RATIO)round(random()*SUM_OF_WEIGHTS);
--create_person (random_first_name || ' ' || random_last_name);
END LOOP;
END
$$
LANGUAGE plpgsql VOLATILE;
P.S.: The sum of all ratios for each name (per table) sums up to 100%.
I want to run a function N times and get a name and a surname to create sample data... both tables have 1000 rows each.
The sample size can be anywhere from 1000 full names to 1000000 names, so if there is a "fast" way of doing this random weighted function, even better.
Any suggestion of how to do it in PL/PGSQL?
I am using PG 13.3 on SUPABASE.IO.
Thanks
Given the small input dataset, it's straightforward to do this in pure SQL. Use CTEs to build lower & upper bound columns for each row in each of the common_FOO_name tables, then use generate_series() to generate sets of random numbers. Join everything together, and use the random value between the bounds as the WHERE clause.
with first_names_weighted as (
select first_name,
sum(ratio) over (order by first_name) - ratio as lower_bound,
sum(ratio) over (order by first_name) as upper_bound
from common_first_name
),
last_names_weighted as (
select last_name,
sum(ratio) over (order by last_name) - ratio as lower_bound,
sum(ratio) over (order by last_name) as upper_bound
from common_last_name
),
randoms as (
select random() * (select sum(ratio) from common_first_name) as f_random,
random() * (select sum(ratio) from common_last_name) as l_random
from generate_series(1, 32)
)
select r, first_name, last_name
from randoms r
cross join first_names_weighted f
cross join last_names_weighted l
where f.lower_bound <= r.f_random and r.f_random <= f.upper_bound
and l.lower_bound <= r.l_random and r.l_random <= l.upper_bound;
Change the value passed to generate_series() to control how many names to generate. If it's important that it be a function, you can just use a LANGAUGE SQL function definition to parameterize that number:
https://www.db-fiddle.com/f/mmGQRhCP2W1yfhZTm1yXu5/3

Generate data with at least one occurence

I have three tables:
create table genres
(
genre_id serial primary key,
genre_name varchar NOT NULL UNIQUE
);
create table movies
(
movie_id serial primary key,
movie_name varchar NOT NULL
);
create table movie_genres
(
movie_id integer references movies NOT NULL,
genre_id integer references genres NOT NULL,
PRIMARY KEY(movie_id, genre_id)
);
Tables genres and movies are full of data and I want to generate some random data for table movie_genres, so that every movie has at least one genre.
I tried it this way, but then it is possible for a movie to be without any genre. Can anyone help me with that, please?
insert into movie_genres
select movie_id, genre_id
from genres cross join movies
where random() < 0.15;
Hmm, you can try to join a derived table in which you first select one random genre and then UNION some more randomly.
INSERT INTO movie_genres
(movie_id,
genre_id)
SELECT m.movie_id,
rg.genre_id
FROM movies m
CROSS JOIN ((SELECT g.genre_id
FROM genres g
ORDER BY random()
LIMIT 1)
UNION
(SELECT g.genre_id
FROM genres g
WHERE random() < 0.15)) rg;
That however means that every movie has that one genre selected first. To overcome this and have the first genre be random per movie, a lateral join can be used. (Remark: You need to use some column from the outer table in the derived table as otherwise the optimizer seems to optimize the LATERAL away.)
INSERT INTO movie_genres
(movie_id,
genre_id)
SELECT rg.movie_id,
rg.genre_id
FROM movies m
CROSS JOIN LATERAL ((SELECT g.genre_id,
m.movie_id -- that's just here to force the optimizer to keep the join lateral
FROM genres g
ORDER BY random()
LIMIT 1)
UNION
(SELECT g.genre_id,
m.movie_id
FROM genres g
WHERE random() < 0.15)) rg;
db<>fiddle

PostgreSQL Transaction to Use Results from Query to Insert and Query another Table then Return Original Query Results

I am writing an application that stores data on file samples and YARA signatures. Essentially, in a single transaction, I need to execute a query, reference those results in an insert and another query, then return the original results. I have three tables that are relevant to this discussion:
samples - this is the table that stores information on files that need to be scanned with the associated YARA signatures.
yararules - the table that stores information on the YARA rules.
yaratracker - a table that tracks the sample/rule pairs that have been processed thus far.
In a single transaction, the application needs to:
Get a batch of unique sample/rule pairs that have not yet been processed. Preferably, this query will get all non-processed rules associated with a single sample (i.e. if I'm going to run the YARA rules on a sample, I want to run all of the YARA rules not yet processed on that sample so that I only have to load the sample into memory once).
Get a unique list of id,sha256 from the batch found in step 1.
Insert the batch from step 1 into the yaraqueue with the matchcount column equal to 0 and complete column set to false.
I can accomplish Step 1 with the query below, but I don't know how to reference those results to accomplish step 2. I've tried looking into variables, but apparently there isn't one that can hold multiple rows. I've looked into using a cursor, but I can't seem to use the cursor with a subsequent command and then return the cursor.
SELECT s.id,r.id
FROM sample s CROSS JOIN yararules r
WHERE r.status = 'Disabled' AND NOT EXISTS(
SELECT 1 FROM yaratracker q
WHERE q.sample_id = s.id AND q.rule_id = r.id
)
ORDER BY s.id
LIMIT 1000;
The relevant database schema looks like this.
CREATE TYPE samplelist AS ENUM ('Whitelist', 'Blacklist', 'Greylist', 'Unknown');
CREATE TABLE samples (
id SERIAL PRIMARY KEY,
md5 CHAR(32) NOT NULL,
sha1 CHAR(40) NOT NULL,
sha256 CHAR(64) NOT NULL,
total INT NOT NULL,
positives INT NOT NULL,
list SAMPLELIST NOT NULL,
filetype VARCHAR(16) NOT NULL,
submitted TIMESTAMP WITH TIME ZONE NOT NULL,
user_id SERIAL REFERENCES users;
);
CREATE UNIQUE INDEX md5_idx ON {0} (md5);
CREATE UNIQUE INDEX sha1_idx ON {0} (sha1);
CREATE UNIQUE INDEX sha256_idx ON {0} (sha256);
CREATE TYPE rulestatus AS ENUM ('Enabled', 'Disabled');
CREATE TABLE yararules (
id SERIAL PRIMARY KEY,
name VARCHAR(32) NOT NULL UNIQUE,
description TEXT NOT NULL,
rules TEXT NOT NULL,
lastmodified TIMESTAMP WITH TIME ZONE NOT NULL,
status rulestatus NOT NULL,
user_id SERIAL REFERENCES users ON DELETE CASCADE
);
CREATE TABLE yaratracker (
id SERIAL PRIMARY KEY,
rule_id SERIAL REFERENCES yararules ON DELETE CASCADE,
sample_id SERIAL REFERENCES sample ON DELETE CASCADE,
matchcount INT NOT NULL,
complete BOOL NOT NULL
);
CREATE INDEX composite_idx ON yaratracker (rule_id, sample_id);
CREATE INDEX complete_idx ON yaratracker (complete);
INSERT INTO target_table(a,b,c,...)
SELECT sid, rid, sha, ...
FROM (
SELECT s.id AS sid
,r.id AS rid
, s.sha256 AS sha
, ...
, ROW_NUMBER() OVER (PARTITION BY s.id) AS rn -- <<<--- HERE
FROM sample s CROSS JOIN yararules r
WHERE r.status = 'Disabled' AND NOT EXISTS(
SELECT 1 FROM yaratracker q
WHERE q.sample_id = s.id
AND q.rule_id = r.id
)
ORDER BY s.id
LIMIT 1000;
) src
WHERE src.rn = 1; -- <<<--- HERE
The WHERE src.rn = 1 will restrict the cross-join to deliver only one tuple per sample.id (both id and sha256 are unique in the sample table, so picking a unique id has the same effect as picking a unique sha256)
The complete cross-join result will never be generated; the optimiser is smart enough to push down the WHERE rn=1 condition into the subquery.
Note: the LIMIT 1000 should probably be removed (or pulled up to a higher level)
If you REALLY need to save the results from the CROSS JOIN, you could use a chain of CTEs (expect a performance degradation ...)
WITH big AS (
SELECT s.id AS sample_id
,r.id AS rule_id
, s.sha256
-- , ...
, ROW_NUMBER() OVER (PARTITION BY s.id) AS rn -- <<<--- HERE
FROM sample s
CROSS JOIN yararules r
WHERE r.status = 'Disabled' AND NOT EXISTS(
SELECT 1 FROM yaratracker q
WHERE q.sample_id = s.id AND q.rule_id = r.id
)
)
, ins AS (
INSERT INTO target_table(a,b,c,...)
SELECT b.sample_id, b.rule_id, b.sha256 , ...
FROM big b
WHERE b.rn = 1; -- <<<--- HERE
RETURNING *
)
INSERT INTO yaratracker (rule_id, sample_id, matchcount, complete )
SELECT b.sample_id, b.rule_id, 0, False
FROM big b
-- LEFT JOIN ins i ON i.a = b.sample_id AND i.b= b.rule_id
;
NOTE: the yaratracker(rule_id,sample_id) should not be serials but just plain integers, referencing yararules(id) and sample(id)

PL/pgSQL function to randomly select an id

Goal:
pre-populate a table with a list of sequential id, from e.g. 1 to 1,000,000. The table has an additional column that is nillable. NULL values are marked as unassigned and non-NULL values are marked as assigned
have function i can call that asks for x number of randomly chosen ids from the table which have not been assigned.
This is for something quite specific and while I understand there are different ways of doing this, I'd like to know if there's a solution to the flaw in this particular implementation.
I have something that partially works, but wondering where the flaw in the function is.
Here's the table:
CREATE SEQUENCE accounts_seq MINVALUE 700000000001 NO MAXVALUE;
CREATE TABLE accounts (
id BIGINT PRIMARY KEY default nextval('accounts_seq'),
client VARCHAR(25), UNIQUE(id, client)
);
This function gen_account_ids is just a one-time setup to pre-populate the table with a fixed number of rows, all marked as unassigned.
/*
This function will insert new rows into the accounts table with ids being
generated by a sequence, and client being NULL. A NULL client indicates
the account has not yet been assigned.
*/
CREATE OR REPLACE FUNCTION gen_account_ids(bigint)
RETURNS INT AS $gen_account_ids$
DECLARE
-- count is the number of new accounts you want generated
count alias for $1;
-- rowcount is returned as the number of rows inserted
rowcount int;
BEGIN
INSERT INTO accounts(client) SELECT NULL FROM generate_series(1, count);
GET DIAGNOSTICS rowcount = ROW_COUNT;
RETURN rowcount;
END;
$gen_account_ids$ LANGUAGE plpgsql;
So, I use this to pre-populate the table with, say 1000 records:
SELECT gen_account_ids(1000);
The next function assign is meant to randomly select an unassigned id (unassigned means client column is null), and update it with a client value so it becomes assigned. It returns the number of rows affected.
It works sometimes, but I do believe there are collisions occurring -- which is why I tried for DISTINCT, but it often returns fewer than the desired number of rows. For example, if I select assign(100, 'foo'); it might return 95 rows instead of the desired 100.
How can I modify this to make it always return the exact desired rows?
/*
This will assign ids to a client randomly
#param int is the number of account numbers to generate
#param varchar(10) is a string descriptor for the client
#returns the number of rows affected -- should be the same as the input int
Call it like this: `SELECT * FROM assign(100, 'FOO')`
*/
CREATE OR REPLACE FUNCTION assign(INT, VARCHAR(10))
RETURNS INT AS $$
DECLARE
total ALIAS FOR $1;
clientname ALIAS FOR $2;
rowcount int;
BEGIN
UPDATE accounts SET client = clientname WHERE id IN (
SELECT DISTINCT trunc(random() * (
(SELECT max(id) FROM accounts WHERE client IS NULL) -
(SELECT min(id) FROM accounts WHERE client IS NULL)) +
(SELECT min(id) FROM accounts WHERE client IS NULL)) FROM generate_series(1, total));
GET DIAGNOSTICS rowcount = ROW_COUNT;
RETURN rowcount;
END;
$$ LANGUAGE plpgsql;
This is loosely based on this where you can do something like SELECT trunc(random() * (100 - 1) + 1) FROM generate_series(1,5); which will select 5 random numbers between 1 and 100.
My goal is to do something similar where I select a random id between the min and max unassigned rows, and mark it for update.
This isn't the best answer b/c it does involve full table scans, but in my situation, I'm not concerned about the performance, and it works. This is based off #CraigRinger's reference to the blog post getting random tuples
I'd be generally interested in hearing about other (perhaps better) solutions -- and am specifically curious about why the original solution falls short, and what #klin also devised.
So, here's my brute force random order solution:
-- generate a million unassigned rows with null client column
insert into accounts(client) select null from generate_series(1, 1000000);
-- assign 1000 random rows to client 'foo'
update accounts set client = 'foo' where id in
(select id from accounts where client is null order by random() limit 1000);
Because ids of random subset of rows are not consecutive, select a random row_number() instead of random id.
with nulls as ( -- base query
select id
from accounts
where client is null
),
randoms as ( -- calculate random int in range 1..count(nulls.*)
select trunc(random()* (count(*) - 1) + 1)::int random_value
from nulls
),
row_numbers as ( -- add row numbers to nulls
select id, row_number() over (order by id) rn
from nulls
)
select id
from row_numbers, randoms
where rn = random_value; -- random row number
A function is not necessary here, but you can easily place the query in a function body if needed.
This query updates 5 random rows with null client.
update accounts
set client = 'new value' -- <-- clientname
where id in (
with nulls as ( -- base query
select id
from accounts
where client is null
),
randoms as ( -- calculate random int in range 1..count(nulls.*)
select i, trunc(random()* (count(*) - 1) + 1)::int random_value
from nulls
cross join generate_series(1, 5) i -- <-- total
group by 1
),
row_numbers as ( -- add row numbers to nulls in order by id
select id, row_number() over (order by id) rn
from nulls
)
select id
from row_numbers, randoms
where rn = random_value -- random row number
)
However, there is no certainty that the query will update exactly 5 rows, because
select trunc(random()* (max_value - 1) + 1)::int
from generate_series(1, n)
is not a correct way to generate n different random values. The probability of repetitions increases with the quotient n / max_value.

How to work around the "Recursive CTE member can refer itself only in FROM clause" requirement?

I'm trying to run a graph search to find all nodes accessible from a starting point, like so:
with recursive
nodes_traversed as (
select START_NODE ID
from START_POSITION
union all
select ed.DST_NODE
from EDGES ed
join nodes_traversed NT
on (NT.ID = ed.START_NODE)
and (ed.DST_NODE not in (select ID from nodes_traversed))
)
select distinct * from nodes_traversed
Unfortunately, when I try to run that, I get an error:
Recursive CTE member (nodes_traversed) can refer itself only in FROM clause.
That "not in select" clause is important to the recursive expression, though, as it provides the ending point. (Without it, you get infinite recursion.) Using generation counting, like in the accepted answer to this question, would not help, since this is a highly cyclic graph.
Is there any way to work around this without having to create a stored proc that does it iteratively?
Here is my solution that use global temporary table, I have limited recursion by level and nodes from temporary table.
I am not sure how it will work on large set of data.
create procedure get_nodes (
START_NODE integer)
returns (
NODE_ID integer)
as
declare variable C1 integer;
declare variable C2 integer;
begin
/**
create global temporary table id_list(
id integer
);
create index id_list_idx1 ON id_list (id);
*/
delete from id_list;
while ( 1 = 1 ) do
begin
select count(distinct id) from id_list into :c1;
insert into id_list
select id from
(
with recursive nodes_traversed as (
select :START_NODE AS ID , 0 as Lv
from RDB$DATABASE
union all
select ed.DST_NODE , Lv+1
from edges ed
join nodes_traversed NT
on
(NT.ID = ed.START_NODE)
and nt.Lv < 5 -- Max recursion level
and nt.id not in (select id from id_list)
)
select distinct id from nodes_traversed);
select count(distinct id) from id_list into :c2;
if (c1 = c2) then break;
end
for select distinct id from id_list into :node_id do
begin
suspend ;
end
end