Tracking payments on invoices - postgresql

I am trying to track payments made by customers using the following tables:
CREATE TABLE IF NOT EXISTS invoices
(
id integer NOT NULL DEFAULT nextval('invoices_id_seq'::regclass),
customer_id integer NOT NULL
)
CREATE TABLE IF NOT EXISTS invoice_entries
(
id integer NOT NULL DEFAULT nextval('invoice_entries_id_seq'::regclass),
price numeric,
invoice_id integer NOT NULL
)
CREATE TABLE IF NOT EXISTS customer_transactions
(
id integer NOT NULL DEFAULT nextval('customer_transactions_id_seq'::regclass),
total_amount numeric NOT NULL,
invoice_id integer,
customer_id integer NOT NULL,
)
My problem is that a payment can be made to the customer's account, OR, it could be made to a specific invoice.
My goal is to generate aggregated columns that show the running balance, as well as the amount paid on any given invoice. If the payment was made directly to the customer's account, it will apply to any remaining balances on each invoice.
So far I have gotten this far:
WITH RECURSIVE CTE_R AS (
SELECT
ROW_NUMBER() OVER (PARTITION BY ct.customer_id) AS row_id,
pb.invoice_balance,
pb.invoice_id,
ct.total_amount
FROM customer_invoice_balances() as pb
LEFT JOIN customer_transactions as ct ON ct.customer_id = pb.customer_id
),
CTE AS (
SELECT
row_id,
invoice_balance,
invoice_id,
total_amount,
invoice_balance AS remaining_balance,
0::numeric AS amount_paid
FROM CTE_R
WHERE row_id = 1
UNION ALL
SELECT
cr.row_id,
cr.invoice_balance,
cr.invoice_id,
cr.total_amount,
CASE
WHEN cr.invoice_id = c.invoice_id THEN remaining_balance - cr.total_amount
ELSE remaining_balance
END,
CASE
WHEN cr.invoice_id = c.invoice_id THEN amount_paid + cr.total_amount
ELSE amount_paid
END
FROM
CTE c, CTE_R cr
WHERE c.row_id = cr.row_id -1
)
SELECT * FROM CTE
At this point I am stumped, because the remaining balance will go negative and not move to the next invoice that has a balance. Am I going about this the right way?
Edit: The other problem I am having is that the recursive CTE is very inefficient, and the query takes far too long.

Related

Is it possible to find duplicating records in two columns simultaneously in PostgreSQL?

I have the following database schema (oversimplified):
create sequence partners_partner_id_seq;
create table partners
(
partner_id integer default nextval('partners_partner_id_seq'::regclass) not null primary key,
name varchar(255) default NULL::character varying,
company_id varchar(20) default NULL::character varying,
vat_id varchar(50) default NULL::character varying,
is_deleted boolean default false not null
);
INSERT INTO partners(name, company_id, vat_id) VALUES('test1','1010109191191', 'BG1010109191192');
INSERT INTO partners(name, company_id, vat_id) VALUES('test2','1010109191191', 'BG1010109191192');
INSERT INTO partners(name, company_id, vat_id) VALUES('test3','3214567890102', 'BG1010109191192');
INSERT INTO partners(name, company_id, vat_id) VALUES('test4','9999999999999', 'GE9999999999999');
I am trying to figure out how to return test1, test2 (because the company_id column value duplicates vertically) and test3 (because the vat_id column value duplicates vertically as well).
To put it in other words - I need to find duplicating company_id and vat_id records and group them together, so that test1, test2 and test3 would be together, because they duplicate by company_id and vat_id.
So far I have the following query:
SELECT *
FROM (
SELECT *, LEAD(row, 1) OVER () AS nextrow
FROM (
SELECT *, ROW_NUMBER() OVER (w) AS row
FROM partners
WHERE is_deleted = false
AND ((company_id != '' AND company_id IS NOT null) OR (vat_id != '' AND vat_id IS NOT NULL))
WINDOW w AS (PARTITION BY company_id, vat_id ORDER BY partner_id DESC)
) x
) y
WHERE (row > 1 OR nextrow > 1)
AND is_deleted = false
This successfully shows all company_id duplicates, but does not appear to show vat_id ones - test3 row is missing. Is this possible to be done within one query?
Here is a db-fiddle with the schema, data and predefined query reproducing my result.
You can do this with recursion, but depending on the size of your data you may want to iterate, instead.
The trick is to make the name just another match key instead of treating it differently than the company_id and vat_id:
create table partners (
partner_id integer generated always as identity primary key,
name text,
company_id text,
vat_id text,
is_deleted boolean not null default false
);
insert into partners (name, company_id, vat_id) values
('test1','1010109191191', 'BG1010109191192'),
('test2','1010109191191', 'BG1010109191192'),
('test3','3214567890102', 'BG1010109191192'),
('test4','9999999999999', 'GE9999999999999'),
('test5','3214567890102', 'BG8888888888888'),
('test6','2983489023408', 'BG8888888888888')
;
I added a couple of test cases and left in the lone partner.
with recursive keys as (
select partner_id,
array['n_'||name, 'c_'||company_id, 'v_'||vat_id] as matcher,
array[partner_id] as matchlist,
1 as size
from partners
), matchers as (
select *
from keys
union all
select p.partner_id, c.matcher,
p.matchlist||c.partner_id as matchlist,
p.size + 1
from matchers p
join keys c
on c.matcher && p.matcher
and not p.matchlist #> array[c.partner_id]
), largest as (
select distinct sort(matchlist) as matchlist
from matchers m
where not exists (select 1
from matchers
where matchlist #> m.matchlist
and size > m.size)
-- and size > 1
)
select *
from largest
;
matchlist
{1,2,3,5,6}
{4}
fiddle
EDIT UPDATE
Since recursion did not perform, here is an iterative example in plpgsql that uses a temporary table:
create temporary table match1 (
partner_id int not null,
group_id int not null,
matchkey uuid not null
);
create index on match1 (matchkey);
create index on match1 (group_id);
insert into match1
select partner_id, partner_id, md5('n_'||name)::uuid from partners
union all
select partner_id, partner_id, md5('c_'||company_id)::uuid from partners
union all
select partner_id, partner_id, md5('v_'||vat_id)::uuid from partners;
do $$
declare _cnt bigint;
begin
loop
with consolidate as (
select group_id,
min(group_id) over (partition by matchkey) as new_group_id
from match1
), minimize as (
select group_id, min(new_group_id) as new_group_id
from consolidate
group by group_id
), doupdate as (
update match1
set group_id = m.new_group_id
from minimize m
where m.group_id = match1.group_id
and m.new_group_id != match1.group_id
returning *
)
select count(*) into _cnt from doupdate;
if _cnt = 0 then
exit;
end if;
end loop;
end;
$$;
updated fiddle

PostgreSQL count other values of ID that have the same value of other column

Let's say we have the following table that stores id of an observation and its address_id. You can create the table with the following code:
drop table if exists schema.pl_address_cnt;
create table schema.pl_address_cnt (
id serial,
address_id int);
insert into schema.pl_address_cnt(address_id) values
(100), (101), (100), (101), (100), (125), (128), (200), (200), (100);
My task is to count for each id how many other ids (thus -1) have the same address_id. I've come up with a solution that turns out to be quite expensive (explain) on the original dataset. I wonder whether my solution can be somehow optimised.
with tmp_table as (select address_id
, count(distinct id) as id_count
from schema.pl_address_cnt
group by address_id
)
select id
, id_count - 1
from schema.pl_address_cnt as pac
left join tmp_table as tt on tt.address_id=pac.address_id;
You can try to omit the CTE and do a self left join on common address but different ID and then aggregate this.
SELECT pac1.id,
count(pac2.id)
FROM pl_address_cnt pac1
LEFT JOIN pl_address_cnt pac2
ON pac1.address_id = pac2.address_id
AND pac1.id <> pac2.id
GROUP BY pac1.id
ORDER BY pac1.id;
For performance you can try indexes on (address_id, id) and (id).

PostgreSQL Transaction to Use Results from Query to Insert and Query another Table then Return Original Query Results

I am writing an application that stores data on file samples and YARA signatures. Essentially, in a single transaction, I need to execute a query, reference those results in an insert and another query, then return the original results. I have three tables that are relevant to this discussion:
samples - this is the table that stores information on files that need to be scanned with the associated YARA signatures.
yararules - the table that stores information on the YARA rules.
yaratracker - a table that tracks the sample/rule pairs that have been processed thus far.
In a single transaction, the application needs to:
Get a batch of unique sample/rule pairs that have not yet been processed. Preferably, this query will get all non-processed rules associated with a single sample (i.e. if I'm going to run the YARA rules on a sample, I want to run all of the YARA rules not yet processed on that sample so that I only have to load the sample into memory once).
Get a unique list of id,sha256 from the batch found in step 1.
Insert the batch from step 1 into the yaraqueue with the matchcount column equal to 0 and complete column set to false.
I can accomplish Step 1 with the query below, but I don't know how to reference those results to accomplish step 2. I've tried looking into variables, but apparently there isn't one that can hold multiple rows. I've looked into using a cursor, but I can't seem to use the cursor with a subsequent command and then return the cursor.
SELECT s.id,r.id
FROM sample s CROSS JOIN yararules r
WHERE r.status = 'Disabled' AND NOT EXISTS(
SELECT 1 FROM yaratracker q
WHERE q.sample_id = s.id AND q.rule_id = r.id
)
ORDER BY s.id
LIMIT 1000;
The relevant database schema looks like this.
CREATE TYPE samplelist AS ENUM ('Whitelist', 'Blacklist', 'Greylist', 'Unknown');
CREATE TABLE samples (
id SERIAL PRIMARY KEY,
md5 CHAR(32) NOT NULL,
sha1 CHAR(40) NOT NULL,
sha256 CHAR(64) NOT NULL,
total INT NOT NULL,
positives INT NOT NULL,
list SAMPLELIST NOT NULL,
filetype VARCHAR(16) NOT NULL,
submitted TIMESTAMP WITH TIME ZONE NOT NULL,
user_id SERIAL REFERENCES users;
);
CREATE UNIQUE INDEX md5_idx ON {0} (md5);
CREATE UNIQUE INDEX sha1_idx ON {0} (sha1);
CREATE UNIQUE INDEX sha256_idx ON {0} (sha256);
CREATE TYPE rulestatus AS ENUM ('Enabled', 'Disabled');
CREATE TABLE yararules (
id SERIAL PRIMARY KEY,
name VARCHAR(32) NOT NULL UNIQUE,
description TEXT NOT NULL,
rules TEXT NOT NULL,
lastmodified TIMESTAMP WITH TIME ZONE NOT NULL,
status rulestatus NOT NULL,
user_id SERIAL REFERENCES users ON DELETE CASCADE
);
CREATE TABLE yaratracker (
id SERIAL PRIMARY KEY,
rule_id SERIAL REFERENCES yararules ON DELETE CASCADE,
sample_id SERIAL REFERENCES sample ON DELETE CASCADE,
matchcount INT NOT NULL,
complete BOOL NOT NULL
);
CREATE INDEX composite_idx ON yaratracker (rule_id, sample_id);
CREATE INDEX complete_idx ON yaratracker (complete);
INSERT INTO target_table(a,b,c,...)
SELECT sid, rid, sha, ...
FROM (
SELECT s.id AS sid
,r.id AS rid
, s.sha256 AS sha
, ...
, ROW_NUMBER() OVER (PARTITION BY s.id) AS rn -- <<<--- HERE
FROM sample s CROSS JOIN yararules r
WHERE r.status = 'Disabled' AND NOT EXISTS(
SELECT 1 FROM yaratracker q
WHERE q.sample_id = s.id
AND q.rule_id = r.id
)
ORDER BY s.id
LIMIT 1000;
) src
WHERE src.rn = 1; -- <<<--- HERE
The WHERE src.rn = 1 will restrict the cross-join to deliver only one tuple per sample.id (both id and sha256 are unique in the sample table, so picking a unique id has the same effect as picking a unique sha256)
The complete cross-join result will never be generated; the optimiser is smart enough to push down the WHERE rn=1 condition into the subquery.
Note: the LIMIT 1000 should probably be removed (or pulled up to a higher level)
If you REALLY need to save the results from the CROSS JOIN, you could use a chain of CTEs (expect a performance degradation ...)
WITH big AS (
SELECT s.id AS sample_id
,r.id AS rule_id
, s.sha256
-- , ...
, ROW_NUMBER() OVER (PARTITION BY s.id) AS rn -- <<<--- HERE
FROM sample s
CROSS JOIN yararules r
WHERE r.status = 'Disabled' AND NOT EXISTS(
SELECT 1 FROM yaratracker q
WHERE q.sample_id = s.id AND q.rule_id = r.id
)
)
, ins AS (
INSERT INTO target_table(a,b,c,...)
SELECT b.sample_id, b.rule_id, b.sha256 , ...
FROM big b
WHERE b.rn = 1; -- <<<--- HERE
RETURNING *
)
INSERT INTO yaratracker (rule_id, sample_id, matchcount, complete )
SELECT b.sample_id, b.rule_id, 0, False
FROM big b
-- LEFT JOIN ins i ON i.a = b.sample_id AND i.b= b.rule_id
;
NOTE: the yaratracker(rule_id,sample_id) should not be serials but just plain integers, referencing yararules(id) and sample(id)

LEFT JOIN trouble with multiple tables

I have the following query
SELECT a.account_id, sum(p.amount) AS amount
FROM accounts a
LEFT JOIN users_accounts ua
JOIN users u
JOIN payments p on p.meta_id = u.user_id
ON u.user_id = ua.user_id
ON ua.account_id = a.account_id
WHERE p.date_prcsd BETWEEN '2017-08-01 00:00:00' AND '2017-08-31 23:59:59'
GROUP BY a.account_id
ORDER BY account_id ASC;
What I want is all the rows from accounts a and zeroes for missing amount data. Same result set for different types of joins and different join structures - only rows that have some payments in p.
Where do I go wrong?
Simplified:
SELECT a.account_id
,sum(coalesce(p2.amount, 0)) AS amount
FROM accounts a
LEFT JOIN users_accounts ua ON (a.account_id = ua.account_id)
LEFT JOIN users u ON (ua.user_id = u.user_id)
LEFT JOIN (
SELECT p.meta_id
,p.amount
FROM payments p
WHERE p.date BETWEEN '2017-08-01' AND '2017-08-10'
) AS p2 ON (u.user_id = p2.meta_id)
GROUP BY a.account_id
ORDER BY account_id ASC;
Result:
account_id | amount
------------+--------
1 | 4
2 | 0
3 | 0
(3 rows)
Explanation: you need to take care of all returning null values. coalesce() does that for you. The where-clause is actually the real problem in your solution because it filters out rows that you would want to have in your endresult. On top of that: you left out the left join for the other tables. I created a simplified test db:
$ cat tables.sql
drop table users_accounts;
drop table payments;
drop table users;
drop table accounts;
create table accounts (account_id serial primary key, name varchar not
null);
create table users (user_id serial primary key, name varchar not null);
create table users_accounts(user_id int references users(user_id),
account_id int references
accounts(account_id));
create table payments(meta_id int references users(user_id), amount int
not null, date date);
insert into accounts (account_id, name) values (1, 'Account A'), (2,
'Account B'), (3, 'Account C');
insert into users (user_id, name) values (1, 'Marc'), (2, 'Ruben'), (3,
'Isaak');
insert into users_accounts (user_id, account_id) values (1,1),(2,1);
insert into payments(meta_id, amount, date) values (1,1, '2017-08-01'),
(1,2, '2017-08-11'),(1,3, '2017-08-03'),(2,1, null),(2,2, null),(2,3,
null);

Can't get postgres to recognise input variables in a function

What am I doing wrong here?
I want to write a function that returns a table of all the products that we have had dismal
performance for between two dates - where 'dismal' means marketing costs exceeded profits. I
have three tables: products, spend and transactions.
CREATE TABLE products
(id int not null, name varchar(255), primary key (id));
CREATE TABLE spend
(id int not null, spenddate date, spendamount decimal (9, 2));
CREATE TABLE transactions
(id int not null, transactiondate date, profit decimal (9, 2));
What I'd do is union queries between the two tables and then sum them to get a line per
product:
WITH a as (
SELECT products.id, products.name,
sum(transactions.profit) as profit,
sum(0) as spendamount
FROM transactions
WHERE transactiondate BETWEEN startdate AND enddate
GROUP BY products.id, products.name
UNION ALL
SELECT id, sum(0) as profit
sum(amount) as spendamount
FROM spend
WHERE spenddate BETWEEN startdate AND enddate
GROUP BY id)
SELECT products.id, products.name, sum(profit)-sum(spendamount) as loss
FROM a, products
WHERE products.id = a.id
GROUP BY products.id, products.name
HAVING sum(profit)-sum(spendamount) < 0;
But I don't want to keep changing the start and enddate values every time I run my code.
I thought I'd do this:
CREATE FUNCTION report_dismal_products (startdate date, enddate date,
out id int,
out name varchar(255),
out loss decimal(9, 2)
)
RETURNS SETOF RECORD
as $$
SELECT products.id, products.name, sum(profit)-sum(spendamount) as loss
FROM
(
SELECT id,
sum(transactions.profit) as profit,
sum(0) as spendamount
FROM transactions
WHERE transactiondate BETWEEN startdate AND enddate
GROUP BY id
UNION ALL
SELECT id, sum(0) as profit
sum(amount) as spendamount
FROM spend
WHERE spenddate BETWEEN startdate AND enddate
GROUP BY id) a, products
WHERE products.id = a.id
GROUP BY products.id, products.name
HAVING sum(profit)-sum(spendamount) < 0;
$$ Language sql;
But it returns
ERROR: column "startdate" does not exist
LINE 17: WHERE transactiondate BETWEEN startdate AND enddate