Prevent and/or detect cycles in postgres

Prevent and/or detect cycles in postgres - postgresql

Assuming a schema like the following:
CREATE TABLE node (
id SERIAL PRIMARY KEY,
name VARCHAR,
parentid INT REFERENCES node(id)
);
Further, let's assume the following data is present:
INSERT INTO node (name,parentid) VALUES
('A',NULL),
('B',1),
('C',1);
Is there a way to prevent cycles from being created? Example:
UPDATE node SET parentid = 2 WHERE id = 1;
This would create a cycle of 1->2->1->...

Your trigger simplified and optimized, should be considerably faster:
CREATE OR REPLACE FUNCTION detect_cycle()
RETURNS TRIGGER
LANGUAGE plpgsql AS
$func$
BEGIN
IF EXISTS (
WITH RECURSIVE search_graph(parentid, path, cycle) AS ( -- relevant columns
-- check ahead, makes 1 step less
SELECT g.parentid, ARRAY[g.id, g.parentid], (g.id = g.parentid)
FROM node g
WHERE g.id = NEW.id -- only test starting from new row
UNION ALL
SELECT g.parentid, sg.path || g.parentid, g.parentid = ANY(sg.path)
FROM search_graph sg
JOIN node g ON g.id = sg.parentid
WHERE NOT sg.cycle
)
SELECT FROM search_graph
WHERE cycle
LIMIT 1 -- stop evaluation at first find
)
THEN
RAISE EXCEPTION 'Loop detected!';
ELSE
RETURN NEW;
END IF;
END
$func$;
You don't need dynamic SQL, you don't need to count, you don't need all the columns and you don't need to test the whole table for every single row.
CREATE TRIGGER detect_cycle_after_update
AFTER INSERT OR UPDATE ON node
FOR EACH ROW EXECUTE PROCEDURE detect_cycle();
An INSERT like this has to be prohibited, too:
INSERT INTO node (id, name,parentid) VALUES (8,'D',9), (9,'E',8);

To answer my own question, I came up with a trigger that prevents this:
CREATE OR REPLACE FUNCTION detect_cycle() RETURNS TRIGGER AS
$func$
DECLARE
loops INTEGER;
BEGIN
EXECUTE 'WITH RECURSIVE search_graph(id, parentid, name, depth, path, cycle) AS (
SELECT g.id, g.parentid, g.name, 1,
ARRAY[g.id],
false
FROM node g
UNION ALL
SELECT g.id, g.parentid, g.name, sg.depth + 1,
path || g.id,
g.id = ANY(path)
FROM node g, search_graph sg
WHERE g.id = sg.parentid AND NOT cycle
)
SELECT count(*) FROM search_graph where cycle = TRUE' INTO loops;
IF loops > 0 THEN
RAISE EXCEPTION 'Loop detected!';
ELSE
RETURN NEW;
END IF;
END
$func$ LANGUAGE plpgsql;
CREATE TRIGGER detect_cycle_after_update
AFTER UPDATE ON node
FOR EACH ROW EXECUTE PROCEDURE detect_cycle();
So, if you try to create a loop, like in the question:
UPDATE node SET parentid = 2 WHERE id = 1;
You get an EXCEPTION:
ERROR: Loop detected!

CREATE OR REPLACE FUNCTION detect_cycle()
RETURNS TRIGGER AS
$func$
DECLARE
cycle int[];
BEGIN
EXECUTE format('WITH RECURSIVE search_graph(%4$I, path, cycle) AS (
SELECT g.%4$I, ARRAY[g.%3$I, g.%4$I], (g.%3$I = g.%4$I)
FROM %1$I.%2$I g
WHERE g.%3$I = $1.%3$I
UNION ALL
SELECT g.%4$I, sg.path || g.%4$I, g.%4$I = ANY(sg.path)
FROM search_graph sg
JOIN %1$I.%2$I g ON g.%3$I = sg.%4$I
WHERE NOT sg.cycle)
SELECT path
FROM search_graph
WHERE cycle
LIMIT 1', TG_TABLE_SCHEMA, TG_TABLE_NAME, quote_ident(TG_ARGV[0]), quote_ident(TG_ARGV[1]))
INTO cycle
USING NEW;
IF cycle IS NULL
THEN
RETURN NEW;
ELSE
RAISE EXCEPTION 'Loop in %.% detected: %', TG_TABLE_SCHEMA, TG_TABLE_NAME, array_to_string(cycle, ' -> ');
END IF;
END
$func$ LANGUAGE plpgsql;
CREATE TRIGGER detect_cycle_after_update
AFTER INSERT OR UPDATE ON node
FOR EACH ROW EXECUTE PROCEDURE detect_cycle('id', 'parent_id');

While the current accepted answer by #Erwin Brandstetter is ok when you process one update/insert at a time, it still can fail when considering concurrent execution.
Assume the table content defined by
INSERT INTO node VALUES
(1, 'A', NULL),
(2, 'B', 1),
(3, 'C', NULL),
(4, 'D', 3);
and then in one transaction, execute
-- transaction A
UPDATE node SET parentid = 2 where id = 3;
and in another
-- transaction B
UPDATE node SET parentid = 4 where id = 1;
Both UPDATE commands will succeed, and you can afterwards commit both transactions.
-- transaction A
COMMIT;
-- transaction B
COMMIT;
You will then have a cycle 1->4->3->2->1 in the table.
To make it work, you will either have to use isolation level SERIALIZABLE or use explicit locking in the trigger.

slightly different from Erwin's
CREATE OR REPLACE FUNCTION detect_cycle ()
RETURNS TRIGGER
LANGUAGE plpgsql
AS $func$
BEGIN
IF EXISTS ( WITH RECURSIVE search_graph (
id,
name,
parentid,
is_cycle,
path
) AS (
SELECT *, FALSE,ARRAY[ROW (n.id,n.parentid)]
FROM
node n
WHERE
n.id = NEW.id
UNION ALL
SELECT
n.*,
ROW (n.id,n.parentid) = ANY (path),
path || ROW (n.id,n.parentid)
FROM
node n,
search_graph sg
WHERE
n.id = sg.parentid
AND NOT is_cycle
)
SELECT *
FROM
search_graph
WHERE
is_cycle
LIMIT 1) THEN
RAISE EXCEPTION 'Loop detected!';
ELSE
RETURN new;
END IF;
END
$func$;

Related

Postgres query with variable in loop and condition on variable

I have a query which updates the records based on variables old_id and new_id. But condition is I need to fetch the variables dynamically. Here is simple query which I am using.
do
$$
declare
old_id bigint = 1561049391647687270;
declare new_id bigint = 2068236279446765699;
begin
update songs set poet_id = new_id where poet_id = old_id;
update poets set active = true where id = new_id;
update poets set deleted = true where id = old_id;
end
$$;
I need to assign the old_id and new_id dynamically
do
$$
declare
su record;
pc record;
old_id bigint;
new_id bigint;
begin
for pc in select name, count(name)
from poets
where deleted = false
group by name
having count(name) > 1
order by name
loop
for su in select * from poets where name ilike pc.name
loop
-- old_id could be null where I have 2 continue the flow without update
for old_id in (select id from su where su.link is null)
loop
raise notice 'old: %', old_id;
end loop;
-- new_id could be more than 2 skip this condition as well
for new_id in (select id from su where su.link is not null)
loop
raise notice 'new: %', new_id;
end loop;
end loop;
-- run the statement_1 example if new_id and old_id is not null
end loop;
end
$$;
The expected problem statement (to assign variable and use it in further execution) is with in comment.

(a) In your first "simple query", the update of the table poets could be automatically executed by a trigger function defined on the table songs :
CREATE OR REPLACE FUNCTION songs_update_id ()
RETURNS trigger LANGUAGE plpgsql AS
$$
BEGIN
UPDATE poets SET active = true WHERE id = NEW.poet_id ;
UPDATE poets SET deleted = true WHERE id = OLD.poet_id ; -- SET active = false to be added ?
END ;
$$ ;
CREATE OR REPLACE TRIGGER songs_update_id AFTER UPDATE OF id ON songs
FOR EACH ROW EXECUTE songs_update_id () ;
Your first query can then be reduced as :
do
$$
declare
old_id bigint = 1561049391647687270;
declare new_id bigint = 2068236279446765699;
begin
update songs set poet_id = new_id where poet_id = old_id;
end
$$;
(b) The tables update could be performed with a sql query instead of a plpgsql loop and with better performances :
do
$$
BEGIN
UPDATE songs
SET poet_id = list.new_id[1]
FROM
( SELECT b.name
, array_agg(b.id) FILTER (WHERE b.link IS NULL) AS old_id
, array_agg(b.id) FILTER (WHERE b.link IS NOT NULL) AS new_id
FROM
( SELECT name
FROM poets
WHERE deleted = false
GROUP BY name
HAVING COUNT(*) > 1
-- ORDER BY name -- this ORDER BY sounds like useless and resource-intensive
) AS a
INNER JOIN poets AS b
ON b.name ilike a.name
GROUP BY b.name
HAVING array_length(old_id, 1) = 1
AND array_length(new_id, 1) = 1
) AS list
WHERE poet_id = list.old_id[1] ;
END ;
$$;
This solution is not tested yet and could have to be adjusted in order to work correctly. Please provide the tables definition of songs and poets and a sample of data in dbfiddle so that I can test and adjust the proposed solution.

Is it worth Parallel/Concurrent INSERT INTO... (SELECT...) to the same Table in Postgres?

I was attempting an INSERT INTO.... ( SELECT... ) (inserting a batch of rows from SELECT... subquery), onto the same table in my database. For the most part it was working, however, I did see a "Deadlock" exception logged every now and then. Does it make sense to do this or is there a way to avoid a deadlock scenario? On a high-level, my queries both resemble this structure:
CREATE OR REPLACE PROCEDURE myConcurrentProc() LANGUAGE plpgsql
AS $procedure$
DECLARE
BEGIN
LOOP
EXIT WHEN row_count = 0
WITH cte AS (SELECT *
FROM TableA tbla
WHERE EXISTS (SELECT 1 FROM TableB tblb WHERE tblb.id = tbla.id)
INSERT INTO concurrent_table (SELECT id FROM cte);
COMMIT;
UPDATE log_tbl
SET status = 'FINISHED',
WHERE job_name = 'tblA_and_B_job';
END LOOP;
END
$procedure$;
And the other script that runs in parallel and INSERTS... also to the same table is also basically:
CREATE OR REPLACE PROCEDURE myConcurrentProc() LANGUAGE plpgsql
AS $procedure$
DECLARE
BEGIN
LOOP
EXIT WHEN row_count = 0
WITH cte AS (SELECT *
FROM TableC c
WHERE EXISTS (SELECT 1 FROM TableD d WHERE d.id = tblc.id)
INSERT INTO concurrent_table (SELECT id FROM cte);
COMMIT;
UPDATE log_tbl
SET status = 'FINISHED',
WHERE job_name = 'tbl_C_and_D_job';
END LOOP;
END
$procedure$;
So you can see I'm querying two different tables in each script, however inserting into the same some_table. I also have the UPDATE... statement that writes to a log table so I suppose that could also cause issues. Is there any way to use BEGIN... END here and COMMIT to avoid any deadlock/concurrency issues or should I just create a 2nd table to hold the "tbl_C_and_D_job" data?

How to return a table from iteration in function with postgres 11

Following functions are created for doing housekeeping within the database (PostgreSQL 11.4).
entity_with_multiple_taskexec: returns a list of entities for which the housekeeping should be done.
row_id_to_delete: returns tuples of id's to delete
Just for completeness, the function which works fine:
CREATE OR REPLACE FUNCTION entity_with_multiple_taskexec()
RETURNS TABLE(entitykey varchar) AS
$func$
BEGIN
RETURN QUERY select distinct task.entitykey from
(select task.entitykey from task where dtype = 'PropagationTask' group by task.entitykey having count(*) > (select count(*) from conninstance)) more_than_one_entry
inner join task on task.entitykey = more_than_one_entry.entitykey
inner join taskexec on taskexec.task_id = task.id order by task.entitykey asc;
END
$func$ LANGUAGE plpgsql;
But which the second function, I'm not able to return a table, created from looping through the results of the entity_with_multiple_taskexec function;
CREATE OR REPLACE FUNCTION row_id_to_delete()
RETURNS TABLE(task_id varchar, taskexec_id varchar) AS
$func$
DECLARE
entityrow RECORD;
resultset RECORD;
BEGIN
FOR entityrow IN SELECT entitykey FROM entity_with_multiple_taskexec() LOOP
insert into resultset select task.id as task_id, taskexec.id as taskexec_id from task
inner join taskexec on taskexec.task_id = task.id where taskexec.entitykey = entityrow.entitykey order by taskexec.enddate desc offset 1
END LOOP;
RETURN resultset;
END
$func$ LANGUAGE plpgsql;
This breaks with the following error
ERROR: syntax error at or near "END"
LINE 12: END LOOP;
I've tried different approaches. What would be a good solution to return the table?

You don't need a loop, just join to the function as if it is a table.
There is also no need to use PL/pgSQL for this, a simple language sql function will be more efficient.
CREATE OR REPLACE FUNCTION row_id_to_delete()
RETURNS TABLE(task_id varchar, taskexec_id varchar) AS
$func$
select task.id as task_id, taskexec.id as taskexec_id
from task
join taskexec on taskexec.task_id = task.id
join entity_with_multiple_taskexec() as mt on mt.entitykey = taskexec.entitykey
order by taskexec.enddate desc
offset 1
$func$
LANGUAGE sql;

How to optimize postgresql procedure

I have 61 million of non unique emails with statuses.
This emails need to deduplicate with logic by status.
I write stored procedure, but this procedure runs to long.
How I can optimize execution time of this procedure?
CREATE OR REPLACE FUNCTION public.load_oxy_emails() RETURNS boolean AS $$
DECLARE
row record;
rec record;
new_id int;
BEGIN
FOR row IN SELECT * FROM oxy_email ORDER BY id LOOP
SELECT * INTO rec FROM oxy_emails_clean WHERE email = row.email;
IF rec IS NOT NULL THEN
IF row.status = 3 THEN
UPDATE oxy_emails_clean SET status = 3 WHERE id = rec.id;
END IF;
ELSE
INSERT INTO oxy_emails_clean(id, email, status) VALUES(nextval('oxy_emails_clean_id_seq'), row.email, row.status);
SELECT currval('oxy_emails_clean_id_seq') INTO new_id;
INSERT INTO oxy_emails_clean_websites_relation(oxy_emails_clean_id, website_id) VALUES(new_id, row.website_id);
END IF;
END LOOP;
RETURN true;
END;
$$
LANGUAGE 'plpgsql';

How I can optimize execution time of this procedure?
Don't do it with a loop.
Doing a row-by-row processing (also known as "slow-by-slow") is almost always a lot slower then doing bulk changes where a single statement processes a lot of rows "in one go".
The change of the status can easily be done using a single statement:
update oxy_emails_clean oec
SET status = 3
from oxy_email oe
where oe.id = oec.id
and oe.status = 3;
The copying of the rows can be done using a chain of CTEs:
with to_copy as (
select *
from oxy_email
where status <> 3 --<< all those that have a different status
), clean_inserted as (
INSERT INTO oxy_emails_clean (id, email, status)
select nextval('oxy_emails_clean_id_seq'), email, status
from to_copy
returning id;
)
insert oxy_emails_clean_websites_relation (oxy_emails_clean_id, website_id)
select ci.id, tc.website_id
from clean_inserted ci
join to_copy tc on tc.id = ci.id;

Get IDs from multiple columns in multiple tables as one set or array

I have multiple tables with each two rows of interest: connection_node_start_id and connection_node_end_id. My goal is to get a collection of all those IDs, either as a flat ARRAY or as a new TABLE consisting of one row.
Example output ARRAY:
result = {1,4,7,9,2,5}
Example output TABLE:
IDS
-------
1
4
7
9
2
5
My fist attempt is somewhat clumsy and does not work properly as the SELECT statement just returns one row. It seems there must be a simple way to do this, can someone point me into the right direction?
CREATE OR REPLACE FUNCTION get_connection_nodes(anyarray)
RETURNS anyarray AS
$$
DECLARE
table_name varchar;
result integer[];
sel integer[];
BEGIN
FOREACH table_name IN ARRAY $1
LOOP
RAISE NOTICE 'table_name(%)',table_name;
EXECUTE 'SELECT ARRAY[connection_node_end_id,
connection_node_start_id] FROM ' || table_name INTO sel;
RAISE NOTICE 'sel(%)',sel;
result := array_cat(result, sel);
END LOOP;
RETURN result;
END
$$
LANGUAGE 'plpgsql';
Test table:
connection_node_start_id | connection_node_end_id
--------------------------------------------------
1 | 4
7 | 9
Call:
SELECT get_connection_nodes(ARRAY['test_table']);
Result:
{1,4} -- only 1st row, rest is missing

For Postgres 9.3+
CREATE OR REPLACE FUNCTION get_connection_nodes(text[])
RETURNS TABLE (ids int) AS
$func$
DECLARE
_tbl text;
BEGIN
FOREACH _tbl IN ARRAY $1
LOOP
RETURN QUERY EXECUTE format('
SELECT t.id
FROM %I, LATERAL (VALUES (connection_node_start_id)
, (connection_node_end_id)) t(id)'
, _tbl);
END LOOP;
END
$func$ LANGUAGE plpgsql;
Related answer on dba.SE:
SELECT DISTINCT on multiple columns
Or drop the loop and concatenate a single query. Probably fastest:
CREATE OR REPLACE FUNCTION get_connection_nodes2(text[])
RETURNS TABLE (ids int) AS
$func$
BEGIN
RETURN QUERY EXECUTE (
SELECT string_agg(format(
'SELECT t.id FROM %I, LATERAL (VALUES (connection_node_start_id)
, (connection_node_end_id)) t(id)'
, tbl), ' UNION ALL ')
FROM unnest($1) tbl
);
END
$func$ LANGUAGE plpgsql;
Related:
Loop through like tables in a schema
LATERAL was introduced with Postgres 9.3.
For older Postgres
You can use the set-returning function unnest() in the SELECT list, too:
CREATE OR REPLACE FUNCTION get_connection_nodes2(text[])
RETURNS TABLE (ids int) AS
$func$
BEGIN
RETURN QUERY EXECUTE (
SELECT string_agg(
'SELECT unnest(ARRAY[connection_node_start_id
, connection_node_end_id]) FROM ' || tbl
, ' UNION ALL '
)
FROM (SELECT quote_ident(tbl) AS tbl FROM unnest($1) tbl) t
);
END
$func$ LANGUAGE plpgsql;
Should work with pg 8.4+ (or maybe even older). Works with current Postgres (9.4) as well, but LATERAL is much cleaner.
Or make it very simple:
CREATE OR REPLACE FUNCTION get_connection_nodes3(text[])
RETURNS TABLE (ids int) AS
$func$
BEGIN
RETURN QUERY EXECUTE (
SELECT string_agg(format(
'SELECT connection_node_start_id FROM %1$I
UNION ALL
SELECT connection_node_end_id FROM %1$I'
, tbl), ' UNION ALL ')
FROM unnest($1) tbl
);
END
$func$ LANGUAGE plpgsql;
format() was introduced with pg 9.1.
Might be a bit slower with big tables because each table is scanned once for every column (so 2 times here). Sort order in the result is different, too - but that does not seem to matter for you.
Be sure to sanitize escape identifiers to defend against SQL injection and other illegal syntax. Details:
Table name as a PostgreSQL function parameter

The EXECUTE ... INTO statement can only return data from a single row:
If multiple rows are returned, only the first will be assigned to the INTO variable.
In order to concatenate values from all rows you have to aggregate them first by column and then append the arrays:
EXECUTE 'SELECT array_agg(connection_node_end_id) ||
array_agg(connection_node_start_id) FROM ' || table_name INTO sel;

You're probably looking for something like this:
CREATE OR REPLACE FUNCTION d (tblname TEXT [])
RETURNS TABLE (c INTEGER) AS $$
DECLARE sql TEXT;
BEGIN
WITH x
AS (SELECT unnest(tblname) AS tbl),
y AS (
SELECT FORMAT('
SELECT connection_node_end_id
FROM %s
UNION ALL
SELECT connection_node_start_id
FROM %s
', tbl, tbl) AS s
FROM x)
SELECT string_agg(s, ' UNION ALL ')
INTO sql
FROM y;
RETURN QUERY EXECUTE sql;
END;$$
LANGUAGE plpgsql;
CREATE TABLE a (connection_node_end_id INTEGER, connection_node_start_id INTEGER);
INSERT INTO A VALUES (1,2);
CREATE TABLE b (connection_node_end_id INTEGER, connection_node_start_id INTEGER);
INSERT INTO B VALUES (100, 101);
SELECT * from d(array['a','b']);
c
-----
1
2
100
101
(4 rows)

We Keep Coding

iphone swift flutter scala powershell matlab mongodb postgresql perl eclipse

Prevent and/or detect cycles in postgres - postgresql

Related

Postgres query with variable in loop and condition on variable

Is it worth Parallel/Concurrent INSERT INTO... (SELECT...) to the same Table in Postgres?

How to return a table from iteration in function with postgres 11

How to optimize postgresql procedure

Get IDs from multiple columns in multiple tables as one set or array

Categories

Resources