Performance of joining on multiple columns with potential NULL values - postgresql

Lets say we have the following table
CREATE TABLE my_table
(
record_id SERIAL,
column_1 INTEGER,
column_2 INTEGER,
column_3 INTEGER,
price NUMERIC
);
With the following data
INSERT INTO my_table (column_1, column_2, column_3, price) VALUES
(1, NULL, 1, 54.99),
(1, NULL, 1, 69.50),
(NULL, 2, 2, 54.99),
(NULL, 2, 2, 69.50),
(3, 3, NULL, 54.99),
(3, 3, NULL, 69.50);
Now we do something like
CREATE TABLE my_table_aggregations AS
SELECT
ROW_NUMBER() OVER () AS aggregation_id,
column_1,
column_2,
column_3
FROM my_table
GROUP BY
column_1,
column_2,
column_3;
What I want to do now is assign an aggregation_id to each record_id in my_table. Now because I have NULL values I cant simply join by t1.column_1 = t2.column_1 because NULL = NULL is NULL and so the join will exclude these records.
Now I know that I should use something like this
SELECT
t.record_id,
agg.aggregation_id
FROM my_table t
JOIN my_table_aggregations agg ON
(
((t.column_1 IS NULL AND agg.column_1 IS NULL) OR t.column_1 = agg.column_1) AND
((t.column_2 IS NULL AND agg.column_2 IS NULL) OR t.column_2 = agg.column_2) AND
((t.column_3 IS NULL AND agg.column_3 IS NULL) OR t.column_3 = agg.column_3)
);
The problem here is that I am dealing with hundreds of millions of records and having an OR in the join seems to take forever to run.
There is an alternative, which is something like this
SELECT
t.record_id,
agg.aggregation_id
FROM my_table t
JOIN my_table_aggregations agg ON
(
COALESCE(t.column_1, -1) = COALESCE(agg.column_1, -1) AND
COALESCE(t.column_2, -1) = COALESCE(agg.column_2, -1) AND
COALESCE(t.column_3, -1) = COALESCE(agg.column_3, -1)
);
But the problem with this is that I am assuming there is no value in any of those columns which is -1.
Do note, this is an example which I am well aware I can use DENSE_RANK to get the same result. So lets pretend that this isn't an option.
Is there some crazy awesome way to get around having to use COALESCE but keeping the performance it has over using the correct way of the OR? I run tests, and the COALESCE is over 10 times faster than the OR.
I am running this on a Greenplum database so I am not sure if this performance difference is the same on a standard Postgres database.

Since my solution with NULLIF had performance problems, and your use of COALESCE was much faster, I wonder if you could try tweaking that solution to deal with the issue of -1. To do that, you could try casting to avoid false matches. I'm not sure what the performance hit would be, but it would look like:
SELECT
t.record_id,
agg.aggregation_id
FROM my_table t
JOIN my_table_aggregations agg ON
(
COALESCE(cast(t.column_1 as varchar), 'NA') =
COALESCE(cast(agg.column_1 as varchar), 'NA') AND
COALESCE(cast(t.column_2 as varchar), 'NA') =
COALESCE(cast(agg.column_2 as varchar), 'NA') AND
COALESCE(cast(t.column_3 as varchar), 'NA') =
COALESCE(cast(agg.column_3 as varchar), 'NA')
);

After doing some thinking, I decided the best approach this this is to dynamically find a value for each column that can be used as the second param in a COALESCE join. The function is rather long, but it does what I need and more importantly, this way keeps the COALESCE performance, the only down side is getting the MIN values is an additional time cost, but we are talking a minute.
Here is the function:
CREATE OR REPLACE FUNCTION pg_temp.get_null_join_int_value
(
left_table_schema TEXT,
left_table_name TEXT,
left_table_columns TEXT[],
right_table_schema TEXT,
right_table_name TEXT,
right_table_columns TEXT[],
output_table_schema TEXT,
output_table_name TEXT
) RETURNS TEXT AS
$$
DECLARE
colum_name TEXT;
sql TEXT;
complete_sql TEXT;
full_left_table TEXT;
full_right_table TEXT;
full_output_table TEXT;
BEGIN
/*****************************
VALIDATE PARAMS
******************************/
-- this section validates all of the function parameters ensuring that the values that cannot be NULL are not so
-- also checks for empty arrays which is not allowed and then ensures both arrays are of the same length
IF (left_table_name IS NULL) THEN
RAISE EXCEPTION 'left_table_name cannot be NULL';
ELSIF (left_table_columns IS NULL) THEN
RAISE EXCEPTION 'left_table_columns cannot be NULL';
ELSIF (right_table_name IS NULL) THEN
RAISE EXCEPTION 'right_table_name cannot be NULL';
ELSIF (right_table_columns IS NULL) THEN
RAISE EXCEPTION 'right_table_columns cannot be NULL';
ELSIF (output_table_name IS NULL) THEN
RAISE EXCEPTION 'output_table_name cannot be NULL';
ELSIF (array_upper(left_table_columns, 1) IS NULL) THEN
RAISE EXCEPTION 'left_table_columns cannot be an empty array';
ELSIF (array_upper(right_table_columns, 1) IS NULL) THEN
RAISE EXCEPTION 'right_table_columns cannot be an empty array';
ELSIF (array_upper(left_table_columns, 1) <> array_upper(right_table_columns, 1)) THEN
RAISE EXCEPTION 'left_table_columns and right_table_columns must have a matching array length';
END IF;
/************************
TABLE NAMES
*************************/
-- create the full name of the left table
-- the schema name can be NULL which means that the table is temporary
-- because of this, we need to detect if we should specify the schema
IF (left_table_schema IS NOT NULL) THEN
full_left_table = left_table_schema || '.' || left_table_name;
ELSE
full_left_table = left_table_name;
END IF;
-- create the full name of the right table
-- the schema name can be NULL which means that the table is temporary
-- because of this, we need to detect if we should specify the schema
IF (right_table_schema IS NOT NULL) THEN
full_right_table = right_table_schema || '.' || right_table_name;
ELSE
full_right_table = right_table_name;
END IF;
-- create the full name of the output table
-- the schema name can be NULL which means that the table is temporary
-- because of this, we need to detect if we should specify the schema
IF (output_table_schema IS NOT NULL) THEN
full_output_table = output_table_schema || '.' || output_table_name;
ELSE
full_output_table = output_table_name;
END IF;
/**********************
LEFT TABLE
***********************/
-- start to create the table which will store the min values from the left table
sql =
'DROP TABLE IF EXISTS temp_null_join_left_table;' || E'\n' ||
'CREATE TEMP TABLE temp_null_join_left_table AS' || E'\n' ||
'SELECT';
-- loop through each column name in the left table column names parameter
FOR colum_name IN SELECT UNNEST(left_table_columns) LOOP
-- find the minimum value in this column and subtract one
-- we will use this as a value we know is not in the column of this table
sql = sql || E'\n\t' || 'MIN("' || colum_name || '")-1 AS "' || colum_name || '",';
END LOOP;
-- remove the trailing comma from the SQL
sql = TRIM(TRAILING ',' FROM sql);
-- finish the SQL to create the left table min values
sql = sql || E'\n' ||
'FROM ' || full_left_table || ';';
-- run the query that creates the table which stores the minimum values for each column in the left table
EXECUTE sql;
-- store the sql which will be the return value of the function
complete_sql = sql;
/************************
RIGHT TABLE
*************************/
-- start to create the table which will store the min values from the right table
sql =
'DROP TABLE IF EXISTS temp_null_join_right_table;' || E'\n' ||
'CREATE TEMP TABLE temp_null_join_right_table AS' || E'\n' ||
'SELECT';
-- loop through each column name in the right table column names parameter
FOR colum_name IN SELECT UNNEST(right_table_columns) LOOP
-- find the minimum value in this column and subtract one
-- we will use this as a value we know is not in the column of this table
sql = sql || E'\n\t' || 'MIN("' || colum_name || '")-1 AS "' || colum_name || '",';
END LOOP;
-- remove the trailing comma from the SQL
sql = TRIM(TRAILING ',' FROM sql);
-- finish the SQL to create the right table min values
sql = sql || E'\n' ||
'FROM ' || full_left_table || ';';
-- run the query that creates the table which stores the minimum values for each column in the right table
EXECUTE sql;
-- store the sql which will be the return value of the function
complete_sql = complete_sql || E'\n\n' || sql;
-- start to create the final output table which will contain the column names defined in the left_table_columns parameter
-- each column will contain a negative value that is not present in both the left and right tables for the given column
sql =
'DROP TABLE IF EXISTS ' || full_output_table || ';' || E'\n' ||
'CREATE ' || (CASE WHEN output_table_schema IS NULL THEN 'TEMP ' END) || 'TABLE ' || full_output_table || ' AS' || E'\n' ||
'SELECT';
-- loop through each index of the left_table_columns array
FOR i IN coalesce(array_lower(left_table_columns, 1), 1)..coalesce(array_upper(left_table_columns, 1), 1) LOOP
-- add to the sql a call to the LEAST function
-- this function takes an infinite number of columns and returns the smallest value within those columns
-- we have -1 hardcoded because the smallest minimum value may be a positive integer and so we need to ensure the number used is negative
-- this way we will not confuse this value with a real ID from a table
sql = sql || E'\n\t' || 'LEAST(l."' || left_table_columns[i] || '", r."' || right_table_columns[i] || '", -1) AS "' || left_table_columns[i] || '",';
END LOOP;
-- remove the trailing comma from the SQL
sql = TRIM(TRAILING ',' FROM sql);
-- finish off the SQL which creates the final table
sql = sql || E'\n' ||
'FROM temp_null_join_left_table l' || E'\n' ||
'CROSS JOIN temp_null_join_right_table r' || ';';
-- create the final table
EXECUTE sql;
-- store the sql which will be the return value of the function
complete_sql = complete_sql || E'\n\n' || sql;
-- we no longer need these tables
sql =
'DROP TABLE IF EXISTS temp_null_join_left_table;' || E'\n' ||
'DROP TABLE IF EXISTS temp_null_join_right_table;';
EXECUTE sql;
-- store the sql which will be the return value of the function
complete_sql = complete_sql || E'\n\n' || sql;
-- return the SQL that has been run, good for debugging purposes or just understanding what the function does
RETURN complete_sql;
END;
$$
LANGUAGE plpgsql;
Below is an example usage of the function
SELECT pg_temp.get_null_join_int_value
(
-- left table
'public',
'my_table',
'{"column_1", "column_2", "column_3"}',
-- right table
'public',
'my_table_aggregations',
'{"column_1", "column_2", "column_3"}',
-- output table
NULL,
'temp_null_join_values'
);
Once the temp_null_join_values table is created you can do a sub select in the join for the COALESCE 2nd param.
DROP TABLE IF EXISTS temp_result_table;
CREATE TEMP TABLE temp_result_table AS
SELECT
t.record_id,
agg.aggregation_id
FROM public.my_table t
JOIN my_table_aggregations agg ON
(
COALESCE(t.column_1, (SELECT column_1 FROM temp_null_join_values)) = COALESCE(agg.column_1, (SELECT column_1 FROM temp_null_join_values)) AND
COALESCE(t.column_2, (SELECT column_2 FROM temp_null_join_values)) = COALESCE(agg.column_2, (SELECT column_2 FROM temp_null_join_values)) AND
COALESCE(t.column_3, (SELECT column_3 FROM temp_null_join_values)) = COALESCE(agg.column_3, (SELECT column_3 FROM temp_null_join_values))
);
I hope this helps someone

How about:
SELECT
t.record_id,
a.aggregation_id
FROM my_table t
JOIN my_table_aggregations agg ON
(
NULLIF(t.column_1, agg.column_1) IS NULL
AND
NULLIF(agg.column_1, t.column_1) IS NULL
AND
NULLIF(t.column_2, agg.column_2) IS NULL
AND
NULLIF(agg.column_2, t.column_2) IS NULL
AND
NULLIF(t.column_3, agg.column_3) IS NULL
AND
NULLIF(agg.column_3, t.column_3) IS NULL
);

Related

Getting A "Could Not Open Relation" Error On Simple Query

I have a function that creates a set of INSERT INTO ... VALUES scripts. If I uncomment the dvp.content line, the function fails with an "ERROR: could not open relation with OID ###", which refers to the temp table. The content column is a jsonb type. Not sure where to begin?
CREATE OR REPLACE FUNCTION export_docs_as_sql(doc_list uuid[], to_org_id uuid)
RETURNS table(id integer, sql text)
AS $$
BEGIN
...
-- use a temp table to gather all INSERT statements
CREATE TEMP TABLE IF NOT EXISTS doc_data_export(
id serial PRIMARY KEY,
sql text
);
...
-- get doc_version_pages
INSERT INTO doc_data_export(sql)
SELECT 'INSERT INTO doc_version_pages(id, doc_version_id, persona_id, care_category_id, patient_group_id, title, content, created_at, updated_at, is_guide, is_root) VALUES (' ||
quote_literal(dvp.id::TEXT) || ', ' ||
quote_literal(dvp.doc_version_id::TEXT) || ', ' ||
CASE WHEN p.name IS NOT NULL THEN '(SELECT px.id FROM personas px WHERE px.org_id = ' || quote_literal(dv.id::TEXT) || ' AND px.name = ' || quote_literal(p.name) || '), ' ELSE 'NULL, ' END ||
CASE WHEN c.name IS NOT NULL THEN '(SELECT cx.id FROM care_categories cx WHERE cx.org_id = ' || quote_literal(to_org_id) || ' AND cx.name = ' || quote_literal(c.name) || '), ' ELSE 'NULL, ' END ||
CASE WHEN g.name IS NOT NULL THEN '(SELECT gx.id FROM patient_groups gx WHERE gx.org_id = ' || quote_literal(to_org_id) || ' AND gx.name = ' || quote_literal(g.name) || '), ' ELSE 'NULL, ' END ||
quote_literal(dvp.title::TEXT) || ', ' ||
--dvp.content || ', ' ||
quote_literal(dvp.created_at::TEXT) || ', ' ||
quote_literal(now()::timestamp) || ', ' ||
quote_literal(dvp.is_guide::TEXT) || ', ' ||
quote_literal(dvp.is_root::TEXT) || ');'
FROM unnest(doc_list) l
INNER JOIN doc_versions dv ON l = dv.doc_id
INNER JOIN doc_version_pages dvp ON dv.id = dvp.doc_version_id
LEFT JOIN personas p ON dvp.persona_id = p.id
LEFT JOIN care_categories c ON dvp.care_category_id = c.id
LEFT JOIN patient_groups g ON dvp.patient_group_id = g.id;
...
-- output all inserts
RETURN QUERY SELECT * FROM doc_data_export;
-- drop temp table
DROP TABLE doc_data_export;
END;
$$ LANGUAGE plpgsql;
The "Could Not Open Relation" problem is occurring due to the bug described here, which remains an issue as of Postgres 14.0:
What seems to be happening is that if the strings are large enough to be
toasted, then the data returned out of the function with RETURN QUERY
contains toast pointers referencing the temp table's toast table.
If you drop the temp table then those pointers will fail upon use.
To explain further, when a column value is greater than the TOAST_TUPLE_THRESHOLD configuration parameter (usually 2KB) and cannot be compressed or when the column is configured with a storage parameter of EXTERNAL, the value will be broken down into chunks and stored in a special secondary table called a TOAST table. This table will be stored in the pg_toast schema and will be named like pg_toast.pg_toast_<table OID>.
So when you add dvp.content to the sql statement you insert that into doc_data_export, some of these values are larger than the aforementioned constraints and are thus TOASTed. Your RETURN QUERY is only sending the pointers to the values in the toast table. After the return is done, the temporary table and its corresponding TOAST table is dropped. Thus when the outer query attempts to materialize the results, it can't find the TOAST table that these pointers reference - hence the cryptic error message you see.
You can avoid sending TOAST pointers for the temporary table -and thus safely DROP it after the RETURN QUERY -by performing an operation on the sql column that returns the same value:
RETURN QUERY SELECT id, sql || '' FROM doc_data_export;
The simple function below will reproduce a minimal example of the TOAST bug when you set fail to true and demonstrate the successful workaround when you set fail to false.
DROP FUNCTION IF EXISTS buttered_toast(boolean);
CREATE OR REPLACE FUNCTION buttered_toast(fail boolean)
RETURNS table(id integer, enormous_data text)
AS $$
BEGIN
CREATE TEMPORARY TABLE tbl_with_toasts (
id integer PRIMARY KEY,
enormous_data text
) ON COMMIT DROP;
--generate a giant string that is sure to generate a TOAST table.
INSERT INTO tbl_with_toasts(id,enormous_data) SELECT 1, string_agg(gen_random_uuid()::text,'-') FROM generate_series(1,10000) as ints(int);
IF buttered_toast.fail THEN
-- will return pointers to tbl_with_toast's TOAST table for the "enormous_data" column.
RETURN QUERY SELECT tbl_with_toasts.id, tbl_with_toasts.enormous_data FROM tbl_with_toasts ;
ELSE
-- will generate and return new values for the "enormous_data" column
RETURN QUERY SELECT tbl_with_toasts.id, tbl_with_toasts.enormous_data || '' FROM tbl_with_toasts ;
END IF;
DROP TABLE tbl_with_toasts;
END;
$$ LANGUAGE plpgsql;
-- fails with "Could Not Open Relation"
select * from buttered_toast(true)
--succeeds
select * from buttered_toast(false);

create table in postgresql function not work

I am a newbie to postgresql and now trying to create table in postgresql function. Though the function can be successfully created without raising any error, it always throw errors to say the table I created does not exist while the function trys to insert data into this table.
I am using pgadmin to create and test my function.
my code:
create or replace function f_produceMultiroleWorkload(sourceTable text, targetTable text) returns integer as $$
declare
mysql text;
record_cnt integer;
begin
record_cnt=0;
create temp table t_staff_job_division( staff_num varchar(30), cate_division varchar(30));
mysql:='insert into t_staff_job_division select staff_num, cate_division from (select staff_num, cate_division from bpc."' || $1 || '" group by 1,2) t1 where staff_num in (select distinct staff_num from bpc."' || $1 || '" where cate_division<>staff_division) and staff_num not in (select distinct staff_num from (select staff_num, count(distinct cate_division) as division_cnt from bpc."'|| $1 ||'" group by 1) t2 where division_cnt=1)' ;
execute mysql;
EXECUTE format(
'
CREATE TABLE IF NOT EXISTS %I.%I (
staff_num varchar(30) PRIMARY KEY,
cate_division varchar(30),
score numeric(18,7)
);
',
'bpc', $2
);
mysql:='insert into bpc.' || $2 ||' select t1.staff_num, t1.cate_division, sum(normalized_individual_gross_score) from bpc."' || $1 || '" t1 inner join t_staff_job_division t2 on t1.staff_num=t2.staff_num and t1.cate_division=t2.cate_division group by 1,2';
execute mysql;
if exists(select count(*) from bpc."' || $2 || '") then
mysql:='select count(*) from bpc."' || $2 || '"';
execute mysql into record_cnt;
else
record_cnt=0;
end if;
return record_cnt;
end;
$$ language plpgsql;
The error throws when function execute
mysql:='insert into bpc.' || $2 ||' select t1.staff_num, t1.cate_division, sum(normalized_individual_gross_score) from bpc."' || $1 || '" t1 inner join t_staff_job_division t2 on t1.staff_num=t2.staff_num and t1.cate_division=t2.cate_division group by 1,2';
Sine the prompt is in Chinese, it is no sense to publish it here. But generally speaking, it says the table named by bpc.$2 does not exist.
I will appreciate any help to assist me to solve this problem.
thanks in advance.
You are a victim of (accidental) SQL injection.
The second function argument (targettable) probably looks like this: CamelCase.
Now the CREATE TABLE statement, which is correctly constructed using format, looks like this:
CREATE TABLE bpc."CamelCase" ...
while the incorrectly constructed INSERT statement looks like
INSERT INTO bpc.CamelCase ...
Now SQL identifiers are folded to lower case in PostgreSQL unless they are (double) quoted, so the second statement will try to insert into bpc.camelcase. But table names are case sensitive, so that fails.
Recommendations:
Always use format to avoid SQL injection.
Avoid using anything but lower case ASCII letters, digits and _ in identifiers.
Unrelated, but the IF EXISTS in your function will also fail. You need dynamic SQL there too.
It seems that you need to make sure that your uppercase/lowercase naming is consistent. Here, you potentially run into a problem with uppercase/lowercase because $2 is inconsistently wrapped in double-quotes (").
Notice:
EXECUTE format(
'
CREATE TABLE IF NOT EXISTS %I.%I (
staff_num varchar(30) PRIMARY KEY,
cate_division varchar(30),
score numeric(18,7)
);
',
'bpc', $2
);
If $2 is "myTable", then the table will be created as bpc.mytable because when table/column names are not wrapped in double-quotes, PostgreSQL will automatically convert to lowercase. However, for this:
mysql:='select count(*) from bpc."' || $2 || '"';
The table name will get interpreted as bpc."myTable" and because it is in double-quotes, the uppercase T will be preserved. In Postgres mytable != "myTable", so you will get an error saying that bpc."myTable" does not exist.
Please consider using this format (notice double quotes):
EXECUTE format(
'
CREATE TABLE IF NOT EXISTS %I."%I" (
staff_num varchar(30) PRIMARY KEY,
cate_division varchar(30),
score numeric(18,7)
);
',
'bpc', $2
);

Find all tables having data in a given column

I have a set of around 500 schemas and many of them have common columns. Now whenever I have a update I have to manually see all schemas having those columns and update if they have the data.
I was trying to get all the tables having those columns against number of rows for a specific column data.
Eg. Lets say I have col1 column in scehmas A, B and C. Can I get data in following format.
Col1 table number
1005 A 3
1005 B 4
1005 C 5
1006 A 7
Where 1005 is a row in col1. A is table. 3 is number of rows with 1005 in col1 in table A.
Kindly excuse my formatting and lack of queries because I posted this question from mobile.
Create below function and use that for the extraction of the data
DROP FUNCTION IF EXISTS fun_test (CHARACTER VARYING);
drop type if exists fun_test_out;
create type fun_test_out as(
"schema_name" VARCHAR(255)
,"table_name" VARCHAR(255)
,"column_value" VARCHAR(255)
,"count" INT
);
CREATE OR REPLACE FUNCTION fun_test (colname CHARACTER VARYING)
RETURNS SETOF fun_test_out
AS
$$
declare
r fun_test_out%rowtype;
l_colname VARCHAR(255);
l_cte TEXT;
l_insert TEXT;
tables RECORD;
begin
l_colname := colname ;
DROP TABLE IF EXISTS tmp_output;
CREATE temp TABLE tmp_output
(
schema_name VARCHAR(255)
,table_name VARCHAR(255)
,column_value VARCHAR(255)
,count INT
);
DROP TABLE IF EXISTS tmp_tablename;
CREATE temp TABLE tmp_tablename
(
table_schema VARCHAR(255)
,table_name VARCHAR(255)
,column_name VARCHAR(255)
);
l_cte := 'Insert into tmp_tablename ' || chr(10) ||
'SELECT table_schema,table_name,column_name' || chr(10) ||
'FROM information_schema.columns WHERE column_name = ''' || l_colname || '''' ;
EXECUTE l_cte;
FOR tables IN
SELECT table_schema,table_name,column_name
FROM tmp_tablename
LOOP
l_insert = 'Insert into tmp_output ' || chr(10) ||
'SELECT ''' || tables.table_schema || ''',''' || tables.table_name || ''',' || tables.column_name || ',COUNT(*)' || chr(10) ||
'FROM ' || tables.table_schema || '.' || tables.table_name || chr(10) ||
'group by ' || tables.column_name
;
EXECUTE l_insert;
END LOOP;
/******************************************************************
FINAL SELECT
******************************************************************/
FOR r in
select *
from tmp_output
loop
RETURN NEXT r;
END LOOP;
DROP TABLE IF EXISTS tmp_output;
DROP TABLE IF EXISTS tmp_tablename;
end
$$
LANGUAGE PLPGSQL;
You can call the function using below statement
Select * from fun_test('Column_name');

Postgresql create a log schema

So my problem is simple. I have a schema prod with many tables, and another one log with the exact same tables and structure (primary keys change that's it).
When I do UPDATE or DELETE in the schema prod, I want to record old data in the log schema.
I have the following function called after a update or delete:
CREATE FUNCTION prod.log_data() RETURNS trigger
LANGUAGE plpgsql AS $$
DECLARE
v RECORD;
column_names text;
value_names text;
BEGIN
-- get column names of current table and store the list in a text var
column_names = '';
value_names = '';
FOR v IN SELECT * FROM information_schema.columns WHERE table_name = quote_ident(TG_TABLE_NAME) AND table_schema = quote_ident(TG_TABLE_SCHEMA) LOOP
column_names = column_names || ',' || v.column_name;
value_names = value_names || ',$1.' || v.column_name;
END LOOP;
-- remove first char ','
column_names = substring( column_names FROM 2);
value_names = substring( value_names FROM 2);
-- execute the insert into log schema
EXECUTE 'INSERT INTO log.' || TG_TABLE_NAME || ' ( ' || column_names || ' ) VALUES ( ' || value_names || ' )' USING OLD;
RETURN NULL; -- no need to return, it is executed after update
END;$$;
The annoying part is that I have to get column names from information_schema for each row.
I would rather use this:
EXECUTE 'INSERT INTO log.' || TG_TABLE_NAME || ' SELECT ' || OLD;
But some values can be NULL so this will execute:
INSERT INTO log.user SELECT 2,,,"2015-10-28 13:52:44.785947"
instead of
INSERT INTO log.user SELECT 2,NULL,NULL,"2015-10-28 13:52:44.785947"
Any idea to convert ",," to ",NULL,"?
Thanks
-Quentin
First of all I must say that in my opinion using PostgreSQL system tables (like information_schema) is the proper way for such a usecase. Especially that you must write it once: you create the function prod.log_data() and your done. Moreover it may be dangerous to use OLD in that context (just like *) as always because of not specified elements order.
But,
to answer your exact question the only way I know is to do some operations on OLD. Just observe that you cast OLD to text by doing concatenation ... ' SELECT ' || OLD. The default casting create that ugly double-commas. So, next you can play with that text. In the end I propose:
DECLARE
tmp TEXT
...
BEGIN
...
/*to make OLD -> text like (2,,3,4,,)*/
SELECT '' || OLD INTO tmp; /*step 1*/
/*take care of commas at the begining and end: '(,' ',)'*/
tmp := replace(replace(tmp, '(,', '(NULL,'), ',)', ',NULL)'); /*step 2*/
/* replace rest of commas to commas with NULL between them */
SELECT array_to_string(string_to_array(tmp, ',', ''), ',', 'NULL') INTO tmp; /*step 3*/
/* Now we can do EXECUTE*/
EXECUTE 'INSERT INTO log.' || TG_TABLE_NAME || ' SELECT ' || tmp;
Of course you can do steps 1-3 in one big step
SELECT array_to_string(string_to_array(replace(replace('' || NEW, '(,', '(NULL,'), ',)', ',NULL)'), ',', ''), ',', 'NULL') INTO tmp;
In my opinion this approach isn't any better from using information_schema, but it's your call.

Generic trigger to restrict insertions based on count

Background
In a PostgreSQL 9.0 database, there are various tables that have many-to-many relationships. The number of those relationships must be restricted. A couple of example tables include:
CREATE TABLE authentication (
id bigserial NOT NULL, -- Primary key
cookie character varying(64) NOT NULL, -- Authenticates the user with a cookie
ip_address character varying(40) NOT NULL -- Device IP address (IPv6-friendly)
)
CREATE TABLE tag_comment (
id bigserial NOT NULL, -- Primary key
comment_id bigint, -- Foreign key to the comment table
tag_name_id bigint -- Foreign key to the tag name table
)
Different relationships, however, have different limitations. For example, in the authentication table, a given ip_address is allowed 1024 cookie values; whereas, in the tag_comment table, each comment_id can have 10 associated tag_name_ids.
Problem
Currently, a number of functions have these restrictions hard-coded; scattering the limitations throughout the database, and preventing them from being changed dynamically.
Question
How would you impose a maximum many-to-many relationship limit on tables in a generic fashion?
Idea
Create a table to track the limits:
CREATE TABLE imposed_maximums (
id serial NOT NULL,
table_name character varying(128) NOT NULL,
column_group character varying(128) NOT NULL,
column_count character varying(128) NOT NULL,
max_size INTEGER
)
Establish the restrictions:
INSERT INTO imposed_maximums
(table_name, column_group, column_count, max_size) VALUES
('authentication', 'ip_address', 'cookie', 1024);
INSERT INTO imposed_maximums
(table_name, column_group, column_count, max_size) VALUES
('tag_comment', 'comment_id', 'tag_id', 10);
Create a trigger function:
CREATE OR REPLACE FUNCTION impose_maximum()
RETURNS trigger AS
$BODY$
BEGIN
-- Join this up with imposed_maximums somehow?
select
count(1)
from
-- the table name
where
-- the group column = NEW value to INSERT;
RETURN NEW;
END;
Attach the trigger to every table:
CREATE TRIGGER trigger_authentication_impose_maximum
BEFORE INSERT
ON authentication
FOR EACH ROW
EXECUTE PROCEDURE impose_maximum();
Obviously it won't work as written... is there a way to make it work, or otherwise enforce the restrictions such that they are:
in a single location; and
not hard-coded?
Thank you!
I've been doing a similar type of generic triggers.
The most tricky part is to get the value entry in the NEW record based on the column name.
I'm doing it the following way:
convert NEW data into array;
find the attnum of the column and use it as an index for the array.
This approach works as long as there're no commas in the data :( I don't know of other ways how to convert NEW or OLD variables into the array of values.
The following function might help:
CREATE OR REPLACE FUNCTION impose_maximum() RETURNS trigger AS $impose_maximum$
DECLARE
_sql text;
_cnt int8;
_vals text[];
_anum int4;
_im record;
BEGIN
_vals := string_to_array(translate(trim(NEW::text), '()', ''), ',');
FOR _im IN SELECT * FROM imposed_maximums WHERE table_name = TG_TABLE_NAME LOOP
SELECT attnum INTO _anum FROM pg_catalog.pg_attribute a
JOIN pg_catalog.pg_class t ON t.oid = a.attrelid
WHERE t.relkind = 'r' AND t.relname = TG_TABLE_NAME
AND NOT a.attisdropped AND a.attname = _im.column_group;
_sql := 'SELECT count('||quote_ident(_im.column_count)||')'||
' FROM '||quote_ident(_im.table_name)||
' WHERE '||quote_ident(_im.column_group)||' = $1';
EXECUTE _sql INTO _cnt USING _vals[_anum];
IF _cnt > CAST(_im.max_size AS int8) THEN
RAISE EXCEPTION 'Maximum of % hit for column % in table %(%=%)',
_im.max_size, _im.column_count,
_im.table_name, _im.column_group, _vals[_anum];
END IF;
END LOOP;
RETURN NEW;
END; $impose_maximum$ LANGUAGE plpgsql;
This function will check for all conditions defined for a given table.
Yes, there is a way to make it work.
In my personal opinion your idea is the way to go. It just needs one level of "meta". So, the table imposed_restrictions should have trigger(s), which is (are) fired after insert, update and delete. The code should then in turn create, modify or remove triggers and functions.
Take a look at execute statement of PL/pgSQL, which - essentially - allows you to execute any string. Needless to say, this string may contain definitions of triggers, functions, etc. Obviously, you have the access to OLD and NEW in the triggers, so you can fill in the placeholders in the string and you are done.
I believe you should be able to accomplish what you want with this answer. Please note that this is my personal view on the topic and it might not be an optimal solution - I would like to see a different, maybe also more efficient, approach.
Edit - Below is a sample from one of my old projects. It is located inside the function that is triggered before update (though now I get to think of it, maybe it should have been called after ;) And yes, the code is messy, as it does not use the nice $escape$ syntax. I was really, really young then. Nonetheless, the snipped demonstrates that it is possible to achieve what you want.
query:=''CREATE FUNCTION '' || NEW.function_name || ''('';
IF NEW.parameter=''t'' THEN
query:=query || ''integer'';
END IF;
query:=query || '') RETURNS setof '' || type_name || '' AS'' || chr(39);
query:=query || '' DECLARE list '' || type_name || ''; '';
query:=query || ''BEGIN '';
query:=query || '' FOR list IN EXECUTE '' || chr(39) || chr(39);
query:=query || temp_s || '' FROM '' || NEW.table_name;
IF NEW.parameter=''t'' THEN
query:=query || '' WHERE id='' || chr(39) || chr(39) || ''||'' || chr(36) || ''1'';
ELSE
query:=query || '';'' || chr(39) || chr(39);
END IF;
query:=query || '' LOOP RETURN NEXT list; '';
query:=query || ''END LOOP; RETURN; END; '' || chr(39);
query:=query || ''LANGUAGE '' || chr(39) || ''plpgsql'' || chr(39) || '';'';
EXECUTE query;
These function + trigger could be used as a template. If You combine them with #Sorrow 's technique of dynamically generating the functions + triggers, this could solve the OP's problem.
Please note that, instead of recalculating the count for every affected row (by calling the COUNT() aggregate function), I maintain an 'incremental' count. This should be cheaper.
DROP SCHEMA tmp CASCADE;
CREATE SCHEMA tmp ;
SET search_path='tmp';
CREATE TABLE authentication
( id bigserial NOT NULL -- Primary key
, cookie varchar(64) NOT NULL -- Authenticates the user with a cookie
, ip_address varchar(40) NOT NULL -- Device IP address (IPv6-friendly)
, PRIMARY KEY (ip_address, cookie)
);
CREATE TABLE authentication_ip_count (
ip_address character varying(40) NOT NULL
PRIMARY KEY -- REFERENCES authentication(ip_address)
, refcnt INTEGER NOT NULL DEFAULT 0
--
-- This is much easyer:
-- keep the max value inside the table
-- + use a table constraint
-- , maxcnt INTEGER NOT NULL DEFAULT 2 -- actually 100
-- , CONSTRAINT no_more_cookies CHECK (refcnt <= maxcnt)
);
CREATE TABLE imposed_maxima
( id serial NOT NULL
, table_name varchar NOT NULL
, column_group varchar NOT NULL
, column_count varchar NOT NULL
, max_size INTEGER NOT NULL
, PRIMARY KEY (table_name,column_group,column_count)
);
INSERT INTO imposed_maxima(table_name,column_group,column_count,max_size)
VALUES('authentication','ip_address','cookie', 2);
CREATE OR REPLACE FUNCTION authentication_impose_maximum()
RETURNS trigger AS
$BODY$
DECLARE
dummy INTEGER;
BEGIN
IF (TG_OP = 'INSERT') THEN
INSERT INTO authentication_ip_count (ip_address)
SELECT sq.*
FROM ( SELECT NEW.ip_address) sq
WHERE NOT EXISTS (
SELECT *
FROM authentication_ip_count nx
WHERE nx.ip_address = sq.ip_address
);
UPDATE authentication_ip_count
SET refcnt = refcnt + 1
WHERE ip_address = NEW.ip_address
;
SELECT COUNT(*) into dummy -- ac.refcnt, mx.max_size
FROM authentication_ip_count ac
JOIN imposed_maxima mx ON (1=1) -- outer join
WHERE ac.ip_address = NEW.ip_address
AND mx.table_name = 'authentication'
AND mx.column_group = 'ip_address'
AND mx.column_count = 'cookie'
AND ac.refcnt > mx.max_size
;
IF FOUND AND dummy > 0 THEN
RAISE EXCEPTION 'Cookie moster detected';
END IF;
ELSIF (TG_OP = 'DELETE') THEN
UPDATE authentication_ip_count
SET refcnt = refcnt - 1
WHERE ip_address = OLD.ip_address
;
DELETE FROM authentication_ip_count ac
WHERE ac.ip_address = OLD.ip_address
AND ac.refcnt <= 0
;
-- ELSIF (TG_OP = 'UPDATE') THEN
-- (Only needed if we allow updates of ip-address)
-- otherwise the count stays the same.
END IF;
RETURN NEW;
END;
$BODY$
LANGUAGE plpgsql;
CREATE TRIGGER trigger_authentication_impose_maximum
BEFORE INSERT OR UPDATE OR DELETE
ON authentication
FOR EACH ROW
EXECUTE PROCEDURE authentication_impose_maximum();
-- Test it ...
INSERT INTO authentication(ip_address, cookie) VALUES ('1.2.3.4', 'Some koekje' );
INSERT INTO authentication(ip_address, cookie) VALUES ('1.2.3.4', 'kaakje' );
INSERT INTO authentication(ip_address, cookie) VALUES ('1.2.3.4', 'Yet another cookie' );
RESULTS:
INSERT 0 1
CREATE FUNCTION
CREATE TRIGGER
INSERT 0 1
INSERT 0 1
ERROR: Cookie moster detected