I have table1 which contains around 900k line of records, and table2 around 500k rows and 26 columns. sample table
I want to update table1 with total of unique combinations from table2
Tried with different type of counts, but still the performance is extremally slow. Is there any alternative options to improve the performance please?
query #1, the completion time for just 10 lines is 1 min 47 secs.
do $$
declare
rec record;
current_joins text;
current_result int;
begin for rec in (select joins from table1 where line<=10) loop
select rec.joins into current_joins;
execute format('select count(*) from (select 1 from table2 group by %1$s) as some_alias;', current_joins) into current_result;
update table1 set result = current_result where joins=current_joins;
end loop;
end $$;
query #2, the completion time for just 10 lines is 1 min 48 secs
do $$
declare
rec record;
current_joins text;
current_result int;
begin for rec in (select joins from table1 where line<=10) loop
select rec.joins into current_joins;
execute format('select count(*) from (select 1 from table2 group by %1$s having count(*)>=1) as some_alias;', current_joins) into current_result;
update table1 set result = current_result where joins=current_joins;
end loop;
end $$;
query #3, the completion time for just 10 lines is 1 min 52 secs
do $$
declare
rec record;
current_joins text;
current_result int;
begin for rec in (select joins from table1 where line<=10) loop
select rec.joins into current_joins;
execute format('select count(distinct (%1$s)) from table2', current_joins) into current_result;
update table1 set result = current_result where joins=current_joins;
end loop;
end $$;
query #4, the completion time for just 10 lines is 1 min 55 secs
do $$
declare
rec record;
current_joins text;
current_result int;
begin for rec in (select joins from table1 where line<=10) loop
select rec.joins into current_joins;
execute format('select count(*) from (select distinct (%1$s) from table2) as temp', current_joins) into current_result;
update table1 set result = current_result where joins=current_joins;
end loop;
end $$;
other option, tried with Microsoft Excel, the completion time for just 10 lines is <10 secs which is much faster than all the above sql codes, but still slow with 500k data
=LET(A,Data!$C$2:$AB$12,B,ROWS(A),C,FILTERXML("<A><B>"&SUBSTITUTE(C2,",","</B><B>")&"</B></A>","//B"),ROWS(UNIQUE(INDEX(A,SEQUENCE(B),TRANSPOSE(C)))))
Related
below is my postgres procedure
create or replace procedure ds_rs.test_ad () as $$
declare
i integer;
v_command text;
rcd record;
v_count text; v_pk_1 text; v_pk_2 text;
v_account_id varchar(100);
v_query text;
begin
for rcd in ( select t.table_name as object_name, a.table_schema, a.table_name from TLDEPLOY.TABLES_TO_VERIFY t
left join information_schema.tables a on concat
(COALESCE(a.table_schema, ''),'.',COALESCE(a.table_name, '')) = t.table_name order by t.table_name) loop
select count(*), max(column_name), min(column_name) into v_count, v_pk_1, v_pk_2
from information_schema.table_constraints c
join information_schema.columns cl on cl.table_name =c.table_name
where c.constraint_type ='PRIMARY KEY'
and c.table_name =rcd.table_name
and c.TABLE_SCHEMA=rcd.table_schema
and cl.ordinal_position =1
and cl.TABLE_SCHEMA=rcd.table_schema ;
v_command :=('insert into ds_rs.test_table (pk_1,pk_2) select '||v_pk_1||' , '||md5(v_pk_2::text)||' from ds_rs.account');
execute v_command;
end loop;
end; $$
language plpgsql;
while insert into table this query -> md5(v_pk_2::text) -> hash column name and inserts - i want column values to hash and insert
when used as normal - select md5(account_id::text) from account it gives correct value - but when passed as variable it inserts column name not column data
DROP FUNCTION IF EXISTS top_5(customers.customerid%TYPE, products.prod_id%TYPE, orderlines.quantity%TYPE) CASCADE;
CREATE OR REPLACE FUNCTION top_5(c_id customers.customerid%TYPE, p_id products.prod_id%TYPE, quant orderlines.quantity%TYPE)
RETURNS orders.orderid%TYPE AS $$
DECLARE
top_prod CURSOR IS
SELECT inv.prod_id
FROM inventory AS inv, products AS prod
WHERE inv.prod_id=prod.prod_id
ORDER BY inv.quan_in_stock desc, inv.sales
limit 5;
ord_id orders.orderid%TYPE;
ord_date orders.orderdate%TYPE:= current_date;
ordln_id orderlines.orderlineid%TYPE:=1;
BEGIN
SELECT nova_orderid() INTO ord_id;
INSERT INTO orders(orderid, orderdate,customerid,netamount,tax,totalamount) VALUES(ord_id,ord_date,c_id,0,0,0);
PERFORM compra(c_id, p_id, 1::smallint, ord_id, ordln_id, ord_date);
IF (p_id = top_prod) THEN
UPDATE orders
SET totalamount = totalamount - (totalamount*0.2)
WHERE ord_id = (SELECT MAX(ord_id) FROM orders);
END IF;
END;
$$ LANGUAGE plpgsql;
I have the following code and when i try to execute this
SELECT top_5(1,1,'2');
i have this error
ERROR: operator does not exist: integer = refcursor
LINE 1: SELECT (p_id = top_prod)
You need to get the 'prod_id' value from the cursor 'top_prod'.
You cannot compare two types.
Try this,
DECLARE
top_prod_id top_prod%ROWTYPE;
BEGIN
OPEN top_prod;
LOOP
FETCH top_prod INTO top_prod_id;
EXIT WHEN top_prod %NOTFOUND;
IF (p_id = top_prod_id) THEN
UPDATE orders
SET totalamount = totalamount - (totalamount*0.2)
WHERE ord_id = (SELECT MAX(ord_id) FROM orders);
END IF;
END LOOP;
CLOSE top_prod;
END;
I have a function
drop function ProcessReward();
CREATE OR REPLACE FUNCTION ProcessReward()
RETURNS text AS $$
DECLARE
sessionid NO SCROLL CURSOR FOR SELECT pg."Setting",pg."UserId",pg."Id" FROM "Development"."PersonGame" pg inner join "Development"."Game" g on g."Id" = pg."GameId" pg."GameId"=1 for read only;
titles TEXT DEFAULT '';
rec record;
jsonrec record;
jsonrecord record;
BEGIN
OPEN sessionid;
loop
FETCH sessionid INTO rec;
if not found then
exit ;
end if;
EXECUTE 'select * from "Development"."GameRecipient" where "PersonGameId"=$1' into jsonrecord using rec."Id";
--I want to loop here every row returned by above query
--loop start
-- do your task
--loop end
end loop;
return titles;
END;
$$ LANGUAGE plpgsql;
My query
EXECUTE 'select * from "Development"."GameRecipient" where
"PersonGameId"=$1' into jsonrecord using rec."Id";
returns
col1 col2 col3
123 324 444
345 222 765
I want to process all rows returned by above query,How to achieve this in PostgreSQL.
I have multiple tables with each two rows of interest: connection_node_start_id and connection_node_end_id. My goal is to get a collection of all those IDs, either as a flat ARRAY or as a new TABLE consisting of one row.
Example output ARRAY:
result = {1,4,7,9,2,5}
Example output TABLE:
IDS
-------
1
4
7
9
2
5
My fist attempt is somewhat clumsy and does not work properly as the SELECT statement just returns one row. It seems there must be a simple way to do this, can someone point me into the right direction?
CREATE OR REPLACE FUNCTION get_connection_nodes(anyarray)
RETURNS anyarray AS
$$
DECLARE
table_name varchar;
result integer[];
sel integer[];
BEGIN
FOREACH table_name IN ARRAY $1
LOOP
RAISE NOTICE 'table_name(%)',table_name;
EXECUTE 'SELECT ARRAY[connection_node_end_id,
connection_node_start_id] FROM ' || table_name INTO sel;
RAISE NOTICE 'sel(%)',sel;
result := array_cat(result, sel);
END LOOP;
RETURN result;
END
$$
LANGUAGE 'plpgsql';
Test table:
connection_node_start_id | connection_node_end_id
--------------------------------------------------
1 | 4
7 | 9
Call:
SELECT get_connection_nodes(ARRAY['test_table']);
Result:
{1,4} -- only 1st row, rest is missing
For Postgres 9.3+
CREATE OR REPLACE FUNCTION get_connection_nodes(text[])
RETURNS TABLE (ids int) AS
$func$
DECLARE
_tbl text;
BEGIN
FOREACH _tbl IN ARRAY $1
LOOP
RETURN QUERY EXECUTE format('
SELECT t.id
FROM %I, LATERAL (VALUES (connection_node_start_id)
, (connection_node_end_id)) t(id)'
, _tbl);
END LOOP;
END
$func$ LANGUAGE plpgsql;
Related answer on dba.SE:
SELECT DISTINCT on multiple columns
Or drop the loop and concatenate a single query. Probably fastest:
CREATE OR REPLACE FUNCTION get_connection_nodes2(text[])
RETURNS TABLE (ids int) AS
$func$
BEGIN
RETURN QUERY EXECUTE (
SELECT string_agg(format(
'SELECT t.id FROM %I, LATERAL (VALUES (connection_node_start_id)
, (connection_node_end_id)) t(id)'
, tbl), ' UNION ALL ')
FROM unnest($1) tbl
);
END
$func$ LANGUAGE plpgsql;
Related:
Loop through like tables in a schema
LATERAL was introduced with Postgres 9.3.
For older Postgres
You can use the set-returning function unnest() in the SELECT list, too:
CREATE OR REPLACE FUNCTION get_connection_nodes2(text[])
RETURNS TABLE (ids int) AS
$func$
BEGIN
RETURN QUERY EXECUTE (
SELECT string_agg(
'SELECT unnest(ARRAY[connection_node_start_id
, connection_node_end_id]) FROM ' || tbl
, ' UNION ALL '
)
FROM (SELECT quote_ident(tbl) AS tbl FROM unnest($1) tbl) t
);
END
$func$ LANGUAGE plpgsql;
Should work with pg 8.4+ (or maybe even older). Works with current Postgres (9.4) as well, but LATERAL is much cleaner.
Or make it very simple:
CREATE OR REPLACE FUNCTION get_connection_nodes3(text[])
RETURNS TABLE (ids int) AS
$func$
BEGIN
RETURN QUERY EXECUTE (
SELECT string_agg(format(
'SELECT connection_node_start_id FROM %1$I
UNION ALL
SELECT connection_node_end_id FROM %1$I'
, tbl), ' UNION ALL ')
FROM unnest($1) tbl
);
END
$func$ LANGUAGE plpgsql;
format() was introduced with pg 9.1.
Might be a bit slower with big tables because each table is scanned once for every column (so 2 times here). Sort order in the result is different, too - but that does not seem to matter for you.
Be sure to sanitize escape identifiers to defend against SQL injection and other illegal syntax. Details:
Table name as a PostgreSQL function parameter
The EXECUTE ... INTO statement can only return data from a single row:
If multiple rows are returned, only the first will be assigned to the INTO variable.
In order to concatenate values from all rows you have to aggregate them first by column and then append the arrays:
EXECUTE 'SELECT array_agg(connection_node_end_id) ||
array_agg(connection_node_start_id) FROM ' || table_name INTO sel;
You're probably looking for something like this:
CREATE OR REPLACE FUNCTION d (tblname TEXT [])
RETURNS TABLE (c INTEGER) AS $$
DECLARE sql TEXT;
BEGIN
WITH x
AS (SELECT unnest(tblname) AS tbl),
y AS (
SELECT FORMAT('
SELECT connection_node_end_id
FROM %s
UNION ALL
SELECT connection_node_start_id
FROM %s
', tbl, tbl) AS s
FROM x)
SELECT string_agg(s, ' UNION ALL ')
INTO sql
FROM y;
RETURN QUERY EXECUTE sql;
END;$$
LANGUAGE plpgsql;
CREATE TABLE a (connection_node_end_id INTEGER, connection_node_start_id INTEGER);
INSERT INTO A VALUES (1,2);
CREATE TABLE b (connection_node_end_id INTEGER, connection_node_start_id INTEGER);
INSERT INTO B VALUES (100, 101);
SELECT * from d(array['a','b']);
c
-----
1
2
100
101
(4 rows)
I have the following procedure :
CREATE OR REPLACE FUNCTION findKNN()
RETURNS Text AS $body$
DECLARE
cur refcursor;
tempcur refcursor;
gid_ integer;
_var1 integer;
_var2 integer;
BEGIN
open cur for execute('select gid from polygons');
loop
fetch cur into gid_;
open tempcur for SELECT g1.gid , g2.gid FROM polygons AS g1, polygons AS g2
WHERE g1.gid = gid_ and g1.gid <> g2.gid ORDER BY g1.gid , ST_Distance(g1.the_geom,g2.the_geom)
LIMIT 5;
loop
fetch tempcur into _var1 , _var2;
-- how to return _var1 , _var2 here ?
end loop;
end loop;
close cur;
END;
$body$
LANGUAGE plpgsql;
But I don't know how to return the result out of this procedure. The query returns 5 rows for each execution within outer cursor loop. How can I retrieve these five rows for each query execution?
Unless you are trying to do something more complicated that is not in your question, you can radically simplify to:
CREATE OR REPLACE FUNCTION find_knn()
RETURNS TABLE(gid1 integer, gid2 integer) AS
$body$
BEGIN
RETURN QUERY
SELECT g1.gid , g2.gid
FROM polygons g1
JOIN polygons g2 ON g1.gid <> g2.gid
-- WHERE g1.gid = <some_condition> -- ???
ORDER BY g1.gid, st_distance(g1.the_geom, g2.the_geom)
LIMIT 5;
END;
$body$ LANGUAGE plpgsql;
Or even:
CREATE OR REPLACE FUNCTION find_knn()
RETURNS TABLE(gid1 integer, gid2 integer) AS
$body$
SELECT g1.gid , g2.gid
FROM polygons g1
JOIN polygons g2 ON g1.gid <> g2.gid
-- WHERE g1.gid = <some_condition> -- ???
ORDER BY g1.gid, st_distance(g1.the_geom, g2.the_geom)
LIMIT 5;
$body$ LANGUAGE sql;
Call:
SELECT * FROM x.find_knn();
The manual about Returning From a Function.
The manual about CREATE FUNCTION.
Retrieve a small slice of a huge join
(Answer to comment.)
There is many ways to pick a small slice of a huge join without actually evaluating the whole join. In most cases you don't even have to worry about it. For instance, run this at home:
EXPLAIN ANALYZE
SELECT *
FROM huge_tbl t1
CROSS JOIN huge_tbl t2
LIMIT 5
You will see that only 5 rows will be processed, not the whole cross join.
The same is true for a CTE:
WITH a AS (
SELECT *
FROM huge_tbl t1
CROSS JOIN huge_tbl t2
)
SELECT *
FROM a
LIMIT 5
Some limitations apply. I quote the excellent manual:
PostgreSQL's implementation evaluates only as many rows of a WITH
query as are actually fetched by the parent query.
To make absolutely sure, you could apply the LIMIT (or a fitting WHERE clause) at the source:
SELECT *
FROM (SELECT * FROM huge_table LIMIT 1) t1
CROSS JOIN (SELECT * FROM huge_table LIMIT 5) t2;