How do I drop duplicates from from PostgreSQL in chunks? - postgresql

I have a table with many millions of rows and I want to drop the duplicates. I don't want to make a second table and I don't have enough memory to do it all in one go. I tried this code but I'm not sure if it's working correctly
BEGIN;
for i in range(0, 80000000, 10000):
DELETE FROM table_name t
USING (SELECT MIN(id) as min_id, column1, column2, ...
FROM table_name
GROUP BY column1, column2, ...
HAVING COUNT(*) > 1
LIMIT 10000
OFFSET i) as dups
WHERE t.column1 = dups.column1 AND t.column2 = dups.column2 AND ...
AND t.id > dups.min_id;
COMMIT;

you can try below
CREATE OR REPLACE FUNCTION remove_dupes()
RETURNS int language plpgsql AS
$$
declare
DELETE_LIMIT int;
total_deleted int;
count int;
begin
DELETE_LIMIT := 1; // make it 10,000 or more
total_deleted := 0;
count := 0;
LOOP
delete from test_delete td
using (select bk, col1, col2, col3, col4, min(pk) minpk, count(*)
from test_delete
group by bk, col1, col2, col3, col4
having count(*) > 1
limit DELETE_LIMIT) dups
where td.bk = dups.bk and td.col1 = dups.col1 and td.col2 = dups.col2
and td.col3 = dups.col3 and td.col4 = dups.col4
and td.pk != dups.minpk
;
GET DIAGNOSTICS count = ROW_COUNT;
total_deleted = total_deleted + count;
raise notice 'deleted this loop: % ; total: %', count, total_deleted;
exit when count = 0;
end loop;
return total_deleted;
end;
$$
invocation - assumed auto commit
select * from remove_dupes();

Related

How to make COUNT(*) query faster

I have table1 which contains around 900k line of records, and table2 around 500k rows and 26 columns. sample table
I want to update table1 with total of unique combinations from table2
Tried with different type of counts, but still the performance is extremally slow. Is there any alternative options to improve the performance please?
query #1, the completion time for just 10 lines is 1 min 47 secs.
do $$
declare
rec record;
current_joins text;
current_result int;
begin for rec in (select joins from table1 where line<=10) loop
select rec.joins into current_joins;
execute format('select count(*) from (select 1 from table2 group by %1$s) as some_alias;', current_joins) into current_result;
update table1 set result = current_result where joins=current_joins;
end loop;
end $$;
query #2, the completion time for just 10 lines is 1 min 48 secs
do $$
declare
rec record;
current_joins text;
current_result int;
begin for rec in (select joins from table1 where line<=10) loop
select rec.joins into current_joins;
execute format('select count(*) from (select 1 from table2 group by %1$s having count(*)>=1) as some_alias;', current_joins) into current_result;
update table1 set result = current_result where joins=current_joins;
end loop;
end $$;
query #3, the completion time for just 10 lines is 1 min 52 secs
do $$
declare
rec record;
current_joins text;
current_result int;
begin for rec in (select joins from table1 where line<=10) loop
select rec.joins into current_joins;
execute format('select count(distinct (%1$s)) from table2', current_joins) into current_result;
update table1 set result = current_result where joins=current_joins;
end loop;
end $$;
query #4, the completion time for just 10 lines is 1 min 55 secs
do $$
declare
rec record;
current_joins text;
current_result int;
begin for rec in (select joins from table1 where line<=10) loop
select rec.joins into current_joins;
execute format('select count(*) from (select distinct (%1$s) from table2) as temp', current_joins) into current_result;
update table1 set result = current_result where joins=current_joins;
end loop;
end $$;
other option, tried with Microsoft Excel, the completion time for just 10 lines is <10 secs which is much faster than all the above sql codes, but still slow with 500k data
=LET(A,Data!$C$2:$AB$12,B,ROWS(A),C,FILTERXML("<A><B>"&SUBSTITUTE(C2,",","</B><B>")&"</B></A>","//B"),ROWS(UNIQUE(INDEX(A,SEQUENCE(B),TRANSPOSE(C)))))

Postgresql: ERROR: syntax error at or near "integer"

I am getting this error while declaring variables in this postgres query , really not sure what the issue is,I am new to postgres. thank for feedback
DECLARE
maxpagecount integer;
rowsofpage integer;
pagenumber integer;
Set rowsofpage :=3;
Set pagenumber :=1;
select maxpagecount=count(*) from UNLABELED_IMAGE_PREDICTION;
Set maxpagecount = CEILING(Maxpagecount/rowsofpage);
While maxpagecount >= pagenumber
BEGIN
Select x.image_id,x.score
from (
select *,row_number() over (order by score desc) as row_count
from UNLABELED_IMAGE_PREDICTION
) x
where x.row_count <= (select count(*) from UNLABELED_IMAGE_PREDICTION)
OFFSET (pagenumber-1)* rowsofpage ROWS
FETCH first rowsofpage ROWS ONLY;
Set rowsOfpage =rowsOfpage + 1
Set pagenumber=pagenumber+1
END;

How to compare two table value using if condition in function of Postgres

create or replace function trace.get_latest_exception_custom_msg(id varchar)
returns varchar
language plpgsql
as $$
declare
msg varchar ;
begin
perform t1.message, t1.created_time from table_1 t1 where t1.id = id order by t1.created_time desc limit 1;
perform t2.message, t2.created_time from table_2 t2 where t2.id = id order by t2.created_time desc limit 1;
if date(t1.created_time ) >= date(t2.created_time) then msg= t1.message;
elsif d date(t1.created_time ) < date(t2.created_time) then msg= t1.message;
else msg =t1.message;
end if;
return msg;
end;
while i call this function it give error ERROR: missing FROM-clause entry for table "t_1
You need to store the result of the two SELECT queries into variables in order to be able to be able to use them in an IF statement.
Your IF statement is also a bit confusing as all three parts assign the same value to msg. I assume that you want to use t2.message at least in one case.
create or replace function trace.get_latest_exception_custom_msg(p_id varchar)
returns varchar
language plpgsql
as
$$
declare
t1_msg varchar;
t1_created date;
t2_msg varchar;
t2_created date;
msg varchar;
begin
select t1.message, t1.created_time::date
into t1_msg, t1_created
from table_1 t1
where t1.id = p_id
order by t1.created_time desc
limit 1;
select t2.message, t2.created_time::date
into t2_msg, t2_created
from table_2 t2
where t2.id = p_id
order by t2.created_time desc
limit 1;
if t1_created >= t2_created then
msg := t1_msg;
elsif t1_created < t2_created then
msg := t2_msg; --<< ???
else
-- this can only happen if one (or both) of the DATEs is NULL.
msg := t1_msg;
end if;
return msg;
end;
$$

Select 1 into variable postgresql?

I have this select statement inside a trigger procedure:
SELECT 1 FROM some_table WHERE "user_id" = new."user_id"
AND created >= now()::date;
How can i store result in a variable and reuse it in IF statement like this:
IF NOT EXISTS (var_name) THEN ...;
procedure (for now i have select right in IF statement, but i want it separately)
CREATE OR REPLACE FUNCTION add_row() RETURNS TRIGGER AS $$
BEGIN
//need to check if row was created around today
IF NOT EXISTS (SELECT 1 FROM some_table WHERE "user_id" = new."user_id"
AND created >= now()::date) THEN
INSERT INTO another_table VALUES(1, 2, 3);
END IF;
END;
$$ LANGUAGE plpgsql;
To store the result of a query into a variable, you need to declare a variable. Then you can use select .. into .. to store the result. But I would use a boolean and an exists condition for this purpose.
CREATE OR REPLACE FUNCTION add_row()
RETURNS TRIGGER
AS $$
declare
l_row_exists boolean;
BEGIN
select exists (SELECT *
FROM some_table
WHERE user_id = new.user_id
AND created >= current_date)
into l_row_exists;
IF NOT l_row_exists THEN
INSERT INTO another_table (col1, col2, col3)
VALUES(1, 2, 3);
END IF;
END;
$$ LANGUAGE plpgsql;
However, you don't really need an IF statement to begin with. You can simplify this to a single INSERT statement:
INSERT INTO another_table (col1, col2, col3)
SELECT 1,2,3
WHERE NOT EXISTS (SELECT *
FROM some_table
WHERE user_id = new.user_id
AND created >= current_date);

PostgreSQL function

I've created this function to re-sequence the sequence number on a BOM table (bomitem).
CREATE OR REPLACE FUNCTION seqincr(integer)
RETURNS SETOF bomitem AS
$BODY$
DECLARE
pItemid ALIAS FOR $1;
_row bomitem%ROWTYPE;
seqint int;
_id int;
BEGIN
seqint=8;
FOR _row IN SELECT *
FROM bomitem
WHERE ((bomitem_parent_item_id=pItemid))
LOOP
RETURN NEXT _row;
_id = _row.bomitem_id;
seqint = seqint+2;
update bomitem set bomitem_seqnumber = seqint where bomitem_id=_id;
END LOOP;
RETURN;
END;
$BODY$
LANGUAGE plpgsql VOLATILE
COST 100
ROWS 1000;
ALTER FUNCTION seqincr(integer)
OWNER TO admin;
The example works on an individual bomitem_parent_item_id like below:
SELECT * from seqincr(14917);
I would like to rewrite this function to loop through
SELECT distinct bomitem_parent_item_id FROM bomitem;
so that it resequences the entire BOM table.
What you are trying to do is much simpler with a CTE:
WITH x AS (
SELECT bomitem_parent_item_id
, row_number() OVER (ORDER BY bomitem_parent_item_id) AS rn
FROM bomitem
GROUP BY bomitem_parent_item_id
ORDER BY bomitem_parent_item_id
)
UPDATE bomitem b
SET bomitem_seqnumber = 8 + 2 * rn
FROM x
WHERE x.bomitem_parent_item_id = b.bomitem_id;
You need at least PostgreSQL 9.1 for data-modifying CTE.
Or use a subquery, works in earlier versions, too:
UPDATE bomitem b
SET bomitem_seqnumber = 8 + 2 * rn
FROM (
SELECT bomitem_parent_item_id
, row_number() OVER (ORDER BY bomitem_parent_item_id) AS rn
FROM bomitem
GROUP BY bomitem_parent_item_id
ORDER BY bomitem_parent_item_id
) x
WHERE x.bomitem_parent_item_id = b.bomitem_id;
But you need at least PostgreSQL 8.4 for the window function row_number().