PostgreSQL: Iterate through a tables rows with for loop, retrieve column value based on current row - postgresql

I have the following 2 tables
CREATE TABLE salesperson_t (
salespersonid numeric(4,0) NOT NULL,
salespersonname character varying(25),
salespersontelephone character varying(50),
salespersonfax character varying(50),
salespersonaddress character varying(30),
salespersoncity character varying(20),
salespersonstate character(2),
salespersonzip character varying(20),
salesterritoryid numeric(4,0),
CONSTRAINT salesperson_pk PRIMARY KEY (salespersonid)
);
INSERT INTO salesperson_t VALUES (1, 'Doug Henny', '8134445555', NULL, NULL, NULL, NULL, NULL, 2);
INSERT INTO salesperson_t VALUES (2, 'Robert Lewis', '8139264006', NULL, '124 Deerfield', 'Lutz', 'FL', '33549', 13);
INSERT INTO salesperson_t VALUES (3, 'William Strong', '3153821212', NULL, '787 Syracuse Lane', 'Syracuse', 'NY', '33240', 3);
INSERT INTO salesperson_t VALUES (4, 'Julie Dawson', '4355346677', NULL, NULL, NULL, NULL, NULL, 4);
INSERT INTO salesperson_t VALUES (5, 'Jacob Winslow', '2238973498', NULL, NULL, NULL, NULL, NULL, 5);
INSERT INTO salesperson_t VALUES (6, 'Pepe Lepue', NULL, NULL, NULL, 'Platsburg', 'NY', NULL, 13);
INSERT INTO salesperson_t VALUES (8, 'Fred Flinstone', NULL, NULL, '1 Rock Lane', 'Bedrock', 'Ca', '99999', 2);
INSERT INTO salesperson_t VALUES (9, 'Mary James', '3035555454', NULL, '9 Red Line', 'Denver', 'CO', '55555', 4);
INSERT INTO salesperson_t VALUES (10, 'Mary Smithson', '4075555555', NULL, '4585 Maple Dr', 'Orlando', 'FL', '32826', 15);
CREATE TABLE territory2_t (
territoryid numeric(4,0),
territoryname character varying(50),
total_sales_person integer,
CONSTRAINT territory2_t_pk PRIMARY KEY (territoryid)
);
INSERT INTO territory2_t VALUES (1, 'SouthEast', NULL);
INSERT INTO territory2_t VALUES (2, 'SouthWest', NULL);
INSERT INTO territory2_t VALUES (3, 'NorthEast', NULL);
INSERT INTO territory2_t VALUES (4, 'NorthWest', NULL);
INSERT INTO territory2_t VALUES (5, 'Central', NULL);
INSERT INTO territory2_t VALUES (6, 'Alaska', NULL);
INSERT INTO territory2_t VALUES (12, 'Hawaii', NULL);
INSERT INTO territory2_t VALUES (13, 'Colorado', NULL);
INSERT INTO territory2_t VALUES (15, 'Arizona', NULL);
I have the following pseudo code:
DO $$
DECLARE
-- currentRow [relevant datatype];
BEGIN
FOR counter IN 1..(SELECT count(*)FROM territory2_t) LOOP -- There are 13 total rows
-- **assign currentRow to counter**
RAISE NOTICE 'Counter: %', counter; -- debugging purposes
UPDATE terriory2_t
SET total_sales_person = ((SELECT count(*)
FROM salesperson_t
WHERE salesterritoryid = currentRow.territoryid)*1) -- *1 is for debuggin puporses
WHERE territoryid = currentRow.territoryid;
-- **increase currentRow by 1**
END LOOP;
END; $$
It's purpose is count how many rows in the table (salesperson) have the 'territoryid' of the the currentRows->'territory2.territoryid', and then assign that quantity to currentRows->territory2.total_sales_person.

You don't need a loop or even a function for this.
What you want to do can be done in a single update statement because the total count per territory can be calculated with a single aggregation:
SELECT salesterritoryid, count(*) as total_count
FROM salesperson_t
group by salesterritoryid
This can then be used as the source to update the territory table:
UPDATE territory2_t
SET total_sales_person = t.total_count
FROM (
SELECT salesterritoryid, count(*) as total_count
FROM salesperson_t
group by salesterritoryid
) t
WHERE territoryid = t.salesterritoryid;
An alternative that might be easier to understand but will be slower for larger tables is an update with a co-related sub-query
UPDATE territory2_t tg
SET total_sales_person = (select count(*)
from salesperson_t sp
where sp.salesterritoryid = tg.territoryid);
There is a slight difference between the first and second update: the second one will update the total_sales_person to 0 (zero) for those territories where there is no salesperson at all. The first one will only update the count for territories that are actually present in the salesperson table.
Unrelated, but: having a "type identifying" prefix or suffix for an identifier is usually useless and doesn't really help at all. See a related discussion on dba.stackexchange

Related

Cannot enter data into tables with inter related foreign keys

I have created the tables with following code, but the foreign key constraints does not allow data addition. What can I do to solve this problem?
CREATE TABLE Employee(
Ssn VARCHAR(10) PRIMARY KEY NOT NULL,
BDate DATE,
FName VARCHAR(25),
MInit VARCHAR(5),
LName VARCHAR(25),
Address VARCHAR(40),
Sex VARCHAR(6),
Salary INT,
SupervisorSsn VARCHAR(10),
DNumber INT
);
CREATE TABLE
CREATE TABLE Department(
DNumber INT PRIMARY KEY NOT NULL,
DName VARCHAR(15),
MgrSsn VARCHAR(10),
MgrStartDate DATE,
NumberofEmployees INT,
CONSTRAINT Department_MgrSsn_FK FOREIGN KEY(MgrSsn) REFERENCES Employee(Ssn) ON DELETE SET DEFAULT ON UPDATE CASCADE
);
ALTER TABLE Employee
ADD CONSTRAINT Employee_SupervisorSsn_FK FOREIGN KEY(SupervisorSsn) REFERENCES Employee(Ssn) ON DELETE SET DEFAULT ON UPDATE CASCADE,
ADD CONSTRAINT Employee_DNumber_FK FOREIGN KEY(DNumber) REFERENCES Department(DNumber) ON DELETE SET DEFAULT ON UPDATE CASCADE;
There are several ways to do that in Postgres.
Update later
The most obvious one: insert null values firs, then update it later:
insert into department
(dnumber, dname)
values
(1, 'One'),
(2, 'Two'),
(3, 'Three');
insert into employee (ssn, fname, lname, supervisorssn, dnumber)
values
('123', 'Arthur', 'Dent', '456', 1),
('456', 'Ford', 'Prefect', null, 2),
('789', 'Zaphod', 'Beeblebrox', null, 3);
update department
set mgrssn = '456'
where dnumber in (1,2);
update department
set mgrssn = '789'
where dnumber = 3;
Online example
Deferred constraints
Make the constraints deferred, so that they will be checked at the end of the transaction, rather when running the INSERT:
ALTER TABLE department
add constraint fk_dempt2emp foreign key (mgrssn) references employee
deferrable initially deferred; --<<
Then you can insert the rows in any order you like as long as everything happens in a single transaction:
begin transaction; --<< important!
insert into department
(dnumber, dname, mgrssn)
values
(1, 'One', '456'),
(2, 'Two', '456'),
(3, 'Three', '789')
insert into employee (ssn, fname, lname, supervisorssn, dnumber)
values
('123', 'Arthur', 'Dent', '456', 1),
('456', 'Ford', 'Prefect', null, 2),
('789', 'Zaphod', 'Beeblebrox', null, 3);
commit; -- the FKs will be checked here
Do everything in a single statement
You can use a data modifying CTE to insert rows into two tables. As this is evaluated as a single statement, the constraints do not need to be deferrable.
with new_depts as (
insert into department
(dnumber, dname, mgrssn)
values
(1, 'One', '456'),
(2, 'Two', '456'),
(3, 'Three', '789')
)
insert into employee (ssn, fname, lname, supervisorssn, dnumber)
values
('123', 'Arthur', 'Dent', '456', 1),
('456', 'Ford', 'Prefect', null, 2),
('789', 'Zaphod', 'Beeblebrox', null, 3)
;
Online example

Postgresql find by count, joined table

Given 3 tables. I need to build SQL query to find two actors who CAST TOGETHER THE MOST and list the titles of those movies. Sort alphabetically
https://www.db-fiddle.com/f/r2Y9CpH8n7MHTeBaqEHe9S/0
The data for reproducing below:
create table film_actor
(
actor_id integer,
film_id integer
)
;
create table film
(
film_id integer,
title varchar
)
;
create table actor
(
actor_id integer,
first_name varchar,
last_name varchar
)
;
INSERT INTO public.film_actor (actor_id, film_id) VALUES (1, 1);
INSERT INTO public.film_actor (actor_id, film_id) VALUES (1, 2);
INSERT INTO public.film_actor (actor_id, film_id) VALUES (1, 3);
INSERT INTO public.film_actor (actor_id, film_id) VALUES (2, 1);
INSERT INTO public.film_actor (actor_id, film_id) VALUES (2, 2);
INSERT INTO public.film_actor (actor_id, film_id) VALUES (2, 3);
INSERT INTO public.film_actor (actor_id, film_id) VALUES (3, 1);
INSERT INTO public.film (film_id, title) VALUES (1, 'First');
INSERT INTO public.film (film_id, title) VALUES (2, 'Second');
INSERT INTO public.film (film_id, title) VALUES (3, 'Third');
INSERT INTO public.film (film_id, title) VALUES (4, 'Fourth');
INSERT INTO public.actor (actor_id, first_name, last_name) VALUES (1, 'John', 'Snow');
INSERT INTO public.actor (actor_id, first_name, last_name) VALUES (2, 'Spider', 'Man');
INSERT INTO public.actor (actor_id, first_name, last_name) VALUES (3, 'Mike', 'Kameron');
Is this what you are looking for?
with acting_pairs as (
select a1.actor_id as a1_id, a2.actor_id as a2_id
from film_actor a1
join film_actor a2 on a1.film_id = a2.film_id
where a1.actor_id < a2.actor_id
)
select a1_id, a2_id, count(*) as total
from acting_pairs
group by (a1_id, a2_id)
order by total desc
limit 1
Giving us expected output for the example input would be nice.

List ranges and the total count based on condition

Table Schema
CREATE TABLE [dbo].[TblMaster](
[SID] [int] IDENTITY(1,1) NOT NULL Primary Key,
[VID] [int] NOT NULL,
[CreatedDate] [datetime] default (getdate()) NOT NULL,
[CharToAdd] [varchar](10) NULL,
[Start] [int] NOT NULL,
[End] [int] NOT NULL
)
GO
CREATE TABLE [dbo].[TblDetails](
[DetailsID] [int] IDENTITY(1,1) NOT NULL Primary Key,
[SID] [int] NOT NULL,
[Sno] [int] NOT NULL,
[ConcatenatedText] [varchar](20) NOT NULL,
[isIssued] [bit] default (0) NOT NULL,
[isUsed] [bit] default (0) NOT NULL
)
GO
Sample Data:
Insert into dbo.TblMaster Values (1,default, 'CA', 1, 5)
Insert into dbo.TblMaster Values (1,default, 'PA', 1, 5)
GO
Insert into dbo.TblDetails values(1, 1, 'CA1', 0,0)
Insert into dbo.TblDetails values(1, 2, 'CA2', 0,0)
Insert into dbo.TblDetails values(1, 3, 'CA3', 0,0)
Insert into dbo.TblDetails values(1, 4, 'CA4', 1,0)
Insert into dbo.TblDetails values(1, 5, 'CA5', 0,0)
Insert into dbo.TblDetails values(2, 1, 'PA1', 0,0)
Insert into dbo.TblDetails values(2, 2, 'PA2', 0,0)
Insert into dbo.TblDetails values(2, 3, 'PA3', 1,0)
Insert into dbo.TblDetails values(2, 4, 'PA4', 0,0)
Insert into dbo.TblDetails values(2, 5, 'PA5', 0,0)
Insert into dbo.TblDetails values(3, 1, '1', 0,0)
Insert into dbo.TblDetails values(3, 2, '2', 1,0)
Insert into dbo.TblDetails values(3, 3, '3', 1,0)
Insert into dbo.TblDetails values(3, 4, '4', 0,0)
Insert into dbo.TblDetails values(3, 5, '5', 0,0)
GO
Expected Output:
Query I have built as of now:
Declare #VID INT = 1
;WITH Tmp as
(
SELECT
TM.CharToAdd as Prefix,
sno,
sno - ROW_NUMBER() OVER(ORDER BY sno) as grp
FROM dbo.TblDetails TD
LEFT JOIN dbo.TblMaster TM on TM.[SID] = TD.[SID]
WHERE isIssued = 0 and isUsed = 0
AND TM.VID = #VID
)
SELECT Prefix,
MIN(sno) as RangeStart,
MAX(sno) as RangeEnd,
COUNT(*) as [Count]
FROM Tmp
GROUP BY grp, Prefix
In the TblDetails table want to find the range of available values and its total counts from all records whose bit columns are 0. If bit column is 1 then it means it is already used so I am trying to skip it and list rest as available records. Doubtful whether am I explaining the problem statement well so have provided the sample data and expected output for better understanding. I did try doing some recursive function but the result isn't matching the expected output. So looking for help to resolve this.
You were very close...
CODE
Declare #VID INT = 1
;with cte as(
select
m.CHarToAdd,
d.sno,
d.sno - ROW_NUMBER() OVER(partition by m.CharToAdd ORDER BY sno) as grp
from
TblMaster m
inner join
TblDetails d on
d.sid = m.sid
where
d.isIssued = 0 and d.isUsed = 0 and m.vid = #VID)
select
CharToAdd,
min(sno) as Start,
max(sno) as [End],
(max(sno) - min(sno) + 1) as [Count]
from cte
group by
CHarToAdd, grp
order by
CHarToAdd
RESULTS
CharToAdd Start End Count
CA 1 3 3
CA 5 5 1
PA 1 2 2
PA 4 5 2

triggers and functions trouble

I'm currently going through the growing pains of trying to learn about functions and triggers. I'm trying to do a problem from a book I'm reading , but i dont understand how to do certain parts.
using this table
create table movies (
id integer primary key,
title varchar(255) not null,
year integer
);
insert into movies values (1, 'The Croods', 2013);
insert into movies values (2, 'Now You See Me', 2013);
insert into movies values (3, 'Argo', 2012);
insert into movies values (4, 'Jurassic World', 2015);
create table discs (
id integer primary key,
movie_id integer not null references movies(id),
type_id integer references disc_types(id),
price decimal(10,2),
available boolean
);
insert into discs values (1, 1, 1, 1.59, 't');
insert into discs values (2, 1, 1, 1.59, 'f');
insert into discs values (3, 1, 2, 2.99, 'f');
insert into discs values (4, 2, 1, 1.29, 't');
insert into discs values (5, 2, 1, 1.29, 't');
insert into discs values (6, 2, 2, 2.99, 't');
insert into discs values (7, 3, 2, 2.59, 't');
insert into discs values (8, 3, 2, 2.59, 't');
create table customers (
id integer primary key,
name varchar(255),
email varchar(255)
);
insert into customers values (1, 'John', 'john#hotmail.com');
insert into customers values (2, 'Jane', 'jane#gmail.com');
create table rentals (
id integer primary key,
customer_id integer not null references customers(id),
disc_id integer not null references discs(id),
date_rented date,
date_returned date
);
insert into rentals values (1, 1, 7, '2013-10-01', '2013-10-03');
insert into rentals values (2, 2, 5, '2013-10-05', '2013-10-06');
insert into rentals values (3, 2, 2, '2013-11-02', null);
insert into rentals values (4, 2, 3, '2013-11-02', null);
create table ratings (
customer_id integer not null references customers(id),
movie_id integer not null references movies(id),
rating integer,
primary key (customer_id, movie_id)
);
insert into ratings values (1, 1, 1);
insert into ratings values (1, 2, 4);
insert into ratings values (1, 3, 5);
insert into ratings values (2, 1, 4);
my logic was that i would have the new values of the ratings table that were going to be inserted or updated and use those to compare to whats in the rentals table to see if that customer had rented that movie already, if they did then they could enter a rating. but i cant transfer that logic in this lol. unless there an easier way to do this.
The loop inside the function complicates matters a bit, let's see if we can get rid of it. Your ratings table has a reference to customer and movie so we need a join.
SELECT COUNT(*) INTO rented FROM rentals WHERE disc_id IN
(SELECT id from discs INNER JOIN
rentals ON disc_id = discs.id where movie_id = new.movie_id)
AND customer_id = new.customer_id
Right this should make the logic of your stored procedure a lot easier. I am now leaving you to finish it because this after all is a learning exercise.
You need this sort of a join because it's more efficient and simpler than the loop. The ratings table has a reference to the movie_id but the rentals table only has a disc_id thus to find out if the user has rented a particular movie, you need to join it through the disc table.
You will need to change the return values. ref: http://www.postgresql.org/docs/9.2/static/plpgsql-trigger.html
Row-level triggers fired BEFORE can return null to signal the trigger
manager to skip the rest of the operation for this row (i.e.,
subsequent triggers are not fired, and the INSERT/UPDATE/DELETE does
not occur for this row). If a nonnull value is returned then the
operation proceeds with that row value
And also note that you do not do an INSERT inside your trigger function. You just return a non null value for the insert to proceed.
This is the EXISTS() version. (BTW: the definition for movies is missing)
CREATE OR REPLACE FUNCTION rate_only_rented()
RETURNS TRIGGER AS $func$
BEGIN
IF ( NOT EXISTS (
SELECT *
FROM rentals r
JOIN discs d ON r.disc_id = d.id
WHERE d.movie_id = NEW.movie_id
AND r.customer_id = NEW.customer_id
) ) THEN
RAISE EXCEPTION 'you(%) have not rented this movie(%) before'
, NEW.customer_id ,NEW.movie_id;
RETURN NULL;
ELSE
RETURN NEW;
END IF;
END;
$func$ language plpgsql;
And the trigger:
CREATE TRIGGER rate_only_rented
AFTER INSERT OR UPDATE
ON ratings
FOR EACH ROW
EXECUTE PROCEDURE rate_only_rented()
;

Updating duplicates from one-to-many relationships.

This isn't your standard "how do I find duplicates" question, I know how to do find duplicates, see below. This question is how do I update said records that also have child items with matching records?
Alright, I'm going to give you whole scenario so that you can work with this problem.
Duplicate records could be inserted as a result of critical system failure.
Finding later duplicates and marking the parent commission_import_commission_junction "is_processed = True" solves this problem.
The complication is that the commission_import_commission_junction and its children commission_import_commission_junction_line_items must be identical on the columns to compare.
the tables are:
commission_import_commission_junction
- id
- created_date
- some columns that are checked for duplication
- some columns that are not checked for duplication
commission_import_commission_junction_line_items
- id
- some columns that are checked for duplication
- some columns that are not checked for duplication
(For the full table spec, check out the CREATE TABLE statements in the bottom-most block of code.)
The query to mark duplicates on just the parent table commission_import_commission_junction:
UPDATE commission_import_commission_junction cicj
SET is_processed = TRUE
FROM (
SELECT MIN(created_date) AS first_date, member_id, site_id, action_status, action_type, ad_id, commission_id, country, event_date, locking_date, order_id, original, original_action_id, posting_date, website_id, advertiser_name, commission_amount, sale_amount, aggregator_affiliate_id
FROM commission_import_commission_junction inner_imports
JOIN commission_import_commission_junction_line_items inner_items ON inner_items.commission_import_commission_junction_id = inner_imports.commission_import_commission_junction_id
GROUP BY member_id, site_id, action_status, action_type, ad_id, commission_id, country, event_date, locking_date, order_id, original, original_action_id, posting_date, website_id, advertiser_name, commission_amount, sale_amount, aggregator_affiliate_id
HAVING (COUNT(*) > 1)
) AS dups
WHERE
-- MAIN TABLE COLUMNN LIST
(cicj.member_id, cicj.site_id, cicj.action_status, cicj.action_type, cicj.ad_id, cicj.commission_id, cicj.country, cicj.event_date, cicj.locking_date, cicj.order_id, cicj.original, cicj.original_action_id, cicj.posting_date, cicj.website_id, cicj.advertiser_name, cicj.commission_amount, cicj.sale_amount, cicj.aggregator_affiliate_id)
IS NOT DISTINCT FROM
-- OTHER TABLE COLUMN LIST
(dups.member_id, dups.site_id, dups.action_status, dups.action_type, dups.ad_id, dups.commission_id, dups.country, dups.event_date, dups.locking_date, dups.order_id, dups.original, dups.original_action_id, dups.posting_date, dups.website_id, dups.advertiser_name, dups.commission_amount, dups.sale_amount, dups.aggregator_affiliate_id)
AND cicj.created_date <> dups.first_date
AND cicj.is_processed = FALSE;
Somewhere and somehow I need to check that the line_items are also duplicates.
THE CODE BELOW IS TO SETUP THE DATABASE, remember this is postgres specific.
-- "commission_import_build" is a record that keeps information about the process of collecting the commission information. Duplicate commission_import_commission_junction records will not exist with the same commission_import_build_id
-- "commission_import_commission_junction" is a record description commission information from a customers purchase.
-- "commission_import_commission_junction_line_items" are records describing items in that purchase.
DROP TABLE IF EXISTS commission_import_commission_junction_line_items;
DROP TABLE IF EXISTS commission_import_commission_junction;
DROP TABLE IF EXISTS commission_import_builds;
CREATE TABLE commission_import_builds
(
commission_import_build_id serial NOT NULL,
build_date timestamp with time zone NOT NULL,
CONSTRAINT pkey_commission_import_build_id PRIMARY KEY (commission_import_build_id),
CONSTRAINT commission_import_builds_build_date_key UNIQUE (build_date)
);
INSERT INTO commission_import_builds (commission_import_build_id, build_date) VALUES (1, '2011-01-01');
INSERT INTO commission_import_builds (commission_import_build_id, build_date) VALUES (2, '2011-01-02');
INSERT INTO commission_import_builds (commission_import_build_id, build_date) VALUES (3, '2011-01-03');
CREATE TABLE commission_import_commission_junction
(
commission_import_commission_junction_id serial NOT NULL,
member_id integer,
site_id integer,
action_status character varying NOT NULL,
action_type character varying NOT NULL,
ad_id bigint,
commission_id bigint NOT NULL,
country character varying,
event_date timestamp with time zone NOT NULL,
locking_date timestamp with time zone,
order_id character varying NOT NULL,
original boolean,
original_action_id bigint NOT NULL,
posting_date timestamp with time zone NOT NULL,
website_id bigint NOT NULL,
advertiser_name character varying,
commission_amount numeric(19,2) NOT NULL,
sale_amount numeric(19,2) NOT NULL,
aggregator_affiliate_id integer NOT NULL,
is_processed boolean NOT NULL DEFAULT false,
created_date timestamp with time zone NOT NULL DEFAULT now(),
member_transaction_id integer,
commission_import_build_id integer NOT NULL,
CONSTRAINT pkey_commission_import_commission_junction_commission_import_co PRIMARY KEY (commission_import_commission_junction_id),
CONSTRAINT fk_commission_import_commission_junction_commission_import_buil FOREIGN KEY (commission_import_build_id)
REFERENCES commission_import_builds (commission_import_build_id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION
);
CREATE INDEX idx_commission_import_commission_junction_is_processed
ON commission_import_commission_junction
USING btree
(is_processed);
INSERT INTO commission_import_commission_junction (commission_import_commission_junction_id, action_status, action_type, commission_id, event_date, order_id, original_action_id, posting_date, website_id, commission_amount, sale_amount, aggregator_affiliate_id, commission_import_build_id, created_date) VALUES
(1, 'new', 'sale', 1234, '2011-02-04 14:39:52.989499-07', 'test-order', 1234567, '2011-02-04 14:39:52.989499-07', 123, 12.35, 123.45, 9876, 1, '2011-02-05');
INSERT INTO commission_import_commission_junction (commission_import_commission_junction_id, action_status, action_type, commission_id, event_date, order_id, original_action_id, posting_date, website_id, commission_amount, sale_amount, aggregator_affiliate_id, commission_import_build_id, created_date) VALUES
(2, 'new', 'sale', 1234, '2011-02-04 14:39:52.989499-07', 'test-order', 1234567, '2011-02-04 14:39:52.989499-07', 123, 12.35, 123.45, 9876, 2, '2011-02-06');
INSERT INTO commission_import_commission_junction (commission_import_commission_junction_id, action_status, action_type, commission_id, event_date, order_id, original_action_id, posting_date, website_id, commission_amount, sale_amount, aggregator_affiliate_id, commission_import_build_id, created_date) VALUES
(3, 'new', 'sale', 1234, '2011-02-04 14:39:52.989499-07', 'test-order', 1234567, '2011-02-04 14:39:52.989499-07', 123, 12.35, 123.45, 9876, 3, '2011-02-07');
SELECT * FROM commission_import_commission_junction;
CREATE TABLE commission_import_commission_junction_line_items
(
commission_import_commission_junction_line_item_id serial NOT NULL,
commission_import_commission_junction_id integer NOT NULL,
sku character varying,
quantity integer,
posting_date timestamp with time zone,
sale_amount numeric(19,2),
discount numeric(19,2),
CONSTRAINT pkey_commission_import_commission_junction_link_items_commissio PRIMARY KEY (commission_import_commission_junction_line_item_id),
CONSTRAINT fkey_commission_import_commission_junction_line_items_commissio FOREIGN KEY (commission_import_commission_junction_id)
REFERENCES commission_import_commission_junction (commission_import_commission_junction_id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION
);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (1, 'test1', 3, 23.45);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (1, 'test2', 3, 67.50);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (1, 'test3', 3, 32.50);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (2, 'test1', 3, 23.45);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (2, 'test2', 3, 67.50);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (2, 'test3', 3, 32.50);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (3, 'test1', 3, 23.45);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (3, 'test2', 3, 67.50);
INSERT INTO commission_import_commission_junction_line_items (commission_import_commission_junction_id, sku, quantity, sale_amount) VALUES (3, 'test3', 3, 32.50);
Reminds me of duplicate elimination in direct marketing mailing lists
Regardless of the details of your tables, a parent-child dupe elimination algorithm follows these steps:
1) Get duplicates into a list that matches old key to new key (temp table)
2) Update the foreign key in the child table
3) Delete the dupes from the parent
I admire the detail in your post, but I'm going to keep it simple and easier to read with some example table/column names:
-- step 1, get the list
-- Warning: t-sql syntax, adjust for Postgres
-- if it doesn't like placement of "into..." clause
select keep.primaryKey as keepKey
, dupe.primaryKey as dupeKey
into #DupeList
from (
select min(primaryKey) as primaryKey
, dupeCriteria1
, dupeCriteria2
FROM theTable
group by dupeCriteria1,dupeCritera2
having count(*) > 1
) keep
JOIN theTable dupe
ON keep.dupeCriteria1 = dupe.dupeCriteria1
AND keep.dupeCriteria2 = dupe.dupeCriteria2
AND keep.primaryKey <> dupe.primaryKey
Once you have that, update the foreign key in the child table:
update childTable
set foreignKey = #temp1.keepKey
from #temp1
where foreignKey = #temp1.dupeKey
Then just delete everything out of the parent table:
delete from parentTable
where primaryKey in (select dupeKey from #temp1)
CREATE FUNCTION removeCommissionImportCommissionJunctionDuplicates() RETURNS INT AS $BODY$ DECLARE duplicate RECORD; DECLARE parent RECORD; DECLARE children commission_import_commission_junction_line_items[]; DECLARE duplicate_children commission_import_commission_junction_line_items[]; DECLARE duplicate_child_count INT; DECLARE child commission_import_commission_junction_line_items; DECLARE duplicate_child commission_import_commission_junction_line_items; DECLARE num_updates INT; BEGIN
SELECT * FROM (SELECT 0) AS value INTO num_updates;
FOR duplicate IN
SELECT cicj.*, dups.first_date
FROM commission_import_commission_junction cicj
JOIN (SELECT MIN(created_date) AS first_date, member_id, site_id, action_status, action_type, ad_id, commission_id, country, event_date, locking_date, order_id, original, original_action_id, posting_date, website_id, advertiser_name, commission_amount, sale_amount, aggregator_affiliate_id
FROM commission_import_commission_junction inner_imports
GROUP BY member_id, site_id, action_status, action_type, ad_id, commission_id, country, event_date, locking_date, order_id, original, original_action_id, posting_date, website_id, advertiser_name, commission_amount, sale_amount, aggregator_affiliate_id
HAVING (COUNT(*) > 1)) AS dups
ON (cicj.member_id, cicj.site_id, cicj.action_status, cicj.action_type, cicj.ad_id, cicj.commission_id, cicj.country, cicj.event_date, cicj.locking_date, cicj.order_id, cicj.original, cicj.original_action_id, cicj.posting_date, cicj.website_id, cicj.advertiser_name, cicj.commission_amount, cicj.sale_amount, cicj.aggregator_affiliate_id)
IS NOT DISTINCT FROM
(dups.member_id, dups.site_id, dups.action_status, dups.action_type, dups.ad_id, dups.commission_id, dups.country, dups.event_date, dups.locking_date, dups.order_id, dups.original, dups.original_action_id, dups.posting_date, dups.website_id, dups.advertiser_name, dups.commission_amount, dups.sale_amount, dups.aggregator_affiliate_id)
WHERE cicj.created_date != dups.first_date
AND cicj.is_processed = FALSE
LOOP
--RAISE NOTICE 'Looping';
-- We need to collect the parent and children of the original record.
-- Get the parent of the original
SELECT *
FROM commission_import_commission_junction cicj
WHERE (cicj.member_id, cicj.site_id, cicj.action_status, cicj.action_type, cicj.ad_id, cicj.commission_id, cicj.country, cicj.event_date, cicj.locking_date, cicj.order_id, cicj.original, cicj.original_action_id, cicj.posting_date, cicj.website_id, cicj.advertiser_name, cicj.commission_amount, cicj.sale_amount, cicj.aggregator_affiliate_id)
IS NOT DISTINCT FROM
(duplicate.member_id, duplicate.site_id, duplicate.action_status, duplicate.action_type, duplicate.ad_id, duplicate.commission_id, duplicate.country, duplicate.event_date, duplicate.locking_date, duplicate.order_id, duplicate.original, duplicate.original_action_id, duplicate.posting_date, duplicate.website_id, duplicate.advertiser_name, duplicate.commission_amount, duplicate.sale_amount, duplicate.aggregator_affiliate_id)
AND cicj.created_date = duplicate.first_date
INTO parent;
-- Get the children of the original
children := ARRAY(
SELECT cicjli
FROM commission_import_commission_junction_line_items cicjli
WHERE cicjli.commission_import_commission_junction_id
= parent.commission_import_commission_junction_id);
--RAISE NOTICE 'parent: %', parent;
--RAISE NOTICE 'children: %', children;
-- Now get the duplicates children
duplicate_children := ARRAY(
SELECT cicjli
FROM commission_import_commission_junction_line_items cicjli
WHERE cicjli.commission_import_commission_junction_id
= duplicate.commission_import_commission_junction_id);
--RAISE NOTICE 'duplicate_children: %', duplicate_children;
-- Next, compare the children of the duplicate to the children of the original parent.
-- First compare size
IF array_upper(children, 1) = array_upper(duplicate_children, 1) THEN
--RAISE NOTICE 'Same number of children in duplicate as in parent';
-- Now compare each set
SELECT * FROM (SELECT 0) AS value INTO duplicate_child_count;
FOR child_index IN array_lower(children, 1) .. array_upper(children, 1) LOOP
child := children[child_index];
FOR duplicate_child_index IN array_lower(duplicate_children, 1) .. array_upper(duplicate_children, 1) LOOP
duplicate_child := duplicate_children[duplicate_child_index];
IF (child.sku, child.quantity, child.posting_date, child.sale_amount, child.discount) IS NOT DISTINCT FROM (duplicate_child.sku, duplicate_child.quantity, duplicate_child.posting_date, duplicate_child.sale_amount, duplicate_child.discount) THEN
SELECT * FROM (SELECT duplicate_child_count + 1) AS value INTO duplicate_child_count;
EXIT;
END IF;
END LOOP;
END LOOP;
--RAISE NOTICE 'Duplicate Child Count: %', duplicate_child_count;
-- If we have the same number of duplicates as there are records
IF duplicate_child_count = array_upper(duplicate_children, 1) THEN
-- Update the duplicate record as processed.
--RAISE NOTICE 'Marking duplicate % as is_processed', duplicate;
UPDATE commission_import_commission_junction cicj SET is_processed = TRUE WHERE cicj.commission_import_commission_junction_id
= duplicate.commission_import_commission_junction_id;
SELECT * FROM (SELECT num_updates + 1) AS value INTO num_updates;
END IF;
END IF;
END LOOP;
--RAISE NOTICE 'Updates: %', num_updates;
RETURN num_updates; END; $BODY$ LANGUAGE plpgsql;