Avoid duplicates when migrating from one table to another - tsql

I need to migrate data from an old Books table:
create table dbo.Books_OLD (
Id int identity not null constraint PK_Books_OLD_Id primary key (Id),
Title nvarchar (200) not null,
Image varbinary (max) null,
Preview varbinary (max) null
)
To a new table structure:
create table dbo.Books (
Id int identity not null constraint PK_Books_Id primary key (Id),
Title nvarchar (200) not null
)
create table dbo.Files (
Id int identity not null constraint PK_Files_Id primary key (Id),
Content varbinary (max) null,
Name nvarchar (280) null
)
create table dbo.BookFiles (
BookId int not null,
FileId int not null,
constraint PK_BookFiles_Id primary key (BookId, FileId)
)
alter table dbo.BookFiles
add constraint FK_BookFiles_BookId foreign key (BookId) references Books(Id) on delete cascade on update cascade,
constraint FK_BookFiles_FileId foreign key (FileId) references Files(Id) on delete cascade on update cascade;
The migration should run as follows:
Books_OLD.Title => Create new Book with given Title value
Books_OLD.Image => Create new File with Image content.
Create new BookFile to associate File to Book.
Books_OLD.Preview => Create new File with Preview content.
Create new BookFile to associate File to Book.
I was able to migrate the data but somehow when I run this:
select FileId
from BookFiles
group by FileId
having count(*) > 1;
I have duplicates. I should not have duplicate FileIds. What am I missing?
The migration code I have is:
DECLARE #BOOKS table (
BookId int,
Image varbinary(max),
Preview varbinary(max)
)
MERGE Books AS d
USING Books_OLD AS s
ON 0 = 1
WHEN NOT MATCHED
THEN INSERT (Title)
VALUES (s.Title)
OUTPUT INSERTED.Id, s.Image, s.Preview
INTO #BOOKS;
INSERT Files (Content, Created)
SELECT t.Content, GETUTCDATE()
FROM #BOOKS i
CROSS APPLY (VALUES (Preview, 'Preview'), (Image, 'Image')) t(Content, ContentType)
WHERE Content IS NOT NULL
INSERT BookFiles (BookId, FileId)
SELECT i.BookId, f.Id
FROM #BOOKS i
JOIN Files f
ON f.Content = i.Image
UNION ALL
SELECT i.BookId, f.Id
FROM #BOOKS i
JOIN Files f
ON f.Content = i.Preview
Some Books can have two files (Image and Preview) so BookId can appear more than once in BooksFiles.
But each file (Image or Preview) in Books_OLD table should only be associated with one Book. So it is strange that I have duplicated FileId in BookFiles.
What am I missing?

If you have the same image or preview for different book in your Books_Old, in your original code from this part:
INSERT BookFiles (BookId, FileId)
SELECT i.BookId, f.Id
FROM #BOOKS i
JOIN Files f
ON f.Content = i.Image
It will return you more results when doing the INNER JOIN because two image or preview from different books can be joined. And the duplicate FileId is actually a bad record, because the BookId is not correspond to that particular Image or Preview even though they are the same.
What you could do is have another table variable called #Files, similar to the Files table structure, you just need to add one more column, which is BookId, then:
INSERT BookFiles (BookId, FileId)
SELECT i.BookId, f.Id
FROM #BOOKS i
JOIN #Files f
ON f.Content = i.Image
AND f.BookId = i.BookId --added joining condition
--assume code before has inserted bookId into `#Files`
So at last, you pick all the needed columns from #Files, insert them to Files.
UPDATE: please refer below for the full codes:
DECLARE #BOOKS table (
BookId int,
Image varbinary(max),
Preview varbinary(max)
)
--Added #File Variable
DECLARE #Files table
(
BookId int,
Content varbinary (max) null,
Created nvarchar (280) null,
Id int identity(1,1) not null primary key
)
MERGE Books AS d
USING Books_OLD AS s
ON 0 = 1
WHEN NOT MATCHED
THEN INSERT (Title)
VALUES (s.Title)
OUTPUT INSERTED.Id, s.Image, s.Preview
INTO #BOOKS;
INSERT #Files (BookId,Content, Created) --
SELECT i.BookId,t.Content, GETUTCDATE()
FROM #BOOKS i
CROSS APPLY (VALUES (Preview, 'Preview'), (Image, 'Image')) t(Content, ContentType)
WHERE Content IS NOT NULL
INSERT BookFiles (BookId, FileId)
SELECT i.BookId, f.Id
FROM #BOOKS i
JOIN #Files f
ON f.Content = i.[Image]
AND f.BookId = i.BookId --added joining condition
UNION ALL
SELECT i.BookId, f.Id
FROM #BOOKS i
JOIN #Files f
ON f.Content = i.Preview
AND f.BookId = i.BookId --added joining condition
--Last insert all needed from #File into File
INSERT INTO Files (Content, Created)
SELECT content,Created
FROM #Files
PS: Not sure whether there is a typo for dbo.File, you have Name in your table definition, but when inserting, its Created

Related

Select rows with and without match of join

This – allegedly easy – task currently I cannot solve.
SQL Fiddle
http://sqlfiddle.com/#!17/90dce/1
Schema
Given this schema and data
CREATE TABLE asset (
"id" BIGINT NULL DEFAULT NULL,
"name" TEXT NULL DEFAULT NULL,
PRIMARY KEY ("id")
);
CREATE INDEX IF NOT EXISTS "IDX_id" ON asset (id);
CREATE TABLE category (
"id" BIGINT NULL DEFAULT NULL,
"ctype" TEXT NULL DEFAULT NULL,
"name" TEXT NULL DEFAULT NULL,
PRIMARY KEY ("id")
);
CREATE INDEX IF NOT EXISTS "IDX_id" ON category (id);
CREATE TABLE asset_category (
"asset_id" BIGINT NULL DEFAULT NULL,
"category_id" BIGINT NULL DEFAULT NULL,
CONSTRAINT "FK_asset_id" FOREIGN KEY ("asset_id") REFERENCES "asset" ("id") ON UPDATE CASCADE ON DELETE SET NULL,
CONSTRAINT "FK_category_id" FOREIGN KEY ("category_id") REFERENCES "category" ("id") ON UPDATE CASCADE ON DELETE SET NULL,
UNIQUE (asset_id, category_id)
);
INSERT INTO asset (id, "name") VALUES(1, 'Awesome Asset with a hit');
INSERT INTO asset (id, "name") VALUES(2, 'Great Asset without a hit');
INSERT INTO category (id, "name", "ctype") VALUES(1, 'First Category', NULL);
INSERT INTO category (id, "name", "ctype") VALUES(2, 'Second Category', 'directory');
INSERT INTO asset_category ("asset_id", "category_id") VALUES(1, 1);
INSERT INTO asset_category ("asset_id", "category_id") VALUES(1, 2);
INSERT INTO asset_category ("asset_id", "category_id") VALUES(2, 1);
Task
I want to get all assets with their category Id (in case they have one of type "directory". Otherwise NULL as category.
See my query below, I wrote two joins letting me limit the results in the ON clause. However, since both are related to the other category, the first JOIN hinders me to get a clean result.
What I tried
This query Query A
SELECT a.id "assetId", c.id "categoryId"
FROM asset a
LEFT JOIN asset_category ac ON ac.asset_id = a.id
left join category c on (
c.id = ac.category_id
AND
c.ctype = 'directory'
)
restulting in:
assetId categoryId
1 (null)
1 2
2 (null)
That is almost good, except, assetId 1 appears twice. This probably due to first JOIN, which creates a relation to assetcategory and the other category not of type 'directory'. Same as assetId 2.
Query B uses inner join:
SELECT a.id "assetId", c.id "categoryId"
FROM asset a
LEFT JOIN asset_category ac ON ac.asset_id = a.id
inner join category c on (
c.id = ac.category_id
AND
c.ctype = 'directory'
)
resulting in
assetId categoryId
1 2
However, here the problem is, it hides asset with id 2 for me as join is not successfully resolving asset id 2.
Desired output
assetId | categoryId
1 | 2
2 | null
I would be really happy about this seemingly simple task.
demo:db<>fiddle
Your first query is a good approach. It seems you wanted only one record per id. This is what is DISTINCT ON for:
SELECT DISTINCT ON (a.id)
a.id, c.id
FROM asset a
LEFT JOIN asset_category ac ON a.id = ac.asset_id
LEFT JOIN category c ON c.id = ac.category_id AND c."ctype" = 'directory'
ORDER BY a.id, ctype NULLS LAST
So, just order your joined result by id first, and order ctype = NULL records to bottom, which makes the directory values bubble up being the first one. DISTINCT ON takes the first record for each id afterwards which is the one you expect.

Copying records in a table with self referencing ids

I have a table with records which can reference another row in the same table so there is a parent-child relationship between rows in the same table.
What I am trying to achieve is to create the same data for another user so that they can see and manage their own version of this structure through the web ui where these rows are displayed as a tree.
Problem is when I bulk insert this data by only changing user_id, I lose the relation between rows because the parent_id values will be invalid for these new records and they should be updated as well with the newly generated ids.
Here is what I tried: (did not work)
Iterate over main_table
copy-paste the static values after each
do another insert on a temp table for holding old and new ids
update old parent_ids with new ids after loop ends
My attempt at doing such thing(last step is not included here)
create or replace function test_x()
returns void as
$BODY$
declare
r RECORD;
userId int8;
rowPK int8;
begin
userId := (select 1)
create table if not exists id_map (old_id int8, new_id int8);
create table if not exists temp_table as select * from main_table;
for r in select * from temp_table
loop
rowPK := insert into main_table(id, user_id, code, description, parent_id)
values(nextval('hibernate_sequence'), userId, r.code, r.description, r.parent_id) returning id;
insert into id_map (old_id, new_id) values (r.id, rowPK);
end loop;
end
$BODY$
language plpgsql;
My PostgreSQL version is 9.6.14.
DDL below for testing.
create table main_table(
id bigserial not null,
user_id int8 not null,
code varchar(3) not null,
description varchar(100) not null,
parent_id int8 null,
constraint mycompkey unique (user_id, code, parent_id),
constraint mypk primary key (id),
constraint myfk foreign key (parent_id) references main_table(id)
);
insert into main_table (id, user_id, code, description, parent_id)
values(0, 0, '01', 'Root row', null);
insert into main_table (id, user_id, code, description, parent_id)
values(1, 0, '001', 'Child row 1', 0);
insert into main_table (id, user_id, code, description, parent_id)
values(2, 0, '002', 'Child row 2', 0);
insert into main_table (id, user_id, code, description, parent_id)
values(3, 0, '002', 'Grand child row 1', 2);
How to write a procedure to accomplish this?
Thanks in advance.
It appears your task is coping all data for a given user to another while maintaining the hierarchical relationship within the new rows. The following accomplishes that.
It begins creating a new copy of the existing rows with the new user_id, including the old row parent_id. That will be user in the next (update) step.
The CTE logically begins with the new rows which have parent_id and joins to the old parent row. From here it joins to the old parent row to the new parent row using the code and description. At that point we have the new id along with the new parent is. At that point just update with those values. Actually for the update the CTE need only select those two columns, but I've left the intermediate columns so you trace through if you wish.
create or replace function copy_user_data_to_user(
source_user_id bigint
, target_user_id bigint
)
returns void
language plpgsql
as $$
begin
insert into main_table ( user_id,code, description, parent_id )
select target_user_id, code, description, parent_id
from main_table
where user_id = source_user_id ;
with n_list as
(select mt.id, mt.code, mt.description, mt.parent_id
, mtp.id p_id,mtp.code p_code,mtp.description p_des
, mtc.id c_id, mtc.code c_code, mtc.description c_description
from main_table mt
join main_table mtp on mtp.id = mt.parent_id
join main_table mtc on ( mtc.user_id = target_user_id
and mtc.code = mtp.code
and mtc.description = mtp.description
)
where mt.parent_id is not null
and mt.user_id = target_user_id
)
update main_table mt
set parent_id = n_list.c_id
from n_list
where mt.id = n_list.id;
return;
end ;
$$;
-- test
select * from copy_user_data_to_user(0,1);
select * from main_table;
CREATE TABLE 'table name you want to create' SELECT * FROM myset
but new table and myset column name should be equal and you can also
use inplace of * to column name but column name exist in new table
othwerwise getting errors

PostgreSQL Transaction to Use Results from Query to Insert and Query another Table then Return Original Query Results

I am writing an application that stores data on file samples and YARA signatures. Essentially, in a single transaction, I need to execute a query, reference those results in an insert and another query, then return the original results. I have three tables that are relevant to this discussion:
samples - this is the table that stores information on files that need to be scanned with the associated YARA signatures.
yararules - the table that stores information on the YARA rules.
yaratracker - a table that tracks the sample/rule pairs that have been processed thus far.
In a single transaction, the application needs to:
Get a batch of unique sample/rule pairs that have not yet been processed. Preferably, this query will get all non-processed rules associated with a single sample (i.e. if I'm going to run the YARA rules on a sample, I want to run all of the YARA rules not yet processed on that sample so that I only have to load the sample into memory once).
Get a unique list of id,sha256 from the batch found in step 1.
Insert the batch from step 1 into the yaraqueue with the matchcount column equal to 0 and complete column set to false.
I can accomplish Step 1 with the query below, but I don't know how to reference those results to accomplish step 2. I've tried looking into variables, but apparently there isn't one that can hold multiple rows. I've looked into using a cursor, but I can't seem to use the cursor with a subsequent command and then return the cursor.
SELECT s.id,r.id
FROM sample s CROSS JOIN yararules r
WHERE r.status = 'Disabled' AND NOT EXISTS(
SELECT 1 FROM yaratracker q
WHERE q.sample_id = s.id AND q.rule_id = r.id
)
ORDER BY s.id
LIMIT 1000;
The relevant database schema looks like this.
CREATE TYPE samplelist AS ENUM ('Whitelist', 'Blacklist', 'Greylist', 'Unknown');
CREATE TABLE samples (
id SERIAL PRIMARY KEY,
md5 CHAR(32) NOT NULL,
sha1 CHAR(40) NOT NULL,
sha256 CHAR(64) NOT NULL,
total INT NOT NULL,
positives INT NOT NULL,
list SAMPLELIST NOT NULL,
filetype VARCHAR(16) NOT NULL,
submitted TIMESTAMP WITH TIME ZONE NOT NULL,
user_id SERIAL REFERENCES users;
);
CREATE UNIQUE INDEX md5_idx ON {0} (md5);
CREATE UNIQUE INDEX sha1_idx ON {0} (sha1);
CREATE UNIQUE INDEX sha256_idx ON {0} (sha256);
CREATE TYPE rulestatus AS ENUM ('Enabled', 'Disabled');
CREATE TABLE yararules (
id SERIAL PRIMARY KEY,
name VARCHAR(32) NOT NULL UNIQUE,
description TEXT NOT NULL,
rules TEXT NOT NULL,
lastmodified TIMESTAMP WITH TIME ZONE NOT NULL,
status rulestatus NOT NULL,
user_id SERIAL REFERENCES users ON DELETE CASCADE
);
CREATE TABLE yaratracker (
id SERIAL PRIMARY KEY,
rule_id SERIAL REFERENCES yararules ON DELETE CASCADE,
sample_id SERIAL REFERENCES sample ON DELETE CASCADE,
matchcount INT NOT NULL,
complete BOOL NOT NULL
);
CREATE INDEX composite_idx ON yaratracker (rule_id, sample_id);
CREATE INDEX complete_idx ON yaratracker (complete);
INSERT INTO target_table(a,b,c,...)
SELECT sid, rid, sha, ...
FROM (
SELECT s.id AS sid
,r.id AS rid
, s.sha256 AS sha
, ...
, ROW_NUMBER() OVER (PARTITION BY s.id) AS rn -- <<<--- HERE
FROM sample s CROSS JOIN yararules r
WHERE r.status = 'Disabled' AND NOT EXISTS(
SELECT 1 FROM yaratracker q
WHERE q.sample_id = s.id
AND q.rule_id = r.id
)
ORDER BY s.id
LIMIT 1000;
) src
WHERE src.rn = 1; -- <<<--- HERE
The WHERE src.rn = 1 will restrict the cross-join to deliver only one tuple per sample.id (both id and sha256 are unique in the sample table, so picking a unique id has the same effect as picking a unique sha256)
The complete cross-join result will never be generated; the optimiser is smart enough to push down the WHERE rn=1 condition into the subquery.
Note: the LIMIT 1000 should probably be removed (or pulled up to a higher level)
If you REALLY need to save the results from the CROSS JOIN, you could use a chain of CTEs (expect a performance degradation ...)
WITH big AS (
SELECT s.id AS sample_id
,r.id AS rule_id
, s.sha256
-- , ...
, ROW_NUMBER() OVER (PARTITION BY s.id) AS rn -- <<<--- HERE
FROM sample s
CROSS JOIN yararules r
WHERE r.status = 'Disabled' AND NOT EXISTS(
SELECT 1 FROM yaratracker q
WHERE q.sample_id = s.id AND q.rule_id = r.id
)
)
, ins AS (
INSERT INTO target_table(a,b,c,...)
SELECT b.sample_id, b.rule_id, b.sha256 , ...
FROM big b
WHERE b.rn = 1; -- <<<--- HERE
RETURNING *
)
INSERT INTO yaratracker (rule_id, sample_id, matchcount, complete )
SELECT b.sample_id, b.rule_id, 0, False
FROM big b
-- LEFT JOIN ins i ON i.a = b.sample_id AND i.b= b.rule_id
;
NOTE: the yaratracker(rule_id,sample_id) should not be serials but just plain integers, referencing yararules(id) and sample(id)

How to load data as nested JSONB from non-JSONB postgres tables

I'm trying to construct an object for use from my postgres backend. The tables in question look something like this:
We have some Things that essentially act as rows for a matrix where the columns are Field_Columns. Field_Values are filled cells.
Create Table Platform_User (
serial id PRIMARY KEY
)
Create Table Things (
serial id PRIMARY KEY,
INTEGER user_id REFERENCES Platform_User(id)
)
Create Table Field_Columns (
serial id PRIMARY KEY,
TEXT name,
)
Create Table Field_Values (
INTEGER field_column_id REFERENCES Field_Columns(id),
INTEGER thing_id REFERENCES Things(id)
TEXT content,
PRIMARY_KEY(field_column_id, thing_id)
)
This would be simple if I were trying to load just the Field_Values for a single Thing as JSON, which would look like this:
SELECT JSONB_OBJECT(
ARRAY(
SELECT name
FROM Field_Columns
ORDER BY Field_Columns.id
),
ARRAY(
SELECT Field_Values.content
FROM Fields_Columns
LEFT JOIN Field_Values ON Field_Values.field_column_id = Field_Columns.id
AND Field_Values.thing_id = Things.id
ORDER BY Field_Columns.id)
)
)
FROM Things
WHERE Thing.id = $1
however, I'd like to construct the JSON object to look like this when returned. I want to get an object of all the Fields:Field_Values objects for the Things that a user owns
{
14:
{
'first field':'asdf',
'other field':''
}
25:
{
'first field':'qwer',
'other field':'dfgdsfg'
}
43:
{
'first field':'',
'other field':''
}
}
My efforts to construct this query look like this, but I'm running into the problem where the JSONB object function doesn't want to construct an object where the value of the field is an object itself
SELECT (
JSONB_OBJECT(
ARRAY(SELECT Things.id::TEXT
FROM Things
WHERE Things.user_id = $2
ORDER BY Things.id
),
ARRAY(SELECT JSONB_OBJECT(
ARRAY(
SELECT name
FROM Field_Columns
ORDER BY Field_Columns.id),
ARRAY(
SELECT Field_Values.content
FROM Field_Columns
LEFT JOIN Field_Values ON Field_Values.field_column_Id = Field_Columns.id
AND Field_Values.thing_id = Things.id
ORDER BY Field_Columns.id)
)
FROM Things
WHERE Things.user_id = $2
ORDER BY Things.id
)
)
) AS thing_fields
The specific error I get is function jsonb_object(text[], jsonb[]) does not exist. Is there a way to do this that doesn't involve copious text conversions and nonsense like that? Or will I just need to abandon trying to sort my data in the query and do it in my code instead.
Your DDL scripts are syntactically incorrect so I created these for you:
create table platform_users (
id int8 PRIMARY KEY
);
create table things (
id int8 PRIMARY KEY,
user_id int8 REFERENCES platform_users(id)
);
create table field_columns (
id int8 PRIMARY KEY,
name text
);
create table field_values (
field_column_id int8 REFERENCES field_columns(id),
thing_id int8 REFERENCES things(id),
content text,
PRIMARY KEY(field_column_id, thing_id)
);
I also created some scripts to populate the db:
insert into platform_users(id) values (1);
insert into platform_users(id) values (2);
insert into platform_users(id) values (3);
insert into platform_users(id) values (4);
insert into platform_users(id) values (5);
insert into things(id, user_id) values(1, 1);
insert into things(id, user_id) values(2, 1);
insert into things(id, user_id) values(3, 2);
insert into things(id, user_id) values(4, 2);
insert into field_columns(id, name) values(1, 'col1');
insert into field_columns(id, name) values(2, 'col2');
insert into field_values(field_column_id, thing_id, content) values(1, 1, 'thing1 val1');
insert into field_values(field_column_id, thing_id, content) values(2, 1, 'thing1 val2');
insert into field_values(field_column_id, thing_id, content) values(1, 2, 'thing2 val1');
insert into field_values(field_column_id, thing_id, content) values(2, 2, 'thing2 val2');
Please include such scripts next time when you ask for help, and make sure that your scripts are correct. This will reduce the work needed to answer your question.
You can get your jsonb value by aggregating the key value pairs with jsonb_object_agg
select
t.id,
jsonb_object_agg(fc.name, fv.content)
from
things t inner join
field_values fv on fv.thing_id = t.id inner join
field_columns fc on fv.field_column_id = fc.id
group by 1
The results looking like this:
thing_id;jsonb_value
1;"{"col1": "thing1 val1", "col2": "thing1 val2"}"
2;"{"col1": "thing2 val1", "col2": "thing2 val2"}"

How to use WHILE EXISTS in a loop

CREATE TABLE [CandidateDocsAssociation](
[Row_ID] [bigint] IDENTITY(1,1) NOT NULL,
[Doc_ID] [bigint] NOT NULL,
[Candidate_ID] [bigint] NOT NULL,
) ON [PRIMARY]
GO
I have the above table structure to store the association between documents and candidates. Row_ID is an auto generated primary key. Doc_ID is a foreign key referencing the documents table. Candidate_ID is also a foreign key referencing the Candidates table.
A candidate can be associated with more than one document and one document can be associated with multiple candidates.
What i want to achieve is insert a default common document (Doc_ID) for all candidates(DISTINCT) if a Candidate_ID row with a DOC_ID of 2 does not already exist.
Below is what i'm trying but it ain't working
WHILE EXISTS (SELECT DISTINCT Candidate_ID from CandidateDocsAssociation
WHERE Doc_ID <> (SELECT Doc_ID FROM Doc_Table WHERE Doc_Name = N'Default'))
BEGIN
INSERT CandidateDocsAssociation (Doc_ID, Candidate_ID) VALUES ((SELECT Doc_ID FROM Doc_Table WHERE Doc_Name = N'Default'),Candidate_ID)
END
GO
Forget the loop and do a set-based operation. Assuming you have a Candidates table:
INSERT INTO CandidateDocsAssociation (Doc_ID, Candidate_ID)
SELECT dt.Doc_ID, c.Candidate_ID
FROM Doc_Table dt
CROSS JOIN Candidates c
WHERE dt.Doc_Name = N'Default'
AND NOT EXISTS(SELECT * FROM CandidateDocsAssociation cda
WHERE cda.Candidate_ID=c.Candidate_ID
AND cda.Doc_ID=dt.Doc_ID)
try with this (use NOT IN Clause)
WHILE EXISTS (SELECT DISTINCT Candidate_ID from CandidateDocsAssociation
WHERE Doc_ID NOT IN (SELECT Doc_ID FROM Doc_Table WHERE Doc_Name = N'Default'))
BEGIN
INSERT CandidateDocsAssociation (Doc_ID, Candidate_ID) VALUES ((SELECT Doc_ID FROM Doc_Table WHERE Doc_Name = N'Default'),Candidate_ID)
END
GO