Assume following tables:
CREATE TABLE main
(
id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY
);
CREATE TABLE apple
(
id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
main_id INTEGER NOT NULL REFERENCES main(id)
);
CREATE TABLE orange
(
id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
main_id INTEGER NOT NULL REFERENCES main(id)
);
CREATE TABLE main_history
(
id INTEGER NOT NULL,
history_valid_from TIMESTAMPTZ NOT NULL,
history_valid_to TIMESTAMPTZ
);
CREATE TABLE apple_history
(
id INTEGER NOT NULL,
main_id INTEGER NOT NULL REFERENCES main(id), -- main_history does not have a PK
history_valid_from TIMESTAMPTZ NOT NULL,
history_valid_to TIMESTAMPTZ
);
CREATE TABLE orange_history
(
id INTEGER NOT NULL,
main_id INTEGER NOT NULL REFERENCES main(id), -- main_history does not have a PK
history_valid_from TIMESTAMPTZ NOT NULL,
history_valid_to TIMESTAMPTZ
);
So there is a root (main) and two tables which reference on it. The referenced table hold max. 20 records (in 99% of the cases) for each main record.
The problem starts when I try to retreive the history:
SELECT * FROM main_history
LEFT JOIN apple_history ON apple_history.main_id = main_history.id AND
apple_history.history_valid_from <= '2021-12-20T10:46:52.482620Z' AND
(apple_history.history_valid_to IS NULL OR '2021-12-20T10:46:52.482620Z' < apple_history.history_valid_to)
LEFT JOIN orange_history ON orange_history.main_id = main_history.id AND
orange_history.history_valid_from <= '2021-12-20T10:46:52.482620Z' AND
(orange_history.history_valid_to IS NULL OR '2021-12-20T10:46:52.482620Z' < orange_history.history_valid_to)
WHERE
main_history.id IN (1,2,3,4) AND
main_history.history_valid_from <= '2021-12-20T10:46:52.482620Z' AND
(main_history.history_valid_to IS NULL OR '2021-12-20T10:46:52.482620Z' < main_history.history_valid_to)
The point is: I'm joining now over a non-referencing column. PostgreSQL overestimates the rows up to 20.000 times (see https://explain.dalibo.com/plan/oIF).
Then I tried to
SET enable_hashjoin = off; SET enable_mergejoin = off
because I knew that there are not that much rows --> then it went down to 100msec.
When I change the query to:
SELECT * FROM main
LEFT JOIN apple_history ON apple_history.main_id = main_history.id AND
apple_history.history_valid_from <= '2021-12-20T10:46:52.482620Z' AND
(apple_history.history_valid_to IS NULL OR '2021-12-20T10:46:52.482620Z' < apple_history.history_valid_to)
LEFT JOIN orange_history ON orange_history.main_id = main_history.id AND
orange_history.history_valid_from <= '2021-12-20T10:46:52.482620Z' AND
(orange_history.history_valid_to IS NULL OR '2021-12-20T10:46:52.482620Z' < orange_history.history_valid_to)
WHERE
main.id IN (1,2,3,4)
It can use the original statitics - everything works fine.
My question: Can I create manual statistics for JOINing not REFERENCESed columns? Goal: use main_history as main table in the query without forcing JOIN strategies
You need some indexes.
To replicate 100ms performance all you need to do is create an index on main_history (main_id, valid_from, valid_to) and you should be set.
But why, though? Ok, so when you query main_history WHERE main_history.Id IN (1,2,3,4) postgres
doesn't know how many rows will match
doesn't know where the rows are on disk/in memory
It has no choice but to scan the entire main_history table to find the rows you're looking for. And the same goes for apples_history/oranges_history. It cannot possibly know where the corresponding main_ids are, nor how many there are. Full scans are needed, and Hash-joins are the best choices (since data isn't ordered)
So to help it along, you create indexes on the referenced/referencing columns. They give count and order, and will help postgres choose the correct plan.
Related
As a part of ETL, table continuous_trips has continuous flow of incoming records.
New records are aggregated and get inserted into temp. table called trips_agg every 5 minutes.
CREATE TABLE IF NOT EXISTS trips_agg AS (
SELECT start_time, station_id, from_station, to_station, from_terminus, end_terminus, previous_station, next_station,
AVG(wait_span) AS wait_span,
AVG(walk_span) AS walk_span,
AVG(delay_span) AS delay_span,
SUM(passengers_requests) AS passengers_requests
FROM continuous_trips
GROUP BY start_time, station_id, from_station, to_station, from_terminus, end_terminus, previous_station, next_station
)
The table trips_agg gets dropped after inserting all records into the table daily_trips and recreated during next cycle.
Tables daily_trips & trips_agg have the same columns.
CREATE TABLE IF NOT EXISTS daily_trips (
start_time timestamp without time zone NOT NULL,
station_id text NOT NULL,
from_station text NOT NULL,
to_station text NOT NULL,
from_terminus text NOT NULL,
end_terminus text NOT NULL,
previous_station text,
next_station text,
wait_span interval NOT NULL,
walk_span interval NOT NULL,
delay_span interval NOT NULL,
passengers_requests numeric NOT NULL
)
Note: columns 'previous_station' and 'next_station' allows null.
composite unique key is added as follows:
ALTER TABLE daily_trips ADD CONSTRAINT daily_trips_unique_row UNIQUE
(start_time, station_id, from_station, to_station, from_terminus, end_terminus, previous_station, next_station);
In case unique key is violated upon insertion, the record should be updated. So used upsert strategy.
INSERT INTO daily_trips SELECT * FROM trips_agg
ON CONFLICT (start_time, station_id, from_station, to_station, from_terminus, end_terminus,
previous_station, next_station) DO UPDATE
set wait_span = (daily_trips.wait_span + EXCLUDED.wait_span)/2,
walk_span = (daily_trips.walk_span + EXCLUDED.walk_span)/2 ,
delay_span = (daily_trips.delay_span + EXCLUDED.delay_span)/2,
passengers_requests =(daily_trips.passengers_requests + EXCLUDED.passengers_requests);
When values for all columns are present this setup works perfectly but, it's not the case when any of nullable columns have a null value.
Since Postgres doesn't consider null values to invoke unique constraint, whenever any of nullable columns have null value, a new row is inserted, instead of update. This results into multiple rows for the unique key.
To overcome this, added an index on the table daily_trips after referring this article.
create unique index daily_trips_unique_trip_idx ON daily_trips
(start_time, station_id, from_station, to_station, from_terminus, end_terminus,
(previous_station IS NULL), (next_station IS NULL)
where previous_station IS NULL or fnext_station IS NULL
However, only one row could be added with null value for any nullable column.
For next row with null value for any nullable column, update is not happening and instead getting following error:
ERROR: duplicate key value violates unique constraint "daily_trips_unique_trip_idx"
What is needed?
The unique constraint should be respected and update should happen when there is null value in either of nullable columns 'previous_station' or 'next_station'.
Any help is appreciated.
The solution is to translate NULL to some other value, more specifically the 0-length string (''). The coalesce function does precisely that when used as coalesce (column_name, ''). The problem being creating a unique constraint with that generates a syntax error. So you cannot create that constraint. However, there is a work around, although not a easy one. Postgres enforces unique constraints through a unique index, so just create the index directly.
create unique index daily_trips_unique_row on daily_trips
( start_time
, station_id
, from_station
, to_station
, from_terminus
, end_terminus
, coalesce(previous_station , '')
, coalesce(next_station, '')
);
However, while the above respects the null-ability of index columns it no longer recognizes INSERT ... ON CONFLICT (See example here) . You will either need a function/procedure to handle the exception or use Select ... if exists then Update else Insert logic.
I'm actually sutend and I'm setting up DB PostgreSQL for my AirsoftShop and some request on it. I need to find similar function as SELECT LAST(xx) FROM yy usable on SQL server and OracleDB i think. For return the last insert values in the column target by LAST().
I have this table :
CREATE TABLE munition.suivi_ammo (
type_ammo integer NOT NULL,
calibre integer NOT NULL,
event integer NOT NULL,
date_event date NOT NULL,
entrance integer NOT NULL,
exit integer NOT NULL,
inventory integer NOT NULL,
FOREIGN KEY (calibre) REFERENCES munition.index(numero),
FOREIGN KEY (event) REFERENCES munition.index(numero),
FOREIGN KEY (type_ammo) REFERENCES munition.index(numero)
);
and index for definition by number id :
CREATE TABLE munition.index (
numero integer NOT NULL,
definition text NOT NULL,
PRIMARY KEY (numero)
);
I want to select the last inventory insert in the table and calculate the current inventory according to the inflow and outflow made after my inventory
It's works when i do this type of request with specific date to be sure to only have the last one inventory, but I do not want to have to do it
SELECT index.definition,
Sum(suivi_ammo.inventory) + Sum(suivi_ammo.entrance) - Sum(suivi_ammo.exit) AS Stock
FROM munition.suivi_ammo
INNER JOIN munition.index ON suivi_ammo.type_ammo = index.numero
WHERE date_event < '03/05/2019' AND date_event >= '2019-04-10'
GROUP BY index.definition;
I also tried to used last_value() window function but doesn't work.
Thx !
I am writing an application that stores data on file samples and YARA signatures. Essentially, in a single transaction, I need to execute a query, reference those results in an insert and another query, then return the original results. I have three tables that are relevant to this discussion:
samples - this is the table that stores information on files that need to be scanned with the associated YARA signatures.
yararules - the table that stores information on the YARA rules.
yaratracker - a table that tracks the sample/rule pairs that have been processed thus far.
In a single transaction, the application needs to:
Get a batch of unique sample/rule pairs that have not yet been processed. Preferably, this query will get all non-processed rules associated with a single sample (i.e. if I'm going to run the YARA rules on a sample, I want to run all of the YARA rules not yet processed on that sample so that I only have to load the sample into memory once).
Get a unique list of id,sha256 from the batch found in step 1.
Insert the batch from step 1 into the yaraqueue with the matchcount column equal to 0 and complete column set to false.
I can accomplish Step 1 with the query below, but I don't know how to reference those results to accomplish step 2. I've tried looking into variables, but apparently there isn't one that can hold multiple rows. I've looked into using a cursor, but I can't seem to use the cursor with a subsequent command and then return the cursor.
SELECT s.id,r.id
FROM sample s CROSS JOIN yararules r
WHERE r.status = 'Disabled' AND NOT EXISTS(
SELECT 1 FROM yaratracker q
WHERE q.sample_id = s.id AND q.rule_id = r.id
)
ORDER BY s.id
LIMIT 1000;
The relevant database schema looks like this.
CREATE TYPE samplelist AS ENUM ('Whitelist', 'Blacklist', 'Greylist', 'Unknown');
CREATE TABLE samples (
id SERIAL PRIMARY KEY,
md5 CHAR(32) NOT NULL,
sha1 CHAR(40) NOT NULL,
sha256 CHAR(64) NOT NULL,
total INT NOT NULL,
positives INT NOT NULL,
list SAMPLELIST NOT NULL,
filetype VARCHAR(16) NOT NULL,
submitted TIMESTAMP WITH TIME ZONE NOT NULL,
user_id SERIAL REFERENCES users;
);
CREATE UNIQUE INDEX md5_idx ON {0} (md5);
CREATE UNIQUE INDEX sha1_idx ON {0} (sha1);
CREATE UNIQUE INDEX sha256_idx ON {0} (sha256);
CREATE TYPE rulestatus AS ENUM ('Enabled', 'Disabled');
CREATE TABLE yararules (
id SERIAL PRIMARY KEY,
name VARCHAR(32) NOT NULL UNIQUE,
description TEXT NOT NULL,
rules TEXT NOT NULL,
lastmodified TIMESTAMP WITH TIME ZONE NOT NULL,
status rulestatus NOT NULL,
user_id SERIAL REFERENCES users ON DELETE CASCADE
);
CREATE TABLE yaratracker (
id SERIAL PRIMARY KEY,
rule_id SERIAL REFERENCES yararules ON DELETE CASCADE,
sample_id SERIAL REFERENCES sample ON DELETE CASCADE,
matchcount INT NOT NULL,
complete BOOL NOT NULL
);
CREATE INDEX composite_idx ON yaratracker (rule_id, sample_id);
CREATE INDEX complete_idx ON yaratracker (complete);
INSERT INTO target_table(a,b,c,...)
SELECT sid, rid, sha, ...
FROM (
SELECT s.id AS sid
,r.id AS rid
, s.sha256 AS sha
, ...
, ROW_NUMBER() OVER (PARTITION BY s.id) AS rn -- <<<--- HERE
FROM sample s CROSS JOIN yararules r
WHERE r.status = 'Disabled' AND NOT EXISTS(
SELECT 1 FROM yaratracker q
WHERE q.sample_id = s.id
AND q.rule_id = r.id
)
ORDER BY s.id
LIMIT 1000;
) src
WHERE src.rn = 1; -- <<<--- HERE
The WHERE src.rn = 1 will restrict the cross-join to deliver only one tuple per sample.id (both id and sha256 are unique in the sample table, so picking a unique id has the same effect as picking a unique sha256)
The complete cross-join result will never be generated; the optimiser is smart enough to push down the WHERE rn=1 condition into the subquery.
Note: the LIMIT 1000 should probably be removed (or pulled up to a higher level)
If you REALLY need to save the results from the CROSS JOIN, you could use a chain of CTEs (expect a performance degradation ...)
WITH big AS (
SELECT s.id AS sample_id
,r.id AS rule_id
, s.sha256
-- , ...
, ROW_NUMBER() OVER (PARTITION BY s.id) AS rn -- <<<--- HERE
FROM sample s
CROSS JOIN yararules r
WHERE r.status = 'Disabled' AND NOT EXISTS(
SELECT 1 FROM yaratracker q
WHERE q.sample_id = s.id AND q.rule_id = r.id
)
)
, ins AS (
INSERT INTO target_table(a,b,c,...)
SELECT b.sample_id, b.rule_id, b.sha256 , ...
FROM big b
WHERE b.rn = 1; -- <<<--- HERE
RETURNING *
)
INSERT INTO yaratracker (rule_id, sample_id, matchcount, complete )
SELECT b.sample_id, b.rule_id, 0, False
FROM big b
-- LEFT JOIN ins i ON i.a = b.sample_id AND i.b= b.rule_id
;
NOTE: the yaratracker(rule_id,sample_id) should not be serials but just plain integers, referencing yararules(id) and sample(id)
I have 2 tables:
CREATE TABLE sf.dir_current (
id BIGINT primary key,
volume_id INTEGER NOT NULL,
path VARCHAR NOT NULL
);
CREATE index dir_volid_path_indx on dir_current (volume_id, path);
CREATE TABLE sf.event (
id BIGINT, -- no primary key here!
volume_id INTEGER NOT NULL,
parent_path VARCHAR NOT NULL,
type BIGINT,
depth INTEGER
);
Table dir contains ~50 millions of rows and in all rows volume_id = 1. Table events contains ~20K rows.
I execute following query (in PLSQL function - VOL_ID, MIN_ID, MAX_ID and so on are function params):
select dir.id as parent_id, event as event_row
from sf.event as event
left outer join sf.dir_current as dir on dir.volume_id = VOL_ID and parent_path = dir.path
where event.volume_id = VOL_ID
and event.id between MIN_ID and MAX_ID
and (DEPTH_FILTER is null or event.depth = DEPTH_FILTER)
and (TYPE_FILTER is null or event.type = TYPE_FILTER)
order by event.depth;
Everything works fine when all rows in dir table has volume_id = 1. After adding few thousand rows with volume_id = 2 (and running analyze) this query takes very long.
Here is explain of long running query: explain.depesz.com
As it is clearly visible query planner had no idea that there are so many rows with volume_id = 2 and created plan far from optimal.
After some debugging I found out that analyze did not find any row with volume_id = 2. I confirmed it with query:
starfish=# SELECT most_common_vals, n_distinct FROM pg_stats WHERE tablename = 'dir_current' and attname = 'volume_id';
most_common_vals | n_distinct
------------------+------------
{1} | 1
(1 row)
After few analyze's it finnally finds some values with vol_id = 2 and query gets back to normal execution time: explain.depesz.com
Question: how to prevent extremely long query time? Is there a way to force analyze to find this rows? Or maybe manually modify stats for this column (setting n_distinct for vol_id column does not help).
I'm using Postresql 9.5
There is this field in a table:
room_id INT NOT NULL CONSTRAINT room_id_ref_room REFERENCES room
I have three 2 tables for two kinds of rooms: standard_room and family_room
How to do something like this:
room_id INT NOT NULL CONSTRAINT room_id_ref_room REFERENCES standard_room or family_room
I mean, room_id should reference either standard_room or family_room.
Is it possible to do so?
Here is the pattern I've been using.
CREATE TABLE room (
room_id serial primary key,
room_type VARCHAR not null,
CHECK CONSTRAINT room_type in ("standard_room","family_room"),
UNIQUE (room_id, room_type)
);
CREATE_TABLE standard_room (
room_id integer primary key,
room_type VARCHAR not null default "standard_room",
FOREIGN KEY (room_id, room_type) REFERENCES room (room_id, room_type),
CHECK CONSTRAINT room_type = "standard_room"
);
CREATE_TABLE family_room (
room_id integer primary key,
room_type VARCHAR not null default "family_room",
FOREIGN KEY (room_id, room_type) REFERENCES room (room_id, room_type),
CHECK CONSTRAINT room_type = "family_room"
);
That is, the 'subclasses' point at the super-class, by way of a type descriminator column (such that the pointed to base class is of the correct type, and that primary key of the super class is the same as the child classes.
Here's the same SQL from the accepted answer that works for PostGres 12.8. There's a few issues not only the CREATE_TABLE syntax mistake:
CREATE TABLE room (
room_id serial primary key,
room_type VARCHAR not null,
CONSTRAINT room_in_scope CHECK (room_type in ('standard_room','family_room')),
CONSTRAINT unique_room_type_combo UNIQUE (room_id, room_type)
);
CREATE TABLE standard_room (
room_id integer primary key,
room_type VARCHAR not null default 'standard_room',
CONSTRAINT roomid_std_roomtype_fk FOREIGN KEY (room_id, room_type) REFERENCES public."room" (room_id, room_type),
CONSTRAINT std_room_constraint CHECK (room_type = 'standard_room')
);
CREATE TABLE family_room (
room_id integer primary key,
room_type VARCHAR not null default 'family_room',
CONSTRAINT roomid_fam_roomtype_fk FOREIGN KEY (room_id, room_type) REFERENCES "room" (room_id, room_type),
CONSTRAINT fam_room_constraint CHECK (room_type = 'family_room')
);
NOTE: The SQL above uses constraints to enforce the child room_type values default to the parent tables' room_type values: 'standard_room' or 'family_room'.
PROBLEM: Since the child tables Primary Key's expect either the standard and family room Primary Key that means you can't insert more than one record in thsee two child tables.
insert into room (room_type) VALUES ('standard_room'); //Works
insert into room (room_type) values ('family_room'); //Works
insert into standard_room (room_id,pictureAttachment) VALUES (1,'Before Paint'); //Works
insert into standard_room (room_id,pictureAttachment) VALUES (1,'After Paint'); //Fails
insert into standard_room (room_id,pictureAttachment) VALUES (1,'With Furniture');
insert into family_room (room_id,pictureAttachment) VALUES (2, 'Beofre Kids'); //Works
insert into family_room (room_id,pictureAttachment) VALUES (2,'With Kids'); //Fails
To make the tables accept > 1 row you have to remove the Primary Keys from the 'standard_room' and 'family_room' tables which is BAD database design.
Despite 26 upvotes I will ping OP about this as I can see the answer was typed free hand.
Alternate Solutions
For smallish tables with less than a handful of variations a simple alterative is a single table with Bool columns for different table Primary Key fields.
Single Table "Room"
Id
IsStandardRoom
IsFamilyRoom
Desc
Dimensions
1
True
False
Double Bed, BIR
3 x 4
2
False
True
3 Set Lounge
5.5 x 7
SELECT * FROM Room WHERE IsStdRoom = true;
At the end of the day, in a relational database it's not very common to be adding Room Types when it involves creating the necessary related database tables using DDL commands (CREATE, ALTER, DROP).
A typical future proof database design allowing for more Tables would look something like this:
Multi Many-To-Many Table "Room"
Id
TableName
TableId
1
Std
8544
2
Fam
236
3
Std
4351
Either Standard or Family:
select * from standard_room sr where sr.room_id in
(select TableId from room where TableName = 'Std');
select * from family_room fr where fr.room_id in
(select id from room where TableName = 'Fam');
Or both:
select * from standard_room sr where sr.room_id in
(select TableId from room where TableName = 'Std')
UNION
select * from family_room fr where fr.room_id in
(select id from room where TableName = 'Fam');
Sample SQL to demo Polymorphic fields:
If you want to have different Data Types in the polymorphic foreign key fields then you can use this solution. Table r1 stores a TEXT column, r2 stores a TEXT[] Array column and r3 a POLYGON column:
CREATE OR REPLACE FUNCTION null_zero(anyelement)
RETURNS INTEGER
LANGUAGE SQL
AS $$
SELECT CASE WHEN $1 IS NULL THEN 0 ELSE 1 END;
$$;
CREATE TABLE r1 (
r1_id SERIAL PRIMARY KEY
, r1_text TEXT
);
INSERT INTO r1 (r1_text)
VALUES ('foo bar'); --TEXT
CREATE TABLE r2 (
r2_id SERIAL PRIMARY KEY
, r2_text_array TEXT[]
);
INSERT INTO r2 (r2_text_array)
VALUES ('{"baz","blurf"}'); --TEXT[] ARRAY
CREATE TABLE r3 (
r3_id SERIAL PRIMARY KEY
, r3_poly POLYGON
);
INSERT INTO r3 (r3_poly)
VALUES ( '((1,2),(3,4),(5,6),(7,8))' ); --POLYGON
CREATE TABLE flex_key_shadow (
flex_key_shadow_id SERIAL PRIMARY KEY
, r1_id INTEGER REFERENCES r1(r1_id)
, r2_id INTEGER REFERENCES r2(r2_id)
, r3_id INTEGER REFERENCES r3(r3_id)
);
ALTER TABLE flex_key_shadow ADD CONSTRAINT only_one_r
CHECK(
null_zero(r1_id)
+ null_zero(r2_id)
+ null_zero(r3_id)
= 1)
;
CREATE VIEW flex_key AS
SELECT
flex_key_shadow_id as Id
, CASE
WHEN r1_id IS NOT NULL THEN 'r1'
WHEN r2_id IS NOT NULL THEN 'r2'
WHEN r3_id IS NOT NULL THEN 'r3'
ELSE 'wtf?!?'
END AS "TableName"
, CASE
WHEN r1_id IS NOT NULL THEN r1_id
WHEN r2_id IS NOT NULL THEN r2_id
WHEN r3_id IS NOT NULL THEN r3_id
ELSE NULL
END AS "TableId"
FROM flex_key_shadow
;
INSERT INTO public.flex_key_shadow (r1_id,r2_id,r3_id) VALUES
(1,NULL,NULL),
(NULL,1,NULL),
(NULL,NULL,1);
SELECT * FROM flex_key;