Join large table with small table in postgres with index - postgresql

We have a database that consists of 96 million mortgage loans. In this database we have the original houseprice at loan origination. We want to update these houseprices with a very simple houseprice index we extracted from the internet as csv and I imported this to a table in the same database as the mortgage loans. I am already able to join the tables, but it is very slow. I think I'm not working with the index correctly.... This is how the tables look like:
mortgage loans:
CREATE TABLE mydb.mortgageloans
(
pkrmbloan bigint NOT NULL,
fkdeal bigint NOT NULL,
edcode character varying(50) NOT NULL,
poolcutoffdate character varying(50) NOT NULL,
recno integer NOT NULL,
submissiontimestamp timestamp without time zone NOT NULL,
col1 character varying(10),
col2 character varying(100),
country character varying(10),
col......
col199 character varying(25)
CONSTRAINT rmb_loan_pkey PRIMARY KEY (pkrmbloan),
CONSTRAINT fk_rmbloan2deal FOREIGN KEY (fkdeal)
REFERENCES mydb_data.deal (pkdeal) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE CASCADE
)
WITH (
OIDS=FALSE
);
ALTER TABLE mydb.mortgageloans
OWNER TO mydb_admin;
GRANT ALL ON TABLE mydb.mortgageloans TO mydb_admin;
GRANT SELECT ON TABLE mydb.mortgageloans TO mydb_addon;
CREATE INDEX idx_rmbloan_edcode_poolcod
ON mydb.mortgageloans
USING btree
(edcode COLLATE pg_catalog."default", poolcutoffdate COLLATE pg_catalog."default");
CREATE INDEX idx_rmbloan_fkdeal
ON mydb.mortgageloans
USING btree
(fkdeal);
CREATE INDEX idx_rmbloan_recno
ON mydb.mortgageloans
USING btree
(recno);
house price index table I self imported.
CREATE TABLE mydb.hpi
(
period character varying(100),
au character varying(100),
be character varying(100),
ca character varying(100),
ch character varying(100),
de character varying(100),
dk character varying(100),
es character varying(100),
fi character varying(100),
fr character varying(100),
uk character varying(100),
ie character varying(100),
it character varying(100),
jp character varying(100),
nl character varying(100),
no character varying(100),
nz character varying(100),
us character varying(100),
pt character varying(100)
)
WITH (
OIDS=FALSE
);
ALTER TABLE mydb.hpi
OWNER TO mydb_admin;
And the query to add the original house price index based on the loan origination date (col55)
ALTER TABLE mydb.mortgageloans ADD COLUMN OriginalHPI varchar(130);
UPDATE mydb.mortgageloans set OriginalHPI = test.rv
FROM
(
select
CASE
WHEN a.country = 'NL'::text THEN c.nl::numeric
WHEN a.country = 'BE'::text THEN c.be::numeric
WHEN a.country = 'ES'::text THEN c.es::numeric
WHEN a.country = 'FR'::text THEN c.fr::numeric
WHEN a.country = 'IT'::text THEN c.IT::numeric
WHEN a.country = 'DE'::text THEN c.de::numeric
WHEN a.country = 'IE'::text THEN c.ie::numeric
else NULL::numeric
END AS rv,
,a.pkrmbloan
FROM mydb.mortgageloans a
LEFT JOIN mydb_data.hpi c on a.col55 = c.Period
)
as test
where test.pkrmbloan = mydb.mortgageloans.pkrmbloan
Any help would be much appreciated!
Best regards,
Tim
edit: added the explain output
Using slightly different db names, wanted to anonomize first
Actual query:
EXPLAIN
UPDATE edp_data.rmb_loan set OriginalHPI = test.rv
FROM
(
select
CASE
WHEN "substring"(a.edcode::text, 5, 2)::text = 'NL'::text THEN c.nl::numeric
WHEN "substring"(a.edcode::text, 5, 2)::text = 'BE'::text THEN c.be::numeric
WHEN "substring"(a.edcode::text, 5, 2)::text = 'ES'::text THEN c.es::numeric
WHEN "substring"(a.edcode::text, 5, 2)::text = 'FR'::text THEN c.fr::numeric
WHEN "substring"(a.edcode::text, 5, 2)::text = 'IT'::text THEN c.IT::numeric
WHEN "substring"(a.edcode::text, 5, 2)::text = 'DE'::text THEN c.de::numeric
WHEN "substring"(a.edcode::text, 5, 2)::text = 'IE'::text THEN c.ie::numeric
else 12::numeric
END AS rv,
a.pkrmbloan, a.fkdeal
FROM edp_data.rmb_loan a
LEFT JOIN edp_data.hpi c on a.ar55 = c.period
)
as test
where test.pkrmbloan = edp_data.rmb_loan.pkrmbloan and test.fkdeal = edp_data.rmb_loan.fkdeal;
Output
"Update on rmb_loan (cost=22.11..60667621.09 rows=342266 width=4090)"
" -> Hash Left Join (cost=22.11..60667621.09 rows=342266 width=4090)"
" Hash Cond: ((a.ar55)::text = (c.period)::text)"
" -> Merge Join (cost=0.00..60635941.00 rows=341941 width=4049)"
" Merge Cond: (rmb_loan.pkrmbloan = a.pkrmbloan)"
" Join Filter: (rmb_loan.fkdeal = a.fkdeal)"
" -> Index Scan using rmb_loan_pkey on rmb_loan (cost=0.00..28746023.33 rows=179651105 width=4014)"
" -> Index Scan using rmb_loan_pkey on rmb_loan a (cost=0.00..28746023.33 rows=179651105 width=51)"
" -> Hash (cost=15.38..15.38 rows=538 width=56)"
" -> Seq Scan on hpi c (cost=0.00..15.38 rows=538 width=56)"

I think your confusing from clause originates from the fact that you want the column default to be 12. To avoid that just declare the default when adding the column
alter table mydb.mortgageloans
add column OriginalHPI varchar(130) default '12';
update edp_data.rmb_loan a
set OriginalHPI = (
case substring(a.edcode::text, 5, 2)
when 'NL' then c.nl
when 'BE' then c.be
when 'ES' then c.es
when 'FR' then c.fr
when 'IT' then c.IT
when 'DE' then c.de
when 'IE' then c.ie
else 12
end)::numeric
from edp_data.hpi c
where a.ar55 = c.period
Why do you cast the case result to numeric just to save it in a varchar column?

Related

Postgres - how to bulk insert table with foreign keys

I am looking to do a bulk insert into my postgreSQL database.
database is not yet live
postgreSQL 13
I have a temporary staging table which I bulk inserted data
TABLE public.temp_inverter_location
(
id integer ,
inverter_num_in_sld integer,
lift_requirements character varying,
geo_location_id integer NOT NULL (foreign key references geo_location.id),
location_name character varying,
project_info_id integer NOT NULL (foreign key references project_info.id)
)
I am trying to populate the two foreign key columns temp_inverter_location.geo_location_id and temp_inverter_location.project_info_id.
The two referenced tables are referenced by their id columns:
geo_location
CREATE TABLE public.geo_location
(
id integer,
country character varying(50) COLLATE pg_catalog."default",
region character varying(50) COLLATE pg_catalog."default",
city character varying(100) COLLATE pg_catalog."default",
location_name character varying COLLATE pg_catalog."default",
)
and
project_info
CREATE TABLE public.project_info
(
id integer
operation_name character varying,
project_num character varying(10),
grafana_site_num character varying(10)
)
I want to populate the correct foreign keys into the columns temp_inverter_location.geo_location_id and temp_inverter_location.project_info_id.
I am trying to use INSERT INTO SELECT to populate temp_inverter_location.geo_location_id with a JOIN that matches geo_location.location_name and temp_inverter_location.name.
I have tried this query however inverter_location.geo_location_id remains blank:
INSERT INTO temp_inverter_location(geo_location_id) SELECT geo_location.id FROM geo_location INNER JOIN temp_inverter_location ON geo_location.location_name=temp_inverter_location.location_name;
Please let me know if more info is needed, thanks!
I was able to resolve this issue using update referencing another table.
Basically, I updated the geo_location_id column using
UPDATE temp_inverter_location SET geo_location_id = geo_location.id FROM geo_location WHERE geo_location.location_name = temp_inverter_location.location_name;
and updated the project_info_id using
UPDATE load_table SET project_info_id = project_info.id FROM project_info WHERE project_info.operation_name = load_table.location_name;
It seems to have worked.

query a jsonb array in a select of another query

I have a user table with a column favorites that is a jsonb
favorites:
[
{
"id_doc:": 9,
"type": "post"
},
{
"id_doc": 10,
"type": "post"
}
]
And I have another table posts where I want to make a query by id and this id must be in the fields id_doc in the favorites user
select * from posts where id in (select favorites -> id_doc from users )
This is the schema
CREATE TABLE dev.users
(
id integer NOT NULL GENERATED BY DEFAULT AS IDENTITY ( INCREMENT 1 START 1 MINVALUE 1 MAXVALUE 2147483647 CACHE 1 ),
firstname character varying COLLATE pg_catalog."default" NOT NULL,
lastname character varying COLLATE pg_catalog."default" NOT NULL,
email character varying COLLATE pg_catalog."default" NOT NULL,
password character varying COLLATE pg_catalog."default" NOT NULL,
favorites jsonb[],
CONSTRAINT users_pkey PRIMARY KEY (id),
CONSTRAINT email_key UNIQUE (email)
)
WITH (
OIDS = FALSE
)
TABLESPACE pg_default;
ALTER TABLE dev.users
OWNER to postgres;
CREATE TABLE dev.posts
(
id integer NOT NULL DEFAULT nextval('dev.posts_id_seq'::regclass),
title character varying COLLATE pg_catalog."default" NOT NULL,
userid integer NOT NULL,
description character varying COLLATE pg_catalog."default" NOT NULL,
CONSTRAINT posts_pkey PRIMARY KEY (id)
)
WITH (
OIDS = FALSE
)
TABLESPACE pg_default;
ALTER TABLE dev.posts
OWNER to postgres;
How can I do this?
Thank you
There are other ways to accomplish this, but I prefer using CTEs for clarity. Please let me know in the comments if you have questions about what this does.
with elements as (
select jsonb_array_elements(favorites) as favitem
from users
), fav_ids as (
select distinct (favitem->>'id_doc')::int as id_doc
from elements
)
select p.*
from posts p
join fav_ids f on f.id_doc = p.id
;
Update
Since the column is defined as jsonb[] rather than json, we need to unnest() instead of jsonb_array_elements():
with elements as (
select unnest(favorites) as favitem
from users
), fav_ids as (
select distinct (favitem->>'id_doc')::int as id_doc
from elements
)
select p.*
from posts p
join fav_ids f on f.id_doc = p.id
;

How to create index on table which is partitioned?

How to create an index on a partitioned table in PostgreSQL 11.2?
My table is:
CREATE TABLE sometablename
(
column1 character varying(255) COLLATE pg_catalog."default" NOT NULL,
column2 integer NOT NULL,
column3 character varying(255) COLLATE pg_catalog."default" NOT NULL,
"timestamp" timestamp without time zone NOT NULL,
avg_val double precision,
max_val double precision,
min_val double precision,
p95_val double precision,
sample_count double precision,
sum_val double precision,
unit character varying(255) COLLATE pg_catalog."default",
user_id bigint NOT NULL,
CONSTRAINT testtable_pkey PRIMARY KEY (column1, column2, column3, "timestamp", user_id)
)
PARTITION BY HASH (user_id)
WITH (
OIDS = FALSE
)
TABLESPACE pg_default;
CREATE UNIQUE INDEX testtable_unique_pkey
ON sometablename USING btree (column1 COLLATE pg_catalog."default", column2
COLLATE pg_catalog."default", "timestamp", user_id)
TABLESPACE pg_default;
As you can see testtable_unique_pkey is my index.
but when I run:
SELECT tablename, indexname, indexdef
FROM pg_indexes
WHERE tablename = 'sometablename'
I can't see my index.
I checked the explain analysis on my queries which is also not using the index.
The index for the base table is never really created, so it doesn't show up in pg_indexes:
CREATE TABLE base_table
(
column1 varchar(255) NOT NULL,
column2 integer NOT NULL,
user_id bigint NOT NULL
)
PARTITION BY HASH (user_id);
CREATE UNIQUE INDEX idx_one ON base_table (column1, column2, user_id);
So the following returns nothing:
select *
from pg_indexes
where tablename = 'base_table';
It is however stored in pg_class:
select i.relname as indexname, t.relname as tablename
from pg_class i
join pg_index idx on idx.indexrelid = i.oid
join pg_class t on t.oid = idx.indrelid
where i.relkind = 'I'
and t.relname = 'base_table';
returns:
indexname | tablename
----------+-----------
idx_one | base_table
But for each partition the index will show up in pg_indexes:
create table st_p1 partition of base_table for values with (modulus 4, remainder 0);
create table st_p2 partition of base_table for values with (modulus 4, remainder 1);
create table st_p3 partition of base_table for values with (modulus 4, remainder 2);
create table st_p4 partition of base_table for values with (modulus 4, remainder 3);
And then:
select tablename, indexname
from pg_indexes
where tablename in ('st_p1', 'st_p2', 'st_p3', 'st_p4');
returns:
tablename | indexname
----------+----------------------------------
st_p1 | st_p1_column1_column2_user_id_idx
st_p2 | st_p2_column1_column2_user_id_idx
st_p3 | st_p3_column1_column2_user_id_idx
st_p4 | st_p4_column1_column2_user_id_idx
Update 2020-06-26:
The fact that the index did not show up in pg_indexes was acknowledged as a bug by the Postgres team and was fixed in Postgres 12.
So the above explanation is only valid for Postgres 10 and 11. Starting with Postgres 12, the index on base_table will be shown in `pg_indexes.

Update column using dblink

I am using below reference to update 2 columns (customer name, service) at table cust_eq_memory_dy.
loopback at table msrouterlistfinal2 will match address at cust_eq_memory_dy.
Can someone help me on this as I got the syntax error at or near "FROM"?
Update between 2 databases using dblink not working
UPDATE cust_eq_memory_dy B
SET customername = A.customername
WHERE B.ipaddress = A.loopbackip
FROM (
SELECT *
FROM DBLINK ( 'host= 10.X.80.160 user=123 password=123 dbname=postgres',
'select customername, serviceid, loopbackip FROM msrouterlistfinal2 ')
as temp (
customername character varying (100),
serviceid character varying (50),
loopbackip character varying (30) )
)A
if you are using postgres I highly recommend you to use the WITH sentence.
WITH A as ( SELECT * FROM DBLINK ( 'host= 10.X.80.160 user=123 password=123 dbname=postgres', 'select customername, serviceid, loopbackip FROM msrouterlistfinal2 ') as temp ( customername character varying (100), serviceid character varying (50), loopbackip character varying (30) ) )
UPDATE cust_eq_memory_dy B SET customername = (SELECT A.customername FROM A WHERE B.ipaddress = A.loopbackip);
Check this link for more information.
https://www.postgresql.org/docs/8.4/static/queries-with.html

How to approach data warehouse (PostgreSQL) documentation?

We do have a small data warehouse in PostgreSQL database and I have to document all the tables.
I thought I can add a comment to every column and table and use pipe "|" separator to add more attributes. Then I can use information schema and array function to get documentation and use any reporting software to create desired output.
select
ordinal_position,
column_name,
data_type,
character_maximum_length,
numeric_precision,
numeric_scale,
is_nullable,
column_default,
(string_to_array(descr.description,'|'))[1] as cs_name,
(string_to_array(descr.description,'|'))[2] as cs_description,
(string_to_array(descr.description,'|'))[3] as en_name,
(string_to_array(descr.description,'|'))[4] as en_description,
(string_to_array(descr.description,'|'))[5] as other
from
information_schema.columns columns
join pg_catalog.pg_class klass on (columns.table_name = klass.relname and klass.relkind = 'r')
left join pg_catalog.pg_description descr on (descr.objoid = klass.oid and descr.objsubid = columns.ordinal_position)
where
columns.table_schema = 'data_warehouse'
order by
columns.ordinal_position;
It is a good idea or is there better approach?
Unless you must include descriptions of the system tables, I wouldn't try to shoehorn your descriptions into pg_catalog.pg_description. Make your own table. That way you get to keep the columns as columns, and not have to use clunky string functions.
Alternatively, consider adding specially formatted comments to your master schema file, along the lines of javadoc. Then write a tool to extract those comments and create a document. That way the comments stay close to the thing they're commenting, and you don't have to mess with the database at all to produce the report. For example:
--* Used for authentication.
create table users
(
--* standard Rails-friendly primary key. Also an example of
--* a long comment placed before the item, rather than on the
--* the same line.
id serial primary key,
name text not null, --* Real name (hopefully)
login text not null, --* Name used for authentication
...
);
Your documentation tool reads the file, looks for the --* comments, figures out what comments go with what things, and produces some kind of report, e.g.:
table users: Used for authentication
id: standard Rails-friendly primary key. Also an example of a
long comment placed before the item, rather than on the same
line.
name: Real name
login: Name used for authentication
You might note that with appropriate comments, the master schema file itself is a pretty good report in its own right, and that perhaps nothing else is needed.
If anyone interested, here is what I've used for initial load for my small documentation project. Documentation is in two tables, one for describing tables and one for describing columns and constraints. I appreciate any feedback.
/* -- Initial Load - Tables */
drop table dw_description_table cascade;
create table dw_description_table (
table_description_key serial primary key,
physical_full_name character varying,
physical_schema_name character varying,
physical_table_name character varying,
Table_Type character varying, -- Fact Dimension ETL Transformation
Logical_Name_CS character varying,
Description_CS character varying,
Logical_Name_EN character varying,
Description_EN character varying,
ToDo character varying,
Table_Load_Type character varying, --Manually TruncateLoad AddNewRows
Known_Exclusions character varying,
Table_Clover_Script character varying
);
insert into dw_description_table (physical_full_name, physical_schema_name, physical_table_name) (
select
table_schema || '.' || table_name as physical_full_name,
table_schema,
table_name
from
information_schema.tables
where
table_name like 'dw%' or table_name like 'etl%'
)
/* -- Initial Load - Columns */
CREATE TABLE dw_description_column (
column_description_key serial,
table_description_key bigint,
physical_full_name text,
physical_schema_name character varying,
physical_table_name character varying,
physical_column_name character varying,
ordinal_position character varying,
column_default character varying,
is_nullable character varying,
data_type character varying,
logical_name_cs character varying,
description_cs character varying,
logical_name_en character varying,
description_en character varying,
derived_rule character varying,
todo character varying,
pk_name character varying,
fk_name character varying,
foreign_table_name character varying,
foreign_column_name character varying,
is_primary_key boolean,
is_foreign_key boolean,
CONSTRAINT dw_description_column_pkey PRIMARY KEY (column_description_key ),
CONSTRAINT fk_dw_description_table_key FOREIGN KEY (table_description_key)
REFERENCES dw_description_table (table_description_key) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION
);
insert into dw_description_column (
table_description_key ,
physical_full_name ,
physical_schema_name ,
physical_table_name ,
physical_column_name ,
ordinal_position ,
column_default ,
is_nullable ,
data_type ,
logical_name_cs ,
description_cs ,
logical_name_en ,
description_en ,
derived_rule ,
todo ,
pk_name ,
fk_name ,
foreign_table_name ,
foreign_column_name ,
is_primary_key ,
is_foreign_key )
(
with
dw_constraints as (
SELECT
tc.constraint_name,
tc.constraint_schema || '.' || tc.table_name || '.' || kcu.column_name as physical_full_name,
tc.constraint_schema,
tc.table_name,
kcu.column_name,
ccu.table_name AS foreign_table_name,
ccu.column_name AS foreign_column_name,
TC.constraint_type
FROM
information_schema.table_constraints AS tc
JOIN information_schema.key_column_usage AS kcu ON (tc.constraint_name = kcu.constraint_name and tc.table_name = kcu.table_name)
JOIN information_schema.constraint_column_usage AS ccu ON ccu.constraint_name = tc.constraint_name
WHERE
constraint_type in ('PRIMARY KEY','FOREIGN KEY')
AND tc.constraint_schema = 'bizdata'
and (tc.table_name like 'dw%' or tc.table_name like 'etl%')
group by
tc.constraint_name,
tc.constraint_schema,
tc.table_name,
kcu.column_name,
ccu.table_name ,
ccu.column_name,
TC.constraint_type
)
select
dwdt.table_description_key,
col.table_schema || '.' || col.table_name || '.' || col.column_name as physical_full_name,
col.table_schema as physical_schema_name,
col.table_name as physical_table_name,
col.column_name as physical_column_name,
col.ordinal_position,
col.column_default,
col.is_nullable,
col.data_type,
null as Logical_Name_CS ,
null as Description_CS ,
null as Logical_Name_EN,
null as Description_EN ,
null as Derived_Rule ,
null as ToDo,
dwc1.constraint_name pk_name,
dwc2.constraint_name as fk_name,
dwc2.foreign_table_name,
dwc2.foreign_column_name,
case when dwc1.constraint_name is not null then true else false end as is_primary_key,
case when dwc2.constraint_name is not null then true else false end as foreign_key
from
information_schema.columns col
join dw_description_table dwdt on (col.table_schema || '.' || col.table_name = dwdt.physical_full_name )
left join dw_constraints dwc1 on ((col.table_schema || '.' || col.table_name || '.' || col.column_name) = dwc1.physical_full_name and dwc1.constraint_type = 'PRIMARY KEY')
left join dw_constraints dwc2 on ((col.table_schema || '.' || col.table_name || '.' || col.column_name) = dwc2.physical_full_name and dwc2.constraint_type = 'FOREIGN KEY')
where
col.table_name like 'dw%' or col.table_name like 'etl%'
)