Postgres: Optimize complex Query? - postgresql

I have trouble optimizing queries once they start getting rather large like the one below.
select distinct tm.proto_location from track_message tm where
workflow_analytic_instance_id = 204 and tm.id in
(Select track_message_id from track_message_to_track_mapping where track_id in
(select distinct t.id from track t, track_item item where t.id = item.track_id and
item.item_time between 1328816277089000 and 1328816287089000 and item.id in
(Select track_item_id from track_point tp where ST_Intersects(tp.track_position,
ST_GeomFromText('POLYGON((-144 59, -41 46, -75 15, -127 25, -144 59))',4326)))));
I'm not sure if I need to restructure the query or add additional index's as I currently only have 1 on track_position. Below is my analysis on the query
HashAggregate (cost=3321073.27..3321099.83 rows=2656 width=126) (actual
time=38937.642..38937.781 rows=341 loops=1)
-> Hash Semi Join (cost=3312041.16..3321066.63 rows=2656 width=126) (actual
time=38860.624..38937.235 rows=341 loops=1)"
Hash Cond: (tm.id = track_message_to_track_mapping.track_message_id)
-> Seq Scan on track_message tm (cost=0.00..8441.48 rows=5280 width=134) (actual time=31.643..81.135 rows=5027 loops=1)
Filter: (workflow_analytic_instance_id = 204)
-> Hash (cost=3310705.63..3310705.63 rows=81402 width=8) (actual time=38824.785..38824.785 rows=1026 loops=1)
Buckets: 4096 Batches: 4 Memory Usage: 11kB
-> Hash Join (cost=3306662.03..3310705.63 rows=81402 width=8) (actual time=38741.641..38820.901 rows=1026 loops=1)
Hash Cond: (track_message_to_track_mapping.track_id = t.id)
-> Seq Scan on track_message_to_track_mapping (cost=0.00..2995.04 rows=162804 width=16) (actual time=0.023..36.404 rows=162678 loops=1)
-> Hash (cost=3306623.23..3306623.23 rows=3104 width=8) (actual time=38737.721..38737.721 rows=1026 loops=1)
Buckets: 1024 Batches: 1 Memory Usage: 41kB"
-> Unique (cost=3299618.84..3306592.19 rows=3104 width=8) (actual time=38578.330..38737.166 rows=1026 loops=1)
-> Merge Join (cost=3299618.84..3306584.43 rows=3104 width=8) (actual time=38578.327..38735.062 rows=10303 loops=1)
Merge Cond: (t.id = item.track_id)
-> Index Scan using track_pkey on track t (cost=0.00..6763.86 rows=162639 width=8) (actual time=0.020..122.626 rows=160111 loops=1)
-> Sort (cost=3299617.79..3299625.55 rows=3104 width=8) (actual time=38571.786..38574.074 rows=10303 loops=1)
Sort Key: item.track_id
Sort Method: quicksort Memory: 867kB
-> Hash Semi Join (cost=2688037.93..3299437.75 rows=3104 width=8) (actual time=25663.691..38562.198 rows=10303 loops=1)
Hash Cond: (item.id = tp.track_item_id)
-> Seq Scan on track_item item (cost=0.00..598761.77 rows=17867 width=16) (actual time=1177.986..3128.122 rows=20606 loops=1)
Filter: ((item_time >= 1328816277089000::bigint) AND (item_time <= 1328816287089000::bigint))
-> Hash (cost=2636161.58..2636161.58 rows=3161948 width=8) (actual time=24330.672..24330.672 rows=9485846 loops=1)
Buckets: 4096 Batches: 512 (originally 128) Memory Usage: 1025kB"
-> Seq Scan on track_point tp (cost=0.00..2636161.58 rows=3161948 width=8) (actual time=5.506..20772.158 rows=9485846 loops=1)
Filter: ((track_position && '0103000020E6100000010000000500000000000000000062C00000000000804D4000000000008044C000000000000047400000000000C052C00000000000002E400000000000C05FC0000000000000394000000000000062C00000000000804D40'::geometry) AND _st_intersects(track_position, '0103000020E6100000010000000500000000000000000062C00000000000804D4000000000008044C000000000000047400000000000C052C00000000000002E400000000000C05FC0000000000000394000000000000062C00000000000804D40'::geometry))
Total runtime: 38938.104 ms
I dont have the ability to change the tables as the database was created by another company. I do however have the liberty to add additional indexes as I see fit. The tables used in the query are below.
CREATE TABLE d2d.track_message
(
id bigserial NOT NULL,
proto_location text,
workflow_analytic_instance_id bigint NOT NULL,
CONSTRAINT track_message_pkey PRIMARY KEY (id),
CONSTRAINT track_message_workflow_analytic_instance_id_fkey FOREIGN KEY(workflow_analytic_instance_id)
REFERENCES d2d.workflow_analytic_instance (id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION
);
CREATE TABLE d2d.track_message_to_track_mapping
(
id bigserial NOT NULL,
track_message_id bigint NOT NULL,
track_id bigint NOT NULL,
CONSTRAINT track_message_to_track_mapping_pkey PRIMARY KEY (id),
CONSTRAINT track_message_to_track_mapping_track_id_fkey FOREIGN KEY (track_id)
REFERENCES d2d.track (id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION,
CONSTRAINT track_message_to_track_mapping_track_message_id_fkey FOREIGN KEY (track_message_id)
REFERENCES d2d.track_message (id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION
);
CREATE TABLE d2d.track
(
id bigserial NOT NULL,
track_uuid text,
track_number text,
track_exercise_indicator_id bigint NOT NULL,
track_simulation_indicator_id bigint NOT NULL,
track_status_id bigint,
last_modified timestamp with time zone DEFAULT timezone('utc'::text, now()),
CONSTRAINT track_pkey PRIMARY KEY (id),
CONSTRAINT track_track_exercise_indicator_id_fkey FOREIGN KEY (track_exercise_indicator_id)
REFERENCES d2d.track_exercise_indicator (id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION,
CONSTRAINT track_track_simulation_indicator_id_fkey FOREIGN KEY (track_simulation_indicator_id)
REFERENCES d2d.track_simulation_indicator (id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION,
CONSTRAINT track_track_status_id_fkey FOREIGN KEY (track_status_id)
REFERENCES d2d.track_status (id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION,
CONSTRAINT track_track_uuid_key UNIQUE (track_uuid)
);
CREATE TABLE d2d.track_item
(
id bigserial NOT NULL,
track_item_type_id bigint NOT NULL,
item_time bigint NOT NULL,
image_source text,
track_id bigint NOT NULL,
CONSTRAINT track_item_pkey PRIMARY KEY (id),
CONSTRAINT track_item_track_id_fkey FOREIGN KEY (track_id)
REFERENCES d2d.track (id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION,
CONSTRAINT track_item_track_item_type_id_fkey FOREIGN KEY (track_item_type_id)
REFERENCES d2d.track_item_type (id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION
);
CREATE TABLE d2d.track_point
(
id bigserial NOT NULL,
track_position d2d.geometry(PointZ,4326),
track_point_type_id bigint,
track_point_source_type_id bigint,
last_modified timestamp with time zone DEFAULT timezone('utc'::text, now()),
track_item_id bigint NOT NULL,
CONSTRAINT track_point_pkey PRIMARY KEY (id),
CONSTRAINT track_point_track_item_id_fkey FOREIGN KEY (track_item_id)
REFERENCES d2d.track_item (id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION,
CONSTRAINT track_point_track_point_source_type_id_fkey FOREIGN KEY (track_point_source_type_id)
REFERENCES d2d.track_point_source_type (id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION,
CONSTRAINT track_point_track_point_type_id_fkey1 FOREIGN KEY (track_point_type_id)
REFERENCES d2d.track_point_type (id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION
);

First attempt: use EXISTS() instead of IN(), and add an index (assuming this could be UNIQUE, not sure) to track_item.item_time (untested, since I have no data, obviously):
CREATE UNIQUE INDEX ON track_item ( item_time);
-- -----
EXPLAIN ANALYZE
SELECT DISTINCT tm.proto_location
FROM track_message tm
WHERE tm.workflow_analytic_instance_id = 204
AND EXISTS ( SELECT *
FROM track_message_to_track_mapping tm2tm
JOIN track t ON t.id = tm2tm.track_id
JOIN track_item ti ON t.id = ti.track_id
JOIN track_point tp ON ti.id = tp.track_item_id
WHERE tm.id =tm2tm.track_message_id
AND ti.item_time BETWEEN 1328816277089000 AND 1328816287089000
AND ST_Intersects
(tp.track_position
, ST_GeomFromText('POLYGON((-144 59, -41 46, -75 15, -127 25, -144 59))',4326)
)
)
;

I think you could use something like this, if I didn't make mistake:
SELECT DISTINCT tm.proto_location
FROM track_message tm
INNER JOIN track_message_to_track_mapping ON track_message_to_track_mapping.track_message_id = tm.id
INNER JOIN track t ON track_message_to_track_mapping.track_id = t.id
INNER JOIN track_item item ON t.id = item.track_id
INNER JOIN track_point ON track_point.track_item_id = item.id
WHERE workflow_analytic_instance_id = 204
AND item.item_time BETWEEN 1328816277089000 AND 1328816287089000
AND ST_Intersects(tp.track_position, ST_GeomFromText('POLYGON((-144 59, -41 46, -75 15, -127 25, -144 59))',4326))

Related

How to unique index over primary key constraint?

I have a partitioned table and created a unique index on it.
I am trying to run some queries, some of these using primary key constraint and some using my created index. I want my queries to use unique index instead of primary constraint.
I tried reindexing, didn't work.
Here are two queries
1) Here my created index is getting used.
Query plan is :
Finalize Aggregate (cost=296958.94..296958.95 rows=1 width=8) (actual time=927.948..927.948 rows=1 loops=1)
-> Gather (cost=296958.72..296958.93 rows=2 width=8) (actual time=927.887..933.730 rows=3 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Partial Aggregate (cost=295958.72..295958.73 rows=1 width=8) actual time=924.885..924.885 rows=1 loops=3)
-> Parallel Append (cost=0.68..293370.57 rows=1035261 width=8) (actual time=0.076..852.758 rows=825334 loops=3)
-> Parallel Index Only Scan using testdate2019jan_april_cost_mo_user_id_account_id_resource__idx5 on testdate2019jan_april_cost_mod3rem2 (cost=0.68..146591.56 rows=525490 width=8) (actual time=0.082..388.130 rows=421251 loops=3)
Index Cond: (user_id = 1)
Heap Fetches: 3922
-> Parallel Index Only Scan using testdate2018sept_dec_cost_mod_user_id_account_id_resource__idx5 on testdate2018sept_dec_cost_mod3rem2 (cost=0.68..141570.15 rows=509767 width=8) (actual time=0.057..551.572 rows=606125 loops=2)
Index Cond: (user_id = 1)
Heap Fetches: 0
-> Parallel Index Scan using testdate2018jan_april_cost_mo_account_id_user_id_resource__idx2 on testdate2018jan_april_cost_mod3rem2 (cost=0.12..8.14 rows=1 width=8) (actual time=0.001..0.001 rows=0 loops=1)
Index Cond: (user_id = 1)
-> Parallel Index Scan using testdate2018may_august_cost_m_account_id_user_id_resource__idx1 on testdate2018may_august_cost_mod3rem2 (cost=0.12..8.14 rows=1 width=8) (actual time=0.001..0.001 rows=0 loops=1)
Index Cond: (user_id = 1)
-> Parallel Index Scan using testdate2019may_august_cost_m_account_id_user_id_resource__idx2 on testdate2019may_august_cost_mod3rem2 (cost=0.12..8.14 rows=1 width=8) (actual time=0.002..0.002 rows=0 loops=1)
Index Cond: (user_id = 1)
-> Parallel Index Scan using testdate2019sept_dec_cost_mod_account_id_user_id_resource__idx2 on testdate2019sept_dec_cost_mod3rem2 (cost=0.12..8.14 rows=1 width=8) (actual time=0.003..0.003 rows=0 loops=1)
Index Cond: (user_id = 1)
Planning Time: 0.754 ms
Execution Time: 933.797 ms
In the above query my index testdate2018may_august_cost_m_account_id_user_id_resource__idx1 is used like I want.
2) Here my created index is not getting used instead primary constraint is getting used.
Sort Method: quicksort Memory: 25kB
Buffers: shared hit=2 read=66080
-> Finalize GroupAggregate (cost=388046.40..388187.55 rows=6 width=61) (actual time=510.710..513.262 rows=10 loops=1)
Group Key: c_1.instance_type, c_1.currency
Buffers: shared hit=2 read=66080
-> Gather Merge (cost=388046.40..388187.24 rows=12 width=85) (actual time=510.689..513.303 rows=28 loops=1)
Workers Planned: 2
Workers Launched: 2
Buffers: shared hit=26 read=206407
-> Partial GroupAggregate (cost=387046.38..387185.83 rows=6 width=85) (actual time=504.731..507.277 rows=9 loops=3)
Group Key: c_1.instance_type, c_1.currency
Buffers: shared hit=26 read=206407
-> Sort (cost=387046.38..387056.71 rows=4130 width=36) (actual time=504.694..504.933 rows=3895 loops=3)
Sort Key: c_1.instance_type, c_1.currency
Sort Method: quicksort Memory: 404kB
Worker 0: Sort Method: quicksort Memory: 354kB
Worker 1: Sort Method: quicksort Memory: 541kB
Buffers: shared hit=20 read=206407
-> Parallel Append (cost=0.13..386798.33 rows=4130 width=36) (actual time=0.081..501.720 rows=3895 loops=3)
Buffers: shared hit=6 read=206405
Subplans Removed: 3
-> Parallel Index Scan using testdate2019may_august_cost_mod3rem2_pkey on testdate2019may_august_cost_mod3rem2 c_1 (cost=0.13..8.15 rows=1 width=36) (actual time=0.008..0.008 rows=0 loops=1)
Index Cond: ((usage_start_date >= (CURRENT_DATE - 30)) AND (user_id = '1'::bigint))
Filter: ((instance_type IS NOT NULL) AND ((account_id)::text = '807331824280'::text) AND (usage_end_date <= CURRENT_DATE))
Buffers: shared hit=1
-> Parallel Index Scan using testdate2019sept_dec_cost_mod3rem2_pkey on testdate2019sept_dec_cost_mod3rem2 c_2 (cost=0.13..8.15 rows=1 width=36) (actual time=0.006..0.006 rows=0 loops=1)
Index Cond: ((usage_start_date >= (CURRENT_DATE - 30)) AND (user_id = '1'::bigint))
Filter: ((instance_type IS NOT NULL) AND ((account_id)::text = '807331824280'::text) AND (usage_end_date <= CURRENT_DATE))
Buffers: shared hit=1
-> Parallel Seq Scan on testdate2019jan_april_cost_mod3rem2 c (cost=0.00..258266.58 rows=4125 width=36) (actual time=0.076..501.060 rows=3895 loops=3)
Filter: ((instance_type IS NOT NULL) AND (user_id = '1'::bigint) AND ((account_id)::text = '807331824280'::text) AND (usage_end_date <= CURRENT_DATE) AND (usage_start_date >= (CURRENT_DATE - 30)))
Rows Removed by Filter: 1504689
Buffers: shared hit=4 read=206405
Planning Time: 1.290 ms
Execution Time: 513.439 ms
In above query testdate2019sept_dec_cost_mod3rem2_pkey, which is the primary constraint, is getting used.
I want it to use my created index instead of primary constraint.
Is my 2nd query plan is correct according to partition?
Table Creation Queries:
CREATE TABLE a2i.testawscost_line_item (
line_item_id uuid NOT NULL,
account_id character varying(255) COLLATE pg_catalog."default",
availability_zone character varying(255) COLLATE pg_catalog."default",
base_cost double precision,
base_rate double precision,
cost double precision,
currency character varying(255) COLLATE pg_catalog."default",
instance_family character varying(255) COLLATE pg_catalog."default",
instance_type character varying(255) COLLATE pg_catalog."default",
line_item_type character varying(255) COLLATE pg_catalog."default",
operating_system character varying(255) COLLATE pg_catalog."default",
operation character varying(255) COLLATE pg_catalog."default",
payer_account_id character varying(255) COLLATE pg_catalog."default",
product_code character varying(255) COLLATE pg_catalog."default",
product_family character varying(255) COLLATE pg_catalog."default",
product_group character varying(255) COLLATE pg_catalog."default",
product_name character varying(255) COLLATE pg_catalog."default",
rate double precision,
rate_description character varying(255) COLLATE pg_catalog."default",
reservation_id character varying(255) COLLATE pg_catalog."default",
resource_type character varying(255) COLLATE pg_catalog."default",
sku character varying(255) COLLATE pg_catalog."default",
tax_type character varying(255) COLLATE pg_catalog."default",
unit character varying(255) COLLATE pg_catalog."default",
usage_end_date timestamp without time zone,
usage_quantity double precision,
usage_start_date timestamp without time zone,
usage_type character varying(255) COLLATE pg_catalog."default",
user_id bigint,
resource_id character varying(255) COLLATE pg_catalog."default",
CONSTRAINT testawscost_line_item_pkey PRIMARY KEY
(line_item_id, usage_start_date, user_id),
CONSTRAINT fkptp4hyur3i4yj88wo3rxnaf05 FOREIGN KEY (resource_id)
REFERENCES a2i.awscost_resource (resource_id) MATCH SIMPLE
ON UPDATE NO ACTION
ON DELETE NO ACTION
) PARTITION BY hash(user_id);
The partitions:
create table a2i.testuser_cost_mod3rem0
partition of a2i.testawscost_line_item
for values with (MODULUS 3, REMAINDER 0)
partition by range(usage_start_date);
create table a2i.testuser_cost_mod3rem1
partition of a2i.testawscost_line_item
for values with (MODULUS 3, REMAINDER 1)
partition by range(usage_start_date);
create table a2i.testuser_cost_mod3rem2
partition of a2i.testawscost_line_item
for values with (MODULUS 3, REMAINDER 2)
partition by range(usage_start_date);
Partitions of the partitions for 2019:
create table a2i.testdate2019jan_april_cost_mod3rem0
partition of a2i.testuser_cost_mod3rem0
for values from ('2019-01-01 00:00:00') to ('2019-05-01 00:00:00');
create table a2i.testdate2019may_august_cost_mod3rem0
partition of a2i.testuser_cost_mod3rem0
for values from ('2019-05-01 00:00:00') to ('2019-09-01 00:00:00');
create table a2i.testdate2019sept_dec_cost_mod3rem0
partition of a2i.testuser_cost_mod3rem0
for values from ('2019-09-01 00:00:00') to ('2020-01-01 00:00:00');
create table a2i.testdate2019jan_april_cost_mod3rem1
partition of a2i.testuser_cost_mod3rem1
for values from ('2019-01-01 00:00:00') to ('2019-05-01 00:00:00');
create table a2i.testdate2019may_august_cost_mod3rem1
partition of a2i.testuser_cost_mod3rem1
for values from ('2019-05-01 00:00:00') to ('2019-09-01 00:00:00');
create table a2i.testdate2019sept_dec_cost_mod3rem1
partition of a2i.testuser_cost_mod3rem1
for values from ('2019-09-01 00:00:00') to ('2020-01-01 00:00:00');
create table a2i.testdate2019jan_april_cost_mod3rem2
partition of a2i.testuser_cost_mod3rem2
for values from ('2019-01-01 00:00:00') to ('2019-05-01 00:00:00');
create table a2i.testdate2019may_august_cost_mod3rem2
partition of a2i.testuser_cost_mod3rem2
for values from ('2019-05-01 00:00:00') to ('2019-09-01 00:00:00');
create table a2i.testdate2019sept_dec_cost_mod3rem2
partition of a2i.testuser_cost_mod3rem2
for values from ('2019-09-01 00:00:00') to ('2020-01-01 00:00:00');
The index:
CREATE UNIQUE INDEX awscost_line_item_unique_pkey ON a2i.awscost_line_item (
account_id, user_id, resource_id, usage_start_date, usage_end_date, usage_type,
usage_quantity, line_item_type, sku, rate, base_rate, base_cost,
"cost", currency, product_code, operation
);
For 1st query plan,query is :
explain analyze select sum(cost) from testawscost_line_item where
user_id='1';
2nd query :
explain (analyze/*, buffers*/) SELECT sum (c.cost),
sum (case when c.resource_type = 'Compute' then c.cost end) as computeCost,
sum (case when c.resource_type = 'Storage' then c.cost end) as storageCost,
sum (case when c.resource_type = 'Network' then c.cost end) as networkCost,
sum (case when c.resource_type not in ('Compute', 'Network', 'Storage')
then c.cost end) as otherCost,
c.currency,
c.instance_type as productFamily,
avg (c.rate) FROM testawscost_line_item c WHERE
(c.user_id ='1') AND (c.account_id = '807331824280') AND
(c.usage_start_date >= current_date-30 AND c.usage_end_date <=
current_date) AND
(c.instance_type is not null )
GROUP BY c.instance_type, c.currency
ORDER BY 1 desc
The problem is that your index works well for the first query, but not for the second.
resource_id is in your index, but not in your query, so all index columns after that cannot be used for the query. PostgreSQL decides to use the much smaller primary key index.
The perfect index for this query is:
CREATE INDEX ON a2i.testawscost_line_item (user_id, account_id, usage_start_date)
WHERE instance_type IS NOT NULL;
I assume that the condition on usage_end_date is not more selective than the one on usage_start_date.

Cannot find tables that would join using nested loops

I feel like I will get lots of downvotes here, but let's give it a go.
I am trying to explain nested loops vs hash vs merge join to my students on real examples. However, I am struggling to find tables that would join with nested loops (I tried many different sizes, indexes setups, etc.). Postgres always uses hash join regardless of the table sizes, indexes, etc.
Could someone give an example of tables (with data) that would join with nested loops without explicitly running set enable_hashjoin = true; beforehand?
The following does a nested loop for me (without disabling hashjoins) on Postgres 10.5
create table one (id integer primary key, some_ts timestamp, some_value integer);
insert into one values (1, clock_timestamp(), 42),(2, clock_timestamp(), 42);
create table two (id integer primary key, one_id integer not null references one, some_ts timestamp);
insert into two
select i, 1, clock_timestamp()
from generate_series(1,10) i;
insert into two
select i, 2, clock_timestamp()
from generate_series(11,20) i;
create index on two (one_id);
explain (analyze)
select one.*, two.id, two.some_ts
from one
join two on one.id = two.one_id
where one.id = 1;
Results in:
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------
Nested Loop (cost=0.15..4.23 rows=1 width=28) (actual time=0.029..0.033 rows=10 loops=1)
-> Index Scan using one_pkey on one (cost=0.15..3.16 rows=1 width=16) (actual time=0.016..0.016 rows=1 loops=1)
Index Cond: (id = 1)
-> Seq Scan on two (cost=0.00..1.07 rows=1 width=16) (actual time=0.011..0.014 rows=10 loops=1)
Filter: (one_id = 1)
Rows Removed by Filter: 10
Planning time: 0.130 ms
Execution time: 0.058 ms
Online example: http://rextester.com/CXZZ12304
Create some tables:
CREATE TABLE a (
a_id integer PRIMARY KEY,
a_val text NOT NULL
);
CREATE TABLE b (
b_id integer PRIMARY KEY,
a_id integer REFERENCES a(a_id) NOT NULL,
b_val text NOT NULL
);
-- *never* forget an index on the foreign key column
CREATE INDEX ON b(a_id);
Add some sample data:
INSERT INTO a
SELECT i, 'value ' || i FROM generate_series(1, 1000) i;
INSERT INTO b
SELECT i, (i + 1) / 2, 'value ' || i FROM generate_series(1, 2000) i;
Analyze the tables to get good statistics:
ANALYZE a;
ANALYZE b;
Let's run a sample query:
EXPLAIN SELECT a.a_val, b.b_val FROM a JOIN b USING (a_id) WHERE a_id = 42;
QUERY PLAN
---------------------------------------------------------------------------
Nested Loop (cost=0.55..16.62 rows=2 width=19)
-> Index Scan using a_pkey on a (cost=0.28..8.29 rows=1 width=13)
Index Cond: (a_id = 42)
-> Index Scan using b_a_id_idx on b (cost=0.28..8.31 rows=2 width=14)
Index Cond: (a_id = 42)
(5 rows)

Storing 'ties' for Contests in Postgres

I'm trying to determine if there a "low cost" optimization for the following query. We've implemented a system whereby 'tickets' earn 'points' and thus can be ranked. In order to support analytical type of queries, we store the rank of every ticket and wether the ticket is tied along with the ticket.
I've found that, at scale, storing the is_tied field is very slow. I'm attempting to run the scenario below on a set of "tickets" that is about 20-75k tickets.
I'm hoping that someone can help identify why and offer some help.
We're on postgres 9.3.6
Here's a simplified ticket table schema:
ogs_1=> \d api_ticket
Table "public.api_ticket"
Column | Type | Modifiers
------------------------------+--------------------------+---------------------------------------------------------
id | integer | not null default nextval('api_ticket_id_seq'::regclass)
status | character varying(3) | not null
points_earned | integer | not null
rank | integer | not null
event_id | integer | not null
user_id | integer | not null
is_tied | boolean | not null
Indexes:
"api_ticket_pkey" PRIMARY KEY, btree (id)
"api_ticket_4437cfac" btree (event_id)
"api_ticket_e8701ad4" btree (user_id)
"api_ticket_points_earned_idx" btree (points_earned)
"api_ticket_rank_idx" btree ("rank")
Foreign-key constraints:
"api_ticket_event_id_598c97289edc0e3e_fk_api_event_id" FOREIGN KEY (event_id) REFERENCES api_event(id) DEFERRABLE INITIALLY DEFERRED
(user_id) REFERENCES auth_user(id) DEFERRABLE INITIALLY DEFERRED
Here's the query that I'm executing:
UPDATE api_ticket t SET is_tied = False
WHERE t.event_id IN (SELECT id FROM api_event WHERE status = 'c');
UPDATE api_ticket t SET is_tied = True
FROM (
SELECT event_id, rank
FROM api_ticket tt
WHERE event_id in (SELECT id FROM api_event WHERE status = 'c')
AND tt.status <> 'x'
GROUP BY rank, event_id
HAVING count(*) > 1
) AS tied_tickets
WHERE t.rank = tied_tickets.rank AND
tied_tickets.event_id = t.event_id;
Here's the explain on a set of about 35k rows:
QUERY PLAN
----------------------------------------------------------------------------------------------------------------------------
Update on api_ticket t (cost=3590.01..603570.21 rows=157 width=128)
-> Nested Loop (cost=3590.01..603570.21 rows=157 width=128)
-> Subquery Scan on tied_tickets (cost=2543.31..2556.18 rows=572 width=40)
-> HashAggregate (cost=2543.31..2550.46 rows=572 width=8)
Filter: (count(*) > 1)
-> Nested Loop (cost=0.84..2539.02 rows=572 width=8)
-> Index Scan using api_event_status_idx1 on api_event (cost=0.29..8.31 rows=1 width=4)
Index Cond: ((status)::text = 'c'::text)
-> Index Scan using api_ticket_4437cfac on api_ticket tt (cost=0.55..2524.99 rows=572 width=8)
Index Cond: (event_id = api_event.id)
Filter: ((status)::text <> 'x'::text)
-> Bitmap Heap Scan on api_ticket t (cost=1046.70..1050.71 rows=1 width=92)
Recheck Cond: (("rank" = tied_tickets."rank") AND (event_id = tied_tickets.event_id))
-> BitmapAnd (cost=1046.70..1046.70 rows=1 width=0)
-> Bitmap Index Scan on api_ticket_rank_idx (cost=0.00..26.65 rows=708 width=0)
Index Cond: ("rank" = tied_tickets."rank")
-> Bitmap Index Scan on api_ticket_4437cfac (cost=0.00..1019.79 rows=573 width=0)
Index Cond: (event_id = tied_tickets.event_id)

Why is this simple SQL query so slow?

I have a query
select p.id
from Product p,Identifier i
where p.id=i.product_id
and p.aggregatorId='5109037'
and p.deletionDate is null
and i.type='03'
and i.value='9783639382891'
which takes about 3.5 seconds to run. Product has about 4.7m entries, Identifier about 20m. As you can see in the schema below, every column used in the query is indexed, but not all indexes are used. If I exclude the columns p.aggregatorId and i.type, the query runs as fast as I would have expected. I have also tried to join the Identifier table, but with no change in the explain plan.
Why the indexes are not used?
The explain plan looks like this:
Nested Loop (cost=3.21..63.48 rows=1 width=33) (actual time=10.856..3236.830 rows=1 loops=1)
-> Index Scan using idx_prod_aggr on product p (cost=0.43..2.45 rows=1 width=33) (actual time=0.041..191.339 rows=146692 loops=1)
Index Cond: ((aggregatorid)::text = '5109037'::text)
Filter: (deletiondate IS NULL)
-> Bitmap Heap Scan on identifier i (cost=2.78..61.01 rows=1 width=33) (actual time=0.019..0.019 rows=0 loops=146692)
Recheck Cond: ((product_id)::text = (p.id)::text)
Filter: (((type)::text = '03'::text) AND ((value)::text = '9783639382891'::text))
Rows Removed by Filter: 2"
-> Bitmap Index Scan on idx_id_prod_id (cost=0.00..2.78 rows=29 width=0) (actual time=0.016..0.016 rows=2 loops=146692)
Index Cond: ((product_id)::text = (p.id)::text)
The reduced DB schema looks like this:
CREATE TABLE product
(
id character varying(32) NOT NULL,
version integer,
active boolean,
aggregatorid character varying(15),
deletiondate timestamp without time zone,
)
WITH (
OIDS=FALSE
);
CREATE INDEX idx_prod_active
ON product
USING btree
(active);
CREATE INDEX idx_prod_aggr
ON product
USING btree
(aggregatorid COLLATE pg_catalog."default");
CREATE INDEX idx_prod_del_date
ON product
USING btree
(deletiondate);
CREATE TABLE identifier
(
id character varying(32) NOT NULL,
version integer,
typename character varying(50),
type character varying(3) NOT NULL,
value character varying(512) NOT NULL,
product_id character varying(32),
CONSTRAINT identifier_pkey PRIMARY KEY (id),
CONSTRAINT fk165a88c9c93f3e7f FOREIGN KEY (product_id)
REFERENCES product (id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION
)
WITH (
OIDS=FALSE
);
CREATE INDEX idx_id_prod_type
ON identifier
USING btree
(type COLLATE pg_catalog."default");
CREATE INDEX idx_id_prod_value
ON identifier
USING btree
(value COLLATE pg_catalog."default");
CREATE INDEX idx_id_typename
ON identifier
USING btree
(typename COLLATE pg_catalog."default");
CREATE INDEX idx_prod_ident
ON identifier
USING btree
(product_id COLLATE pg_catalog."default");

postgresql hashaggregate query optimization

I am trying to optimize the query below.
select cellid2 as cellid, max(endeks) as turkcell
from (select a.cellid2 as cellid2, b.endeks
from (select geom, cellid as cellid2 from grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000 ) a join (select endeks, st_transform(geom, 2320) as geom_tmp from turkcell_data ) b on st_intersects(a.geom, b.geom_tmp) ) x
group by cellid2 limit 5
and explain analyze returns
"Limit (cost=81808.31..81808.36 rows=5 width=12) (actual time=271376.201..271376.204 rows=5 loops=1)"
" -> HashAggregate (cost=81808.31..81879.63 rows=7132 width=12) (actual time=271376.200..271376.203 rows=5 loops=1)"
" -> Nested Loop (cost=0.00..81772.65 rows=7132 width=12) (actual time=5.128..269753.647 rows=1237707 loops=1)"
" Join Filter: _st_intersects(grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000.geom, st_transform(turkcell_data.geom, 2320))"
" -> Seq Scan on turkcell_data (cost=0.00..809.40 rows=3040 width=3045) (actual time=0.031..7.426 rows=3040 loops=1)"
" -> Index Scan using grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000_geom_gist on grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000 (cost=0.00..24.76 rows=7 width=124) (actual time=0.012..0.799 rows=647 loops=3040)"
" Index Cond: (geom && st_transform(turkcell_data.geom, 2320))"
"Total runtime: 271387.499 ms"
There exist indexes on geometry column and cellid columns. I read that instead of using max, order by desc and limit 1 works better. However, since I have group by clause, it does not work I think. Is there any way to do this or any other way which improves the performance?
Table Definitions:
CREATE TABLE grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000
(
regionid numeric,
geom geometry(Geometry,2320),
cellid integer,
turkcell double precision
)
WITH (
OIDS=FALSE
);
ALTER TABLE grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000
OWNER TO postgres;
-- Index: grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000_cellid
-- DROP INDEX grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000_cellid;
CREATE INDEX grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000_cellid
ON grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000
USING btree
(cellid );
-- Index: grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000_geom_gist
-- DROP INDEX grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000_geom_gist;
CREATE INDEX grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000_geom_gist
ON grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000
USING gist
(geom );
CREATE TABLE turkcell_data
(
gid serial NOT NULL,
objectid_1 integer,
objectid integer,
neighbourh numeric,
endeks numeric,
coorx numeric,
coory numeric,
shape_leng numeric,
shape_le_1 numeric,
shape_area numeric,
geom geometry(MultiPolygon,4326),
CONSTRAINT turkcell_data_pkey PRIMARY KEY (gid )
)
WITH (
OIDS=FALSE
);
ALTER TABLE turkcell_data
OWNER TO postgres;
-- Index: turkcell_data_geom_gist
-- DROP INDEX turkcell_data_geom_gist;
CREATE INDEX turkcell_data_geom_gist
ON turkcell_data
USING gist
(geom );
Either store your data re-projected to 2320, index that column, and use it in your join, or create an index on the transformed projection of the geometry in turkcell_data. I usually prefer the latter:
CREATE INDEX turkcell_data_geom_gist2320
ON turkcell_data
USING gist
(st_transform(geom, 2320) );
The other issue might be if your geometries are very complex - if any of your polygons have a relatively large number of points you might get stuck crunching away on the intersection. Try the index first, though.