PostGIS intersect/summary query very slow for particular table - postgresql

Edited question based on comment by #peter so now the two tables in question are using the same geometry type
I'm running a query that intersects a table's geometry with a generic input geometry and then summarizes the results based on a specific attribute.
The reason I'm suspicious (and frustrated) with this particular query is that I can run the exact same query on a different table that is 40x the size and it takes 1/100 of the time.
I'm puzzled because I'm very happy with the speed (~400ms) that this query has on a large table with 1.3M records. However, on a particular table with 30,000 records the query takes 40+ seconds to complete.
Here's the query:
WITH input_geom AS (
SELECT ST_Transform(
ST_SetSRID(
ST_GeomFromGeoJSON(
'{"type":"Polygon","coordinates":[[[-91.865616,47.803339],[-91.830597,47.780274],[-91.810341,47.817404],[-91.865616,47.803339]]]}'
), 4326
), 26915
) AS geom
)
-- find total area and proportion for each type
SELECT
attr,
total_area_summtable AS area,
total_area_summtable / buff_area.area_sqm AS percent
FROM
(-- group by attribute and buffer
SELECT
attr,
sum(area_sqm) AS total_area_summtable
FROM
(-- find intersected area of each type
-- Clip ownership by input geom
SELECT
%attr% AS attr,
CASE
-- speed intersection calculation by using summary table
-- geom when it covers the entire buffer
-- otherwise use intersection of geometries
WHEN ST_CoveredBy(input_geom.geom, summtable.geom) THEN ST_Area(input_geom.geom)
ELSE ST_Area(ST_Multi(ST_Intersection(input_geom.geom,summtable.geom)))
END AS area_sqm
FROM input_geom
INNER JOIN %table% AS summtable ON ST_Intersects(input_geom.geom, summtable.geom)
) AS summtable_inter
-- group by type
GROUP BY attr
) AS summtable_area,
(-- find total area for the buffer
SELECT
ST_Area(ST_Collect(geom)) AS area_sqm
FROM input_geom
) AS buff_area
That produces results like this:
attr area percent
6 17106063.3199902 0.0630578194718625
8 41892903.9272884 0.154429170732226
2 4441738.70688669 0.016373513430921
....
Here are the Explain Analyze results for this query:
Nested Loop (cost=31.00..31.34 rows=9 width=23) (actual time=49042.306..49042.309 rows=5 loops=1)
Output: mown.owner_desc, (sum(CASE WHEN ((input_geom_1.geom # mown.geom) AND _st_coveredby(input_geom_1.geom, mown.geom)) THEN st_area(input_geom_1.geom) ELSE st_area(st_multi(st_intersection(input_geom_1.geom, mown.geom))) END)), ((sum(CASE WHEN ((input (...)
CTE input_geom
-> Result (cost=0.00..0.01 rows=1 width=0) (actual time=0.002..0.002 rows=1 loops=1)
Output: '01030000202369000001000000040000003D484506D9D92141B7A4EA61F6325441A3BEFDC8A2EE2141E731F0497F305441774E0C95FEF9214173B409BA8C3454413D484506D9D92141B7A4EA61F6325441'::geometry
-> Aggregate (cost=0.02..0.06 rows=1 width=32) (actual time=0.035..0.036 rows=1 loops=1)
Output: st_area(st_collect(input_geom.geom))
-> CTE Scan on input_geom (cost=0.00..0.02 rows=1 width=32) (actual time=0.005..0.006 rows=1 loops=1)
Output: input_geom.geom
-> HashAggregate (cost=30.96..31.05 rows=9 width=18085) (actual time=49042.264..49042.266 rows=5 loops=1)
Output: mown.owner_desc, sum(CASE WHEN ((input_geom_1.geom # mown.geom) AND _st_coveredby(input_geom_1.geom, mown.geom)) THEN st_area(input_geom_1.geom) ELSE st_area(st_multi(st_intersection(input_geom_1.geom, mown.geom))) END)
Group Key: mown.owner_desc
-> Nested Loop (cost=4.32..25.34 rows=18 width=18085) (actual time=3.304..791.829 rows=39 loops=1)
Output: input_geom_1.geom, mown.owner_desc, mown.geom
-> CTE Scan on input_geom input_geom_1 (cost=0.00..0.02 rows=1 width=32) (actual time=0.001..0.003 rows=1 loops=1)
Output: input_geom_1.geom
-> Bitmap Heap Scan on public.gap_stewardship_2008_all_ownership_types mown (cost=4.32..25.30 rows=2 width=18053) (actual time=3.299..791.762 rows=39 loops=1)
Output: mown.gid, mown.wetland_ty, mown.county, mown.name, mown.unit, mown.owner, mown.owner_ver1, mown.owner_desc, mown.owner_name, mown.agency, mown.agncy_ver1, mown.agency_nam, mown.new_manage, mown.name_manag, mown.comments, mown.or (...)
Recheck Cond: (input_geom_1.geom && mown.geom)
Filter: _st_intersects(input_geom_1.geom, mown.geom)
Rows Removed by Filter: 208
Heap Blocks: exact=142
-> Bitmap Index Scan on gap_stewardship_2008_all_ownership_types_geom_idx (cost=0.00..4.31 rows=5 width=0) (actual time=0.651..0.651 rows=247 loops=1)
Index Cond: (input_geom_1.geom && mown.geom)
Planning time: 1.245 ms
Execution time: 49046.184 ms
Here is the SQL to recreate the tables in question:
Slow table (50,000 rows):
CREATE TABLE public.gap_stewardship_2008_all_ownership_types
(
gid integer NOT NULL DEFAULT nextval('gap_stewardship_2008_all_ownership_types_gid_seq'::regclass),
wetland_ty character varying(50),
county character varying(50),
name character varying(50),
unit character varying(50),
owner smallint,
owner_ver1 smallint,
owner_desc character varying(50),
owner_name character varying(50),
agency smallint,
agncy_ver1 smallint,
agency_nam character varying(50),
new_manage smallint,
name_manag character varying(50),
comments character varying(100),
origin character varying(50),
area numeric,
acres numeric,
perfeet numeric,
perimeter numeric,
km2 numeric,
shape_leng numeric,
shape_area numeric,
geom geometry(MultiPolygon,26915),
CONSTRAINT gap_stewardship_2008_all_ownership_types_pkey PRIMARY KEY (gid)
)
Fast table (1,300,000 rows):
CREATE TABLE public.nwi_combine
(
id integer NOT NULL DEFAULT nextval('"NWI_combine_AOI_id_seq"'::regclass),
geom geometry(MultiPolygon,26915),
attribute character varying(254),
wetland_ty character varying(254),
acres numeric,
hgm_code character varying(254),
hgm_desc character varying(254),
spcc_desc character varying(254),
cow_class1 character varying(254),
circ39_cla bigint,
hgm_ll_des character varying(254),
shape_leng numeric,
shape_area numeric,
nwi_code character varying(254),
new_cow character varying(254),
system character varying(254),
subsystem character varying(254),
class1 character varying(254),
subclass1 character varying(254),
class2 character varying(254),
subclass2 character varying(254),
wreg character varying(254),
soilm character varying(254),
spec_mod1 character varying(254),
spec_mod2 character varying(254),
circ39 character varying(254),
old_cow character varying(254),
mnwet character varying(254),
circ39_com bigint,
CONSTRAINT "NWI_combine_AOI_pkey" PRIMARY KEY (id)
)
Each of these tables has a GIST index on the geometry field.
Does anyone have any idea what could be contributing to such a difference?

Related

Postgres slow performance when order is added

I have this simple setup with a single table where I import some users having a short description (bio) that is max 255 characters. Next to the description field I have another field of type tsvector which is the bio but tokenized.
My goal is to find user bios that contain certain keywords. A full text search on a single table so I'm not sure if the tsvector field is even needed (I'm new to postgres, used only Mysql in the past) since it seems to me it gets really powerful when querying in different fields/tables.
My actual problem is the fact that this query runs ok when no ordering is involved (0.5s) but extremely slow when a single order by clause is added on an indexed field (8s). I only have about 1 million records in there.
Table setup:
CREATE TABLE public.django_user
(
id integer NOT NULL DEFAULT nextval('django_user_id_seq'::regclass),
username character varying(32) COLLATE pg_catalog."default" NOT NULL,
description character varying(255) COLLATE pg_catalog."default" NOT NULL,
description_tokens tsvector,
streams_count integer NOT NULL,
CONSTRAINT django_user_pkey PRIMARY KEY (id),
CONSTRAINT django_user_streams_count_check CHECK (streams_count >= 0)
)
WITH (
OIDS = FALSE
)
TABLESPACE pg_default;
CREATE INDEX django_user_description_tokens_07422d46
ON public.django_user USING btree
(description_tokens)
TABLESPACE pg_default;
CREATE INDEX django_user_streams_count_66aa1edc
ON public.django_user USING btree
(streams_count)
TABLESPACE pg_default;
The slow query:
SELECT
streams_count, username, description
FROM
"django_user"
WHERE
to_tsvector('english'::regconfig, COALESCE(("django_user"."description_tokens")::text, '')) ## (plainto_tsquery('english'::regconfig, 'react redux')) = true
ORDER BY streams_count ASC LIMIT 20
If I remove the ORDER BY streams_count ASC everything works fine. Here's an explain for the query:
"Limit (cost=174377.42..174379.75 rows=20 width=106) (actual time=7363.660..7368.257 rows=20 loops=1)"
" -> Gather Merge (cost=174377.42..174379.99 rows=22 width=106) (actual time=7363.658..7368.245 rows=20 loops=1)"
" Workers Planned: 2"
" Workers Launched: 2"
" -> Sort (cost=173377.40..173377.42 rows=11 width=106) (actual time=7359.708..7359.710 rows=15 loops=3)"
" Sort Key: streams_count"
" Sort Method: top-N heapsort Memory: 31kB"
" Worker 0: Sort Method: top-N heapsort Memory: 32kB"
" Worker 1: Sort Method: top-N heapsort Memory: 32kB"
" -> Parallel Seq Scan on django_user (cost=0.00..173377.21 rows=11 width=106) (actual time=24.870..7359.379 rows=109 loops=3)"
" Filter: (to_tsvector('english'::regconfig, COALESCE((description_tokens)::text, ''::text)) ## '''react'' & ''redux'''::tsquery)"
" Rows Removed by Filter: 347231"
"Planning Time: 0.298 ms"
"Execution Time: 7368.293 ms"
Any idea what I'm missing?
A btree index on a tsvector is pretty useless. Make it to a GIN index, so that the query can make use of it with the ## operator.
As for why the ORDER BY makes is slower, consider how much harder it would be find the 20 tallest people in Chicago wearing red ties, versus 20 random people in Chicago wearing red ties.

How to unique index over primary key constraint?

I have a partitioned table and created a unique index on it.
I am trying to run some queries, some of these using primary key constraint and some using my created index. I want my queries to use unique index instead of primary constraint.
I tried reindexing, didn't work.
Here are two queries
1) Here my created index is getting used.
Query plan is :
Finalize Aggregate (cost=296958.94..296958.95 rows=1 width=8) (actual time=927.948..927.948 rows=1 loops=1)
-> Gather (cost=296958.72..296958.93 rows=2 width=8) (actual time=927.887..933.730 rows=3 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Partial Aggregate (cost=295958.72..295958.73 rows=1 width=8) actual time=924.885..924.885 rows=1 loops=3)
-> Parallel Append (cost=0.68..293370.57 rows=1035261 width=8) (actual time=0.076..852.758 rows=825334 loops=3)
-> Parallel Index Only Scan using testdate2019jan_april_cost_mo_user_id_account_id_resource__idx5 on testdate2019jan_april_cost_mod3rem2 (cost=0.68..146591.56 rows=525490 width=8) (actual time=0.082..388.130 rows=421251 loops=3)
Index Cond: (user_id = 1)
Heap Fetches: 3922
-> Parallel Index Only Scan using testdate2018sept_dec_cost_mod_user_id_account_id_resource__idx5 on testdate2018sept_dec_cost_mod3rem2 (cost=0.68..141570.15 rows=509767 width=8) (actual time=0.057..551.572 rows=606125 loops=2)
Index Cond: (user_id = 1)
Heap Fetches: 0
-> Parallel Index Scan using testdate2018jan_april_cost_mo_account_id_user_id_resource__idx2 on testdate2018jan_april_cost_mod3rem2 (cost=0.12..8.14 rows=1 width=8) (actual time=0.001..0.001 rows=0 loops=1)
Index Cond: (user_id = 1)
-> Parallel Index Scan using testdate2018may_august_cost_m_account_id_user_id_resource__idx1 on testdate2018may_august_cost_mod3rem2 (cost=0.12..8.14 rows=1 width=8) (actual time=0.001..0.001 rows=0 loops=1)
Index Cond: (user_id = 1)
-> Parallel Index Scan using testdate2019may_august_cost_m_account_id_user_id_resource__idx2 on testdate2019may_august_cost_mod3rem2 (cost=0.12..8.14 rows=1 width=8) (actual time=0.002..0.002 rows=0 loops=1)
Index Cond: (user_id = 1)
-> Parallel Index Scan using testdate2019sept_dec_cost_mod_account_id_user_id_resource__idx2 on testdate2019sept_dec_cost_mod3rem2 (cost=0.12..8.14 rows=1 width=8) (actual time=0.003..0.003 rows=0 loops=1)
Index Cond: (user_id = 1)
Planning Time: 0.754 ms
Execution Time: 933.797 ms
In the above query my index testdate2018may_august_cost_m_account_id_user_id_resource__idx1 is used like I want.
2) Here my created index is not getting used instead primary constraint is getting used.
Sort Method: quicksort Memory: 25kB
Buffers: shared hit=2 read=66080
-> Finalize GroupAggregate (cost=388046.40..388187.55 rows=6 width=61) (actual time=510.710..513.262 rows=10 loops=1)
Group Key: c_1.instance_type, c_1.currency
Buffers: shared hit=2 read=66080
-> Gather Merge (cost=388046.40..388187.24 rows=12 width=85) (actual time=510.689..513.303 rows=28 loops=1)
Workers Planned: 2
Workers Launched: 2
Buffers: shared hit=26 read=206407
-> Partial GroupAggregate (cost=387046.38..387185.83 rows=6 width=85) (actual time=504.731..507.277 rows=9 loops=3)
Group Key: c_1.instance_type, c_1.currency
Buffers: shared hit=26 read=206407
-> Sort (cost=387046.38..387056.71 rows=4130 width=36) (actual time=504.694..504.933 rows=3895 loops=3)
Sort Key: c_1.instance_type, c_1.currency
Sort Method: quicksort Memory: 404kB
Worker 0: Sort Method: quicksort Memory: 354kB
Worker 1: Sort Method: quicksort Memory: 541kB
Buffers: shared hit=20 read=206407
-> Parallel Append (cost=0.13..386798.33 rows=4130 width=36) (actual time=0.081..501.720 rows=3895 loops=3)
Buffers: shared hit=6 read=206405
Subplans Removed: 3
-> Parallel Index Scan using testdate2019may_august_cost_mod3rem2_pkey on testdate2019may_august_cost_mod3rem2 c_1 (cost=0.13..8.15 rows=1 width=36) (actual time=0.008..0.008 rows=0 loops=1)
Index Cond: ((usage_start_date >= (CURRENT_DATE - 30)) AND (user_id = '1'::bigint))
Filter: ((instance_type IS NOT NULL) AND ((account_id)::text = '807331824280'::text) AND (usage_end_date <= CURRENT_DATE))
Buffers: shared hit=1
-> Parallel Index Scan using testdate2019sept_dec_cost_mod3rem2_pkey on testdate2019sept_dec_cost_mod3rem2 c_2 (cost=0.13..8.15 rows=1 width=36) (actual time=0.006..0.006 rows=0 loops=1)
Index Cond: ((usage_start_date >= (CURRENT_DATE - 30)) AND (user_id = '1'::bigint))
Filter: ((instance_type IS NOT NULL) AND ((account_id)::text = '807331824280'::text) AND (usage_end_date <= CURRENT_DATE))
Buffers: shared hit=1
-> Parallel Seq Scan on testdate2019jan_april_cost_mod3rem2 c (cost=0.00..258266.58 rows=4125 width=36) (actual time=0.076..501.060 rows=3895 loops=3)
Filter: ((instance_type IS NOT NULL) AND (user_id = '1'::bigint) AND ((account_id)::text = '807331824280'::text) AND (usage_end_date <= CURRENT_DATE) AND (usage_start_date >= (CURRENT_DATE - 30)))
Rows Removed by Filter: 1504689
Buffers: shared hit=4 read=206405
Planning Time: 1.290 ms
Execution Time: 513.439 ms
In above query testdate2019sept_dec_cost_mod3rem2_pkey, which is the primary constraint, is getting used.
I want it to use my created index instead of primary constraint.
Is my 2nd query plan is correct according to partition?
Table Creation Queries:
CREATE TABLE a2i.testawscost_line_item (
line_item_id uuid NOT NULL,
account_id character varying(255) COLLATE pg_catalog."default",
availability_zone character varying(255) COLLATE pg_catalog."default",
base_cost double precision,
base_rate double precision,
cost double precision,
currency character varying(255) COLLATE pg_catalog."default",
instance_family character varying(255) COLLATE pg_catalog."default",
instance_type character varying(255) COLLATE pg_catalog."default",
line_item_type character varying(255) COLLATE pg_catalog."default",
operating_system character varying(255) COLLATE pg_catalog."default",
operation character varying(255) COLLATE pg_catalog."default",
payer_account_id character varying(255) COLLATE pg_catalog."default",
product_code character varying(255) COLLATE pg_catalog."default",
product_family character varying(255) COLLATE pg_catalog."default",
product_group character varying(255) COLLATE pg_catalog."default",
product_name character varying(255) COLLATE pg_catalog."default",
rate double precision,
rate_description character varying(255) COLLATE pg_catalog."default",
reservation_id character varying(255) COLLATE pg_catalog."default",
resource_type character varying(255) COLLATE pg_catalog."default",
sku character varying(255) COLLATE pg_catalog."default",
tax_type character varying(255) COLLATE pg_catalog."default",
unit character varying(255) COLLATE pg_catalog."default",
usage_end_date timestamp without time zone,
usage_quantity double precision,
usage_start_date timestamp without time zone,
usage_type character varying(255) COLLATE pg_catalog."default",
user_id bigint,
resource_id character varying(255) COLLATE pg_catalog."default",
CONSTRAINT testawscost_line_item_pkey PRIMARY KEY
(line_item_id, usage_start_date, user_id),
CONSTRAINT fkptp4hyur3i4yj88wo3rxnaf05 FOREIGN KEY (resource_id)
REFERENCES a2i.awscost_resource (resource_id) MATCH SIMPLE
ON UPDATE NO ACTION
ON DELETE NO ACTION
) PARTITION BY hash(user_id);
The partitions:
create table a2i.testuser_cost_mod3rem0
partition of a2i.testawscost_line_item
for values with (MODULUS 3, REMAINDER 0)
partition by range(usage_start_date);
create table a2i.testuser_cost_mod3rem1
partition of a2i.testawscost_line_item
for values with (MODULUS 3, REMAINDER 1)
partition by range(usage_start_date);
create table a2i.testuser_cost_mod3rem2
partition of a2i.testawscost_line_item
for values with (MODULUS 3, REMAINDER 2)
partition by range(usage_start_date);
Partitions of the partitions for 2019:
create table a2i.testdate2019jan_april_cost_mod3rem0
partition of a2i.testuser_cost_mod3rem0
for values from ('2019-01-01 00:00:00') to ('2019-05-01 00:00:00');
create table a2i.testdate2019may_august_cost_mod3rem0
partition of a2i.testuser_cost_mod3rem0
for values from ('2019-05-01 00:00:00') to ('2019-09-01 00:00:00');
create table a2i.testdate2019sept_dec_cost_mod3rem0
partition of a2i.testuser_cost_mod3rem0
for values from ('2019-09-01 00:00:00') to ('2020-01-01 00:00:00');
create table a2i.testdate2019jan_april_cost_mod3rem1
partition of a2i.testuser_cost_mod3rem1
for values from ('2019-01-01 00:00:00') to ('2019-05-01 00:00:00');
create table a2i.testdate2019may_august_cost_mod3rem1
partition of a2i.testuser_cost_mod3rem1
for values from ('2019-05-01 00:00:00') to ('2019-09-01 00:00:00');
create table a2i.testdate2019sept_dec_cost_mod3rem1
partition of a2i.testuser_cost_mod3rem1
for values from ('2019-09-01 00:00:00') to ('2020-01-01 00:00:00');
create table a2i.testdate2019jan_april_cost_mod3rem2
partition of a2i.testuser_cost_mod3rem2
for values from ('2019-01-01 00:00:00') to ('2019-05-01 00:00:00');
create table a2i.testdate2019may_august_cost_mod3rem2
partition of a2i.testuser_cost_mod3rem2
for values from ('2019-05-01 00:00:00') to ('2019-09-01 00:00:00');
create table a2i.testdate2019sept_dec_cost_mod3rem2
partition of a2i.testuser_cost_mod3rem2
for values from ('2019-09-01 00:00:00') to ('2020-01-01 00:00:00');
The index:
CREATE UNIQUE INDEX awscost_line_item_unique_pkey ON a2i.awscost_line_item (
account_id, user_id, resource_id, usage_start_date, usage_end_date, usage_type,
usage_quantity, line_item_type, sku, rate, base_rate, base_cost,
"cost", currency, product_code, operation
);
For 1st query plan,query is :
explain analyze select sum(cost) from testawscost_line_item where
user_id='1';
2nd query :
explain (analyze/*, buffers*/) SELECT sum (c.cost),
sum (case when c.resource_type = 'Compute' then c.cost end) as computeCost,
sum (case when c.resource_type = 'Storage' then c.cost end) as storageCost,
sum (case when c.resource_type = 'Network' then c.cost end) as networkCost,
sum (case when c.resource_type not in ('Compute', 'Network', 'Storage')
then c.cost end) as otherCost,
c.currency,
c.instance_type as productFamily,
avg (c.rate) FROM testawscost_line_item c WHERE
(c.user_id ='1') AND (c.account_id = '807331824280') AND
(c.usage_start_date >= current_date-30 AND c.usage_end_date <=
current_date) AND
(c.instance_type is not null )
GROUP BY c.instance_type, c.currency
ORDER BY 1 desc
The problem is that your index works well for the first query, but not for the second.
resource_id is in your index, but not in your query, so all index columns after that cannot be used for the query. PostgreSQL decides to use the much smaller primary key index.
The perfect index for this query is:
CREATE INDEX ON a2i.testawscost_line_item (user_id, account_id, usage_start_date)
WHERE instance_type IS NOT NULL;
I assume that the condition on usage_end_date is not more selective than the one on usage_start_date.

Why is this simple SQL query so slow?

I have a query
select p.id
from Product p,Identifier i
where p.id=i.product_id
and p.aggregatorId='5109037'
and p.deletionDate is null
and i.type='03'
and i.value='9783639382891'
which takes about 3.5 seconds to run. Product has about 4.7m entries, Identifier about 20m. As you can see in the schema below, every column used in the query is indexed, but not all indexes are used. If I exclude the columns p.aggregatorId and i.type, the query runs as fast as I would have expected. I have also tried to join the Identifier table, but with no change in the explain plan.
Why the indexes are not used?
The explain plan looks like this:
Nested Loop (cost=3.21..63.48 rows=1 width=33) (actual time=10.856..3236.830 rows=1 loops=1)
-> Index Scan using idx_prod_aggr on product p (cost=0.43..2.45 rows=1 width=33) (actual time=0.041..191.339 rows=146692 loops=1)
Index Cond: ((aggregatorid)::text = '5109037'::text)
Filter: (deletiondate IS NULL)
-> Bitmap Heap Scan on identifier i (cost=2.78..61.01 rows=1 width=33) (actual time=0.019..0.019 rows=0 loops=146692)
Recheck Cond: ((product_id)::text = (p.id)::text)
Filter: (((type)::text = '03'::text) AND ((value)::text = '9783639382891'::text))
Rows Removed by Filter: 2"
-> Bitmap Index Scan on idx_id_prod_id (cost=0.00..2.78 rows=29 width=0) (actual time=0.016..0.016 rows=2 loops=146692)
Index Cond: ((product_id)::text = (p.id)::text)
The reduced DB schema looks like this:
CREATE TABLE product
(
id character varying(32) NOT NULL,
version integer,
active boolean,
aggregatorid character varying(15),
deletiondate timestamp without time zone,
)
WITH (
OIDS=FALSE
);
CREATE INDEX idx_prod_active
ON product
USING btree
(active);
CREATE INDEX idx_prod_aggr
ON product
USING btree
(aggregatorid COLLATE pg_catalog."default");
CREATE INDEX idx_prod_del_date
ON product
USING btree
(deletiondate);
CREATE TABLE identifier
(
id character varying(32) NOT NULL,
version integer,
typename character varying(50),
type character varying(3) NOT NULL,
value character varying(512) NOT NULL,
product_id character varying(32),
CONSTRAINT identifier_pkey PRIMARY KEY (id),
CONSTRAINT fk165a88c9c93f3e7f FOREIGN KEY (product_id)
REFERENCES product (id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION
)
WITH (
OIDS=FALSE
);
CREATE INDEX idx_id_prod_type
ON identifier
USING btree
(type COLLATE pg_catalog."default");
CREATE INDEX idx_id_prod_value
ON identifier
USING btree
(value COLLATE pg_catalog."default");
CREATE INDEX idx_id_typename
ON identifier
USING btree
(typename COLLATE pg_catalog."default");
CREATE INDEX idx_prod_ident
ON identifier
USING btree
(product_id COLLATE pg_catalog."default");

Limit slows down my postgres query

Hi i have a simple query on a single table which runs pretty fast, but i want to page my results and the LIMIT slows down the select incredibly. The Table contains about 80 Million rows. I'm on postgres 9.2.
Without LIMIT it takes 330ms and returns 2100 rows
EXPLAIN SELECT * from interval where username='1228321f131084766f3b0c6e40bc5edc41d4677e' order by time desc
Sort (cost=156599.71..156622.43 rows=45438 width=108)"
Sort Key: "time""
-> Bitmap Heap Scan on "interval" (cost=1608.05..155896.71 rows=45438 width=108)"
Recheck Cond: ((username)::text = '1228321f131084766f3b0c6e40bc5edc41d4677e'::text)"
-> Bitmap Index Scan on interval_username (cost=0.00..1605.77 rows=45438 width=0)"
Index Cond: ((username)::text = '1228321f131084766f3b0c6e40bc5edc41d4677e'::text)
EXPLAIN ANALYZE SELECT * from interval where
username='1228321f131084766f3b0c6e40bc5edc41d4677e' order by time desc
Sort (cost=156599.71..156622.43 rows=45438 width=108) (actual time=1.734..1.887 rows=2131 loops=1)
Sort Key: id
Sort Method: quicksort Memory: 396kB
-> Bitmap Heap Scan on "interval" (cost=1608.05..155896.71 rows=45438 width=108) (actual time=0.425..0.934 rows=2131 loops=1)
Recheck Cond: ((username)::text = '1228321f131084766f3b0c6e40bc5edc41d4677e'::text)
-> Bitmap Index Scan on interval_username (cost=0.00..1605.77 rows=45438 width=0) (actual time=0.402..0.402 rows=2131 loops=1)
Index Cond: ((username)::text = '1228321f131084766f3b0c6e40bc5edc41d4677e'::text)
Total runtime: 2.065 ms
With LIMIT it takes several minuts (i never waited for it to end)
EXPLAIN SELECT * from interval where username='1228321f131084766f3b0c6e40bc5edc41d4677e' order by time desc LIMIT 10
Limit (cost=0.00..6693.99 rows=10 width=108)
-> Index Scan Backward using interval_time on "interval" (cost=0.00..30416156.03 rows=45438 width=108)
Filter: ((username)::text = '1228321f131084766f3b0c6e40bc5edc41d4677e'::text)
Table definition
-- Table: "interval"
-- DROP TABLE "interval";
CREATE TABLE "interval"
(
uuid character varying(255) NOT NULL,
deleted boolean NOT NULL,
id bigint NOT NULL,
"interval" bigint NOT NULL,
"time" timestamp without time zone,
trackerversion character varying(255),
username character varying(255),
CONSTRAINT interval_pkey PRIMARY KEY (uuid),
CONSTRAINT fk_272h71b2gfyov9fwnksyditdd FOREIGN KEY (username)
REFERENCES appuser (panelistcode) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE CASCADE,
CONSTRAINT uk_hyi5iws50qif6jwky9xcch3of UNIQUE (id)
)
WITH (
OIDS=FALSE
);
ALTER TABLE "interval"
OWNER TO postgres;
-- Index: interval_time
-- DROP INDEX interval_time;
CREATE INDEX interval_time
ON "interval"
USING btree
("time");
-- Index: interval_username
-- DROP INDEX interval_username;
CREATE INDEX interval_username
ON "interval"
USING btree
(username COLLATE pg_catalog."default");
-- Index: interval_uuid
-- DROP INDEX interval_uuid;
CREATE INDEX interval_uuid
ON "interval"
USING btree
(uuid COLLATE pg_catalog."default");
Further results
SELECT n_distinct FROM pg_stats WHERE tablename='interval' AND attname='username';
n_distinct=1460
SELECT AVG(length) FROM (SELECT username, COUNT(*) AS length FROM interval GROUP BY username) as freq;
45786.022605591910
SELECT COUNT(*) FROM interval WHERE username='1228321f131084766f3b0c6e40bc5edc41d4677e';
2131
The planner is expecting 45438 rows for username '1228321f131084766f3b0c6e40bc5edc41d4677e', while in reality there are only 2131 rows with it, thus it thinks it will find the 10 rows you want faster by looking backward through the interval_time index.
Try increasing the stats on the username column and see whether the query plan will change.
ALTER TABLE interval ALTER COLUMN username SET STATISTICS 100;
ANALYZE interval;
You can try different values of statistics up to 10000.
If you are still not satisfied with the plan and you are sure that you can do better than the planner and know what you are doing, then you can bypass any index easily by performing some operation over it that does not change its value.
For example, instead of ORDER BY time, you can use ORDER BY time + '0 seconds'::interval. That way any index on the value of time stored in the table will be bypassed. For integer values you can multiply * 1, etc.
The page http://thebuild.com/blog/2014/11/18/when-limit-attacks/ showed that i could force postgres to do better by using CTE
WITH inner_query AS (SELECT * from interval where username='7823721a3eb9243be63c6c3a13dffee44753cda6')
SELECT * FROM inner_query order by time desc LIMIT 10;

postgresql hashaggregate query optimization

I am trying to optimize the query below.
select cellid2 as cellid, max(endeks) as turkcell
from (select a.cellid2 as cellid2, b.endeks
from (select geom, cellid as cellid2 from grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000 ) a join (select endeks, st_transform(geom, 2320) as geom_tmp from turkcell_data ) b on st_intersects(a.geom, b.geom_tmp) ) x
group by cellid2 limit 5
and explain analyze returns
"Limit (cost=81808.31..81808.36 rows=5 width=12) (actual time=271376.201..271376.204 rows=5 loops=1)"
" -> HashAggregate (cost=81808.31..81879.63 rows=7132 width=12) (actual time=271376.200..271376.203 rows=5 loops=1)"
" -> Nested Loop (cost=0.00..81772.65 rows=7132 width=12) (actual time=5.128..269753.647 rows=1237707 loops=1)"
" Join Filter: _st_intersects(grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000.geom, st_transform(turkcell_data.geom, 2320))"
" -> Seq Scan on turkcell_data (cost=0.00..809.40 rows=3040 width=3045) (actual time=0.031..7.426 rows=3040 loops=1)"
" -> Index Scan using grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000_geom_gist on grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000 (cost=0.00..24.76 rows=7 width=124) (actual time=0.012..0.799 rows=647 loops=3040)"
" Index Cond: (geom && st_transform(turkcell_data.geom, 2320))"
"Total runtime: 271387.499 ms"
There exist indexes on geometry column and cellid columns. I read that instead of using max, order by desc and limit 1 works better. However, since I have group by clause, it does not work I think. Is there any way to do this or any other way which improves the performance?
Table Definitions:
CREATE TABLE grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000
(
regionid numeric,
geom geometry(Geometry,2320),
cellid integer,
turkcell double precision
)
WITH (
OIDS=FALSE
);
ALTER TABLE grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000
OWNER TO postgres;
-- Index: grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000_cellid
-- DROP INDEX grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000_cellid;
CREATE INDEX grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000_cellid
ON grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000
USING btree
(cellid );
-- Index: grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000_geom_gist
-- DROP INDEX grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000_geom_gist;
CREATE INDEX grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000_geom_gist
ON grd_90098780_7c48_11e3_8876_f0bf97e0dd001000000000
USING gist
(geom );
CREATE TABLE turkcell_data
(
gid serial NOT NULL,
objectid_1 integer,
objectid integer,
neighbourh numeric,
endeks numeric,
coorx numeric,
coory numeric,
shape_leng numeric,
shape_le_1 numeric,
shape_area numeric,
geom geometry(MultiPolygon,4326),
CONSTRAINT turkcell_data_pkey PRIMARY KEY (gid )
)
WITH (
OIDS=FALSE
);
ALTER TABLE turkcell_data
OWNER TO postgres;
-- Index: turkcell_data_geom_gist
-- DROP INDEX turkcell_data_geom_gist;
CREATE INDEX turkcell_data_geom_gist
ON turkcell_data
USING gist
(geom );
Either store your data re-projected to 2320, index that column, and use it in your join, or create an index on the transformed projection of the geometry in turkcell_data. I usually prefer the latter:
CREATE INDEX turkcell_data_geom_gist2320
ON turkcell_data
USING gist
(st_transform(geom, 2320) );
The other issue might be if your geometries are very complex - if any of your polygons have a relatively large number of points you might get stuck crunching away on the intersection. Try the index first, though.