Please help me to optimize the following query:
EXPLAIN ANALYZE
SELECT
"subscriptions"."id" AS t0_r0,
"subscriptions"."created_at" AS t0_r3,
"subscriptions"."updated_at" AS t0_r4,
"subscriptions"."next_date" AS t0_r5,
"subscriptions"."number_of_games" AS t0_r6,
"subscriptions"."renewal_date" AS t0_r7,
"subscriptions"."type" AS t0_r8,
"subscriptions"."order_id" AS t0_r9,
"orders"."id" AS t1_r0,
"orders"."customer_id" AS t1_r1,
"orders"."created_at" AS t1_r2,
"orders"."updated_at" AS t1_r3,
"orders"."payment_id" AS t1_r4,
"orders"."status" AS t1_r5,
"orders"."col13" AS t1_r13,
"orders"."col14" AS t1_r14,
"orders"."col15" AS t1_r15,
"orders"."active_subscription_id" AS t1_r21,
"orders"."product_id" AS t1_r22
FROM
"subscriptions"
INNER JOIN "orders" ON "orders"."id" = "subscriptions"."order_id"
WHERE
"subscriptions"."type" IN ('Const1')
AND "orders"."status" = 'confirm'
AND "orders"."product_id" IN (1, 95, 79, 22)
AND ("subscriptions"."renewal_date" BETWEEN '2017-09-23' AND '2017-09-29') AND (orders.active_subscription_id = subscriptions.id)
AND ("subscriptions"."number_of_games" >= 5)
AND ("subscriptions"."id" NOT IN (
SELECT subscriptions.id
FROM "subscriptions"
INNER JOIN "orders" ON "orders"."id" = "subscriptions"."order_id"
INNER JOIN "table1" ON "table1"."order_id" = "orders"."id"
WHERE "subscriptions"."type" IN ('Const1')
AND "orders"."status" = 'confirm'
AND "orders"."product_id" IN (1, 95, 79, 22)
AND "table1"."col1" IN ('1041', '1042')
AND ("subscriptions"."renewal_date" BETWEEN '2017-09-23' AND '2017-09-29')
AND (orders.active_subscription_id = subscriptions.id)
AND ("subscriptions"."number_of_games" >= 5))
) ;
Initially there're b-tree indexes on:
CREATE INDEX index_table1_on_order_id ON table1 USING btree (order_id);
CREATE INDEX index_orders_on_active_subscription_id ON orders USING btree (active_subscription_id);
CREATE INDEX index_orders_on_status ON orders USING btree (status);
CREATE INDEX orders_payment_id_idx ON orders USING btree (payment_id);
CREATE INDEX index_subscriptions_on_order_id ON subscriptions USING btree (order_id);
All columns with name "id" are primary key.
Execution plan:
Nested Loop (cost=18699.70..38236.80 rows=1 width=466) (actual time=11185.634..11336.548 rows=3352 loops=1)
-> Seq Scan on subscriptions (cost=18699.28..37754.22 rows=57 width=76) (actual time=11185.610..11309.520 rows=3356 loops=1)
Filter: ((renewal_date >= '2017-09-23'::date) AND (renewal_date <= '2017-09-29'::date) AND (number_of_games >= 5) AND (NOT (hashed SubPlan 1)) AND ((type)::text = 'Const1'::text))
Rows Removed by Filter: 522626
SubPlan 1
-> Nested Loop (cost=0.85..18699.28 rows=1 width=4) (actual time=6743.644..11185.269 rows=31 loops=1)
-> Nested Loop (cost=0.42..18697.21 rows=1 width=12) (actual time=0.150..1792.440 rows=3383 loops=1)
-> Seq Scan on subscriptions subscriptions_1 (cost=0.00..17740.06 rows=114 width=8) (actual time=0.114..145.256 rows=3387 loops=1)
Filter: ((renewal_date >= '2017-09-23'::date) AND (renewal_date <= '2017-09-29'::date) AND (number_of_games >= 5) AND ((type)::text = 'Const1'::text))
Rows Removed by Filter: 522595
-> Index Scan using index_orders_on_active_subscription_id on orders orders_1 (cost=0.42..8.39 rows=1 width=8) (actual time=0.471..0.484 rows=1 loops=3387)
Index Cond: (active_subscription_id = subscriptions_1.id)
Filter: (((status)::text = 'confirm'::text) AND (subscriptions_1.order_id = id) AND (product_id = ANY ('{1,95,79,22}'::integer[])))
Rows Removed by Filter: 0
-> Index Scan using index_table1_on_order_id on table1 (cost=0.43..2.05 rows=1 width=4) (actual time=2.775..2.775 rows=0 loops=3383)
Index Cond: (order_id = orders_1.id)
Filter: ((col1)::text = ANY ('{1041,1042}'::text[]))
Rows Removed by Filter: 5
-> Index Scan using index_orders_on_active_subscription_id on orders (cost=0.42..8.46 rows=1 width=390) (actual time=0.007..0.007 rows=1 loops=3356)
Index Cond: (active_subscription_id = subscriptions.id)
Filter: (((status)::text = 'confirm'::text) AND (subscriptions.order_id = id) AND (product_id = ANY ('{1,95,79,22}'::integer[])))
Rows Removed by Filter: 0
Planning time: 3.928 ms
Execution time: 11337.023 ms
Creating the following index:
CREATE INDEX index_subscriptions_on_renewal_date ON subscriptions USING btree (renewal_date);
doesn't make things much better. Even rewriting the query doesn't improve the performance either:
EXPLAIN ANALYZE
With subscriptions_1 as (
SELECT
"subscriptions"."id" AS t0_r0,
"subscriptions"."created_at" AS t0_r3,
"subscriptions"."updated_at" AS t0_r4,
"subscriptions"."next_date" AS t0_r5,
"subscriptions"."number_of_games" AS t0_r6,
"subscriptions"."renewal_date" AS t0_r7,
"subscriptions"."type" AS t0_r8,
"subscriptions"."order_id" AS t0_r9
FROM
"subscriptions"
WHERE
"subscriptions"."type" IN ('Const1')
AND ("subscriptions"."renewal_date" >= '2017-09-23' AND "subscriptions"."renewal_date" <= '2017-09-29')
AND ("subscriptions"."number_of_games" >= 5)
ORDER BY "subscriptions"."id"
)
SELECT
Subscriptions_1.*,
"orders"."id" AS t1_r0,
"orders"."customer_id" AS t1_r1,
"orders"."created_at" AS t1_r2,
"orders"."updated_at" AS t1_r3,
"orders"."payment_id" AS t1_r4,
"orders"."status" AS t1_r5,
"orders"."col13" AS t1_r13,
"orders"."col14" AS t1_r14,
"orders"."col15" AS t1_r15,
"orders"."active_subscription_id" AS t1_r21,
"orders"."product_id" AS t1_r22
FROM
Subscriptions_1
INNER JOIN "orders" ON "orders"."id" = subscriptions_1.t0_r9
WHERE
"orders"."status" = 'confirm'
AND "orders"."product_id" IN (1,95,79,22)
AND (orders.active_subscription_id = subscriptions_1.t0_r0)
AND (subscriptions_1.t0_r0 NOT IN (SELECT subscriptions_1.t0_r0 FROM subscriptions_1 INNER JOIN "orders" ON "orders"."id" = subscriptions_1.t0_r9 INNER JOIN "table1" ON "table1"."order_id" = "orders"."id" WHERE "orders"."status" = 'confirm' AND "orders"."product_id" IN (1,95,79,22) AND "table1"."col1" IN ('1041', '1042') AND (orders.active_subscription_id = subscriptions_1.t0_r0))
) ;
The plan is so bad because PostgreSQL underestimates the number of result rows (1 instead of the actual 3383 in the join between subscriptions and orders).
That causes PostgreSQL to pick a nested loop join for the join with table1, which is where 9 of your 11 seconds are spent.
There are several approaches:
Run ANALYZE, perhaps with increased default_statistics_target, on all tables affected. Perhaps fresh statistics will lead to a better estimate.
If that doesn't help, create an index ON table1(order_id, col1::text), which will speed up the nested loop join as much as possible.
The brutal way: set enable_nestloop to off for this one query.
Related
While executing below two queries, I notice serious difference in query plan. Why is that?
select * from table1
where id = 'dummy' or id in (select id from table2 where id = 'dummy')
Query plan
Seq Scan on table1 (cost=8.30..49611.63 rows=254478 width=820) (actual time=535.477..557.431 rows=1 loops=1)
Filter: (((code)::text = 'dummy'::text) OR (hashed SubPlan 1))
Rows Removed by Filter: 510467
SubPlan 1
-> Index Scan using idx on table2 (cost=0.29..8.30 rows=1 width=8) (actual time=0.009..0.012 rows=0 loops=1)
Index Cond: ((id)::text = 'dummy'::text)
Planning Time: 0.165 ms
Execution Time: 557.517 ms
select * from table1
where id = 'dummy'
union
select * from table1
where id in (select id from table2 where id = 'dummy')
Unique (cost=25.22..25.42 rows=2 width=5818) (actual time=0.045..0.047 rows=1 loops=1)
-> Sort (cost=25.22..25.23 rows=2 width=5818) (actual time=0.045..0.046 rows=1 loops=1)
Sort Method: quicksort Memory: 25kB
-> Append (cost=0.42..25.21 rows=2 width=5818) (actual time=0.016..0.026 rows=1 loops=1)
-> Index Scan using id on table1 (cost=0.42..8.44 rows=1 width=820) (actual time=0.015..0.016 rows=1 loops=1)
Index Cond: ((id)::text = 'dummy'::text)
-> Nested Loop (cost=0.71..16.74 rows=1 width=820) (actual time=0.009..0.009 rows=0 loops=1)
-> Index Scan using idx on table2 (cost=0.29..8.30 rows=1 width=8) (actual time=0.008..0.008 rows=0 loops=1)
Index Cond: ((id)::text = 'dummy'::text)
-> Index Scan using pkey on table1 (cost=0.42..8.44 rows=1 width=820) (never executed)
Index Cond: (id = table2.id)
Planning Time: 0.753 ms
Execution Time: 0.131 ms
So the main difference you can see is the first query returns 254478 rows but the second just returns 2 rows.
Why is that?
Please do another test -- run both these queries -- do they give the same results as the queries without my changes?
select * from table1
where table1.id = 'dummy' or
table1.id in (select table2.id from table2 where table2.id = 'dummy')
select * from table1
where table1.id = 'dummy'
union
select * from table1
where table1.id in (select table2.id from table2 where table2.id = 'dummy')
I don't think you are sharing your actual code with us -- because as written your code makes little sense -- you are returning a list of ids in the sub-query that equal 'dummy' -- so you will just get a list of dummy multiple times.
Note these comments are not true since they had no impact on the results -- the order of operations was working as expected
What result do you get when when you do this:
select * from table1
where (id = 'dummy') or id in (select id from table2 where id = 'dummy')
The reason your query was giving more results is because it was selecting records from table1 where id equals dummy or id = id. The query in the original post gives you all the records. The OR was being applied to the first expression not splitting two expressions.
Use Case: Need to find the index and totalCount of the particular id in the table
I am having a table ann_details which has 60 million records and based on where condition I need to retrieve the rows along with index of that id
Query:
with a as (
select an.id, row_number() over (partition by created_at) as rn
from annotation an
where ( an.layer_id = '47afb169-aed2-4378-ab13-897836275da3' or an.job_id = '' or an.task_id = '') and
an.category_id in (10019)
) select (select count(1) from a ) as totalCount , rn-1 as index from a where a.id= '47afb169-aed2-4378-ab13-897836275da3_a93f0758-8fe0-4c76-992f-0be17e5618bf_484484101';
Output:
totalCount index
1797124,1791143
Execution Time: 5 sec 487 ms
explain and analyze
CTE Scan on a (cost=872778.54..907545.00 rows=7722 width=16) (actual time=5734.572..5735.989 rows=1 loops=1)
Filter: ((id)::text = '47afb169-aed2-4378-ab13-897836275da3_a93f0758-8fe0-4c76-992f-0be17e5618bf_484484101'::text)
Rows Removed by Filter: 1797123
CTE a
-> WindowAgg (cost=0.68..838031.38 rows=1544318 width=97) (actual time=133.660..3831.998 rows=1797124 loops=1)
-> Index Only Scan using test_index_test_2 on annotation an (cost=0.68..814866.61 rows=1544318 width=89) (actual time=133.647..2660.009 rows=1797124 loops=1)
Index Cond: (category_id = 10019)
Filter: (((layer_id)::text = '47afb169-aed2-4378-ab13-897836275da3'::text) OR ((job_id)::text = ''::text) OR ((task_id)::text = ''::text))
Rows Removed by Filter: 3773007
Heap Fetches: 101650
InitPlan 2 (returns $1)
-> Aggregate (cost=34747.15..34747.17 rows=1 width=8) (actual time=2397.391..2397.392 rows=1 loops=1)
-> CTE Scan on a a_1 (cost=0.00..30886.36 rows=1544318 width=0) (actual time=0.017..2156.210 rows=1797124 loops=1)
Planning time: 0.487 ms
Execution time: 5771.080 ms
Index:
CREATE INDEX test_index_test_2 ON public.annotation USING btree (category_id,created_at,layer_id,job_id,task_id,id);
From application we will be passing the job_id or task_id or layer_id and rest 2 will be passed as empty
Need help in optimizing the query to get the response in 2 sec
Query Plan: https://explain.depesz.com/s/mXme
I have a query where the Postgres is performing a Hash join with sequence scan instead of an Index join with Nested loop, when I use an OR condition. This is causing the query to take 2 seconds instead of completing in < 100ms. I have run VACUUM ANALYZE and have rebuilt the index on the PATIENTCHARTNOTE table (which is about 5GB) but its still using hash join. Do you have any suggestions on how I can improve this?
explain analyze
SELECT Count (_pcn.id) AS total_open_note
FROM patientchartnote _pcn
INNER JOIN appointment _appt
ON _appt.id = _pcn.appointment_id
INNER JOIN patient _pt
ON _pt.id = _appt.patient_id
LEFT OUTER JOIN person _ps
ON _ps.id = _pt.appuser_id
WHERE _pcn.active = true
AND _pt.active = true
AND _appt.datecomplete IS NULL
AND _pcn.title IS NOT NULL
AND _pcn.title <> ''
AND ( _pt.assigned_to_user_id = '136964'
OR _pcn.createdby_id = '136964'
);
Aggregate (cost=237655.59..237655.60 rows=1 width=8) (actual time=1602.069..1602.069 rows=1 loops=1)
-> Hash Join (cost=83095.43..237645.30 rows=4117 width=4) (actual time=944.850..1602.014 rows=241 loops=1)
Hash Cond: (_appt.patient_id = _pt.id)
Join Filter: ((_pt.assigned_to_user_id = 136964) OR (_pcn.createdby_id = 136964))
Rows Removed by Join Filter: 94036
-> Hash Join (cost=46650.68..182243.64 rows=556034 width=12) (actual time=415.862..1163.812 rows=94457 loops=1)
Hash Cond: (_pcn.appointment_id = _appt.id)
-> Seq Scan on patientchartnote _pcn (cost=0.00..112794.20 rows=1073978 width=12) (actual time=0.016..423.262 rows=1
073618 loops=1)
Filter: (active AND (title IS NOT NULL) AND ((title)::text <> ''::text))
Rows Removed by Filter: 22488
-> Hash (cost=35223.61..35223.61 rows=696486 width=8) (actual time=414.749..414.749 rows=692839 loops=1)
Buckets: 131072 Batches: 16 Memory Usage: 2732kB
-> Seq Scan on appointment _appt (cost=0.00..35223.61 rows=696486 width=8) (actual time=0.010..271.208 rows=69
2839 loops=1)
Filter: (datecomplete IS NULL)
Rows Removed by Filter: 652426
-> Hash (cost=24698.57..24698.57 rows=675694 width=12) (actual time=351.566..351.566 rows=674929 loops=1)
Buckets: 131072 Batches: 16 Memory Usage: 2737kB
-> Seq Scan on patient _pt (cost=0.00..24698.57 rows=675694 width=12) (actual time=0.013..197.268 rows=674929 loops=
1)
Filter: active
Rows Removed by Filter: 17426
Planning time: 1.533 ms
Execution time: 1602.715 ms
When I replace "OR _pcn.createdby_id = '136964'" with "AND _pcn.createdby_id = '136964'", Postgres performs an index scan
Aggregate (cost=29167.56..29167.57 rows=1 width=8) (actual time=937.743..937.743 rows=1 loops=1)
-> Nested Loop (cost=1.28..29167.55 rows=7 width=4) (actual time=19.136..937.669 rows=37 loops=1)
-> Nested Loop (cost=0.85..27393.03 rows=1654 width=4) (actual time=2.154..910.250 rows=1649 loops=1)
-> Index Scan using patient_activeassigned_idx on patient _pt (cost=0.42..3075.00 rows=1644 width=8) (actual time=1.
599..11.820 rows=1627 loops=1)
Index Cond: ((active = true) AND (assigned_to_user_id = 136964))
Filter: active
-> Index Scan using appointment_datepatient_idx on appointment _appt (cost=0.43..14.75 rows=4 width=8) (actual time=
0.543..0.550 rows=1 loops=1627)
Index Cond: ((patient_id = _pt.id) AND (datecomplete IS NULL))
-> Index Scan using patientchartnote_activeappointment_idx on patientchartnote _pcn (cost=0.43..1.06 rows=1 width=8) (actual time=0.014..0.014 rows=0 loops=1649)
Index Cond: ((active = true) AND (createdby_id = 136964) AND (appointment_id = _appt.id) AND (title IS NOT NULL))
Filter: (active AND ((title)::text <> ''::text))
Planning time: 1.489 ms
Execution time: 937.910 ms
(13 rows)
Using OR in SQL queries usually results in bad performance.
That is because – different from AND – it does not restrict, but extend the number of rows in the query result. With AND, you can use an index scan for one part of the condition and further restrict the result set with a filter on the second condition. That is not possible with OR.
So PostgreSQL does the only thing left: it computes the whole join and then filters out all rows that do not match the condition. Of course that is very inefficient when you are joining three tables (I didn't count the outer join).
Assuming that all columns called id are primary keys, you could rewrite the query as follows:
SELECT count(*) FROM
(SELECT _pcn.id
FROM patientchartnote _pcn
INNER JOIN appointment _appt
ON _appt.id = _pcn.appointment_id
INNER JOIN patient _pt
ON _pt.id = _appt.patient_id
LEFT OUTER JOIN person _ps
ON _ps.id = _pt.appuser_id
WHERE _pcn.active = true
AND _pt.active = true
AND _appt.datecomplete IS NULL
AND _pcn.title IS NOT NULL
AND _pcn.title <> ''
AND _pt.assigned_to_user_id = '136964'
UNION
SELECT _pcn.id
FROM patientchartnote _pcn
INNER JOIN appointment _appt
ON _appt.id = _pcn.appointment_id
INNER JOIN patient _pt
ON _pt.id = _appt.patient_id
LEFT OUTER JOIN person _ps
ON _ps.id = _pt.appuser_id
WHERE _pcn.active = true
AND _pt.active = true
AND _appt.datecomplete IS NULL
AND _pcn.title IS NOT NULL
AND _pcn.title <> ''
AND _pcn.createdby_id = '136964'
) q;
Even though this is running the query twice, indexes can be used to filter out all but a few rows early on, so this query should perform better.
SELECT "Series".*
,"SeriesTranslations"."id" AS "SeriesTranslations.id"
,"SeriesTranslations"."title" AS "SeriesTranslations.title"
,"SeriesTranslations"."subtitle" AS "SeriesTranslations.subtitle"
,"SeriesTranslations"."slug" AS "SeriesTranslations.slug"
,"SeriesTranslations"."language" AS "SeriesTranslations.language"
,"SeriesTranslations"."seoTitle" AS "SeriesTranslations.seoTitle"
,"SeriesTranslations"."seoDescription" AS "SeriesTranslations.seoDescription"
,"Posts"."id" AS "Posts.id"
,"Posts"."type" AS "Posts.type"
,"Posts"."contentDuration" AS "Posts.contentDuration"
,"Posts"."publishDate" AS "Posts.publishDate"
,"Posts"."publishedAt" AS "Posts.publishedAt"
,"Posts"."thumbnailUrl" AS "Posts.thumbnailUrl"
,"Posts"."imageUrl" AS "Posts.imageUrl"
,"Posts"."media" AS "Posts.media"
,"Posts.PostTranslations"."id" AS "Posts.PostTranslations.id"
,"Posts.PostTranslations"."slug" AS "Posts.PostTranslations.slug"
,"Posts.PostTranslations"."title" AS "Posts.PostTranslations.title"
,"Posts.PostTranslations"."subtitle" AS "Posts.PostTranslations.subtitle"
,"Posts.PostTranslations"."language" AS "Posts.PostTranslations.language"
FROM (
SELECT "Series"."id"
,"Series"."thumbnailUrl"
,"Series"."imageUrl"
,"Series"."coverUrl"
FROM "Series" AS "Series"
WHERE EXISTS (
SELECT *
FROM "SeriesTranslations" AS t
WHERE t.LANGUAGE IN ('en-us')
AND t.slug = 'in-residence-architecture-design-video-series'
AND t."SeriesId" = "Series"."id" LIMIT 1
) LIMIT 1
) AS "Series"
INNER JOIN "SeriesTranslations" AS "SeriesTranslations" ON "Series"."id" = "SeriesTranslations"."SeriesId"
AND "SeriesTranslations"."language" IN ('en-us')
LEFT JOIN "Posts" AS "Posts" ON "Series"."id" = "Posts"."SeriesId"
AND EXISTS (
SELECT *
FROM "PostTranslations" AS pt
WHERE pt.LANGUAGE IN ('en-us')
AND pt."PostId" = "Posts"."id" LIMIT 1
)
LEFT JOIN "PostTranslations" AS "Posts.PostTranslations" ON "Posts"."id" = "Posts.PostTranslations"."PostId"
AND "Posts.PostTranslations"."language" IN ('en-us')
ORDER BY "Posts"."publishDate" DESC;
It loads data from 4 tables "Series", "SeriesTranslations", "Posts" and "PostsTranslations". I retrieves single "Series" based on "SeriesTranslations" slug and also all "Posts" that belong to this series with their translations.
This query takes ~1.5 sec when series is returned with 14 posts (TOTAL 14 rows are returned from query). In DB there are just few series (no more than 5), each one has 2 translations. However there are many posts in DB - around 2000 and each one has 2 translations so around 4k PostTranslations...
Here is EXPLAIN result
I have unique indexes on "slug", "language" in "SeriesTranslations" and "PostTranslations" and also I have forign keys on "Posts"."SeriesId", "SeriesTranslations"."SeriesId" and "PostTranslations"."PostId"
EXPLAIN here http://explain.depesz.com/s/fhm
I simplified query as suggested: (removed one subquery and moved conditions to inner join) - however query is still slow...
SELECT "Series"."id"
,"Series"."thumbnailUrl"
,"Series"."imageUrl"
,"Series"."coverUrl"
,"SeriesTranslations"."id" AS "SeriesTranslations.id"
,"SeriesTranslations"."title" AS "SeriesTranslations.title"
,"SeriesTranslations"."subtitle" AS "SeriesTranslations.subtitle"
,"SeriesTranslations"."slug" AS "SeriesTranslations.slug"
,"SeriesTranslations"."language" AS "SeriesTranslations.language"
,"SeriesTranslations"."seoTitle" AS "SeriesTranslations.seoTitle"
,"SeriesTranslations"."seoDescription" AS "SeriesTranslations.seoDescription"
,"Posts"."id" AS "Posts.id"
,"Posts"."type" AS "Posts.type"
,"Posts"."contentDuration" AS "Posts.contentDuration"
,"Posts"."publishDate" AS "Posts.publishDate"
,"Posts"."publishedAt" AS "Posts.publishedAt"
,"Posts"."thumbnailUrl" AS "Posts.thumbnailUrl"
,"Posts"."imageUrl" AS "Posts.imageUrl"
,"Posts"."media" AS "Posts.media"
,"Posts.PostTranslations"."id" AS "Posts.PostTranslations.id"
,"Posts.PostTranslations"."slug" AS "Posts.PostTranslations.slug"
,"Posts.PostTranslations"."title" AS "Posts.PostTranslations.title"
,"Posts.PostTranslations"."subtitle" AS "Posts.PostTranslations.subtitle"
,"Posts.PostTranslations"."language" AS "Posts.PostTranslations.language"
FROM "Series" AS "Series"
INNER JOIN "SeriesTranslations" AS "SeriesTranslations" ON "Series"."id" = "SeriesTranslations"."SeriesId"
AND "SeriesTranslations"."language" IN ('en-us')
AND "SeriesTranslations"."slug" = 'sdf'
LEFT JOIN "Posts" AS "Posts" ON "Series"."id" = "Posts"."SeriesId"
AND EXISTS (
SELECT *
FROM "PostTranslations" AS pt
WHERE pt.LANGUAGE IN ('en-us')
AND pt."PostId" = "Posts"."id" LIMIT 1
)
LEFT JOIN "PostTranslations" AS "Posts.PostTranslations" ON "Posts"."id" = "Posts.PostTranslations"."PostId"
AND "Posts.PostTranslations"."language" IN ('en-us')
WHERE (1 = 1)
ORDER BY "Posts"."publishDate" DESC
,"Posts"."id" DESC;
And here is new query plan:
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Sort (cost=1014671.76..1014671.76 rows=1 width=695) (actual time=2140.906..2140.908 rows=14 loops=1)
Sort Key: "Posts"."publishDate", "Posts".id
Sort Method: quicksort Memory: 45kB
-> Nested Loop Left Join (cost=0.03..1014671.76 rows=1 width=695) (actual time=85.862..2140.745 rows=14 loops=1)
Join Filter: ("Posts".id = "Posts.PostTranslations"."PostId")
Rows Removed by Join Filter: 28266
-> Nested Loop (cost=0.03..1014165.24 rows=1 width=564) (actual time=85.307..2042.304 rows=14 loops=1)
Join Filter: ("Series".id = "SeriesTranslations"."SeriesId")
Rows Removed by Join Filter: 35
-> Index Scan using "SeriesTranslations-slug-language-unique" on "SeriesTranslations" (cost=0.03..4.03 rows=1 width=200) (actual time=0.044..0.046 rows=1 loops=1)
Index Cond: ((slug = 'in-residence-architecture-design-video-series'::text) AND (language = 'en-us'::text))
-> Nested Loop Left Join (cost=0.00..1014159.63 rows=450 width=368) (actual time=85.243..2042.207 rows=49 loops=1)
Join Filter: ("Series".id = "Posts"."SeriesId")
Rows Removed by Join Filter: 18131
-> Seq Scan on "Series" (cost=0.00..11.35 rows=450 width=100) (actual time=0.006..0.046 rows=9 loops=1)
-> Materialize (cost=0.00..1012330.79 rows=1010 width=272) (actual time=4.422..226.499 rows=2020 loops=9)
-> Seq Scan on "Posts" (cost=0.00..1012329.78 rows=1010 width=272) (actual time=39.785..2020.448 rows=2020 loops=1)
Filter: (SubPlan 1)
SubPlan 1
-> Limit (cost=0.00..500.94 rows=1 width=1267) (actual time=0.995..0.995 rows=1 loops=2020)
-> Seq Scan on "PostTranslations" pt (cost=0.00..500.94 rows=1 width=1267) (actual time=0.992..0.992 rows=1 loops=2020)
Filter: ((language = 'en-us'::text) AND ("PostId" = "Posts".id))
Rows Removed by Filter: 1591
-> Seq Scan on "PostTranslations" "Posts.PostTranslations" (cost=0.00..499.44 rows=2020 width=135) (actual time=0.003..3.188 rows=2020 loops=14)
Filter: (language = 'en-us'::text)
Rows Removed by Filter: 964
Total runtime: 2141.432 ms
(27 rows)
An index on the FKs might help the JOINs:
CREATE INDEX ON PostTranslations (PostId); -- For FK
VACUUM ANALYZE PostTranslations ; -- refresh statistics
CREATE INDEX ON SeriesTranslations (SeriesId ); -- FK
VACUUM ANALYZE SeriesTranslations ;
CREATE INDEX ON Posts (SeriesId) ; -- FK
VACUUM ANALYZE Posts ;
And REMOVE the LIMIT 1 from the EXISTS(...) subqueries. They can only do harm.
I looking for ways to abstract database access to postgres. In my examples I will use a hypothetical twitter clone in nodejs, but in the end it's a question about how postgres handles prepared statements, so the language and library don't really matter:
Suppose I want to be able to access a list of all tweets from a user by username:
name: "tweets by username"
text: "SELECT (SELECT * FROM tweets WHERE tweets.user_id = users.user_id) FROM users WHERE users.username = $1"
values: [username]
That works fine, but it seems inefficient, both in practical terms and code-quality terms to have to make another function to handle getting tweets by email rather than by username:
name: "tweets by email"
text: "SELECT (SELECT * FROM tweets WHERE tweets.user_id = users.user_id) FROM users WHERE users.email = $1"
values: [email]
Is it possible to include a field as a parameter to the prepared statement?
name: "tweets by user"
text: "SELECT (SELECT * FROM tweets WHERE tweets.user_id = users.user_id) FROM users WHERE users.$1 = $2"
values: [field, value]
While it's true that this might be a bit less efficient in the corner case of accessing tweets by user_id, that's a trade I'm willing to make to improve code quality, and hopefully overall improve efficiency by reducing the number of query templates to 1 instead of 3+.
#Clodoaldo 's answer is correct in that it allows the capability you desire and should return the right results. Unfortunately it produces rather slow execution.
I set up an experimental data base with tweets and users. populated 10K users each with 100 tweets (1M tweet records). I indexed the PKs u.id, t.id, the FK t.user_id and the predicate fields u.username, u.email.
create table t(id serial PRIMARY KEY, data integer, user_id bignit);
create index t1 t(user_id);
create table u(id serial PRIMARY KEY, name text, email text);
create index u1 on u(name);
create index u2 on u(email);
insert into u(name,email) select i::text, i::text from generate_series(1,10000) i;
insert into t(data,user_id) select i, (i/100)::bigint from generate_series(1,1000000) i;
analyze table t;
analyze table u;
A simple query using one field as predicate is very fast:
prepare qn as select t.* from t join u on t.user_id = u.id where u.name = $1;
explain analyze execute qn('1111');
Nested Loop (cost=0.00..19.81 rows=1 width=16) (actual time=0.030..0.057 rows=100 loops=1)
-> Index Scan using u1 on u (cost=0.00..8.46 rows=1 width=4) (actual time=0.020..0.020 rows=1 loops=1)
Index Cond: (name = $1)
-> Index Scan using t1 on t (cost=0.00..10.10 rows=100 width=16) (actual time=0.007..0.023 rows=100 loops=1)
Index Cond: (t.user_id = u.id)
Total runtime: 0.093 ms
A query using case in the where as #Clodoaldo proposed takes almost 30 seconds:
prepare qen as select t.* from t join u on t.user_id = u.id
where case $2 when 'e' then u.email = $1 when 'n' then u.name = $1 end;
explain analyze execute qen('1111','n');
Merge Join (cost=25.61..38402.69 rows=500000 width=16) (actual time=27.771..26345.439 rows=100 loops=1)
Merge Cond: (t.user_id = u.id)
-> Index Scan using t1 on t (cost=0.00..30457.35 rows=1000000 width=16) (actual time=0.023..17.741 rows=111200 loops=1)
-> Index Scan using u_pkey on u (cost=0.00..42257.36 rows=500000 width=4) (actual time=0.325..26317.384 rows=1 loops=1)
Filter: CASE $2 WHEN 'e'::text THEN (u.email = $1) WHEN 'n'::text THEN (u.name = $1) ELSE NULL::boolean END
Total runtime: 26345.535 ms
Observing that plan, I thought that using a union subselect then filtering its results to get the id appropriate to the parametrized predicate choice would allow the planner to use specific indexes for each predicate. Turns out I was right:
prepare qen2 as
select t.*
from t
join (
SELECT id from
(
SELECT 'n' as fld, id from u where u.name = $1
UNION ALL
SELECT 'e' as fld, id from u where u.email = $1
) poly
where poly.fld = $2
) uu
on t.user_id = uu.id;
explain analyze execute qen2('1111','n');
Nested Loop (cost=0.00..28.31 rows=100 width=16) (actual time=0.058..0.120 rows=100 loops=1)
-> Subquery Scan poly (cost=0.00..16.96 rows=1 width=4) (actual time=0.041..0.073 rows=1 loops=1)
Filter: (poly.fld = $2)
-> Append (cost=0.00..16.94 rows=2 width=4) (actual time=0.038..0.070 rows=2 loops=1)
-> Subquery Scan "*SELECT* 1" (cost=0.00..8.47 rows=1 width=4) (actual time=0.038..0.038 rows=1 loops=1)
-> Index Scan using u1 on u (cost=0.00..8.46 rows=1 width=4) (actual time=0.038..0.038 rows=1 loops=1)
Index Cond: (name = $1)
-> Subquery Scan "*SELECT* 2" (cost=0.00..8.47 rows=1 width=4) (actual time=0.031..0.032 rows=1 loops=1)
-> Index Scan using u2 on u (cost=0.00..8.46 rows=1 width=4) (actual time=0.030..0.031 rows=1 loops=1)
Index Cond: (email = $1)
-> Index Scan using t1 on t (cost=0.00..10.10 rows=100 width=16) (actual time=0.015..0.028 rows=100 loops=1)
Index Cond: (t.user_id = poly.id)
Total runtime: 0.170 ms
SELECT t.*
FROM tweets t
inner join users u on t.user_id = u.user_id
WHERE case $2
when 'username' then u.username = $1
when 'email' then u.email = $1
else u.user_id = $1
end