Related
I have a collection with 62k documents in it. The same collection has a bunch of indexes too, most of them simple, single field ones. What I am observing is that the following query takes extremely long to return:
db.jobs.count({"status":"complete","$or":[{"groups":{"$exists":false}},{"groups":{"$size":0}},{"groups":{"$in":["5e65ffc2a1e6ef0007bc5fa8"]}}]})
The executionStats for the above query are as follows
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "xxxxxx.jobs",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"$or" : [
{
"groups" : {
"$size" : 0
}
},
{
"groups" : {
"$eq" : "5e65ffc2a1e6ef0007bc5fa8"
}
},
{
"$nor" : [
{
"groups" : {
"$exists" : true
}
}
]
}
]
},
{
"status" : {
"$eq" : "complete"
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"filter" : {
"$or" : [
{
"groups" : {
"$size" : 0
}
},
{
"groups" : {
"$eq" : "5e65ffc2a1e6ef0007bc5fa8"
}
},
{
"$nor" : [
{
"groups" : {
"$exists" : true
}
}
]
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"status" : 1,
"groups" : 1
},
"indexName" : "status_1_groups_1",
"isMultiKey" : true,
"multiKeyPaths" : {
"status" : [ ],
"groups" : [
"groups"
]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"status" : [
"[\"complete\", \"complete\"]"
],
"groups" : [
"[MinKey, MaxKey]"
]
}
}
},
"rejectedPlans" : [
{
"stage" : "FETCH",
"filter" : {
"$or" : [
{
"groups" : {
"$size" : 0
}
},
{
"groups" : {
"$eq" : "5e65ffc2a1e6ef0007bc5fa8"
}
},
{
"$nor" : [
{
"groups" : {
"$exists" : true
}
}
]
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"status" : 1
},
"indexName" : "status_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"status" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"status" : [
"[\"complete\", \"complete\"]"
]
}
}
}
]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 62092,
"executionTimeMillis" : 9992,
"totalKeysExamined" : 62092,
"totalDocsExamined" : 62092,
"executionStages" : {
"stage" : "FETCH",
"filter" : {
"$or" : [
{
"groups" : {
"$size" : 0
}
},
{
"groups" : {
"$eq" : "5e65ffc2a1e6ef0007bc5fa8"
}
},
{
"$nor" : [
{
"groups" : {
"$exists" : true
}
}
]
}
]
},
"nReturned" : 62092,
"executionTimeMillisEstimate" : 9929,
"works" : 62093,
"advanced" : 62092,
"needTime" : 0,
"needYield" : 0,
"saveState" : 682,
"restoreState" : 682,
"isEOF" : 1,
"invalidates" : 0,
"docsExamined" : 62092,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 62092,
"executionTimeMillisEstimate" : 60,
"works" : 62093,
"advanced" : 62092,
"needTime" : 0,
"needYield" : 0,
"saveState" : 682,
"restoreState" : 682,
"isEOF" : 1,
"invalidates" : 0,
"keyPattern" : {
"status" : 1,
"groups" : 1
},
"indexName" : "status_1_groups_1",
"isMultiKey" : true,
"multiKeyPaths" : {
"status" : [ ],
"groups" : [
"groups"
]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"status" : [
"[\"complete\", \"complete\"]"
],
"groups" : [
"[MinKey, MaxKey]"
]
},
"keysExamined" : 62092,
"seeks" : 1,
"dupsTested" : 62092,
"dupsDropped" : 0,
"seenInvalidated" : 0
}
}
},
"serverInfo" : {
"host" : "xxxxxxx",
"port" : 27017,
"version" : "3.6.15",
"gitVersion" : "xxxxxx"
},
"ok" : 1}
What I am trying to understand is why does the FETCH stage take 10 seconds when the index scan in INPUT_STAGE takes 60ms. Since I am eventually doing a count() I don't really need mongoDB to return the documents, I only need it to $sum up the number of matching keys and give me the grand total.
Any idea what I am doing wrong?
The query explained there was not a count, it returned quite a few documents:
"nReturned" : 62092,
The estimated execution for each stage suggests that the index scan was expected to take 60ms, and fetching the documents from disk took the additional 9.8 seconds.
There are a couple of reasons this count required fetching the documents:
Key existence cannot be fully determined from the index
The {"$exists":false} predicate is also troublesome. When building an index the value for a document contains the value of each indexed field. There is no value for "nonexistent", so it uses null. Since a document that contains a field whose value is explicitly set to null should not match {"$exists":false}, the query executor must load each document from disk in order to determine if the field was null nor nonexistent. This means that a COUNTSCAN stage cannot be used, which further means that all of the documents to be counted must be loaded from disk.
The $or predicate does not ensure exclusivity
The query executor cannot know ahead of time that the clauses in the $or are mutually exclusive. They are in your query, but in the general case it is possible for a single document to match more than one clause in the $or, so the query executor must load the documents to ensure deduplication.
So how to eliminate the fetch stage?
If you were to query with only the $in clause, or with only the $size clause you should find the count is derived from the index scan, without needing to load any documents.
This is, if you were to run these queries separately from the client, and sum the results, you should find that the overall execution time is less than the query that requires fetching:
db.jobs.count({"status":"complete","groups":{"$size":0}})
db.jobs.count({"status":"complete","groups":{"$in":["5e65ffc2a1e6ef0007bc5fa8"]}})
For the {"groups":{"$exists":false}} predicate, you might modify the data slightly, such as ensure that the field always exists, but assign it a value that means "undefined" that can be indexed and queried.
As an example, if you were to run the following update, the groups field would then exist in all documents:
db.jobs.update({"groups":{"$exists":false}},{"$set":{"groups":false}})
And you could get the equivalent of the above count by running these 2 queries that should both be covered by an index scan, and should run faster together than the query that requires loading documents:
db.jobs.count({"status":"complete","groups":{"$size":0}})
db.jobs.count({"status":"complete","groups":{"$in":[false, "5e65ffc2a1e6ef0007bc5fa8"]}})
`
db.jobs.aggregate(
.{$match: {"$or":[
{"groups":{"$exists":false}},
{"groups":{"$in":["5e65ffc2a1e6ef0007bc5fa8"]}},
{"$size":0}
]}
},
.{$count:{"status":"complete"}
)`
If you can somehow avoid the empty array case, than the following query can be used: db.jobs.count({"status":"complete", "groups": { $in: [ null, "5e65ffc2a1e6ef0007bc5fa8" ] } })
null is equivalent to $exists: false.
Also: I'd suggest to use ObjectId instead of string as type for the groups field.
Update
$size never hit an index!
You can use the following query:
db.jobs.count({"status":"complete","$or":[
{"groups":[],
{"groups": {$in: [ null, "5e65ffc2a1e6ef0007bc5fa8" ]}
]})
Using mongo server v3.6.16.
I have a mongo collection with about 18m records. Records are being added at about 100k a day. I have a query that runs fairly often on the collection that depends on two values - user_id and server_time_stamp. I have a compound index set up for those two fields.
The index is regularly getting stale - and queries are taking minutes to complete and causing the server to burn all the CPU it can grab. As soon as I regenerate the index, queries happen quickly. But then a day or two later, the index is stale again. (ed. the index is failing more quickly now - within 30 mins.) I have no idea why the index is going stale - what can I look for?
Edit
Here are the index Fields:
{
"uid" : 1,
"server_time_stamp" : -1
}
and index options:
{
"v" : 2,
"name" : "server_time_stamp_1_uid_1",
"ns" : "sefaria.user_history"
}
This appears to be a Heisenbug. When I used "explain", it performs well. Here is one of the pathological queries, from the long query log, taking 445 seconds:
sefaria.user_history command: find { find: "user_history", filter: { server_time_stamp: { $gt: 1577918252 }, uid: 80588 }, sort: { _id: 1 }, lsid: { id: UUID("4936fb55-8514-4442-b852-306686985126") }, $db: "sefaria", $readPreference: { mode: "primaryPreferred" } } planSummary: IXSCAN { _id: 1 } keysExamined:17286277 docsExamined:17286277 cursorExhausted:1 numYields:142780 nreturned:79 reslen:35375 locks:{ Global: { acquireCount: { r: 285562 } }, Database: { acquireCount: { r: 142781 } }, Collection: { acquireCount: { r: 142781 } } } protocol:op_msg 445101ms
Here's the results of explain for a performant query, right after regenerating the index:
{
"queryPlanner" : {
"plannerVersion" : NumberInt(1),
"namespace" : "sefaria.user_history",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"uid" : {
"$eq" : 80588.0
}
},
{
"server_time_stamp" : {
"$gt" : 1577918252.0
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"uid" : NumberInt(1),
"server_time_stamp" : NumberInt(-1)
},
"indexName" : "server_time_stamp_1_uid_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"uid" : [
],
"server_time_stamp" : [
]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : NumberInt(2),
"direction" : "forward",
"indexBounds" : {
"uid" : [
"[80588.0, 80588.0]"
],
"server_time_stamp" : [
"[inf.0, 1577918252.0)"
]
}
}
},
"rejectedPlans" : [
{
"stage" : "FETCH",
"filter" : {
"server_time_stamp" : {
"$gt" : 1577918252.0
}
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"uid" : NumberInt(1),
"book" : NumberInt(1),
"last_place" : NumberInt(1)
},
"indexName" : "uid_1_book_1_last_place_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"uid" : [
],
"book" : [
],
"last_place" : [
]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : NumberInt(2),
"direction" : "forward",
"indexBounds" : {
"uid" : [
"[80588.0, 80588.0]"
],
"book" : [
"[MinKey, MaxKey]"
],
"last_place" : [
"[MinKey, MaxKey]"
]
}
}
},
{
"stage" : "FETCH",
"filter" : {
"server_time_stamp" : {
"$gt" : 1577918252.0
}
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"uid" : NumberInt(1)
},
"indexName" : "uid",
"isMultiKey" : false,
"multiKeyPaths" : {
"uid" : [
]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : NumberInt(2),
"direction" : "forward",
"indexBounds" : {
"uid" : [
"[80588.0, 80588.0]"
]
}
}
}
]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : NumberInt(97),
"executionTimeMillis" : NumberInt(1),
"totalKeysExamined" : NumberInt(97),
"totalDocsExamined" : NumberInt(97),
"executionStages" : {
"stage" : "FETCH",
"nReturned" : NumberInt(97),
"executionTimeMillisEstimate" : NumberInt(0),
"works" : NumberInt(99),
"advanced" : NumberInt(97),
"needTime" : NumberInt(0),
"needYield" : NumberInt(0),
"saveState" : NumberInt(3),
"restoreState" : NumberInt(3),
"isEOF" : NumberInt(1),
"invalidates" : NumberInt(0),
"docsExamined" : NumberInt(97),
"alreadyHasObj" : NumberInt(0),
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : NumberInt(97),
"executionTimeMillisEstimate" : NumberInt(0),
"works" : NumberInt(98),
"advanced" : NumberInt(97),
"needTime" : NumberInt(0),
"needYield" : NumberInt(0),
"saveState" : NumberInt(3),
"restoreState" : NumberInt(3),
"isEOF" : NumberInt(1),
"invalidates" : NumberInt(0),
"keyPattern" : {
"uid" : NumberInt(1),
"server_time_stamp" : NumberInt(-1)
},
"indexName" : "server_time_stamp_1_uid_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"uid" : [
],
"server_time_stamp" : [
]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : NumberInt(2),
"direction" : "forward",
"indexBounds" : {
"uid" : [
"[80588.0, 80588.0]"
],
"server_time_stamp" : [
"[inf.0, 1577918252.0)"
]
},
"keysExamined" : NumberInt(97),
"seeks" : NumberInt(1),
"dupsTested" : NumberInt(0),
"dupsDropped" : NumberInt(0),
"seenInvalidated" : NumberInt(0)
}
}
},
"serverInfo" : {
"host" : "mongo-deployment-5cf4f4fff6-dz84r",
"port" : NumberInt(27017),
"version" : "3.6.15",
"gitVersion" : "18934fb5c814e87895c5e38ae1515dd6cb4c00f7"
},
"ok" : 1.0
}
The issue was about a query that runs well and uses the indexes suddenly stops using the index and results in a very poor performance. This is noted in the query plan and the log respectively.
The explain's output:
The query plan's "executionStats" says "totalKeysExamined" : NumberInt(97). The query filter is using index defined on the collection ("stage" : "IXSCAN") and the compound index "server_time_stamp_1_uid_1" is used. Also, the query's sort is using the index (the index on _id). As it is the query and the indexes are working as they are meant to be. And, "executionTimeMillis" : NumberInt(1) says that it is a performant query.
Details from the log:
{ ...
find: "user_history", filter: { server_time_stamp: { $gt: 1577918252 }, uid: 80588 }, sort: { _id: 1 }
planSummary: IXSCAN { _id: 1 } keysExamined:17286277 docsExamined:17286277 numYields:142780 nreturned:79
... }
From the log, note that the index "server_time_stamp_1_uid_1" is not used.
Discussion:
The data and the index (called as working set) for the frequently used queries are kept in the memory (RAM + file system cache). If the working set is not in the memory the system has to load it into the memory during the operation and it results in a slower performance. Reading from disk drive is much slower than the memory. Note that SSD drives are much faster than the HDD drives and when there is no option to increase the memory this could be an option.
Also, if the query is using indexes and the index size is large and could not be in memory, the index has to be read from the disk drive and it will slow down the operation. More memory is a solution and when not possible the solution can be in redesigning (or re-modeling) the data and its indexes.
But, the the problem in this case was not the available memory; there is enough of it.
The following info gives an idea about how much memory might be used for the working set for a given query:
db.collection.stats().indexSizes, size, count and avgObjSize.
Solution:
The query log with slow performance shows that the index "server_time_stamp_1_uid_1" is not used: planSummary: IXSCAN { _id: 1 }.
One way to make sure and force the query to use the index (always) is to use the hint on the query. The hint need to be on the index "server_time_stamp_1_uid_1". This way the situation as seen in the log will not happen.
Another way is to keep the index active in the memory. This can be achieved by running a query on the indexed fields only (a covered query: the query filter and returned fields are of indexed fields only). Running this dummy query, which runs often or before the actual query will make sure the index is available in the memory.
In this case, as #Laizer mentioned that supplying the hint to the query helped resolve the issue.
This behavior is due to the index not being capable of being selective and servicing the sort.
The log line for the slow operation is showing the operation using the _id index. The query planner likely made this selection to avoid having to sort results in memory (note the lack of hasSortStage: 1). As a consequence, however, it required scanning considerably more documents in memory (docsExamined:17286277) which made it take considerably longer.
Memory contention likely also played a part. Depending on load, the overhead from sorting results in memory may have contributed to pushing the index out of RAM and the _id index being selected.
A few comments:
As Babu noted, the explain posted above does not include a sort. Including the sort would likely show that stage consuming more time than the IXSCAN.
The name for the index (server_time_stamp_1_uid_1) suggests that server_time_stamp is placed first in the index, followed by uid. Equality matches should be prioritized; i.e. uid should be placed before ranges.
Some options to consider:
Create the index { "uid" : 1, "_id" : 1, "server_time_stamp" : 1 }. See here for guidance on sorting using indexes. Results may be mixed though given that both _id and server_time_stamp are likely to have a high cardinality, which means you may still be trading off scanning documents for avoiding a sort.
Assuming that the _id values are auto-generated, consider sorting by server_time_stamp rather than _id. This will allow you to bound AND sort using server_time_stamp_1_uid_1. The server_time_stamp is a timestamp, so it will also be relatively unique.
sefaria.user_history command: find { find: "user_history", filter: { server_time_stamp: { $gt: 1577918252 }, uid: 80588 }, sort: { _id: 1 }, lsid: { id: UUID("4936fb55-8514-4442-b852-306686985126") }, $db: "sefaria", $readPreference: { mode: "primaryPreferred" } } planSummary: IXSCAN { _id: 1 } keysExamined:17286277 docsExamined:17286277 cursorExhausted:1 numYields:142780 nreturned:79 reslen:35375 locks:{ Global: { acquireCount: { r: 285562 } }, Database: { acquireCount: { r: 142781 } }, Collection: { acquireCount: { r: 142781 } } } protocol:op_msg 445101ms
Looking at the query plan, the query uses _id index. Is it because you have a sort of _id field. I looked at your other plan attached.
"executionSuccess" : true,
"nReturned" : NumberInt(97),
"executionTimeMillis" : NumberInt(1),
"totalKeysExamined" : NumberInt(97),
"totalDocsExamined" : NumberInt(97),
The number of documents returned / examined are 1:1 ratio.
Also the query is using
"indexName" : "server_time_stamp_1_uid_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"uid" : [
],
"server_time_stamp" : [
]
},
I think there is something is missing in both queries. May be the sort is not mentioned in the good plan. Can you please check.
I believe that the issue here was memory. The instance was operating near the limit of physical memory. I can't say for sure, but I believe that the relevant index was being removed from memory, and that the poor query performance was a result of that. Regenerating the index forced it back into memory (assumedly, something else got kicked out of memory.)
I've put the instance on node with much more memory, and so far it seems to be performing well.
I have a collection with millions of documents, each document represent an event: {_id, product, timestamp}
In my query, I need to group by product and take the top 10 for example.
"aggregate" : "product_events",
"pipeline" : [
{
"$match" : {
"timeEvent" : {
"$gt" : ISODate("2017-07-17T00:00:00Z")
}
}
},
{
"$group" : {
"_id" : "$product",
"count" : {
"$sum" : 1
}
}
},
{
"$sort" : {
"count" : -1
}
},
{
"$limit" : 10
}
]
My query is very slow now (10 seconds), I am wondering if there is a way to store data differently to optimise this query?
db.product_events.explain("executionStats").aggregate([ {"$match" :
{"timeEvent" : {"$gt" : ISODate("2017-07-17T00:00:00Z")}}},{"$group" :
{"_id" : "$product","count" : {"$sum" : 1}}}, {"$project": {"_id": 1,
"count": 1}} , {"$sort" : {"count" : -1}},{"$limit" : 500}],
{"allowDiskUse": true})
{
"stages" : [
{
"$cursor" : {
"query" : {
"timeEvent" : {
"$gt" : ISODate("2017-07-17T00:00:00Z")
}
},
"fields" : {
"product" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "mydb.product_events",
"indexFilterSet" : false,
"parsedQuery" : {
"timeEvent" : {
"$gt" : ISODate("2017-07-17T00:00:00Z")
}
},
"winningPlan" : {
"stage" : "COLLSCAN",
"filter" : {
"timeEvent" : {
"$gt" : ISODate("2017-07-17T00:00:00Z")
}
},
"direction" : "forward"
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 2127315,
"executionTimeMillis" : 940,
"totalKeysExamined" : 0,
"totalDocsExamined" : 2127315,
"executionStages" : {
"stage" : "COLLSCAN",
"filter" : {
"timeEvent" : {
"$gt" : ISODate("2017-07-17T00:00:00Z")
}
},
"nReturned" : 2127315,
"executionTimeMillisEstimate" : 810,
"works" : 2127317,
"advanced" : 2127315,
"needTime" : 1,
"needYield" : 0,
"saveState" : 16620,
"restoreState" : 16620,
"isEOF" : 1,
"invalidates" : 0,
"direction" : "forward",
"docsExamined" : 2127315
}
}
}
},
{
"$group" : {
"_id" : "$product",
"count" : {
"$sum" : {
"$const" : 1
}
}
}
},
{
"$project" : {
"_id" : true,
"count" : true
}
},
{
"$sort" : {
"sortKey" : {
"count" : -1
},
"limit" : NumberLong(500)
}
}
],
"ok" : 1
}
Below my indexes
db.product_events.getIndexes()
[
{
"v" : 2,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "mydb.product_events"
},
{
"v" : 2,
"key" : {
"product" : 1,
"timeEvent" : -1
},
"name" : "product_1_timeEvent_-1",
"ns" : "mydb.product_events"
}
]
Creating indexes on fields of a collection aids into optimising process of data retrieval from database collections.
Indexes are generally created on fields into which data are filtered according to specific criteria.
Data contained into indexed fields are sorted in specific order and while fetching data once match is found ,scanning of other document stops which makes process of fetching data faster.
According to description as mentioned into above question to optimise performance of aggregate query please try creating an index on timeEvent field as timeEvent field is used as a filter expression into $match stage of aggregation pipeline.
The documentation on compound indexes states the following.
db.products.createIndex( { "item": 1, "stock": 1 } )
The order of the fields listed in a compound index is important. The
index will contain references to documents sorted first by the values
of the item field and, within each value of the item field, sorted by
values of the stock field.
In addition to supporting queries that match on all the index fields,
compound indexes can support queries that match on the prefix of the
index fields. That is, the index supports queries on the item field as
well as both item and stock fields.
Your product_1_timeEvent_-1 index looks like this:
{
"product" : 1,
"timeEvent" : -1
}
which is why it cannot be used to support a query that only filters on timeEvent.
Options you have to get that sorted:
Flip the order of the fields in your index
Remove the product field from your index
Create an additional index with only the timeEvent field in it.
(Include some additional filter on the product field so the existing index gets used)
And keep in mind that any creation/deletion/modification of an index may impact other queries, too. So make sure you test your changes properly.
This is What I tried so far on aggregated query:
db.getCollection('storage').aggregate([
{
"$match": {
"user_id": 2
}
},
{
"$project": {
"formattedDate": {
"$dateToString": { "format": "%Y-%m", "date": "$created_on" }
},
"size": "$size"
}
},
{ "$group": {
"_id" : "$formattedDate",
"size" : { "$sum": "$size" }
} }
])
This is the result:
/* 1 */
{
"_id" : "2018-02",
"size" : NumberLong(10860595386)
}
/* 2 */
{
"_id" : "2017-12",
"size" : NumberLong(524288)
}
/* 3 */
{
"_id" : "2018-01",
"size" : NumberLong(21587971)
}
And this is the document structure:
{
"_id" : ObjectId("5a59efedd006b9036159e708"),
"user_id" : NumberLong(2),
"is_transferred" : false,
"is_active" : false,
"process_id" : NumberLong(0),
"ratio" : 0.000125759169459343,
"type_id" : 201,
"size" : NumberLong(1687911),
"is_processed" : false,
"created_on" : ISODate("2018-01-13T11:39:25.000Z"),
"processed_on" : ISODate("1970-01-01T00:00:00.000Z")
}
And last, the explain result:
/* 1 */
{
"stages" : [
{
"$cursor" : {
"query" : {
"user_id" : 2.0
},
"fields" : {
"created_on" : 1,
"size" : 1,
"_id" : 1
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "data.storage",
"indexFilterSet" : false,
"parsedQuery" : {
"user_id" : {
"$eq" : 2.0
}
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1
},
"indexName" : "user_id",
"isMultiKey" : false,
"multiKeyPaths" : {
"user_id" : []
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"[2.0, 2.0]"
]
}
}
},
"rejectedPlans" : []
}
}
},
{
"$project" : {
"_id" : true,
"formattedDate" : {
"$dateToString" : {
"format" : "%Y-%m",
"date" : "$created_on"
}
},
"size" : "$size"
}
},
{
"$group" : {
"_id" : "$formattedDate",
"size" : {
"$sum" : "$size"
}
}
}
],
"ok" : 1.0
}
The problem:
I can navigate and get all results in almost instantly like in 0,002sec. However, when I specify user_id and sum them by grouping on each month, My result came in between 0,300s to 0,560s. I do similar tasks in one request and it becaomes more than a second to finish.
What I tried so far:
I've added an index for user_id
I've added an index for created_on
I used more $match conditions. However, This makes even worse.
This collection have almost 200,000 documents in it currently and approximately 150,000 of them are belongs to user_id = 2
How can I minimize the response time for this query?
Note: MongoDB 3.4.10 used.
Pratha,
try to add sort on "created_on" and "size" fields as the first stage in aggregation pipeline.
db.getCollection('storage').aggregate([
{
"$sort": {
"created_on": 1, "size": 1
}
}, ....
Before that, add compound key index:
db.getCollection('storage').createIndex({created_on:1,size:1})
If you sort data before the $group stage, it will improve the efficiency of accumulation of the totals.
Note about sort aggregation stage:
The $sort stage has a limit of 100 megabytes of RAM. By default, if the stage exceeds this limit, $sort will produce an error. To allow for the handling of large datasets, set the allowDiskUse option to true to enable $sort operations to write to temporary files.
P.S
get rid of match stage by userID to test performance, or add userID to compound key also.
This is not a duplicate question. All other answers say that the solution is to create an index on the sort key. In my case, I do have an index and still face this error
Given a mongodb collection with documents similar to:
{
'_id': ...,
'title': ...,
'price': ...,
'category_id': ...,
'last_updated': ...,
... other keys
}
I have an ascending single field index on category_id and a descending single field index on last_updated.
The following query crashes:
> var c = db.collection_name.find({category_id: "categ_id"}, {_id: 0, price: 1, title: 1}).sort({last_updated: -1}).limit(20000).batchSize(500)
> c.forEach(function(doc) {
... ;
... })
2015-05-13T10:00:46.561+0000 E QUERY Error: error: {
"$err" : "getMore executor error: Overflow sort stage buffered data usage of 33554596 bytes exceeds internal limit of 33554432 bytes",
"code" : 17406
}
at Error (<anonymous>)
at DBQuery.next (src/mongo/shell/query.js:259:15)
at DBQuery.forEach (src/mongo/shell/query.js:414:20)
at (shell):1:3 at src/mongo/shell/query.js:259
Here's the explanation of the query if that helps:
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "db_name.collection_name",
"indexFilterSet" : false,
"parsedQuery" : {
"category_id" : {
"$eq" : "categ_id"
}
},
"winningPlan" : {
"stage" : "PROJECTION",
"transformBy" : {
"_id" : 0,
"price" : 1,
"title" : 1
},
"inputStage" : {
"stage" : "SORT",
"sortPattern" : {
"last_updated" : -1
},
"limitAmount" : 500,
"inputStage" : {
"stage" : "KEEP_MUTATIONS",
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"category_id" : 1
},
"indexName" : "category_id_1",
"isMultiKey" : false,
"direction" : "forward",
"indexBounds" : {
"category_id" : [
"[\"categ_id\", \"categ_id\"]"
]
}
}
}
}
}
},
"rejectedPlans" : [
{
"stage" : "LIMIT",
"limitAmount" : 500,
"inputStage" : {
"stage" : "PROJECTION",
"transformBy" : {
"_id" : 0,
"price" : 1,
"title" : 1
},
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"category_id" : {
"$eq" : "categ_id"
}
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"last_updated" : 1
},
"indexName" : "last_updated_1",
"isMultiKey" : false,
"direction" : "backward",
"indexBounds" : {
"last_updated" : [
"[MaxKey, MinKey]"
]
}
}
}
}
}
]
},
"serverInfo" : {
"host" : "host",
"port" : 27017,
"version" : "3.0.2",
"gitVersion" : "6201872043ecbbc0a4cc169b5482dcf385fc464f"
},
"ok" : 1
}
Interestingly, this error only happens on specific categories and not all. Also, if I remove the batchSize option the query does not crash (regardless of the size that I set for the batch).
It's worth noting that the last_updated field may not exist in all documents.
So, turns out the clue was in the query explanation in my question. Since category_id is being used in the query, the query optimizer chooses to use the category_id index and completely ignore the last_updated index. My thinking was it would use category_id for fetching and last_updated for sorting but that doesn't seem to be the way mongodb queries work. In order to fix this, a compound index needs to be created for category_id and last_updated in that order:
db.collection_name.createIndex({category_id: 1, last_updated: -1})