Mongo Remove records in a huge collection - mongodb

I have a huge collection (1000 Million) and I would like to search and remove the records older than a timestamp.
I have an index created on the field lastUpdatedTime
db.MyCol.remove({"lastUpdatedTime" : {$lt: ISODate("2016-10-06 00:00:00 AM") }})
The above remove query timeouts and I modified to use BulkOperation as well.
Failed execution of command 'delete' with id 4334 on connection
'connectionId{localValue:13, serverValue:22}' to server 'XXXXX:27017'
with exception 'com.mongodb.MongoSocketReadTimeoutException: Timeout
while receiving message'
I understand mongo doesn't support limit in remove yet. So, I am implementing something like below
//Read 10K records
BasicDBObject query = new BasicDBObject();
query.append("lastUpdatedTime",
new BasicDBObject("$lte", new Timestamp(cal.getTimeInMillis())));
DBCursor cursorDocBuilder = myCol.find(query).limit(10000);
// Get Ids
BasicDBList inList = new BasicDBList();
while (cursorDocBuilder.hasNext())
{
inList.add(cursorDocBuilder.next().get("_id"));
}
//construct In clause
BasicDBObject deleteQuery = new BasicDBObject();
deleteQuery.put("_id", new BasicDBObject(MongoOps.$IN, inList));
WriteResult result =myCol.remove(deleteQuery);
What would be a good number to remove using $IN clause?
Will it better to fire multiple remove statements instead a big one with many IN clause?
I think this is a everyday situation to delete the top N records in a database. Is there a better way of achieving this?
P.S : I can do multiple threads to clean up. I don't want to throttle the database as I anticipate high read /write operations to the same collection.
Adding the explain() for fetching 1000 records
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "XXX",
"indexFilterSet" : false,
"parsedQuery" : {
"lastUpdatedTime" : {
"$lt" : ISODate("2016-10-06T00:00:00Z")
}
},
"winningPlan" : {
"stage" : "LIMIT",
"limitAmount" : 1000,
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"lastUpdatedTime" : 1
},
"indexName" : "lastUpdatedTime_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"lastUpdatedTime" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"lastUpdatedTime" : [
"(true, new Date(1475712000000))"
]
}
}
}
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 1000,
"executionTimeMillis" : 200,
"totalKeysExamined" : 1000,
"totalDocsExamined" : 1000,
"executionStages" : {
"stage" : "LIMIT",
"nReturned" : 1000,
"executionTimeMillisEstimate" : 201,
"works" : 1001,
"advanced" : 1000,
"needTime" : 0,
"needYield" : 0,
"saveState" : 10,
"restoreState" : 10,
"isEOF" : 1,
"invalidates" : 0,
"limitAmount" : 1000,
"inputStage" : {
"stage" : "FETCH",
"nReturned" : 1000,
"executionTimeMillisEstimate" : 201,
"works" : 1000,
"advanced" : 1000,
"needTime" : 0,
"needYield" : 0,
"saveState" : 10,
"restoreState" : 10,
"isEOF" : 0,
"invalidates" : 0,
"docsExamined" : 1000,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 1000,
"executionTimeMillisEstimate" : 0,
"works" : 1000,
"advanced" : 1000,
"needTime" : 0,
"needYield" : 0,
"saveState" : 10,
"restoreState" : 10,
"isEOF" : 0,
"invalidates" : 0,
"keyPattern" : {
"lastUpdatedTime" : 1
},
"indexName" : "lastUpdatedTime_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"lastUpdatedTime" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"lastUpdatedTime" : [
"(true, new Date(1475712000000))"
]
},
"keysExamined" : 1000,
"seeks" : 1,
"dupsTested" : 0,
"dupsDropped" : 0,
"seenInvalidated" : 0
}
}
}
}

Related

mongodb query slow in different version and env

I have two env mongodbs,
the difference between them is:
test mongodb version: 3.2.20 , prod mongodb version : 4.0.18
and test env query plan first stage is Limit , however the other is Sort.
in my test env, it's very quick and totalDocsExamined == limit
they both hit the index:
{
"v" : 1,
"key" : {
"appIds" : 1,
"ctime" : -1,
"background" : 1
},
"name" : "appIds_1_ctime_-1_background_1",
"ns" : "newsmine.newstoapp"
}
query: db.newstoapp.find({"appIds":{"$in":[999]}}).sort({"ctime":-1}).limit(10).explain('executionStats')
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "newsmine.newstoapp",
"indexFilterSet" : false,
"parsedQuery" : {
"appIds" : {
"$in" : [
999
]
}
},
"winningPlan" : {
"stage" : "LIMIT",
"limitAmount" : 10,
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"appIds" : 1,
"ctime" : -1,
"background" : 1
},
"indexName" : "appIds_1_ctime_-1_background_1",
"isMultiKey" : true,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"appIds" : [
"[999.0, 999.0]"
],
"ctime" : [
"[MaxKey, MinKey]"
],
"background" : [
"[MinKey, MaxKey]"
]
}
}
}
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 10,
"executionTimeMillis" : 0,
"totalKeysExamined" : 10,
"totalDocsExamined" : 10,
"executionStages" : {
"stage" : "LIMIT",
"nReturned" : 10,
"executionTimeMillisEstimate" : 0,
"works" : 11,
"advanced" : 10,
"needTime" : 0,
"needYield" : 0,
"saveState" : 0,
"restoreState" : 0,
"isEOF" : 1,
"invalidates" : 0,
"limitAmount" : 10,
"inputStage" : {
"stage" : "FETCH",
"nReturned" : 10,
"executionTimeMillisEstimate" : 0,
"works" : 10,
"advanced" : 10,
"needTime" : 0,
"needYield" : 0,
"saveState" : 0,
"restoreState" : 0,
"isEOF" : 0,
"invalidates" : 0,
"docsExamined" : 10,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 10,
"executionTimeMillisEstimate" : 0,
"works" : 10,
"advanced" : 10,
"needTime" : 0,
"needYield" : 0,
"saveState" : 0,
"restoreState" : 0,
"isEOF" : 0,
"invalidates" : 0,
"keyPattern" : {
"appIds" : 1,
"ctime" : -1,
"background" : 1
},
"indexName" : "appIds_1_ctime_-1_background_1",
"isMultiKey" : true,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"appIds" : [
"[999.0, 999.0]"
],
"ctime" : [
"[MaxKey, MinKey]"
],
"background" : [
"[MinKey, MaxKey]"
]
},
"keysExamined" : 10,
"dupsTested" : 10,
"dupsDropped" : 0,
"seenInvalidated" : 0
}
}
}
},
"serverInfo" : {
"host" : "",
"port" : ,
"version" : "3.2.20",
"gitVersion" : "a7a144f40b70bfe290906eb33ff2714933544af8"
},
"ok" : 1
}
in my prod env, it's getting slow query
query: datamongo:PRIMARY> db.newstoapp.find({"appIds":{"$in":[1460]}}).sort({"ctime":-1}).limit(10).explain('executionStats')
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "newsmine.newstoapp",
"indexFilterSet" : false,
"parsedQuery" : {
"appIds" : {
"$eq" : 1460
}
},
"winningPlan" : {
"stage" : "SORT",
"sortPattern" : {
"ctime" : -1
},
"limitAmount" : 10,
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"appIds" : 1,
"ctime" : -1,
"background" : 1
},
"indexName" : "appIds_1_ctime_-1_background_1",
"isMultiKey" : true,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"appIds" : [
"[1460.0, 1460.0]"
],
"ctime" : [
"[MaxKey, MinKey]"
],
"background" : [
"[MinKey, MaxKey]"
]
}
}
}
}
},
"rejectedPlans" : [
{
"stage" : "SORT",
"sortPattern" : {
"ctime" : -1
},
"limitAmount" : 10,
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"appIds" : 1
},
"indexName" : "appIds_1",
"isMultiKey" : true,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"appIds" : [
"[1460.0, 1460.0]"
]
}
}
}
}
},
{
"stage" : "LIMIT",
"limitAmount" : 10,
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"appIds" : {
"$eq" : 1460
}
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"ctime" : 1
},
"indexName" : "ctime_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "backward",
"indexBounds" : {
"ctime" : [
"[MaxKey, MinKey]"
]
}
}
}
}
]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 10,
"executionTimeMillis" : 40,
"totalKeysExamined" : 405,
"totalDocsExamined" : 405,
"executionStages" : {
"stage" : "SORT",
"nReturned" : 10,
"executionTimeMillisEstimate" : 3,
"works" : 418,
"advanced" : 10,
"needTime" : 407,
"needYield" : 0,
"saveState" : 9,
"restoreState" : 9,
"isEOF" : 1,
"invalidates" : 0,
"sortPattern" : {
"ctime" : -1
},
"memUsage" : 8471,
"memLimit" : 33554432,
"limitAmount" : 10,
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"nReturned" : 405,
"executionTimeMillisEstimate" : 3,
"works" : 407,
"advanced" : 405,
"needTime" : 1,
"needYield" : 0,
"saveState" : 9,
"restoreState" : 9,
"isEOF" : 1,
"invalidates" : 0,
"inputStage" : {
"stage" : "FETCH",
"nReturned" : 405,
"executionTimeMillisEstimate" : 3,
"works" : 406,
"advanced" : 405,
"needTime" : 0,
"needYield" : 0,
"saveState" : 9,
"restoreState" : 9,
"isEOF" : 1,
"invalidates" : 0,
"docsExamined" : 405,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 405,
"executionTimeMillisEstimate" : 1,
"works" : 406,
"advanced" : 405,
"needTime" : 0,
"needYield" : 0,
"saveState" : 9,
"restoreState" : 9,
"isEOF" : 1,
"invalidates" : 0,
"keyPattern" : {
"appIds" : 1,
"ctime" : -1,
"background" : 1
},
"indexName" : "appIds_1_ctime_-1_background_1",
"isMultiKey" : true,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"appIds" : [
"[1460.0, 1460.0]"
],
"ctime" : [
"[MaxKey, MinKey]"
],
"background" : [
"[MinKey, MaxKey]"
]
},
"keysExamined" : 405,
"seeks" : 1,
"dupsTested" : 405,
"dupsDropped" : 0,
"seenInvalidated" : 0
}
}
}
}
},
"serverInfo" : {
"host" : "",
"port" : ,
"version" : "4.0.18",
"gitVersion" : "6883bdfb8b8cff32176b1fd176df04da9165fd67"
},
"ok" : 1,
"operationTime" : Timestamp(1629988625, 146),
"$clusterTime" : {
"clusterTime" : Timestamp(1629988625, 146),
"signature" : {
"hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="),
"keyId" : NumberLong(0)
}
}
}
i followed Joe, changed my index, ctime before appIds.
but it does not work very well. there still be slow log for my new sql. it's hard to find out why it is slow
2021-10-19T22:38:16.918+0800 I COMMAND [conn2434281164] command newsmine.newstoapp command: find { find: "newstoapp", filter: { appIds: { $elemMatch: { $in: [ 2433 ] } }, ctime: { $gte: 0 } }, sort: { ctime: -1 }, hint: { ctime: -1, appIds: 1 }, skip: 0, limit: 50, batchSize: 50, $readPreference: { mode: "secondaryPreferred" }, $db: "newsmine" } planSummary: IXSCAN { ctime: -1, appIds: 1 } keysExamined:1471582 docsExamined:50 cursorExhausted:1 numYields:11496 nreturned:50 reslen:34043 locks:{ Global: { acquireCount: { r: 11497 } }, Database: { acquireCount: { r: 11497 } }, Collection: { acquireCount: { r: 11497 } } } storage:{ data: { bytesRead: 44958, timeReadingMicros: 618 } } protocol:op_query 7038ms
The cause of the slowness is that MongoDB 4.0.18 hash a blocking sort stage, so all matching documents must be found, retrieved, and sorted in memory before returning the requested batch.
In prior versions of MongoDB it was found that under certain conditions using a multi-key index to support a sort would provide incorrect result.
I never fully understood these conditions or why the results were incorrect, so if you are able to find those details, please edit or comment.
Prior to MongoDB 3.4 the index metadata contained a boolean value to indicate whether or not the index was multi-key (indexed a field that contained an array for at least one document).
MongoDB 3.4 introduced a new index version that also keeps track of which fields in the index are multi-key.
MongoDB 3.6 introduced a change to sorting to avoid the situations where results would be incorrect. This is why your query has a sort stage and is taking longer.
There are a couple things you could try to get back to the previous behavior without a blocking sort:
Drop and rebuild the index.
The existing index is version 1, which does not track multi-key paths. When rebuilding, the index should be created at version 2, which does track these, and may permit the query executor to use the index for sorting.
Create a new index with ctime before appIds.
A multi-key index has an entry in the index for each value in the indexed array. This may cause the query planner to assume it will disrupt sorting on a following key.
An index on {ctime:-1, appIds:1, background:1} would place the sort key ahead of the multi-key field, and while this may require reading more of the index, it may also permit the query executor to use the index for sorting.

MongoDB sort and limit performance issue

I'm facing with a strange issue. The explain of the query shows usage of index with a fast execution time in all stages but the last stage of LIMIT breaks it all. I have execution time of 60s and more!
The DB is used for marketing tool and we collect data on the campaign activity. We have 100k+ records in DB per each hour and I want to select all the 100k in batches of 5k (I tried to reduce it to 1k also) in order to make statistics aggregation.
Just for tests, I will add an example of the query explanation.
If I reduce the created_at date range to 10 minutes and I set the limit to 1000, it works fast.
If I set the created_at date range to 30 minutes and I set the limit to 1000, it stock again.
I have 8 CPU and 64 memory
Storage is with 6000 IOPS
The table include 900 million records in total
I have the following indexes:
created_at: 1
_id: 1
{created_at: 1, _id: 1} - not used in query
Slow query for 30 minutes
> db.logs.explain('allPlansExecution').aggregate([{"$match":{"created_at":{"$gte":ISODate('2021-06-02T20:00:00.000+00:00'),"$lte":ISODate('2021-06-02T20:30:00.000+00:00')}}},{"$sort":{"_id":1}},{"$limit":1000}], { allowDiskUse: true });
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "webpush.campaign_action_logs",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"created_at" : {
"$lte" : ISODate("2021-06-02T20:30:00Z")
}
},
{
"created_at" : {
"$gte" : ISODate("2021-06-02T20:00:00Z")
}
}
]
},
"optimizedPipeline" : true,
"winningPlan" : {
"stage" : "SORT",
"sortPattern" : {
"_id" : 1
},
"memLimit" : 104857600,
"limitAmount" : 100,
"type" : "simple",
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"created_at" : 1
},
"indexName" : "created_at_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"created_at" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"created_at" : [
"[new Date(1622664000000), new Date(1622665800000)]"
]
}
}
}
},
"rejectedPlans" : [
{
"stage" : "LIMIT",
"limitAmount" : 100,
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"created_at" : {
"$lte" : ISODate("2021-06-02T20:30:00Z")
}
},
{
"created_at" : {
"$gte" : ISODate("2021-06-02T20:00:00Z")
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"_id" : 1
},
"indexName" : "_id_",
"isMultiKey" : false,
"multiKeyPaths" : {
"_id" : [ ]
},
"isUnique" : true,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"_id" : [
"[MinKey, MaxKey]"
]
}
}
}
}
]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 100,
"executionTimeMillis" : 183411,
"totalKeysExamined" : 257959,
"totalDocsExamined" : 257959,
"executionStages" : {
"stage" : "SORT",
"nReturned" : 100,
"executionTimeMillisEstimate" : 625,
"works" : 258061,
"advanced" : 100,
"needTime" : 257960,
"needYield" : 0,
"saveState" : 9571,
"restoreState" : 9571,
"isEOF" : 1,
"sortPattern" : {
"_id" : 1
},
"memLimit" : 104857600,
"limitAmount" : 100,
"type" : "simple",
"totalDataSizeSorted" : 187871472,
"usedDisk" : false,
"inputStage" : {
"stage" : "FETCH",
"nReturned" : 257959,
"executionTimeMillisEstimate" : 533,
"works" : 257960,
"advanced" : 257959,
"needTime" : 0,
"needYield" : 0,
"saveState" : 9571,
"restoreState" : 9571,
"isEOF" : 1,
"docsExamined" : 257959,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 257959,
"executionTimeMillisEstimate" : 265,
"works" : 257960,
"advanced" : 257959,
"needTime" : 0,
"needYield" : 0,
"saveState" : 9571,
"restoreState" : 9571,
"isEOF" : 1,
"keyPattern" : {
"created_at" : 1
},
"indexName" : "created_at_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"created_at" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"created_at" : [
"[new Date(1622664000000), new Date(1622665800000)]"
]
},
"keysExamined" : 257959,
"seeks" : 1,
"dupsTested" : 0,
"dupsDropped" : 0
}
}
},
"allPlansExecution" : [
{
"nReturned" : 100,
"executionTimeMillisEstimate" : 625,
"totalKeysExamined" : 257959,
"totalDocsExamined" : 257959,
"executionStages" : {
"stage" : "SORT",
"nReturned" : 100,
"executionTimeMillisEstimate" : 625,
"works" : 258060,
"advanced" : 100,
"needTime" : 257960,
"needYield" : 0,
"saveState" : 9571,
"restoreState" : 9571,
"isEOF" : 0,
"sortPattern" : {
"_id" : 1
},
"memLimit" : 104857600,
"limitAmount" : 100,
"type" : "simple",
"totalDataSizeSorted" : 187871472,
"usedDisk" : false,
"inputStage" : {
"stage" : "FETCH",
"nReturned" : 257959,
"executionTimeMillisEstimate" : 533,
"works" : 257960,
"advanced" : 257959,
"needTime" : 0,
"needYield" : 0,
"saveState" : 9571,
"restoreState" : 9571,
"isEOF" : 1,
"docsExamined" : 257959,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 257959,
"executionTimeMillisEstimate" : 265,
"works" : 257960,
"advanced" : 257959,
"needTime" : 0,
"needYield" : 0,
"saveState" : 9571,
"restoreState" : 9571,
"isEOF" : 1,
"keyPattern" : {
"created_at" : 1
},
"indexName" : "created_at_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"created_at" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"created_at" : [
"[new Date(1622664000000), new Date(1622665800000)]"
]
},
"keysExamined" : 257959,
"seeks" : 1,
"dupsTested" : 0,
"dupsDropped" : 0
}
}
}
},
{
"nReturned" : 70,
"executionTimeMillisEstimate" : 178092,
"totalKeysExamined" : 258060,
"totalDocsExamined" : 258060,
"executionStages" : {
"stage" : "LIMIT",
"nReturned" : 70,
"executionTimeMillisEstimate" : 178092,
"works" : 258060,
"advanced" : 70,
"needTime" : 257990,
"needYield" : 0,
"saveState" : 9571,
"restoreState" : 9571,
"isEOF" : 0,
"limitAmount" : 100,
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"created_at" : {
"$lte" : ISODate("2021-06-02T20:30:00Z")
}
},
{
"created_at" : {
"$gte" : ISODate("2021-06-02T20:00:00Z")
}
}
]
},
"nReturned" : 70,
"executionTimeMillisEstimate" : 178057,
"works" : 258060,
"advanced" : 70,
"needTime" : 257990,
"needYield" : 0,
"saveState" : 9571,
"restoreState" : 9571,
"isEOF" : 0,
"docsExamined" : 258060,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 258060,
"executionTimeMillisEstimate" : 645,
"works" : 258060,
"advanced" : 258060,
"needTime" : 0,
"needYield" : 0,
"saveState" : 9571,
"restoreState" : 9571,
"isEOF" : 0,
"keyPattern" : {
"_id" : 1
},
"indexName" : "_id_",
"isMultiKey" : false,
"multiKeyPaths" : {
"_id" : [ ]
},
"isUnique" : true,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"_id" : [
"[MinKey, MaxKey]"
]
},
"keysExamined" : 258060,
"seeks" : 1,
"dupsTested" : 0,
"dupsDropped" : 0
}
}
}
}
]
},
"serverInfo" : {
"host" : "ip-10-0-3-171",
"port" : 27017,
"version" : "4.4.6",
"gitVersion" : "72e66213c2c3eab37d9358d5e78ad7f5c1d0d0d7"
},
"ok" : 1
}
Faster query for 10 minutes:
> db.logs.explain('allPlansExecution').aggregate([{"$match":{"created_at":{"$gte":ISODate('2021-06-02T20:00:00.000+00:00'),"$lte":ISODate('2021-06-02T20:10:00.000+00:00')}}},{"$sort":{"_id":1}},{"$limit":1000}], { allowDiskUse: true });
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "webpush.campaign_action_logs",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"created_at" : {
"$lte" : ISODate("2021-06-02T20:10:00Z")
}
},
{
"created_at" : {
"$gte" : ISODate("2021-06-02T20:00:00Z")
}
}
]
},
"optimizedPipeline" : true,
"winningPlan" : {
"stage" : "SORT",
"sortPattern" : {
"_id" : 1
},
"memLimit" : 104857600,
"limitAmount" : 1000,
"type" : "simple",
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"created_at" : 1
},
"indexName" : "created_at_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"created_at" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"created_at" : [
"[new Date(1622664000000), new Date(1622664600000)]"
]
}
}
}
},
"rejectedPlans" : [
{
"stage" : "LIMIT",
"limitAmount" : 1000,
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"created_at" : {
"$lte" : ISODate("2021-06-02T20:10:00Z")
}
},
{
"created_at" : {
"$gte" : ISODate("2021-06-02T20:00:00Z")
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"_id" : 1
},
"indexName" : "_id_",
"isMultiKey" : false,
"multiKeyPaths" : {
"_id" : [ ]
},
"isUnique" : true,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"_id" : [
"[MinKey, MaxKey]"
]
}
}
}
}
]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 1000,
"executionTimeMillis" : 1502,
"totalKeysExamined" : 58027,
"totalDocsExamined" : 58027,
"executionStages" : {
"stage" : "SORT",
"nReturned" : 1000,
"executionTimeMillisEstimate" : 122,
"works" : 59029,
"advanced" : 1000,
"needTime" : 58028,
"needYield" : 0,
"saveState" : 122,
"restoreState" : 122,
"isEOF" : 1,
"sortPattern" : {
"_id" : 1
},
"memLimit" : 104857600,
"limitAmount" : 1000,
"type" : "simple",
"totalDataSizeSorted" : 42213931,
"usedDisk" : false,
"inputStage" : {
"stage" : "FETCH",
"nReturned" : 58027,
"executionTimeMillisEstimate" : 96,
"works" : 58028,
"advanced" : 58027,
"needTime" : 0,
"needYield" : 0,
"saveState" : 122,
"restoreState" : 122,
"isEOF" : 1,
"docsExamined" : 58027,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 58027,
"executionTimeMillisEstimate" : 40,
"works" : 58028,
"advanced" : 58027,
"needTime" : 0,
"needYield" : 0,
"saveState" : 122,
"restoreState" : 122,
"isEOF" : 1,
"keyPattern" : {
"created_at" : 1
},
"indexName" : "created_at_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"created_at" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"created_at" : [
"[new Date(1622664000000), new Date(1622664600000)]"
]
},
"keysExamined" : 58027,
"seeks" : 1,
"dupsTested" : 0,
"dupsDropped" : 0
}
}
},
"allPlansExecution" : [
{
"nReturned" : 101,
"executionTimeMillisEstimate" : 122,
"totalKeysExamined" : 58027,
"totalDocsExamined" : 58027,
"executionStages" : {
"stage" : "SORT",
"nReturned" : 101,
"executionTimeMillisEstimate" : 122,
"works" : 58129,
"advanced" : 101,
"needTime" : 58028,
"needYield" : 0,
"saveState" : 121,
"restoreState" : 121,
"isEOF" : 0,
"sortPattern" : {
"_id" : 1
},
"memLimit" : 104857600,
"limitAmount" : 1000,
"type" : "simple",
"totalDataSizeSorted" : 42213931,
"usedDisk" : false,
"inputStage" : {
"stage" : "FETCH",
"nReturned" : 58027,
"executionTimeMillisEstimate" : 96,
"works" : 58028,
"advanced" : 58027,
"needTime" : 0,
"needYield" : 0,
"saveState" : 121,
"restoreState" : 121,
"isEOF" : 1,
"docsExamined" : 58027,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 58027,
"executionTimeMillisEstimate" : 40,
"works" : 58028,
"advanced" : 58027,
"needTime" : 0,
"needYield" : 0,
"saveState" : 121,
"restoreState" : 121,
"isEOF" : 1,
"keyPattern" : {
"created_at" : 1
},
"indexName" : "created_at_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"created_at" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"created_at" : [
"[new Date(1622664000000), new Date(1622664600000)]"
]
},
"keysExamined" : 58027,
"seeks" : 1,
"dupsTested" : 0,
"dupsDropped" : 0
}
}
}
},
{
"nReturned" : 3,
"executionTimeMillisEstimate" : 935,
"totalKeysExamined" : 58129,
"totalDocsExamined" : 58129,
"executionStages" : {
"stage" : "LIMIT",
"nReturned" : 3,
"executionTimeMillisEstimate" : 935,
"works" : 58129,
"advanced" : 3,
"needTime" : 58126,
"needYield" : 0,
"saveState" : 122,
"restoreState" : 122,
"isEOF" : 0,
"limitAmount" : 1000,
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"created_at" : {
"$lte" : ISODate("2021-06-02T20:10:00Z")
}
},
{
"created_at" : {
"$gte" : ISODate("2021-06-02T20:00:00Z")
}
}
]
},
"nReturned" : 3,
"executionTimeMillisEstimate" : 935,
"works" : 58129,
"advanced" : 3,
"needTime" : 58126,
"needYield" : 0,
"saveState" : 122,
"restoreState" : 122,
"isEOF" : 0,
"docsExamined" : 58129,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 58129,
"executionTimeMillisEstimate" : 17,
"works" : 58129,
"advanced" : 58129,
"needTime" : 0,
"needYield" : 0,
"saveState" : 122,
"restoreState" : 122,
"isEOF" : 0,
"keyPattern" : {
"_id" : 1
},
"indexName" : "_id_",
"isMultiKey" : false,
"multiKeyPaths" : {
"_id" : [ ]
},
"isUnique" : true,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"_id" : [
"[MinKey, MaxKey]"
]
},
"keysExamined" : 58129,
"seeks" : 1,
"dupsTested" : 0,
"dupsDropped" : 0
}
}
}
}
]
},
"serverInfo" : {
"host" : "ip-10-0-3-171",
"port" : 27017,
"version" : "4.4.6",
"gitVersion" : "72e66213c2c3eab37d9358d5e78ad7f5c1d0d0d7"
},
"ok" : 1
}
Questions:
If the issue is the amount of doc scanned and we used index, how can I solve it? I do not think that looping over 5000k results in a sort order in a 300k docs should cuz an issue but maybe I'm wrong.
Why the LIMIT stage (with another stage of FETCH) causing the issue?
Thank you very much for all experts in advance!
The issue is the sort. Because you are using skip/limit on a sorted set, it must load the entire set from disk, then sort it in memory (possibly spilling to disk) in order to apply the skip/limit.
This means that if you have 100k documents matching the time range, you load all 100k to get the first batch, and then load 100k again to get the second batch.
An index in mongodb can support a sort if the index includes the field that is sorted on, and any fields that precede that key in the index creation spec are matched with an equality predicate.
Fields matched with inequality or ranges can be listed after the sort key.
The existing index on { _id:1 } is inefficient because it is non-selective and therefore requires reading the entire collection from diks.
The index on { created_at:1 } requires loading only those document that match the query, but they must then be sorted in memory.
To support this query, create an index on { _id:1, created_at:1 }. This index should significantly improve the performance because it can both eliminate the in-memory sort, and only require the query executor to load from disk those documents that match the query. This also has the benefit that the query executor can terminate as soon as the limit is satisfied.

Mongo db query time more than expected

I am running standalone mongodb server with version 3.4. I am using following query on my collection which contains around 1.8 million document out of which around 1 million document are in "ARCHIVED" status.
db.tender_listing.find({ "tender_id" : { "$gt" : "d"} , "workflow_status" : { "$in" : [ "ARCHIVED"]}}).limit(4000).sort({tender_id:1}).hint({workflow_status:1, tender_id:1}).explain('executionStats')
Each query stage has executionTimeMillisEstimate of not more than 100ms but the total executionTimeMillis is 30992.
For what operation query is taking this much extra time? Also how can I optimise same?
Following is the output
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "ofbTenders.tender_listing",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"workflow_status" : {
"$eq" : "ARCHIVED"
}
},
{
"tender_id" : {
"$gt" : "d"
}
}
]
},
"winningPlan" : {
"stage" : "LIMIT",
"limitAmount" : 4000,
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"workflow_status" : 1,
"tender_id" : 1
},
"indexName" : "workflow_status_1_tender_id_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"workflow_status" : [
"[\"ARCHIVED\", \"ARCHIVED\"]"
],
"tender_id" : [
"(\"d\", {})"
]
}
}
}
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 4000,
"executionTimeMillis" : 30992,
"totalKeysExamined" : 4000,
"totalDocsExamined" : 4000,
"executionStages" : {
"stage" : "LIMIT",
"nReturned" : 4000,
"executionTimeMillisEstimate" : 90,
"works" : 6129,
"advanced" : 4000,
"needTime" : 0,
"needYield" : 2128,
"saveState" : 2128,
"restoreState" : 2128,
"isEOF" : 1,
"invalidates" : 0,
"limitAmount" : 4000,
"inputStage" : {
"stage" : "FETCH",
"nReturned" : 4000,
"executionTimeMillisEstimate" : 80,
"works" : 6128,
"advanced" : 4000,
"needTime" : 0,
"needYield" : 2128,
"saveState" : 2128,
"restoreState" : 2128,
"isEOF" : 0,
"invalidates" : 0,
"docsExamined" : 4000,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 4000,
"executionTimeMillisEstimate" : 10,
"works" : 4000,
"advanced" : 4000,
"needTime" : 0,
"needYield" : 0,
"saveState" : 2128,
"restoreState" : 2128,
"isEOF" : 0,
"invalidates" : 0,
"keyPattern" : {
"workflow_status" : 1,
"tender_id" : 1
},
"indexName" : "workflow_status_1_tender_id_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"workflow_status" : [
"[\"ARCHIVED\", \"ARCHIVED\"]"
],
"tender_id" : [
"(\"d\", {})"
]
},
"keysExamined" : 4000,
"seeks" : 1,
"dupsTested" : 0,
"dupsDropped" : 0,
"seenInvalidated" : 0
}
}
}
},
"serverInfo" : {
"host" : "ofb59-Latitude-3450",
"port" : 27017,
"version" : "3.4.4",
"gitVersion" : "888390515874a9debd1b6c5d36559ca86b44babd"
},
"ok" : 1
}
I am not sure but you can try below option.
1) instead of using $in for workflow you can give directly "workflow_status" : "ARCHIVED"
2) change order of fields in find first workflow status and then tender_id.
3) run query execution plan without hint. Let MongoDB decide which index to use.

Difficulty optimizing Mongo distinct query to use indexes

I am having difficulty persuading Mongo to run a distinct query that looks like it should be covered by the indexes without fetching a large number of documents in the collection.
My documents have the general form:
{
_tenantId: 'someString',
_productCategory: 'some string from a smallish set'
...
}
I have an index on (_tenantId, _productCategory).
I want to find out what the set of distinct product categories is for a given tenant, so the query is:
db.products.distinct( '_productCategory', { _tenantId: '463171c3-d15f-4699-893d-3046327f8e1f'})
This runs rather slowly (several seconds for a collection of around half a million products against a local DB, which is Mongo 3.2.9). Against our pre-production SaaS-based Mongo (which is probably more memory constrained than my local instance which has free run of my machine) it take several 10s of seconds for the same data.
Explaining the query yields:
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "engage-prod.products",
"indexFilterSet" : false,
"parsedQuery" : {
"_tenantId" : {
"$eq" : "463171c3-d15f-4699-893d-3046327f8e1f"
}
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"_tenantId" : 1,
"_productCategory" : 1
},
"indexName" : "_tenantId_1__productCategory_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"_tenantId" : [
"[\"463171c3-d15f-4699-893d-3046327f8e1f\", \"463171c3-d15f-4699-893d-3046327f8e1f\"]"
],
"_productCategory" : [
"[MinKey, MaxKey]"
]
}
}
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 406871,
"executionTimeMillis" : 358,
"totalKeysExamined" : 406871,
"totalDocsExamined" : 406871,
"executionStages" : {
"stage" : "FETCH",
"nReturned" : 406871,
"executionTimeMillisEstimate" : 80,
"works" : 406872,
"advanced" : 406871,
"needTime" : 0,
"needYield" : 0,
"saveState" : 3178,
"restoreState" : 3178,
"isEOF" : 1,
"invalidates" : 0,
"docsExamined" : 406871,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 406871,
"executionTimeMillisEstimate" : 40,
"works" : 406872,
"advanced" : 406871,
"needTime" : 0,
"needYield" : 0,
"saveState" : 3178,
"restoreState" : 3178,
"isEOF" : 1,
"invalidates" : 0,
"keyPattern" : {
"_tenantId" : 1,
"_productCategory" : 1
},
"indexName" : "_tenantId_1__productCategory_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"_tenantId" : [
"[\"463171c3-d15f-4699-893d-3046327f8e1f\", \"463171c3-d15f-4699-893d-3046327f8e1f\"]"
],
"_productCategory" : [
"[MinKey, MaxKey]"
]
},
"keysExamined" : 406871,
"dupsTested" : 0,
"dupsDropped" : 0,
"seenInvalidated" : 0
}
}
},
"serverInfo" : {
"host" : "Stevens-MacBook-Pro.local",
"port" : 27017,
"version" : "3.2.9",
"gitVersion" : "22ec9e93b40c85fc7cae7d56e7d6a02fd811088c"
},
"ok" : 1
}
Note that even though it runs an IXSCAN it still returns over 400K documents (nReturned).
If I create a compound field _tenantAndProductCategory containing a lexical concatenation (with a : separator) and index that so it's a single field index, then the query:
db.products.explain('executionStats').distinct( '_productTenantAndCategory', { _productTenantAndCategory: {$gte: '463171c3-d15f-4699-893d-3046327f8e1f',$lt: '463171c3-d15f-4699-893d-3046327f8e1g'}})
works entirely within the index and yields:
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "engage-prod.products",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"_productTenantAndCategory" : {
"$lt" : "463171c3-d15f-4699-893d-3046327f8e1g"
}
},
{
"_productTenantAndCategory" : {
"$gte" : "463171c3-d15f-4699-893d-3046327f8e1f"
}
}
]
},
"winningPlan" : {
"stage" : "PROJECTION",
"transformBy" : {
"_id" : 0,
"_productTenantAndCategory" : 1
},
"inputStage" : {
"stage" : "DISTINCT_SCAN",
"keyPattern" : {
"_productTenantAndCategory" : 1
},
"indexName" : "_productTenantAndCategory_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"_productTenantAndCategory" : [
"[\"463171c3-d15f-4699-893d-3046327f8e1f\", \"463171c3-d15f-4699-893d-3046327f8e1g\")"
]
}
}
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 62,
"executionTimeMillis" : 0,
"totalKeysExamined" : 63,
"totalDocsExamined" : 0,
"executionStages" : {
"stage" : "PROJECTION",
"nReturned" : 62,
"executionTimeMillisEstimate" : 0,
"works" : 63,
"advanced" : 62,
"needTime" : 0,
"needYield" : 0,
"saveState" : 0,
"restoreState" : 0,
"isEOF" : 1,
"invalidates" : 0,
"transformBy" : {
"_id" : 0,
"_productTenantAndCategory" : 1
},
"inputStage" : {
"stage" : "DISTINCT_SCAN",
"nReturned" : 62,
"executionTimeMillisEstimate" : 0,
"works" : 63,
"advanced" : 62,
"needTime" : 0,
"needYield" : 0,
"saveState" : 0,
"restoreState" : 0,
"isEOF" : 1,
"invalidates" : 0,
"keyPattern" : {
"_productTenantAndCategory" : 1
},
"indexName" : "_productTenantAndCategory_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"_productTenantAndCategory" : [
"[\"463171c3-d15f-4699-893d-3046327f8e1f\", \"463171c3-d15f-4699-893d-3046327f8e1g\")"
]
},
"keysExamined" : 63
}
}
},
"serverInfo" : {
"host" : "Stevens-MacBook-Pro.local",
"port" : 27017,
"version" : "3.2.9",
"gitVersion" : "22ec9e93b40c85fc7cae7d56e7d6a02fd811088c"
},
"ok" : 1
}
Having to build single field indexes with manually compounded keys for all the aggregation queries I need is not a very desirable path to follow. Since all the information is present in the compound index I started with, why can't Mongo execute the original distinct query with cover by that index? Is there anything I can do to overcome this in the way of query optimization?
Note This is actually a sub-problem of a slightly more complex one involving an aggregation pipeline to actually count the number of occurrences of each category, but I am restricting my question for now to the simpler distinct query since it seems to capture the essence of failure to use an index that should cover things (which I was also seeing in the aggregation pipeline case), while being a simpler overall query.

MongoDB possibly scanning documents for an operation that could be covered by an index

I have a collection with a locked field in each document.
I have the following index:
{
locked : 1
}
when I perform this explain over a count operation
db.scheduled.find({locked: false}).explain({executionStats:1})
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "connectivity_recruiter.scheduled",
"indexFilterSet" : false,
"parsedQuery" : {
"locked" : {
"$eq" : false
}
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"locked" : 1
},
"indexName" : "locked_1",
"isMultiKey" : false,
"direction" : "forward",
"indexBounds" : {
"locked" : [
"[false, false]"
]
}
}
},
.....
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 53045,
"executionTimeMillis" : 299,
"totalKeysExamined" : 53045,
"totalDocsExamined" : 53045,
"executionStages" : {
"stage" : "FETCH",
"nReturned" : 53045,
"executionTimeMillisEstimate" : 180,
"works" : 53046,
"advanced" : 53045,
"needTime" : 0,
"needFetch" : 0,
"saveState" : 417,
"restoreState" : 417,
"isEOF" : 1,
"invalidates" : 0,
"docsExamined" : 53045,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 53045,
"executionTimeMillisEstimate" : 70,
"works" : 53046,
"advanced" : 53045,
"needTime" : 0,
"needFetch" : 0,
"saveState" : 417,
"restoreState" : 417,
"isEOF" : 1,
"invalidates" : 0,
"keyPattern" : {
"locked" : 1
},
"indexName" : "locked_1",
"isMultiKey" : false,
"direction" : "forward",
"indexBounds" : {
"locked" : [
"[false, false]"
]
},
"keysExamined" : 53045,
"dupsTested" : 0,
"dupsDropped" : 0,
"seenInvalidated" : 0,
"matchTested" : 0
}
},
...........
}
totalDocsExamined seems indicate that all documents are being scanned in order to count them, while this operation could be performed by using the index alone.
What is happening? Is this normal? Is a full scan of the collection going on?
Thanks
All the returned docs were examined, the index was used only to filter not to retrieve the documents.
If you look at your explain, you'll notice that the number of docs is equals the number of documents examined.
Why that? Your index only contains one field while you're fetching the entire document, what mongodb does is to query the index for the keys and then go for the collection to fetch the document.
The only situation were no document will need to be examined is for covered queries, when the index contains all the projected fields.
See more at this link: https://docs.mongodb.com/manual/core/query-optimization/#covered-query