I am new to mongo and below query performs really slow with record set over 2 Million records
Query
db.testCollection.aggregate({
$match: {
active: {
$ne: false
}
}
}, {
$group: {
_id: {
productName: "$productName",
model: "$model",
version: "$version",
uid: "$uid"
},
total: {
$sum: 1
}
}
}, {
$project: {
total: 1,
model: "$_id.model",
version: "$_id.version",
uid: "$_id.uid",
productName: "$_id.productName"
}
}, {
$sort: {
model: 1
}
})
explain()
{
"stages" : [
{
"$cursor" : {
"query" : {
"active" : {
"$ne" : false
}
},
"fields" : {
"version" : 1,
"productName" : 1,
"model" : 1,
"uid" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "fms2.device",
"indexFilterSet" : false,
"parsedQuery" : {
"$nor" : [
{
"active" : {
"$eq" : false
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"active" : 1
},
"indexName" : "active",
"isMultiKey" : false,
"multiKeyPaths" : {
"active" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"active" : [
"[MinKey, false)",
"(false, MaxKey]"
]
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : {
"productName" : "$productName",
"model" : "$model",
"version" : "$version",
"uid" : "$uid"
},
"total" : {
"$sum" : {
"$const" : 1
}
}
}
},
{
"$project" : {
"_id" : true,
"total" : true,
"model" : "$_id.model",
"version" : "$_id.version",
"uid" : "$_id.uid",
"productName" : "$_id.productName"
}
},
{
"$sort" : {
"sortKey" : {
"model" : 1
}
}
}
],
"ok" : 1
}
Is there a way to optimize this query more ? I had a look into https://docs.mongodb.com/manual/core/aggregation-pipeline-optimization/ as well but most of the stated suggestions are not applicable for this query.
Not sure if it matters, result of this aggregation ends up with only 20-30 records.
Related
{"messageId": "123124", "writtenAt":"2017-04-26T15:16:36.200Z", "updatedAt":"2999-12-31T23:59:59.999Z"}
{"messageId": "123124", "writtenAt":"2017-04-26T15:21:30.230Z", "updatedAt":"2999-12-31T23:59:59.999Z"}
The structure of the collection is above. Aside from the mongo id, it has an id called 'messageId', in a collection we can have multiple entries with same 'messageId' but has different 'writtenAt' field value. Have a compound index: messageId (desc), writtenAt (desc).
Now, wanting to do a group by on messageId so I would only get the the latest one (max writtenAt value). I have the following query but it's taking very long I haven't even gotten a result yet (more than 10 mins then I stop, collection has over 1.3 million records):
db.messages.aggregate(
[{ "$match": { "updatedAt": { "$gte": { "$date": "2021-02-26T06:59:51.738Z" } } } },
{ "$sort": { "messageId": -1, "writtenAt": -1 } },
{ "$group": { "_id": "$messageId", "doc": { "$first": "$$ROOT" } } },
{ "$replaceRoot" : { "newRoot" : "$doc"}}
], {allowDiskUse: true});
If I add an explain with executionStats, I can see it's picking up the index:
[
{
"$cursor" : {
"query" : {
"updatedAt" : {
"$gte" : ISODate("2021-02-26T06:59:51.738+0000")
}
},
"sort" : {
"messageId" : -1.0,
"writtenAt" : -1.0
},
"queryPlanner" : {
"plannerVersion" : 1.0,
"namespace" : "db.messages",
"indexFilterSet" : false,
"parsedQuery" : {
"updatedAt" : {
"$gte" : ISODate("2021-02-26T06:59:51.738+0000")
}
},
"queryHash" : "3141BBC5",
"planCacheKey" : "6858F892",
"winningPlan" : {
"stage" : "FETCH",
"filter" : {
"updatedAt" : {
"$gte" : ISODate("2021-02-26T06:59:51.738+0000")
}
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"messageId" : -1.0,
"writtenAt" : -1.0
},
"indexName" : "idx_messageId",
"isMultiKey" : false,
"multiKeyPaths" : {
"messageId" : [
],
"writtenAt" : [
]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2.0,
"direction" : "forward",
"indexBounds" : {
"messageId" : [
"[MaxKey, MinKey]"
],
"writtenAt" : [
"[MaxKey, MinKey]"
]
}
}
},
"rejectedPlans" : [
]
}
}
},
{
"$group" : {
"_id" : "$messageId",
"doc" : {
"$first" : "$$ROOT"
}
}
},
{
"$replaceRoot" : {
"newRoot" : "$doc"
}
}
]
Any idea how can I can improve? After retrieving the latest messages by messageId then planning to do some pagination slicing after.
***Removed
I have the MongoDB aggregation query
db.data.aggregate([{ "$match" : { "$text" : { "$search" : "STORAGE TYPE" } } },
{ "$group" :
{ "_id" :{"doc_type": "$doc_type" ,"title" : "$title", "player_name" : "$player_name", "player_type" : "INSTITUTION", "country_code" :"$country_code" },
"number_records" : { "$sum" : 1}
}
},
{"$match" : {"doc_type": "PATENT"} },
{"$sort":{"number_records" : -1}},
{"$limit" : 10}],
{"allowDiskuse" : true}
)
When I tried to execute the above code, it keeps on buffering for a long time, I am not getting any output. Can anyone help me?
When I used command explain(), it shows the following code:
{
"stages" : [
{
"$cursor" : {
"query" : {
"$and" : [
{
"$text" : {
"$search" : "STORAGE TYPE"
}
},
{
"doc_type" : "PATENT"
}
]
},
"fields" : {
"country_code" : 1,
"doc_type" : 1,
"player_name" : 1,
"title" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "datadocuments.data",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"doc_type" : {
"$eq" : "PATENT"
}
},
{
"$text" : {
"$search" : "STORAGE TYPE",
"$language" : "english",
"$caseSensitive" : false,
"$diacriticSensitive" : false
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"filter" : {
"doc_type" : {
"$eq" : "PATENT"
}
},
"inputStage" : {
"stage" : "TEXT",
"indexPrefix" : {
},
"indexName" : "title",
"parsedTextQuery" : {
"terms" : [
"storag",
"type"
],
"negatedTerms" : [ ],
"phrases" : [ ],
"negatedPhrases" : [ ]
},
"textIndexVersion" : 3,
"inputStage" : {
"stage" : "TEXT_MATCH",
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "OR",
"inputStages" : [
{
"stage" : "IXSCAN",
"keyPattern" : {
"_fts" : "text",
"_ftsx" : 1
},
"indexName" : "title",
"isMultiKey" : true,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "backward",
"indexBounds" : {
}
},
{
"stage" : "IXSCAN",
"keyPattern" : {
"_fts" : "text",
"_ftsx" : 1
},
"indexName" : "title",
"isMultiKey" : true,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "backward",
"indexBounds" : {
}
}
]
}
}
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : {
"doc_type" : "$doc_type",
"title" : "$title",
"player_name" : "$player_name",
"player_type" : {
"$const" : "INSTITUTION"
},
"country_code" : "$country_code"
},
"number_records" : {
"$sum" : {
"$const" : 1
}
}
}
},
{
"$sort" : {
"sortKey" : {
"number_records" : -1
},
"limit" : NumberLong("10")
}
}
],
"ok" : 1
}
I couldn't figure out the mistake; is there any problem in aggregation, if not, how to increase the performance?
Your error comes from your second match stage : at this point, doc_type doesn't exist, but _id.doc_type instead. But you bettermerge this stage with the first one, to improve performance by reducing number of documents passed to the $group stage.
Your improved query will be :
db.data.aggregate([
{"$match" : { "$text" : { "$search" : "STORAGE TYPE" `},"doc_type": "PATENT" } },`
{ "$group" :
{ "_id" :{"doc_type": "$doc_type" ,"title" : "$title", "player_name" : "$player_name", "player_type" : "INSTITUTION", "country_code" :"$country_code" },
"number_records" : { "$sum" : 1}
}
},
{"$sort":{"number_records" : -1}},
{"$limit" : 10}],
{"allowDiskuse" : true}
)
I have two mongo queries, the only change in a query is merchantId field still both queries giving me different winning plan.
First Query
db.transactions.find({"created":{"$gte":1527465600000,"$lte":1527551999000},"merchantId":940,"additionalInformation.REQUESTOR":{"$ne":"MOTO"},"$or":[{"paymentMode":{"$ne":"UPI"}},{"bankCode":{"$ne":"GTEZ"}}]}).sort({ _id: -1 }).limit(200).explain()
Output of above query
{
"queryPlanner" : {
"plannerVersion" : 1,
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"$or" : [
{
"$not" : {
"bankCode" : {
"$eq" : "GTEZ"
}
}
},
{
"$not" : {
"paymentMode" : {
"$eq" : "UPI"
}
}
}
]
},
{
"merchantId" : {
"$eq" : 940
}
},
{
"created" : {
"$lte" : 1527551999000
}
},
{
"created" : {
"$gte" : 1527465600000
}
},
{
"$not" : {
"additionalInformation.REQUESTOR" : {
"$eq" : "MOTO"
}
}
}
]
},
"winningPlan" : {
"stage" : "SORT",
"sortPattern" : {
"_id" : -1
},
"limitAmount" : 200,
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"$not" : {
"bankCode" : {
"$eq" : "GTEZ"
}
}
},
{
"$not" : {
"paymentMode" : {
"$eq" : "UPI"
}
}
}
]
},
{
"$not" : {
"additionalInformation.REQUESTOR" : {
"$eq" : "MOTO"
}
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"created" : 1,
"merchantId" : 1
},
"indexName" : "created_1_merchantId_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"created" : [
"[1527465600000.0, 1527551999000.0]"
],
"merchantId" : [
"[940.0, 940.0]"
]
}
}
}
}
}
},
"serverInfo" : {
},
"ok" : 1
}
Second Query
db.transactions.find({"created":{"$gte":1527465600000,"$lte":1527551999000},"merchantId":1429,"additionalInformation.REQUESTOR":{"$ne":"MOTO"},"$or":[{"paymentMode":{"$ne":"UPI"}},{"bankCode":{"$ne":"GTEZ"}}]}).sort({ _id: -1 }).limit(200).explain()
Output of above query
{
"queryPlanner" : {
"plannerVersion" : 1,
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"$or" : [
{
"$not" : {
"bankCode" : {
"$eq" : "GTEZ"
}
}
},
{
"$not" : {
"paymentMode" : {
"$eq" : "UPI"
}
}
}
]
},
{
"merchantId" : {
"$eq" : 1429
}
},
{
"created" : {
"$lte" : 1527551999000
}
},
{
"created" : {
"$gte" : 1527465600000
}
},
{
"$not" : {
"additionalInformation.REQUESTOR" : {
"$eq" : "MOTO"
}
}
}
]
},
"winningPlan" : {
"stage" : "LIMIT",
"limitAmount" : 200,
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"$not" : {
"bankCode" : {
"$eq" : "GTEZ"
}
}
},
{
"$not" : {
"paymentMode" : {
"$eq" : "UPI"
}
}
}
]
},
{
"merchantId" : {
"$eq" : 1429
}
},
{
"created" : {
"$lte" : 1527551999000
}
},
{
"created" : {
"$gte" : 1527465600000
}
},
{
"$not" : {
"additionalInformation.REQUESTOR" : {
"$eq" : "MOTO"
}
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"_id" : 1
},
"indexName" : "_id_",
"isMultiKey" : false,
"isUnique" : true,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "backward",
"indexBounds" : {
"_id" : [
"[MaxKey, MinKey]"
]
}
}
}
}
},
"serverInfo" : {
},
"ok" : 1
}
As you can see only param difference is merchantId, still explain gives different winning plan also IXSCAN also shows different indexes used. In first query created_1_merchantId_1 index is used and in second query id index is used. First query takes 40 seconds to get results while second query gives 1 sec. Quick will be highly appreciated.
I am new to Mongo and was trying to get distinct count of users. The field Id and Status are not individually Indexed columns but there exists a composite index on both the field. My current query is something like this where the match conditions changes depending on the requirements.
DBQuery.shellBatchSize = 1000000;
db.getCollection('username').aggregate([
{$match:
{ Status: "A"
} },
{"$group" : {_id:"$Id", count:{$sum:1}}}
]);
Is there anyway we can optimize this query more or add parallel runs on collection so that we can achieve results faster ?
Regards
You can tune your aggregation pipelines by passing in an option of explain=true in the aggregate method.
db.getCollection('username').aggregate([
{$match: { Status: "A" } },
{"$group" : {_id:"$Id", count:{$sum:1}}}],
{ explain: true });
This will then output the following to work with
{
"stages" : [
{
"$cursor" : {
"query" : {
"Status" : "A"
},
"fields" : {
"Id" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "test.usernames",
"indexFilterSet" : false,
"parsedQuery" : {
"Status" : {
"$eq" : "A"
}
},
"winningPlan" : {
"stage" : "EOF"
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : "$Id",
"count" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"ok" : 1
}
So to speed up our query we need a index to help the match part of the pipeline, so let's create a index on Status
> db.usernames.createIndex({Status:1})
{
"createdCollectionAutomatically" : true,
"numIndexesBefore" : 1,
"numIndexesAfter" : 2,
"ok" : 1
}
If we now run the explain again we'll get the following results
{
"stages" : [
{
"$cursor" : {
"query" : {
"Status" : "A"
},
"fields" : {
"Id" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "test.usernames",
"indexFilterSet" : false,
"parsedQuery" : {
"Status" : {
"$eq" : "A"
}
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"Status" : 1
},
"indexName" : "Status_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"Status" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"Status" : [
"[\"A\", \"A\"]"
]
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : "$Id",
"count" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"ok" : 1
}
We can now see straight away this is using a index.
https://docs.mongodb.com/manual/reference/explain-results/
My Aggregation is pretty slow. I've already made it a little faster (from 3000 ms to 200ms) by using the match statement before the unwind statement. Is there any other way to improve my aggregation? In the end there'll be just one result (the last one based on timestamp). The unwind part is the longest operation if i'm right yet i really do need this.
db.CpuInfo.aggregate([
{"$match":
{
"timestamp": {"$gte":1464764400},
'hostname': 'baklap4'
}
},
{ "$unwind": "$cpuList" },
{ "$group":
{ "_id":
{ "interval":
{ "$subtract": [
"$timestamp",
{ "$mod": [ "$timestamp", 60 * 5 ] }
]}
},
"avgCPULoad": { "$avg": "$cpuList.load" },
"timestamp": { "$max": "$timestamp" }
}
},
{ "$project": { "_id": 0, "avgCPULoad": 1, "timestamp": 1 } },
{$sort: {'timestamp': -1}},
{$limit: 1}
])
The items in my collection are all simular to this:
{
"_id": ObjectId("574d6175da461e77030041b7"),
"hostname": "VPS",
"timestamp": NumberLong(1460040691),
"cpuCores": NumberLong(2),
"cpuList": [
{
"name": "cpu1",
"load": 3.4
},
{
"name": "cpu2",
"load": 0.7
}
]
}
I've added the explain option to my aggregation and this is the result:
{
"waitedMS" : NumberLong(0),
"stages" : [
{
"$cursor" : {
"query" : {
"timestamp" : {
"$gte" : 1464732000
},
"hostname" : "baklap4"
},
"fields" : {
"cpuList" : 1,
"timestamp" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "prototyping.CpuInfo",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"hostname" : {
"$eq" : "baklap4"
}
},
{
"timestamp" : {
"$gte" : 1464732000
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"filter" : {
"hostname" : {
"$eq" : "baklap4"
}
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"timestamp" : NumberLong(1)
},
"indexName" : "timestamp_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"timestamp" : [
"[1464732000.0, inf.0]"
]
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$unwind" : {
"path" : "$cpuList"
}
},
{
"$group" : {
"_id" : {
"interval" : {
"$subtract" : [
"$timestamp",
{
"$mod" : [
"$timestamp",
{
"$const" : 300
}
]
}
]
}
},
"avgCPULoad" : {
"$avg" : "$cpuList.load"
},
"timestamp" : {
"$max" : "$timestamp"
}
}
},
{
"$project" : {
"_id" : false,
"timestamp" : true,
"avgCPULoad" : true
}
},
{
"$sort" : {
"sortKey" : {
"timestamp" : -1
},
"limit" : NumberLong(1)
}
}
],
"ok" : 1
}
When i Look up in my table i see that Timestamp and Id are indexed:
db.CpuInfo.getIndexes()
[
{
"v" : 1,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "prototyping.CpuInfo"
},
{
"v" : 1,
"key" : {
"timestamp" : NumberLong(1)
},
"name" : "timestamp_1",
"ns" : "prototyping.CpuInfo",
"sparse" : false
}
]