My Aggregation is pretty slow. I've already made it a little faster (from 3000 ms to 200ms) by using the match statement before the unwind statement. Is there any other way to improve my aggregation? In the end there'll be just one result (the last one based on timestamp). The unwind part is the longest operation if i'm right yet i really do need this.
db.CpuInfo.aggregate([
{"$match":
{
"timestamp": {"$gte":1464764400},
'hostname': 'baklap4'
}
},
{ "$unwind": "$cpuList" },
{ "$group":
{ "_id":
{ "interval":
{ "$subtract": [
"$timestamp",
{ "$mod": [ "$timestamp", 60 * 5 ] }
]}
},
"avgCPULoad": { "$avg": "$cpuList.load" },
"timestamp": { "$max": "$timestamp" }
}
},
{ "$project": { "_id": 0, "avgCPULoad": 1, "timestamp": 1 } },
{$sort: {'timestamp': -1}},
{$limit: 1}
])
The items in my collection are all simular to this:
{
"_id": ObjectId("574d6175da461e77030041b7"),
"hostname": "VPS",
"timestamp": NumberLong(1460040691),
"cpuCores": NumberLong(2),
"cpuList": [
{
"name": "cpu1",
"load": 3.4
},
{
"name": "cpu2",
"load": 0.7
}
]
}
I've added the explain option to my aggregation and this is the result:
{
"waitedMS" : NumberLong(0),
"stages" : [
{
"$cursor" : {
"query" : {
"timestamp" : {
"$gte" : 1464732000
},
"hostname" : "baklap4"
},
"fields" : {
"cpuList" : 1,
"timestamp" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "prototyping.CpuInfo",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"hostname" : {
"$eq" : "baklap4"
}
},
{
"timestamp" : {
"$gte" : 1464732000
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"filter" : {
"hostname" : {
"$eq" : "baklap4"
}
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"timestamp" : NumberLong(1)
},
"indexName" : "timestamp_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"timestamp" : [
"[1464732000.0, inf.0]"
]
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$unwind" : {
"path" : "$cpuList"
}
},
{
"$group" : {
"_id" : {
"interval" : {
"$subtract" : [
"$timestamp",
{
"$mod" : [
"$timestamp",
{
"$const" : 300
}
]
}
]
}
},
"avgCPULoad" : {
"$avg" : "$cpuList.load"
},
"timestamp" : {
"$max" : "$timestamp"
}
}
},
{
"$project" : {
"_id" : false,
"timestamp" : true,
"avgCPULoad" : true
}
},
{
"$sort" : {
"sortKey" : {
"timestamp" : -1
},
"limit" : NumberLong(1)
}
}
],
"ok" : 1
}
When i Look up in my table i see that Timestamp and Id are indexed:
db.CpuInfo.getIndexes()
[
{
"v" : 1,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "prototyping.CpuInfo"
},
{
"v" : 1,
"key" : {
"timestamp" : NumberLong(1)
},
"name" : "timestamp_1",
"ns" : "prototyping.CpuInfo",
"sparse" : false
}
]
Related
I have an aggregation query in MongoDB:
[{
$group: {
_id: '$status',
status: {
$sum: 1
}
}
}]
It is running on a collection that has ~80 million documents. The status field is indexed, yet the query is very slow and runs for around 60 seconds or more.
I did an explain() on the query, but still got almost nowhere:
{
"explainVersion" : "1",
"stages" : [
{
"$cursor" : {
"queryPlanner" : {
"namespace" : "loa.document",
"indexFilterSet" : false,
"parsedQuery" : {
},
"queryHash" : "B9878693",
"planCacheKey" : "8EAA28C6",
"maxIndexedOrSolutionsReached" : false,
"maxIndexedAndSolutionsReached" : false,
"maxScansToExplodeReached" : false,
"winningPlan" : {
"stage" : "PROJECTION_SIMPLE",
"transformBy" : {
"status" : 1,
"_id" : 0
},
"inputStage" : {
"stage" : "COLLSCAN",
"direction" : "forward"
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : "$status",
"status" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"serverInfo" : {
"host" : "rack-compute-2",
"port" : 27017,
"version" : "5.0.6",
"gitVersion" : "212a8dbb47f07427dae194a9c75baec1d81d9259"
},
"serverParameters" : {
"internalQueryFacetBufferSizeBytes" : 104857600,
"internalQueryFacetMaxOutputDocSizeBytes" : 104857600,
"internalLookupStageIntermediateDocumentMaxSizeBytes" : 104857600,
"internalDocumentSourceGroupMaxMemoryBytes" : 104857600,
"internalQueryMaxBlockingSortMemoryUsageBytes" : 104857600,
"internalQueryProhibitBlockingMergeOnMongoS" : 0,
"internalQueryMaxAddToSetBytes" : 104857600,
"internalDocumentSourceSetWindowFieldsMaxMemoryBytes" : 104857600
},
"command" : {
"aggregate" : "document",
"pipeline" : [
{
"$group" : {
"_id" : "$status",
"status" : {
"$sum" : 1
}
}
}
],
"explain" : true,
"cursor" : {
},
"lsid" : {
"id" : UUID("a07e17fe-65ff-4d38-966f-7517b7a5d3f2")
},
"$db" : "loa"
},
"ok" : 1
}
I see that it does a full COLLSCAN, I just can't understand why.
I plan on supporting a couple hundred million (or even a billion) documents in that collection, but this problem hijacks my plans for seemingly no reason.
You can advice the query planner to use the index as follow:
db.test.explain("executionStats").aggregate(
[
{$group:{ _id:"$status" ,status:{$sum:1} }}
],
{hint:"status_1"}
)
Make sure the index name in the hint is same as created ...
(db.test.getIndexes() will show you the exact index name )
{"messageId": "123124", "writtenAt":"2017-04-26T15:16:36.200Z", "updatedAt":"2999-12-31T23:59:59.999Z"}
{"messageId": "123124", "writtenAt":"2017-04-26T15:21:30.230Z", "updatedAt":"2999-12-31T23:59:59.999Z"}
The structure of the collection is above. Aside from the mongo id, it has an id called 'messageId', in a collection we can have multiple entries with same 'messageId' but has different 'writtenAt' field value. Have a compound index: messageId (desc), writtenAt (desc).
Now, wanting to do a group by on messageId so I would only get the the latest one (max writtenAt value). I have the following query but it's taking very long I haven't even gotten a result yet (more than 10 mins then I stop, collection has over 1.3 million records):
db.messages.aggregate(
[{ "$match": { "updatedAt": { "$gte": { "$date": "2021-02-26T06:59:51.738Z" } } } },
{ "$sort": { "messageId": -1, "writtenAt": -1 } },
{ "$group": { "_id": "$messageId", "doc": { "$first": "$$ROOT" } } },
{ "$replaceRoot" : { "newRoot" : "$doc"}}
], {allowDiskUse: true});
If I add an explain with executionStats, I can see it's picking up the index:
[
{
"$cursor" : {
"query" : {
"updatedAt" : {
"$gte" : ISODate("2021-02-26T06:59:51.738+0000")
}
},
"sort" : {
"messageId" : -1.0,
"writtenAt" : -1.0
},
"queryPlanner" : {
"plannerVersion" : 1.0,
"namespace" : "db.messages",
"indexFilterSet" : false,
"parsedQuery" : {
"updatedAt" : {
"$gte" : ISODate("2021-02-26T06:59:51.738+0000")
}
},
"queryHash" : "3141BBC5",
"planCacheKey" : "6858F892",
"winningPlan" : {
"stage" : "FETCH",
"filter" : {
"updatedAt" : {
"$gte" : ISODate("2021-02-26T06:59:51.738+0000")
}
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"messageId" : -1.0,
"writtenAt" : -1.0
},
"indexName" : "idx_messageId",
"isMultiKey" : false,
"multiKeyPaths" : {
"messageId" : [
],
"writtenAt" : [
]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2.0,
"direction" : "forward",
"indexBounds" : {
"messageId" : [
"[MaxKey, MinKey]"
],
"writtenAt" : [
"[MaxKey, MinKey]"
]
}
}
},
"rejectedPlans" : [
]
}
}
},
{
"$group" : {
"_id" : "$messageId",
"doc" : {
"$first" : "$$ROOT"
}
}
},
{
"$replaceRoot" : {
"newRoot" : "$doc"
}
}
]
Any idea how can I can improve? After retrieving the latest messages by messageId then planning to do some pagination slicing after.
***Removed
I have following peace of java spring mongoDB code:
startTime = System.currentTimeMillis();
AggregationResults<MyClass> list = mongoTemplate.aggregate(Aggregation.newAggregation(operations),
"Post", MyClass.class);
System.out.println("Time taken for query execution -> "
+ (System.currentTimeMillis() - startTime));
when i am testing this code using jmeter, first execution shows:
Time taken for query execution -> 3275 ('list' has 16 records)
On 2nd and henceforth request its liks
Time taken for query execution -> 355 ('list' has 16 records)
Time difference is huge. How can I improve it in first call ?
When I do Aggregation.newAggregation(operations).toString() I am getting following query output. Running the folliwng aggregation query on shell command always take around .350sec.
{
"aggregate": "__collection__",
"pipeline": [
{
"$match": {
"$and": [
{
"postType": "AUTOMATIC"
}
]
}
},
{
"$project": {
"orders.id": 1,
"postedTotals": 1
}
},
{
"$unwind": "$orders"
},
{
"$group": {
"_id": "$orders.userId",
"ae": {
"$addToSet": "$orders.userId"
}
}
},
{
"$sort": {
"ae": 1
}
}
]
}
.explain().aggregate( shows following:
/* 1 */
{
"stages" : [
{
"$cursor" : {
"query" : {
"$and" : [
{
"postType" : "AUTOMATIC"
}
]
},
"fields" : {
"headerPostedTotals" : 1,
"orders.UserId" : 1,
"_id" : 1
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "post",
"indexFilterSet" : false,
"parsedQuery" : {
"postType" : {
"$eq" : "AUTOMATIC"
}
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"postType" : 1,
"orders.UserId" : 1,
"orders.flightStartDateForQuery" : 1,
"orders.flightEndDateForQuery" : 1,
"postRunDate" : -1
},
"indexName" : "default_filter_index",
"isMultiKey" : true,
"multiKeyPaths" : {
"postType" : [],
"orders.UserId" : [
"orders"
],
"orders.flightStartDateForQuery" : [
"orders"
],
"orders.flightEndDateForQuery" : [
"orders"
],
"postRunDate" : []
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"postType" : [
"[\"AUTOMATIC\", \"AUTOMATIC\"]"
],
"orders.UserId" : [
"[MinKey, MaxKey]"
],
"orders.flightStartDateForQuery" : [
"[MinKey, MaxKey]"
],
"orders.flightEndDateForQuery" : [
"[MinKey, MaxKey]"
],
"postRunDate" : [
"[MaxKey, MinKey]"
]
}
}
},
"rejectedPlans" : []
}
}
},
{
"$project" : {
"_id" : true,
"headerPostedTotals" : true,
"orders" : {
"UserId" : true
}
}
},
{
"$unwind" : {
"path" : "$orders"
}
},
{
"$group" : {
"_id" : "$orders.UserId",
"aes" : {
"$addToSet" : "$orders.UserId"
}
}
},
{
"$sort" : {
"sortKey" : {
"aes" : 1
}
}
}
],
"ok" : 1.0
}
I have a collection containing many product documents that have this structure. Each document represents a cartesian product record with a resulting product price.
{
"name": "PRD_SV_HB2_SVH",
"criterias": [
{
"type": "PREMIUM_REGION",
"value": "COD_RP_KZH"
},
{
"type": "ACCIDENT",
"value": "COD_UZ_EIN"
},
{
"type": "AGE_GROUP",
"value": "COD_LA_G36"
},
{
"type": "PRICE_MODEL",
"value": "COD_TM_HO2"
},
{
"type": "PRICE_TABLE",
"value": "PRT_SU_HB2_V001_2009010"
},
{
"type": "DEDUCTIBLE",
"value": "COD_SB_HO4"
}
],
"price": {
"pricingElements": {
"BASE_PRICE": {
"currency": "CHF",
"amount": 67.8
}
}
},
"priceType": "STANDARD",
"_class": "a.b.c.Product"
}
When querying the collection for a unique cartesian product record I use the following query:
db.product.find({ "name": "PRD_SV_HB2_SVH", "$and": [
{ "criterias": { "$elemMatch": { "value": "COD_LA_G36" } } },
{ "criterias": { "$elemMatch": { "value": "COD_SB_HO4" } } },
{ "criterias": { "$elemMatch": { "value": "COD_UZ_EIN" } } },
{ "criterias": { "$elemMatch": { "value": "COD_RP_KZH" } } },
{ "criterias": { "$elemMatch": { "value": "COD_TM_HO2" } } },
{ "criterias": { "$elemMatch": { "value": "PRT_SU_HB2_V001_2009010" } } }
]
})
The query takes more that 2 seconds to produce a result which is not satisfactory. When I run explain on the same query I can see that MongoDB uses an index name but it does not use the dedicated index name_value for this query.
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "productEngine.product",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "COD_LA_G36"
}
}
}
},
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "COD_SB_HO4"
}
}
}
},
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "COD_UZ_EIN"
}
}
}
},
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "COD_RP_KZH"
}
}
}
},
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "COD_TM_HO2"
}
}
}
},
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "PRT_SU_HB2_V001_2009010"
}
}
}
},
{
"name" : {
"$eq" : "PRD_SV_HB2_SVH"
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "COD_LA_G36"
}
}
}
},
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "COD_SB_HO4"
}
}
}
},
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "COD_UZ_EIN"
}
}
}
},
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "COD_RP_KZH"
}
}
}
},
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "COD_TM_HO2"
}
}
}
},
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "PRT_SU_HB2_V001_2009010"
}
}
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"name" : 1
},
"indexName" : "name",
"isMultiKey" : false,
"multiKeyPaths" : {
"name" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"name" : [
"[\"PRD_SV_HB2_SVH\", \"PRD_SV_HB2_SVH\"]"
]
}
}
},
"rejectedPlans" : [
{
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "COD_LA_G36"
}
}
}
},
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "COD_SB_HO4"
}
}
}
},
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "COD_UZ_EIN"
}
}
}
},
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "COD_RP_KZH"
}
}
}
},
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "COD_TM_HO2"
}
}
}
},
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "PRT_SU_HB2_V001_2009010"
}
}
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"name" : 1,
"criteria.value" : 1
},
"indexName" : "name_value",
"isMultiKey" : false,
"multiKeyPaths" : {
"name" : [ ],
"criteria.value" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"name" : [
"[\"PRD_SV_HB2_SVH\", \"PRD_SV_HB2_SVH\"]"
],
"criteria.value" : [
"[MinKey, MaxKey]"
]
}
}
},
{
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "COD_LA_G36"
}
}
}
},
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "COD_SB_HO4"
}
}
}
},
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "COD_UZ_EIN"
}
}
}
},
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "COD_RP_KZH"
}
}
}
},
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "COD_TM_HO2"
}
}
}
},
{
"criterias" : {
"$elemMatch" : {
"value" : {
"$eq" : "PRT_SU_HB2_V001_2009010"
}
}
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"name" : 1,
"priceType" : 1,
"criteria.value" : 1
},
"indexName" : "name_priceType_value",
"isMultiKey" : false,
"multiKeyPaths" : {
"name" : [ ],
"priceType" : [ ],
"criteria.value" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"name" : [
"[\"PRD_SV_HB2_SVH\", \"PRD_SV_HB2_SVH\"]"
],
"priceType" : [
"[MinKey, MaxKey]"
],
"criteria.value" : [
"[MinKey, MaxKey]"
]
}
}
}
]
},
"serverInfo" : {
"host" : "1a63040d1b73",
"port" : 27018,
"version" : "3.4.10",
"gitVersion" : "078f28920cb24de0dd479b5ea6c66c644f6326e9"
},
"ok" : 1
}
I currently created the name_value compound index like this:
{"name":1, "criteria.value":1}
Is this the correct way to create a compound index on nested document fields or I am missing something here? Why isn't it using the name_value index?
I am new to mongo and below query performs really slow with record set over 2 Million records
Query
db.testCollection.aggregate({
$match: {
active: {
$ne: false
}
}
}, {
$group: {
_id: {
productName: "$productName",
model: "$model",
version: "$version",
uid: "$uid"
},
total: {
$sum: 1
}
}
}, {
$project: {
total: 1,
model: "$_id.model",
version: "$_id.version",
uid: "$_id.uid",
productName: "$_id.productName"
}
}, {
$sort: {
model: 1
}
})
explain()
{
"stages" : [
{
"$cursor" : {
"query" : {
"active" : {
"$ne" : false
}
},
"fields" : {
"version" : 1,
"productName" : 1,
"model" : 1,
"uid" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "fms2.device",
"indexFilterSet" : false,
"parsedQuery" : {
"$nor" : [
{
"active" : {
"$eq" : false
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"active" : 1
},
"indexName" : "active",
"isMultiKey" : false,
"multiKeyPaths" : {
"active" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"active" : [
"[MinKey, false)",
"(false, MaxKey]"
]
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : {
"productName" : "$productName",
"model" : "$model",
"version" : "$version",
"uid" : "$uid"
},
"total" : {
"$sum" : {
"$const" : 1
}
}
}
},
{
"$project" : {
"_id" : true,
"total" : true,
"model" : "$_id.model",
"version" : "$_id.version",
"uid" : "$_id.uid",
"productName" : "$_id.productName"
}
},
{
"$sort" : {
"sortKey" : {
"model" : 1
}
}
}
],
"ok" : 1
}
Is there a way to optimize this query more ? I had a look into https://docs.mongodb.com/manual/core/aggregation-pipeline-optimization/ as well but most of the stated suggestions are not applicable for this query.
Not sure if it matters, result of this aggregation ends up with only 20-30 records.