Realtime Mongo aggregation is slow - mongodb

I am developing the dashboard of our application and I was wondering if it's normal for MongoDB to be so slow for some query aggregate queries. I am pasting below a simplified version of one of our queries:
db.data_items.aggregate([
{'$match':
{
'organization_subscription_waves.wave_id':ObjectId('5617fe6abecf0500f9c6e125')
},
{'$group':
{'_id': '$data_type_value_type', 'count': {'$sum': 1}}
}
])
The result is this:
{ "_id" : "PHOTO", "count" : 76 }
{ "_id" : "MULTI_SELECT", "count" : 1607 }
{ "_id" : "TIME", "count" : 659 }
{ "_id" : "MULTIPLE_CHOICE", "count" : 78321 }
{ "_id" : "DATE", "count" : 649 }
{ "_id" : "NUMBER", "count" : 2679 }
and takes over two minutes which I believe it's too long for such a simple aggregation. I am wondering if there is anything I can do to improve the query efficiency or whether I must do some offline optimization to improve the performance.
Among others, I have this index on the collection:
{
"v" : 1,
"name" : "organization_subscription_waves.wave_id_1",
"key" : {
"organization_subscription_waves.wave_id" : 1
},
"ns" : "gigwalk_apps_1.data_items"
},
Yet I am not sure if it's being used. explain has the following output:
{
"stages" : [
{
"$cursor" : {
"query" : {
"organization_subscription_waves.wave_id" : ObjectId("5617fe6abecf0500f9c6e125")
},
"fields" : {
"data_type_value_type" : 1,
"_id" : 0
},
"plan" : {
"cursor" : "BtreeCursor ",
"isMultiKey" : false,
"scanAndOrder" : false,
"indexBounds" : {
"organization_subscription_waves.wave_id" : [
[
ObjectId("5617fe6abecf0500f9c6e125"),
ObjectId("5617fe6abecf0500f9c6e125")
]
]
},
"allPlans" : [
{
"cursor" : "BtreeCursor ",
"isMultiKey" : false,
"scanAndOrder" : false,
"indexBounds" : {
"organization_subscription_waves.wave_id" : [
[
ObjectId("5617fe6abecf0500f9c6e125"),
ObjectId("5617fe6abecf0500f9c6e125")
]
]
}
}
]
}
}
},
{
"$group" : {
"_id" : "$data_type_value_type",
"count" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"ok" : 1
}

Related

MongoDB $group + $sum aggregation is very slow

I have an aggregation query in MongoDB:
[{
$group: {
_id: '$status',
status: {
$sum: 1
}
}
}]
It is running on a collection that has ~80 million documents. The status field is indexed, yet the query is very slow and runs for around 60 seconds or more.
I did an explain() on the query, but still got almost nowhere:
{
"explainVersion" : "1",
"stages" : [
{
"$cursor" : {
"queryPlanner" : {
"namespace" : "loa.document",
"indexFilterSet" : false,
"parsedQuery" : {
},
"queryHash" : "B9878693",
"planCacheKey" : "8EAA28C6",
"maxIndexedOrSolutionsReached" : false,
"maxIndexedAndSolutionsReached" : false,
"maxScansToExplodeReached" : false,
"winningPlan" : {
"stage" : "PROJECTION_SIMPLE",
"transformBy" : {
"status" : 1,
"_id" : 0
},
"inputStage" : {
"stage" : "COLLSCAN",
"direction" : "forward"
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : "$status",
"status" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"serverInfo" : {
"host" : "rack-compute-2",
"port" : 27017,
"version" : "5.0.6",
"gitVersion" : "212a8dbb47f07427dae194a9c75baec1d81d9259"
},
"serverParameters" : {
"internalQueryFacetBufferSizeBytes" : 104857600,
"internalQueryFacetMaxOutputDocSizeBytes" : 104857600,
"internalLookupStageIntermediateDocumentMaxSizeBytes" : 104857600,
"internalDocumentSourceGroupMaxMemoryBytes" : 104857600,
"internalQueryMaxBlockingSortMemoryUsageBytes" : 104857600,
"internalQueryProhibitBlockingMergeOnMongoS" : 0,
"internalQueryMaxAddToSetBytes" : 104857600,
"internalDocumentSourceSetWindowFieldsMaxMemoryBytes" : 104857600
},
"command" : {
"aggregate" : "document",
"pipeline" : [
{
"$group" : {
"_id" : "$status",
"status" : {
"$sum" : 1
}
}
}
],
"explain" : true,
"cursor" : {
},
"lsid" : {
"id" : UUID("a07e17fe-65ff-4d38-966f-7517b7a5d3f2")
},
"$db" : "loa"
},
"ok" : 1
}
I see that it does a full COLLSCAN, I just can't understand why.
I plan on supporting a couple hundred million (or even a billion) documents in that collection, but this problem hijacks my plans for seemingly no reason.
You can advice the query planner to use the index as follow:
db.test.explain("executionStats").aggregate(
[
{$group:{ _id:"$status" ,status:{$sum:1} }}
],
{hint:"status_1"}
)
Make sure the index name in the hint is same as created ...
(db.test.getIndexes() will show you the exact index name )

Mongo aggregation performance

I am new to mongo and below query performs really slow with record set over 2 Million records
Query
db.testCollection.aggregate({
$match: {
active: {
$ne: false
}
}
}, {
$group: {
_id: {
productName: "$productName",
model: "$model",
version: "$version",
uid: "$uid"
},
total: {
$sum: 1
}
}
}, {
$project: {
total: 1,
model: "$_id.model",
version: "$_id.version",
uid: "$_id.uid",
productName: "$_id.productName"
}
}, {
$sort: {
model: 1
}
})
explain()
{
"stages" : [
{
"$cursor" : {
"query" : {
"active" : {
"$ne" : false
}
},
"fields" : {
"version" : 1,
"productName" : 1,
"model" : 1,
"uid" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "fms2.device",
"indexFilterSet" : false,
"parsedQuery" : {
"$nor" : [
{
"active" : {
"$eq" : false
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"active" : 1
},
"indexName" : "active",
"isMultiKey" : false,
"multiKeyPaths" : {
"active" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"active" : [
"[MinKey, false)",
"(false, MaxKey]"
]
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : {
"productName" : "$productName",
"model" : "$model",
"version" : "$version",
"uid" : "$uid"
},
"total" : {
"$sum" : {
"$const" : 1
}
}
}
},
{
"$project" : {
"_id" : true,
"total" : true,
"model" : "$_id.model",
"version" : "$_id.version",
"uid" : "$_id.uid",
"productName" : "$_id.productName"
}
},
{
"$sort" : {
"sortKey" : {
"model" : 1
}
}
}
],
"ok" : 1
}
Is there a way to optimize this query more ? I had a look into https://docs.mongodb.com/manual/core/aggregation-pipeline-optimization/ as well but most of the stated suggestions are not applicable for this query.
Not sure if it matters, result of this aggregation ends up with only 20-30 records.

MongoDB performs slow query on sum() based on monthly groups

This is What I tried so far on aggregated query:
db.getCollection('storage').aggregate([
{
"$match": {
"user_id": 2
}
},
{
"$project": {
"formattedDate": {
"$dateToString": { "format": "%Y-%m", "date": "$created_on" }
},
"size": "$size"
}
},
{ "$group": {
"_id" : "$formattedDate",
"size" : { "$sum": "$size" }
} }
])
This is the result:
/* 1 */
{
"_id" : "2018-02",
"size" : NumberLong(10860595386)
}
/* 2 */
{
"_id" : "2017-12",
"size" : NumberLong(524288)
}
/* 3 */
{
"_id" : "2018-01",
"size" : NumberLong(21587971)
}
And this is the document structure:
{
"_id" : ObjectId("5a59efedd006b9036159e708"),
"user_id" : NumberLong(2),
"is_transferred" : false,
"is_active" : false,
"process_id" : NumberLong(0),
"ratio" : 0.000125759169459343,
"type_id" : 201,
"size" : NumberLong(1687911),
"is_processed" : false,
"created_on" : ISODate("2018-01-13T11:39:25.000Z"),
"processed_on" : ISODate("1970-01-01T00:00:00.000Z")
}
And last, the explain result:
/* 1 */
{
"stages" : [
{
"$cursor" : {
"query" : {
"user_id" : 2.0
},
"fields" : {
"created_on" : 1,
"size" : 1,
"_id" : 1
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "data.storage",
"indexFilterSet" : false,
"parsedQuery" : {
"user_id" : {
"$eq" : 2.0
}
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1
},
"indexName" : "user_id",
"isMultiKey" : false,
"multiKeyPaths" : {
"user_id" : []
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"[2.0, 2.0]"
]
}
}
},
"rejectedPlans" : []
}
}
},
{
"$project" : {
"_id" : true,
"formattedDate" : {
"$dateToString" : {
"format" : "%Y-%m",
"date" : "$created_on"
}
},
"size" : "$size"
}
},
{
"$group" : {
"_id" : "$formattedDate",
"size" : {
"$sum" : "$size"
}
}
}
],
"ok" : 1.0
}
The problem:
I can navigate and get all results in almost instantly like in 0,002sec. However, when I specify user_id and sum them by grouping on each month, My result came in between 0,300s to 0,560s. I do similar tasks in one request and it becaomes more than a second to finish.
What I tried so far:
I've added an index for user_id
I've added an index for created_on
I used more $match conditions. However, This makes even worse.
This collection have almost 200,000 documents in it currently and approximately 150,000 of them are belongs to user_id = 2
How can I minimize the response time for this query?
Note: MongoDB 3.4.10 used.
Pratha,
try to add sort on "created_on" and "size" fields as the first stage in aggregation pipeline.
db.getCollection('storage').aggregate([
{
"$sort": {
"created_on": 1, "size": 1
}
}, ....
Before that, add compound key index:
db.getCollection('storage').createIndex({created_on:1,size:1})
If you sort data before the $group stage, it will improve the efficiency of accumulation of the totals.
Note about sort aggregation stage:
The $sort stage has a limit of 100 megabytes of RAM. By default, if the stage exceeds this limit, $sort will produce an error. To allow for the handling of large datasets, set the allowDiskUse option to true to enable $sort operations to write to temporary files.
P.S
get rid of match stage by userID to test performance, or add userID to compound key also.

Tune Up Mongo Query

I am new to Mongo and was trying to get distinct count of users. The field Id and Status are not individually Indexed columns but there exists a composite index on both the field. My current query is something like this where the match conditions changes depending on the requirements.
DBQuery.shellBatchSize = 1000000;
db.getCollection('username').aggregate([
{$match:
{ Status: "A"
} },
{"$group" : {_id:"$Id", count:{$sum:1}}}
]);
Is there anyway we can optimize this query more or add parallel runs on collection so that we can achieve results faster ?
Regards
You can tune your aggregation pipelines by passing in an option of explain=true in the aggregate method.
db.getCollection('username').aggregate([
{$match: { Status: "A" } },
{"$group" : {_id:"$Id", count:{$sum:1}}}],
{ explain: true });
This will then output the following to work with
{
"stages" : [
{
"$cursor" : {
"query" : {
"Status" : "A"
},
"fields" : {
"Id" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "test.usernames",
"indexFilterSet" : false,
"parsedQuery" : {
"Status" : {
"$eq" : "A"
}
},
"winningPlan" : {
"stage" : "EOF"
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : "$Id",
"count" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"ok" : 1
}
So to speed up our query we need a index to help the match part of the pipeline, so let's create a index on Status
> db.usernames.createIndex({Status:1})
{
"createdCollectionAutomatically" : true,
"numIndexesBefore" : 1,
"numIndexesAfter" : 2,
"ok" : 1
}
If we now run the explain again we'll get the following results
{
"stages" : [
{
"$cursor" : {
"query" : {
"Status" : "A"
},
"fields" : {
"Id" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "test.usernames",
"indexFilterSet" : false,
"parsedQuery" : {
"Status" : {
"$eq" : "A"
}
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"Status" : 1
},
"indexName" : "Status_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"Status" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"Status" : [
"[\"A\", \"A\"]"
]
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : "$Id",
"count" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"ok" : 1
}
We can now see straight away this is using a index.
https://docs.mongodb.com/manual/reference/explain-results/

execStats is always empty in MongoDB "aggregate" commands profiling results

I am trying to profile the performance of an aggregation pipeline, specifically checking whether indices are used, how many objects are scanned, etc.
I'm setting the DB to full profiling:
db.setProfilingLevel(2)
But then in the db's 'system.profile' collection, in the result record for the aggregation command, the execStats is always empty.
Here is the full result for the command:
{
"op" : "command",
"ns" : "mydb.$cmd",
"command" : {
"aggregate" : "mycolection",
"pipeline" : [{
"$match" : {
"date" : {
"$gte" : "2013-11-26"
}
}
}, {
"$sort" : {
"user_id" : 1
}
}, {
"$project" : {
"user_id" : 1,
"_id" : 0
}
}, {
"$group" : {
"_id" : "$user_id",
"agg_val" : {
"$sum" : 1
}
}
}],
"allowDiskUse" : true
},
"keyUpdates" : 0,
"numYield" : 16,
"lockStats" : {
"timeLockedMicros" : {
"r" : NumberLong(3143653),
"w" : NumberLong(0)
},
"timeAcquiringMicros" : {
"r" : NumberLong(140),
"w" : NumberLong(3)
}
},
"responseLength" : 4990,
"millis" : 3237,
"execStats" : { },
"ts" : ISODate("2014-11-26T16:20:59.576Z"),
"client" : "127.0.0.1",
"allUsers" : [],
"user" : ""
}
Support execStats for aggregation command was added in mongo 3.4.