Realtime Mongo aggregation is slow

Realtime Mongo aggregation is slow - mongodb

I am developing the dashboard of our application and I was wondering if it's normal for MongoDB to be so slow for some query aggregate queries. I am pasting below a simplified version of one of our queries:
db.data_items.aggregate([
{'$match':
{
'organization_subscription_waves.wave_id':ObjectId('5617fe6abecf0500f9c6e125')
},
{'$group':
{'_id': '$data_type_value_type', 'count': {'$sum': 1}}
}
])
The result is this:
{ "_id" : "PHOTO", "count" : 76 }
{ "_id" : "MULTI_SELECT", "count" : 1607 }
{ "_id" : "TIME", "count" : 659 }
{ "_id" : "MULTIPLE_CHOICE", "count" : 78321 }
{ "_id" : "DATE", "count" : 649 }
{ "_id" : "NUMBER", "count" : 2679 }
and takes over two minutes which I believe it's too long for such a simple aggregation. I am wondering if there is anything I can do to improve the query efficiency or whether I must do some offline optimization to improve the performance.
Among others, I have this index on the collection:
{
"v" : 1,
"name" : "organization_subscription_waves.wave_id_1",
"key" : {
"organization_subscription_waves.wave_id" : 1
},
"ns" : "gigwalk_apps_1.data_items"
},
Yet I am not sure if it's being used. explain has the following output:
{
"stages" : [
{
"$cursor" : {
"query" : {
"organization_subscription_waves.wave_id" : ObjectId("5617fe6abecf0500f9c6e125")
},
"fields" : {
"data_type_value_type" : 1,
"_id" : 0
},
"plan" : {
"cursor" : "BtreeCursor ",
"isMultiKey" : false,
"scanAndOrder" : false,
"indexBounds" : {
"organization_subscription_waves.wave_id" : [
[
ObjectId("5617fe6abecf0500f9c6e125"),
ObjectId("5617fe6abecf0500f9c6e125")
]
]
},
"allPlans" : [
{
"cursor" : "BtreeCursor ",
"isMultiKey" : false,
"scanAndOrder" : false,
"indexBounds" : {
"organization_subscription_waves.wave_id" : [
[
ObjectId("5617fe6abecf0500f9c6e125"),
ObjectId("5617fe6abecf0500f9c6e125")
]
]
}
}
]
}
}
},
{
"$group" : {
"_id" : "$data_type_value_type",
"count" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"ok" : 1
}

Related

MongoDB $group + $sum aggregation is very slow

I have an aggregation query in MongoDB:
[{
$group: {
_id: '$status',
status: {
$sum: 1
}
}
}]
It is running on a collection that has ~80 million documents. The status field is indexed, yet the query is very slow and runs for around 60 seconds or more.
I did an explain() on the query, but still got almost nowhere:
{
"explainVersion" : "1",
"stages" : [
{
"$cursor" : {
"queryPlanner" : {
"namespace" : "loa.document",
"indexFilterSet" : false,
"parsedQuery" : {
},
"queryHash" : "B9878693",
"planCacheKey" : "8EAA28C6",
"maxIndexedOrSolutionsReached" : false,
"maxIndexedAndSolutionsReached" : false,
"maxScansToExplodeReached" : false,
"winningPlan" : {
"stage" : "PROJECTION_SIMPLE",
"transformBy" : {
"status" : 1,
"_id" : 0
},
"inputStage" : {
"stage" : "COLLSCAN",
"direction" : "forward"
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : "$status",
"status" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"serverInfo" : {
"host" : "rack-compute-2",
"port" : 27017,
"version" : "5.0.6",
"gitVersion" : "212a8dbb47f07427dae194a9c75baec1d81d9259"
},
"serverParameters" : {
"internalQueryFacetBufferSizeBytes" : 104857600,
"internalQueryFacetMaxOutputDocSizeBytes" : 104857600,
"internalLookupStageIntermediateDocumentMaxSizeBytes" : 104857600,
"internalDocumentSourceGroupMaxMemoryBytes" : 104857600,
"internalQueryMaxBlockingSortMemoryUsageBytes" : 104857600,
"internalQueryProhibitBlockingMergeOnMongoS" : 0,
"internalQueryMaxAddToSetBytes" : 104857600,
"internalDocumentSourceSetWindowFieldsMaxMemoryBytes" : 104857600
},
"command" : {
"aggregate" : "document",
"pipeline" : [
{
"$group" : {
"_id" : "$status",
"status" : {
"$sum" : 1
}
}
}
],
"explain" : true,
"cursor" : {
},
"lsid" : {
"id" : UUID("a07e17fe-65ff-4d38-966f-7517b7a5d3f2")
},
"$db" : "loa"
},
"ok" : 1
}
I see that it does a full COLLSCAN, I just can't understand why.
I plan on supporting a couple hundred million (or even a billion) documents in that collection, but this problem hijacks my plans for seemingly no reason.

You can advice the query planner to use the index as follow:
db.test.explain("executionStats").aggregate(
[
{$group:{ _id:"$status" ,status:{$sum:1} }}
],
{hint:"status_1"}
)
Make sure the index name in the hint is same as created ...
(db.test.getIndexes() will show you the exact index name )

Mongo aggregation performance

I am new to mongo and below query performs really slow with record set over 2 Million records
Query
db.testCollection.aggregate({
$match: {
active: {
$ne: false
}
}
}, {
$group: {
_id: {
productName: "$productName",
model: "$model",
version: "$version",
uid: "$uid"
},
total: {
$sum: 1
}
}
}, {
$project: {
total: 1,
model: "$_id.model",
version: "$_id.version",
uid: "$_id.uid",
productName: "$_id.productName"
}
}, {
$sort: {
model: 1
}
})
explain()
{
"stages" : [
{
"$cursor" : {
"query" : {
"active" : {
"$ne" : false
}
},
"fields" : {
"version" : 1,
"productName" : 1,
"model" : 1,
"uid" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "fms2.device",
"indexFilterSet" : false,
"parsedQuery" : {
"$nor" : [
{
"active" : {
"$eq" : false
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"active" : 1
},
"indexName" : "active",
"isMultiKey" : false,
"multiKeyPaths" : {
"active" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"active" : [
"[MinKey, false)",
"(false, MaxKey]"
]
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : {
"productName" : "$productName",
"model" : "$model",
"version" : "$version",
"uid" : "$uid"
},
"total" : {
"$sum" : {
"$const" : 1
}
}
}
},
{
"$project" : {
"_id" : true,
"total" : true,
"model" : "$_id.model",
"version" : "$_id.version",
"uid" : "$_id.uid",
"productName" : "$_id.productName"
}
},
{
"$sort" : {
"sortKey" : {
"model" : 1
}
}
}
],
"ok" : 1
}
Is there a way to optimize this query more ? I had a look into https://docs.mongodb.com/manual/core/aggregation-pipeline-optimization/ as well but most of the stated suggestions are not applicable for this query.
Not sure if it matters, result of this aggregation ends up with only 20-30 records.

MongoDB performs slow query on sum() based on monthly groups

This is What I tried so far on aggregated query:
db.getCollection('storage').aggregate([
{
"$match": {
"user_id": 2
}
},
{
"$project": {
"formattedDate": {
"$dateToString": { "format": "%Y-%m", "date": "$created_on" }
},
"size": "$size"
}
},
{ "$group": {
"_id" : "$formattedDate",
"size" : { "$sum": "$size" }
} }
])
This is the result:
/* 1 */
{
"_id" : "2018-02",
"size" : NumberLong(10860595386)
}
/* 2 */
{
"_id" : "2017-12",
"size" : NumberLong(524288)
}
/* 3 */
{
"_id" : "2018-01",
"size" : NumberLong(21587971)
}
And this is the document structure:
{
"_id" : ObjectId("5a59efedd006b9036159e708"),
"user_id" : NumberLong(2),
"is_transferred" : false,
"is_active" : false,
"process_id" : NumberLong(0),
"ratio" : 0.000125759169459343,
"type_id" : 201,
"size" : NumberLong(1687911),
"is_processed" : false,
"created_on" : ISODate("2018-01-13T11:39:25.000Z"),
"processed_on" : ISODate("1970-01-01T00:00:00.000Z")
}
And last, the explain result:
/* 1 */
{
"stages" : [
{
"$cursor" : {
"query" : {
"user_id" : 2.0
},
"fields" : {
"created_on" : 1,
"size" : 1,
"_id" : 1
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "data.storage",
"indexFilterSet" : false,
"parsedQuery" : {
"user_id" : {
"$eq" : 2.0
}
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1
},
"indexName" : "user_id",
"isMultiKey" : false,
"multiKeyPaths" : {
"user_id" : []
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"[2.0, 2.0]"
]
}
}
},
"rejectedPlans" : []
}
}
},
{
"$project" : {
"_id" : true,
"formattedDate" : {
"$dateToString" : {
"format" : "%Y-%m",
"date" : "$created_on"
}
},
"size" : "$size"
}
},
{
"$group" : {
"_id" : "$formattedDate",
"size" : {
"$sum" : "$size"
}
}
}
],
"ok" : 1.0
}
The problem:
I can navigate and get all results in almost instantly like in 0,002sec. However, when I specify user_id and sum them by grouping on each month, My result came in between 0,300s to 0,560s. I do similar tasks in one request and it becaomes more than a second to finish.
What I tried so far:
I've added an index for user_id
I've added an index for created_on
I used more $match conditions. However, This makes even worse.
This collection have almost 200,000 documents in it currently and approximately 150,000 of them are belongs to user_id = 2
How can I minimize the response time for this query?
Note: MongoDB 3.4.10 used.

Pratha,
try to add sort on "created_on" and "size" fields as the first stage in aggregation pipeline.
db.getCollection('storage').aggregate([
{
"$sort": {
"created_on": 1, "size": 1
}
}, ....
Before that, add compound key index:
db.getCollection('storage').createIndex({created_on:1,size:1})
If you sort data before the $group stage, it will improve the efficiency of accumulation of the totals.
Note about sort aggregation stage:
The $sort stage has a limit of 100 megabytes of RAM. By default, if the stage exceeds this limit, $sort will produce an error. To allow for the handling of large datasets, set the allowDiskUse option to true to enable $sort operations to write to temporary files.
P.S
get rid of match stage by userID to test performance, or add userID to compound key also.

Tune Up Mongo Query

I am new to Mongo and was trying to get distinct count of users. The field Id and Status are not individually Indexed columns but there exists a composite index on both the field. My current query is something like this where the match conditions changes depending on the requirements.
DBQuery.shellBatchSize = 1000000;
db.getCollection('username').aggregate([
{$match:
{ Status: "A"
} },
{"$group" : {_id:"$Id", count:{$sum:1}}}
]);
Is there anyway we can optimize this query more or add parallel runs on collection so that we can achieve results faster ?
Regards

You can tune your aggregation pipelines by passing in an option of explain=true in the aggregate method.
db.getCollection('username').aggregate([
{$match: { Status: "A" } },
{"$group" : {_id:"$Id", count:{$sum:1}}}],
{ explain: true });
This will then output the following to work with
{
"stages" : [
{
"$cursor" : {
"query" : {
"Status" : "A"
},
"fields" : {
"Id" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "test.usernames",
"indexFilterSet" : false,
"parsedQuery" : {
"Status" : {
"$eq" : "A"
}
},
"winningPlan" : {
"stage" : "EOF"
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : "$Id",
"count" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"ok" : 1
}
So to speed up our query we need a index to help the match part of the pipeline, so let's create a index on Status
> db.usernames.createIndex({Status:1})
{
"createdCollectionAutomatically" : true,
"numIndexesBefore" : 1,
"numIndexesAfter" : 2,
"ok" : 1
}
If we now run the explain again we'll get the following results
{
"stages" : [
{
"$cursor" : {
"query" : {
"Status" : "A"
},
"fields" : {
"Id" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "test.usernames",
"indexFilterSet" : false,
"parsedQuery" : {
"Status" : {
"$eq" : "A"
}
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"Status" : 1
},
"indexName" : "Status_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"Status" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"Status" : [
"[\"A\", \"A\"]"
]
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : "$Id",
"count" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"ok" : 1
}
We can now see straight away this is using a index.
https://docs.mongodb.com/manual/reference/explain-results/

execStats is always empty in MongoDB "aggregate" commands profiling results

I am trying to profile the performance of an aggregation pipeline, specifically checking whether indices are used, how many objects are scanned, etc.
I'm setting the DB to full profiling:
db.setProfilingLevel(2)
But then in the db's 'system.profile' collection, in the result record for the aggregation command, the execStats is always empty.
Here is the full result for the command:
{
"op" : "command",
"ns" : "mydb.$cmd",
"command" : {
"aggregate" : "mycolection",
"pipeline" : [{
"$match" : {
"date" : {
"$gte" : "2013-11-26"
}
}
}, {
"$sort" : {
"user_id" : 1
}
}, {
"$project" : {
"user_id" : 1,
"_id" : 0
}
}, {
"$group" : {
"_id" : "$user_id",
"agg_val" : {
"$sum" : 1
}
}
}],
"allowDiskUse" : true
},
"keyUpdates" : 0,
"numYield" : 16,
"lockStats" : {
"timeLockedMicros" : {
"r" : NumberLong(3143653),
"w" : NumberLong(0)
},
"timeAcquiringMicros" : {
"r" : NumberLong(140),
"w" : NumberLong(3)
}
},
"responseLength" : 4990,
"millis" : 3237,
"execStats" : { },
"ts" : ISODate("2014-11-26T16:20:59.576Z"),
"client" : "127.0.0.1",
"allUsers" : [],
"user" : ""
}

Support execStats for aggregation command was added in mongo 3.4.

We Keep Coding

iphone swift flutter scala powershell matlab mongodb postgresql perl eclipse

Realtime Mongo aggregation is slow - mongodb

Related

MongoDB $group + $sum aggregation is very slow

Mongo aggregation performance

MongoDB performs slow query on sum() based on monthly groups

Tune Up Mongo Query

execStats is always empty in MongoDB "aggregate" commands profiling results

Categories

Resources