Understanding MongoDB aggregate performance - mongodb

I'm running the standard Homebrew installation of Mongo DB, version 2.4.6, and I've got a database with a collection called 'items', which has 600k documents within it.
I've written the following query to find the the top five brands for the collection of items:
db.items.aggregate([
{ $group: { _id: '$brand', size: { $sum: 1}}},
{ $sort: {"size": -1}},
{ $limit: 5}
])
which returns the result I expected, but to be frank, takes much longer to complete than I ever would have imagined. Here is the profile data:
{
"op" : "command",
"ns" : "insights-development.$cmd",
"command" : {
"aggregate" : "items",
"pipeline" : [
{
"$group" : {
"_id" : "$brand",
"size" : {
"$sum" : 1
}
}
},
{
"$sort" : {
"size" : -1
}
},
{
"$limit" : 5
}
]
},
"ntoreturn" : 1,
"keyUpdates" : 0,
"numYield" : 3,
"lockStats" : {
"timeLockedMicros" : {
"r" : NumberLong(3581974),
"w" : NumberLong(0)
},
"timeAcquiringMicros" : {
"r" : NumberLong(1314151),
"w" : NumberLong(10)
}
},
"responseLength" : 267,
"millis" : 2275,
"ts" : ISODate("2013-11-23T18:16:33.886Z"),
"client" : "127.0.0.1",
"allUsers" : [ ],
"user" : ""
}
Here is the ouptut of db.items.stats():
{
"sharded" : false,
"primary" : "a59aff30810b066bbe31d1fae79596af",
"ns" : "insights-development.items",
"count" : 640590,
"size" : 454491840,
"avgObjSize" : 709.4894394230319,
"storageSize" : 576061440,
"numExtents" : 14,
"nindexes" : 10,
"lastExtentSize" : 156225536,
"paddingFactor" : 1,
"systemFlags" : 1,
"userFlags" : 0,
"totalIndexSize" : 165923744,
"indexSizes" : {
"_id_" : 17889088,
"demographic_1" : 14741328,
"brand_1" : 17946320,
"retailer_1" : 18690336,
"color_1" : 15738800,
"style_1" : 18951968,
"classification_1" : 15019312,
"placement_1" : 19107312,
"state_1" : 12394816,
"gender_1" : 15444464
},
"ok" : 1
}
I'm fairly new to MongoDB so I'm hoping someone can point out why this aggregation takes so long to run and if there is anything I can do to speed it up as it seems to me that 600k isn't a huge number of documents more Mongo to run calculations on.

If you have an index on "brand" field, then adding a {$sort:{brand:1}} at the beginning of the pipeline may help performance. The reason you're not seeing good performance right now is likely due to the need to scan every document to group by brand. If there was an index, then it could be used to scan index only rather than all the documents. And sorting (which uses an index) can speed up grouping in some cases where having a result ordered by the field being grouped is beneficial.
If you created an index on brand and didn't see any improvement, try adding a $sort before you get rid of the index. If it happens that you already have an index where brand is the first field, you then don't need to add another index on brand - the compound index will automatically be used.

Related

Optimise MongoDB aggregate query

I have a collection with millions of documents, each document represent an event: {_id, product, timestamp}
In my query, I need to group by product and take the top 10 for example.
"aggregate" : "product_events",
"pipeline" : [
{
"$match" : {
"timeEvent" : {
"$gt" : ISODate("2017-07-17T00:00:00Z")
}
}
},
{
"$group" : {
"_id" : "$product",
"count" : {
"$sum" : 1
}
}
},
{
"$sort" : {
"count" : -1
}
},
{
"$limit" : 10
}
]
My query is very slow now (10 seconds), I am wondering if there is a way to store data differently to optimise this query?
db.product_events.explain("executionStats").aggregate([ {"$match" :
{"timeEvent" : {"$gt" : ISODate("2017-07-17T00:00:00Z")}}},{"$group" :
{"_id" : "$product","count" : {"$sum" : 1}}}, {"$project": {"_id": 1,
"count": 1}} , {"$sort" : {"count" : -1}},{"$limit" : 500}],
{"allowDiskUse": true})
{
"stages" : [
{
"$cursor" : {
"query" : {
"timeEvent" : {
"$gt" : ISODate("2017-07-17T00:00:00Z")
}
},
"fields" : {
"product" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "mydb.product_events",
"indexFilterSet" : false,
"parsedQuery" : {
"timeEvent" : {
"$gt" : ISODate("2017-07-17T00:00:00Z")
}
},
"winningPlan" : {
"stage" : "COLLSCAN",
"filter" : {
"timeEvent" : {
"$gt" : ISODate("2017-07-17T00:00:00Z")
}
},
"direction" : "forward"
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 2127315,
"executionTimeMillis" : 940,
"totalKeysExamined" : 0,
"totalDocsExamined" : 2127315,
"executionStages" : {
"stage" : "COLLSCAN",
"filter" : {
"timeEvent" : {
"$gt" : ISODate("2017-07-17T00:00:00Z")
}
},
"nReturned" : 2127315,
"executionTimeMillisEstimate" : 810,
"works" : 2127317,
"advanced" : 2127315,
"needTime" : 1,
"needYield" : 0,
"saveState" : 16620,
"restoreState" : 16620,
"isEOF" : 1,
"invalidates" : 0,
"direction" : "forward",
"docsExamined" : 2127315
}
}
}
},
{
"$group" : {
"_id" : "$product",
"count" : {
"$sum" : {
"$const" : 1
}
}
}
},
{
"$project" : {
"_id" : true,
"count" : true
}
},
{
"$sort" : {
"sortKey" : {
"count" : -1
},
"limit" : NumberLong(500)
}
}
],
"ok" : 1
}
Below my indexes
db.product_events.getIndexes()
[
{
"v" : 2,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "mydb.product_events"
},
{
"v" : 2,
"key" : {
"product" : 1,
"timeEvent" : -1
},
"name" : "product_1_timeEvent_-1",
"ns" : "mydb.product_events"
}
]
Creating indexes on fields of a collection aids into optimising process of data retrieval from database collections.
Indexes are generally created on fields into which data are filtered according to specific criteria.
Data contained into indexed fields are sorted in specific order and while fetching data once match is found ,scanning of other document stops which makes process of fetching data faster.
According to description as mentioned into above question to optimise performance of aggregate query please try creating an index on timeEvent field as timeEvent field is used as a filter expression into $match stage of aggregation pipeline.
The documentation on compound indexes states the following.
db.products.createIndex( { "item": 1, "stock": 1 } )
The order of the fields listed in a compound index is important. The
index will contain references to documents sorted first by the values
of the item field and, within each value of the item field, sorted by
values of the stock field.
In addition to supporting queries that match on all the index fields,
compound indexes can support queries that match on the prefix of the
index fields. That is, the index supports queries on the item field as
well as both item and stock fields.
Your product_1_timeEvent_-1 index looks like this:
{
"product" : 1,
"timeEvent" : -1
}
which is why it cannot be used to support a query that only filters on timeEvent.
Options you have to get that sorted:
Flip the order of the fields in your index
Remove the product field from your index
Create an additional index with only the timeEvent field in it.
(Include some additional filter on the product field so the existing index gets used)
And keep in mind that any creation/deletion/modification of an index may impact other queries, too. So make sure you test your changes properly.

mongodb insert really slow

i use mongodb to manage device log datas. Right now, it has over one million documents. the document contains more than 30 fields which combine with embed fields. Now, it's really slow when i insert new documents. The insert cost more than 1000ms. From the slow query ops, i get the logs like this:
{
"op" : "insert",
"ns" : "xxx.LogDeviceReport",
"query" : {
"_id" : ObjectId("xxxx"),
"deviceId" : ObjectId("xxxx"),
"en" : "xxxxxx",
"status" : 1,
'other fields, more than 30 fields...'
...
...
},
"ninserted" : 1,
"keyUpdates" : 0,
"writeConflicts" : 0,
"numYield" : 0,
"locks" : {
"Global" : {
"acquireCount" : {
"w" : NumberLong(2)
}
},
"MMAPV1Journal" : {
"acquireCount" : {
"w" : NumberLong(3)
}
},
"Database" : {
"acquireCount" : {
"w" : NumberLong(2)
}
},
"Collection" : {
"acquireCount" : {
"W" : NumberLong(1)
},
"acquireWaitCount" : {
"W" : NumberLong(1)
},
"timeAcquiringMicros" : {
"W" : NumberLong(1477481)
}
},
"oplog" : {
"acquireCount" : {
"w" : NumberLong(1)
}
}
},
"millis" : 977,
"execStats" : {
},
"ts" : ISODate("2016-08-02T22:01:01.270Z"),
"client" : "xxx.xxx.xxxx",
"allUsers" : [
{
"user" : "xxx",
"db" : "xxx"
}
],
"user" : "xx#xx"
}
I checked the index, like this:
[
{
"v" : 1,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "xxx.LogDeviceReport"
},
{
"v" : 1,
"key" : {
"time" : 1
},
"name" : "time_1",
"ns" : "xxx.LogDeviceReport",
"expireAfterSeconds" : 604800,
"background" : true
}
]
Only an _id index and a ttl index by time, no any other indexes.
I guess the 'query' slow the operate. In mongodb doc, it tells that only the _id will be checked the unique, but in the logs, all fields in the 'query', does it matter?
if not this reason, what makes it so slow? Can any one help me ?
If you are using mongodb 3+ you can consider using WiredTiger as storage engine than MMAPV1 which is being used in your case.
I have personally saw a 4x improvement when I have inserted up to 156000 documents in a single go.
MMAPV1 took around 40 min and when I switched to WiredTiger same task was completed in 10 min.
Please check this link from MongoDB blog for more information
Note :: This is only from MongoDB 3.0 +

MongoDB find slow on subarray query

I have a mondoDB collection as follows which contains almost a million entries:
{
_id: 'object id',
link: 'a url',
channels: [ array of ids ]
pubDate: Date
}
I have the following query that I perform pretty often:
db.articles.find({ $and: [ { pubDate: { $gte: new Date(<some date>) } }, { channels: ObjectId(<some object id>) } ] })
The query is extremely slow even though I have certain indexes in place. Recently, I ran an explain on it and here is the result:
{
"cursor" : "BtreeCursor pubDate_-1_channels_1",
"isMultiKey" : true,
"n" : 2926,
"nscannedObjects" : 4245,
"nscanned" : 52611,
"nscannedObjectsAllPlans" : 8125,
"nscannedAllPlans" : 56491,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 5,
"nChunkSkips" : 0,
"millis" : 5378,
"indexBounds" : {
"pubDate" : [
[
ISODate("0NaN-NaN-NaNTNaN:NaN:NaNZ"),
ISODate("2016-03-04T21:00:00Z")
]
],
"channels" : [
[
ObjectId("54239b9477456cf777dd0d31"),
ObjectId("54239b9477456cf777dd0d31")
]
]
}
}
Looks like it is using the correct index but still taking more than 5 seconds to run.
Am I missing something here? Is something wrong with my index?
Here are the indexes on the collection btw:
[
{
"v" : 1,
"name" : "_id_",
"key" : {
"_id" : 1
},
"ns" : "dbname.articles"
},
{
"v" : 1,
"name" : "pubDate_-1_channels_1",
"key" : {
"pubDate" : -1,
"channels" : 1
},
"ns" : "dbname.articles",
"background" : true
},
{
"v" : 1,
"name" : "pubDate_-1",
"key" : {
"pubDate" : -1
},
"ns" : "dbname.articles",
"background" : true
},
{
"v" : 1,
"name" : "link_1",
"key" : {
"link" : 1
},
"ns" : "dbname.articles",
"background" : true
}
]
Here is what I see when I run stats on the collection:
{
"ns" : "dbname.articles",
"count" : 2402741,
"size" : 2838416144,
"avgObjSize" : 1181.3242226274076,
"storageSize" : 3311443968,
"numExtents" : 21,
"nindexes" : 4,
"lastExtentSize" : 862072832,
"paddingFactor" : 1.000000000020535,
"systemFlags" : 0,
"userFlags" : 0,
"totalIndexSize" : 775150208,
"indexSizes" : {
"_id_" : 100834608,
"pubDate_-1_channels_1" : 180812240,
"pubDate_-1" : 96378688,
"link_1" : 397124672
},
"ok" : 1
}
So, according to db.my_collection.stats(), indexed fields takes 0.77gb ("totalIndexSize" : 775150208 bytes), and your collection takes 3.31gb ("storageSize" : 3311443968 bytes). You mentioned that your instance uses 1.5gb of RAM.
MongoDB can therefore keep all indexes in memory, but does not have enough memory to hold all the documents. So, when it needs to do a query on documents that are not loaded in memory, it is slower. I bet that if you run the same query twice, it would take much less time since the necessary documents would already be loaded in memory.
I would recommend trying with 5gb of RAM. Do a couple of queries so that all the documents are loaded in memory, and then compare the speeds.
Your index { pubDate : -1, channel : 1 } looks good to me.
I would try { channel : 1 , pubDate : -1 } though.
The reason for this suggestion is the following:
Notice that indexes have order.
The index you are using is pubDate_-1_channels_1, which will be different from the index channels_1_pubDate_-1 (in reversed order).
Depending on the number of channels you have, I would expect that one index should be more efficient than another for your query.
See the manual on prefixes for details.

MongoDB Aggregation seems very slow

I have a mongodb instance running with the following stats:
{
"db" : "s",
"collections" : 4,
"objects" : 1.23932e+008,
"avgObjSize" : 239.9999891553412400,
"dataSize" : 29743673136.0000000000000000,
"storageSize" : 32916655936.0000000000000000,
"numExtents" : 39,
"indexes" : 3,
"indexSize" : 7737839984.0000000000000000,
"fileSize" : 45009076224.0000000000000000,
"nsSizeMB" : 16,
"dataFileVersion" : {
"major" : 4,
"minor" : 5
},
"extentFreeList" : {
"num" : 0,
"totalSize" : 0
},
"ok" : 1.0000000000000000
}
I'm trying to run the following query:
db.getCollection('tick_data').aggregate(
[
{$group: {_id: "$ccy",min:{$first: "$date_time"},max:{$last: "$date_time"}}}
]
)
And I have the following index set-up in the collection:
{
"ccy" : 1,
"date_time" : 1
}
The query takes 510 seconds to run, which feels like it's extremely slow even though the collection is fairly large (~120 million documents). Is there a simple way for me to make this faster?
Every document has the structure:
{
"_id" : ObjectId("56095bd7b2fc3e36d8d6ed52"),
"bid_volume" : "6.00",
"date_time" : ISODate("2007-01-01T00:00:07.904Z"),
"ccy" : "USDNOK",
"bid" : 6.2271700000000001,
"ask_volume" : "6.00",
"ask" : 6.2357699999999996
}
Results of explain:
{
"stages" : [
{
"$cursor" : {
"query" : {},
"fields" : {
"ccy" : 1,
"date_time" : 1,
"_id" : 0
},
"plan" : {
"cursor" : "BasicCursor",
"isMultiKey" : false,
"scanAndOrder" : false,
"allPlans" : [
{
"cursor" : "BasicCursor",
"isMultiKey" : false,
"scanAndOrder" : false
}
]
}
}
},
{
"$group" : {
"_id" : "$ccy",
"min" : {
"$first" : "$date_time"
},
"max" : {
"$last" : "$date_time"
}
}
}
],
"ok" : 1.0000000000000000
}
Thanks
As mentionned already by #Blakes Seven, $group cannot use indexes. See this topic.
Thus, your query is already optimal. A possible way to optimise this usecase is to pre-calculate and persist the data in a side collection.
You could try this data structure :
{
"_id" : ObjectId("560a5139b56a71ea60890201"),
"ccy" : "USDNOK",
"date_time_first" : ISODate("2007-01-01T00:00:07.904Z"),
"date_time_last" : ISODate("2007-09-09T00:00:07.904Z")
}
Querying this can be done in milliseconds instead of 500+ seconds and you can benefit from indexes.
Then of course, each time you add, update or delete a document from the main collection, you would need to update the side collection.
Depending on how badly you need the data to be "fresh", you could also choose to skip this "live update process" and regenerate entirely the side collection only once a day with a batch and keep in mind that your data may not be "fresh".
Another problem you could fix : Your server definitely needs more RAM & CPU. Your working set probably doesn't fit in RAM, especially with this kind of aggregations.
Also, you can probably make good use of an SSD and I would STRONGLY recommand using a 3 nodes Replicaset instead of a single instance for production.
In the end I wrote a function which takes 0.002 seconds to run.
function() {
var results = {}
var ccys = db.tick_data.distinct("ccy");
ccys.forEach(function(ccy)
{
var max_results = []
var min_results = []
db.tick_data.find({"ccy":ccy},{"date_time":1,"_id":0}).sort({"date_time":1}).limit(1).forEach(function(v){min_results.push(v.date_time)})
db.tick_data.find({"ccy":ccy},{"date_time":1,"_id":0}).sort({"date_time":-1}).limit(1).forEach(function(v){max_results.push(v.date_time)})
var max = max_results[0]
var min = min_results[0]
results[ccy]={"max_date_time":max,"min_date_time":min}
}
)
return results
}

What index to be added in MongoDB to support $elemMatch query on embedded document

Suppose we have a following document
{
embedded:[
{
email:"abc#abc.com",
active:true
},
{
email:"def#abc.com",
active:false
}]
}
What indexing should be used to support $elemMatch query on email and active field of embedded doc.
Update on question :-
db.foo.aggregate([{"$match":{"embedded":{"$elemMatch":{"email":"abc#abc.com","active":true}}}},{"$group":{_id:null,"total":{"$sum":1}}}],{explain:true});
on querying this i am getting following output of explain on aggregate :-
{
"stages" : [
{
"$cursor" : {
"query" : {
"embedded" : {
"$elemMatch" : {
"email" : "abc#abc.com",
"active" : true
}
}
},
"fields" : {
"_id" : 0,
"$noFieldsNeeded" : 1
},
"planError" : "InternalError No plan available to provide stats"
}
},
{
"$group" : {
"_id" : {
"$const" : null
},
"total" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"ok" : 1
}
I think mongodb internally not using index for this query.
Thanx in advance :)
Update on output of db.foo.stats()
db.foo.stats()
{
"ns" : "test.foo",
"count" : 2,
"size" : 480,
"avgObjSize" : 240,
"storageSize" : 8192,
"numExtents" : 1,
"nindexes" : 3,
"lastExtentSize" : 8192,
"paddingFactor" : 1,
"systemFlags" : 0,
"userFlags" : 1,
"totalIndexSize" : 24528,
"indexSizes" : {
"_id_" : 8176,
"embedded.email_1_embedded.active_1" : 8176,
"name_1" : 8176
},
"ok" : 1
}
db.foo.getIndexes();
[
{
"v" : 1,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "test.foo"
},
{
"v" : 1,
"key" : {
"embedded.email" : 1,
"embedded.active" : 1
},
"name" : "embedded.email_1_embedded.active_1",
"ns" : "test.foo"
},
{
"v" : 1,
"key" : {
"name" : 1
},
"name" : "name_1",
"ns" : "test.foo"
}
]
Should you decide to stick to that data model and your queries, here's how to create indexes that match the query:
You can simply index "embedded.email", or use a compound key of embedded indexes, i.e. something like
> db.foo.ensureIndex({"embedded.email" : 1 });
- or -
> db.foo.ensureIndex({"embedded.email" : 1, "embedded.active" : 1});
Indexing boolean fields is often not too useful, since their selectivity is low.