MongoDB aggregation group by month in big collection - optimize pipeline - mongodb

I'm aware that this question has been asked before at SO - but I can't seem to find how to handle aggregation grouping in bigger collections. I have a set of +10 million records, and I just can't get any speed to it.
Running MongoDB v 3.2.
Having a field __createDateUtc (ISODate) in the schema, I'm trying the following pipeline:
db.transactions.aggregate([
{
$project: {
__createDateUtc: 1
}
},
{
$group: {
'_id': { $year: '$__createDateUtc' },
'count': {$sum: 1},
}
},
{
$limit: 10
},
])
This runs at +20 seconds. Could it be made faster? This is a fairly simple pipeline - so really - is there any other strategy that might help in this situation?

I did some bench marking with four different ways of getting the results that I wanted. The results are a discouraging.
Again, with a schema looking like:
{
"_id" : ObjectId("5d665491fd5852755236a5dc"),
...
"__createDateUtc" : ISODate("2019-08-28T10:16:49Z"),
"__createDate" : {
"year" : 2019,
"month" : 8,
"day" : 28,
"yearMonth" : 201908,
"yearMonthDay" : 20190829
}
}
The results:
// Group by __createDate.yearMonth
db.transactions.aggregate([
{ $group: {
'_id': '$__createDate.yearMonth',
'count': {$sum: 1},
} },
{ $limit: 10 },
{ $sort: {'_id': -1 } }
])
// 20 169 ms
// Group by year and month
db.transactions.aggregate([
{$group: {
'_id': {year: '$__createDate.year', month: '$__createDate.month' },
'count': {$sum: 1},
}},
{ $limit: 10 },
{ $sort: {'_id': -1 } }
])
// 23 777 ms
// Group by calculating year and month from ISODate
db.transactions.aggregate([
{$group: {
'_id': {year: { $year: '$__createDateUtc' }, month: { $month: '$__createDateUtc' } },
'count': {$sum: 1},
}},
{ $limit: 10 },
{ $sort: {'_id': -1 } }
])
// 16 444 ms
// Last stupid method to just run many queries with count
var years = [2017, 2018, 2019];
var results = {}
years.forEach(year => {
results[year] = {};
for(var i = 1; i < 13; i++) {
var count = db.transactions.find({'__createDate.year': year, '__createDate.month': i}).count();
if(count > 0) results[year][i] = count;
}
})
// 10 701 ms
As you can see the last method of just running multiple counts is by far the fastest. Especially since I'm actually fetching a lot more data compared to the three other methods.
This just seems stupid to me. I know MongoDB is no search engine, but still. Aggregation is just not fast at all. Makes me wanna sync data to elastic search and try to aggregate within ES instead.

Related

SQL to Mongo Aggregation

Hi I want to change my sql query to mongo aggregation.
select c.year, c.minor_category, count(c.minor_category) from Crime as c
group by c.year, c.minor_category having c.minor_category = (
Select cc.minor_category from Crime as cc where cc.year=c.year group by
cc.minor_category order by count(*) desc, cc.minor_category limit 1)
I tried do something like this:
db.crimes.aggregate({
$group: {
"_id": {
year: "$year",
minor_category :"$minor_category",
count: {$sum: "$minor_category"}
}
},
},
{
$match : {
minor_category: ?
}
})
But i stuck in $match which is equivalent to having, but i dont know how to make subqueries in mongo like in my sql query.
Can anybody can help me ?
Ok based on the confirmation above , the below query should work.
db.crime.aggregate
([
{"$group":{"_id":{"year":"$year","minor":"$minor"},"count":{"$sum":1}}},
{"$project":{"year":"$_id.year","count":"$count","minor":"$_id.minor","document":"$$ROOT"}},
{"$sort":{"year":1,"count":-1}},
{"$group":{"_id":{"year":"$year"},"orig":{"$first":"$document"}}},
{"$project":{"_id":0,"year":"$orig._id.year","minor":"$orig._id.minor","count":"$orig.count"}}
)]
This translates into the following MongoDB query:
db.crime.aggregate({
$group: { // group by year and minor_catetory
_id: {
"year": "$year",
"minor_category": "$minor_category"
},
"count": { $sum: 1 }, // count all documents per group,
}
}, {
$sort: {
"count": -1, // sort descending by count
"minor_category": 1 // and ascending by minor_category
}
}, {
$group: { // now we get the highst element per year
_id: "$_id.year", // so group by year
"minor_category": { $first: "$_id.minor_category" }, // and get the first (we've sorted the data) value
"count": { $first: "$count" } // same here
}
}, {
$project: { // remove the _id field and add the others in the right order (if needed)
"_id": 0,
"year": "$_id",
"minor_category": "$minor_category",
"count": "$count"
}
})

Aggregation pipeline slow with large collection

I have a single collection with over 200 million documents containing dimensions (things I want to filter on or group by) and metrics (things I want to sum or get averages from). I'm currently running against some performance issues and I'm hoping to gain some advice on how I could optimize/scale MongoDB or suggestions on alternative solutions. I'm running the latest stable MongoDB version using WiredTiger. The documents basically look like the following:
{
"dimensions": {
"account_id": ObjectId("590889944befcf34204dbef2"),
"url": "https://test.com",
"date": ISODate("2018-03-04T23:00:00.000+0000")
},
"metrics": {
"cost": 155,
"likes": 200
}
}
I have three indexes on this collection, as there are various aggregations being ran on this collection:
account_id
date
account_id and date
The following aggregation query fetches 3 months of data, summing cost and likes and grouping by week/year:
db.large_collection.aggregate(
[
{
$match: { "dimensions.date": { $gte: new Date(1512082800000), $lte: new Date(1522447200000) } }
},
{
$match: { "dimensions.account_id": { $in: [ "590889944befcf34204dbefc", "590889944befcf34204dbf1f", "590889944befcf34204dbf21" ] }}
},
{
$group: {
cost: { $sum: "$metrics.cost" },
likes: { $sum: "$metrics.likes" },
_id: {
year: { $year: { date: "$dimensions.date", timezone: "Europe/Amsterdam" } },
week: { $isoWeek: { date: "$dimensions.date", timezone: "Europe/Amsterdam" } }
}
}
},
{
$project: {
cost: 1,
likes: 1
}
}
],
{
cursor: {
batchSize: 50
},
allowDiskUse: true
}
);
This query takes about 25-30 seconds to complete and I'm looking to reduce this to at least 5-10 seconds. It's currently a single MongoDB node, no shards or anything. The explain query can be found here: https://pastebin.com/raw/fNnPrZh0 and executionStats here: https://pastebin.com/raw/WA7BNpgA As you can see, MongoDB is using indexes but there are still 1.3 million documents that need to be read. I currently suspect I'm facing some I/O bottlenecks.
Does anyone have an idea how I could improve this aggregation pipeline? Would sharding help at all? Is MonogDB the right tool here?
The following could improve performances if and only if precomputing dimensions within each record is an option.
If this type of query represents an important portion of the queries on this collection, then including additional fields to make these queries faster could be a viable alternative.
This hasn't been benchmarked.
One of the costly parts of this query probably comes from working with dates.
First during the $group stage while computing for each matching record the year and the iso week associated to a specific time zone.
Then, to a lesser extent, during the initial filtering, when keeping dates from the 3 last months.
The idea would be to store in each record the year and the isoweek, for the given example this would be { "year" : 2018, "week" : 10 }. This way the _id key in the $group stage wouldn't need any computation (which would otherwise represent 1M3 complex date operations).
In a similar fashion, we could also store in each record the associated month, which would be { "month" : "201803" } for the given example. This way the first match could be on months [2, 3, 4, 5] before applying a more precise and costlier filtering on the exact timestamps. This would spare the initial costlier Date filtering on 200M records to a simple Int filtering.
Let's create a new collection with these new pre-computed fields (in a real scenario, these fields would be included during the initial insert of the records):
db.large_collection.aggregate([
{ $addFields: {
"prec.year": { $year: { date: "$dimensions.date", timezone: "Europe/Amsterdam" } },
"prec.week": { $isoWeek: { date: "$dimensions.date", timezone: "Europe/Amsterdam" } },
"prec.month": { $dateToString: { format: "%Y%m", date: "$dimensions.date", timezone: "Europe/Amsterdam" } }
}},
{ "$out": "large_collection_precomputed" }
])
which will store these documents:
{
"dimensions" : { "account_id" : ObjectId("590889944befcf34204dbef2"), "url" : "https://test.com", "date" : ISODate("2018-03-04T23:00:00Z") },
"metrics" : { "cost" : 155, "likes" : 200 },
"prec" : { "year" : 2018, "week" : 10, "month" : "201803" }
}
And let's query:
db.large_collection_precomputed.aggregate([
// Initial gross filtering of dates (months) (on 200M documents):
{ $match: { "prec.month": { $gte: "201802", $lte: "201805" } } },
{ $match: {
"dimensions.account_id": { $in: [
ObjectId("590889944befcf34204dbf1f"), ObjectId("590889944befcf34204dbef2")
]}
}},
// Exact filtering of dates (costlier, but only on ~1M5 documents).
{ $match: { "dimensions.date": { $gte: new Date(1512082800000), $lte: new Date(1522447200000) } } },
{ $group: {
// The _id is now extremly fast to retrieve:
_id: { year: "$prec.year", "week": "$prec.week" },
cost: { $sum: "$metrics.cost" },
likes: { $sum: "$metrics.likes" }
}},
...
])
In this case we would use indexes on account_id and month.
Note: Here, months are stored as String ("201803") since I'm not sure how to cast them to Int within an aggregation query. But best would be to store them as Int when records are inserted
As a side effect, this obviously will make the storage disk/ram of the collection heavier.

MongoDB aggregate query to SpringDataMongoDB

I have below MongoDB aggregate query and would like to have it's equivalent SpringData Mongodb query.
MongoDB Aggregate Query :
db.response.aggregate(
// Pipeline
[
// Stage 1 : Group by Emotion & Month
{
$group: {
_id: {
emotion: "$emotion",
category: "$category"
},
count: {
$sum: 1
},
point: {
$first: '$point'
}
}
},
// Stage 2 : Total Points
{
$addFields: {
"totalPoint": {
$multiply: ["$point", "$count"]
}
}
},
// Stage3 : Group By Category - Overall Response Total & totalFeedbacks
{
$group: {
_id: '$_id.category',
totalFeedbacks: {
$sum: "$count"
},
overallResponseTotal: {
$sum: "$totalPoint"
}
}
},
// Stage4 - Overall Response Total & totalFeedbacks
{
$project: {
_id: 1,
overallResponseTotal: '$overallResponseTotal',
maxTotalFrom: {
"$multiply": ["$totalFeedbacks", 3.0]
},
percent: {
"$multiply": [{
"$divide": ["$overallResponseTotal", "$maxTotalFrom"]
}, 100.0]
}
}
},
// Stage4 - Percentage Monthwise
{
$project: {
_id: 1,
overallResponseTotal: 1,
maxTotalFrom: 1,
percent: {
"$multiply": [{
"$divide": ["$overallResponseTotal", "$maxTotalFrom"]
}, 100.0]
}
}
}
]
);
I have tried it's equivalent in Spring Data but got stuck at Stage 2 on how to convert "$addFields" to java code. Though I search about it on multiple sites but couldn't find anything useful. Please see my equivalent java code for Stage 1.
//Stage 1 -Group By Emotion and Category and return it's count
GroupOperation groupEmotionAndCategory = Aggregation.group("emotion","category").count().as("count").first("point")
.as("point");
Aggregation aggregation = Aggregation.newAggregation(groupEmotionAndCategory);
AggregationResults<CategoryWiseEmotion> output = mongoTemplate.aggregate(aggregation, Response.class, CategoryWiseEmotion.class);
Any helps will be highly appreciated.
$addFields is not yet supported by Spring Data Mongodb.
One workaround is to pass the raw aggregation pipeline to Spring.
But since you have a limited number of fields after stage 1, you could also downgrade stage 2 to a projection:
{
$project: {
// _id is included by default
"count" : 1, // include count
"point" : 1, // include point
"totalPoint": {
$multiply: ["$point", "$count"] // compute totalPoint
}
}
}
I haven't tested it myself, but this projection should translate to something like:
ProjectionOperation p = project("count", "point").and("point").multiply(Fields.field("count")).as("totalPoint");
Then you can translate stage 3, 4 and 5 similarly and pass the whole pipeline to Aggregation.aggregate().

MongoDB Aggregate for a sum on a per week basis for all prior weeks

I've got a series of docs in MongoDB. An example doc would be
{
createdAt: Mon Oct 12 2015 09:45:20 GMT-0700 (PDT),
year: 2015,
week: 41
}
Imagine these span all weeks of the year and there can be many in the same week. I want to aggregate them in such a way that the resulting values are a sum of each week and all its prior weeks counting the total docs.
So if there were something like 10 in the first week of the year and 20 in the second, the result could be something like
[{ week: 1, total: 10, weekTotal: 10},
{ week: 2, total: 30, weekTotal: 20}]
Creating an aggregation to find the weekTotal is easy enough. Including a projection to show the first part
db.collection.aggregate([
{
$project: {
"createdAt": 1,
year: {$year: "$createdAt"},
week: {$week: "$createdAt"},
_id: 0
}
},
{
$group: {
_id: {year: "$year", week: "$week"},
weekTotal : { $sum : 1 }
}
},
]);
But getting past this to sum based on that week and those weeks preceding is proving tricky.
The aggregation framework is not able to do this as all operations can only effectively look at one document or grouping boundary at a time. In order to do this on the "server" you need something with access to a global variable to keep the "running total", and that means mapReduce instead:
db.collection.mapReduce(
function() {
Date.prototype.getWeekNumber = function(){
var d = new Date(+this);
d.setHours(0,0,0);
d.setDate(d.getDate()+4-(d.getDay()||7));
return Math.ceil((((d-new Date(d.getFullYear(),0,1))/8.64e7)+1)/7);
};
emit({ year: this.createdAt.getFullYear(), week: this.createdAt.getWeekNumber() }, 1);
},
function(values) {
return Array.sum(values);
},
{
out: { inline: 1 },
scope: { total: 0 },
finalize: function(value) {
total += value;
return { total: total, weekTotal: value }
}
}
)
If you can live with the operation occuring on the "client" then you need to loop through the aggregation result and similarly sum up the totals:
var total = 0;
db.collection.aggregate([
{ "$group": {
"_id": {
"year": { "$year": "$createdAt" },
"week": { "$week": "$createdAt" }
},
"weekTotal": { "$sum": 1 }
}},
{ "$sort": { "_id": 1 } }
]).map(function(doc) {
total += doc.weekTotal;
doc.total = total;
return doc;
});
It's all a matter of whether it makes the most sense to you of whether this needs to happen on the server or on the client. But since the aggregation pipline has no such "globals", then you probably should not be looking at this for any further processing without outputting to another collection anyway.

How do I use aggregation operators in a $match in MongoDB (for example $year or $dayOfMonth)?

I have a collection full of documents with a created_date attribute. I'd like to send these documents through an aggregation pipeline to do some work on them. Ideally I would like to filter them using a $match before I do any other work on them so that I can take advantage of indexes however I can't figure out how to use the new $year/$month/$dayOfMonth operators in my $match expression.
There are a few examples floating around of how to use the operators in a $project operation but I'm concerned that by placing a $project as the first step in my pipeline then I've lost access to my indexes (MongoDB documentation indicates that the first expression must be a $match to take advantage of indexes).
Sample data:
{
post_body: 'This is the body of test post 1',
created_date: ISODate('2012-09-29T05:23:41Z')
comments: 48
}
{
post_body: 'This is the body of test post 2',
created_date: ISODate('2012-09-24T12:34:13Z')
comments: 10
}
{
post_body: 'This is the body of test post 3',
created_date: ISODate('2012-08-16T12:34:13Z')
comments: 10
}
I'd like to run this through an aggregation pipeline to get the total comments on all posts made in September
{
aggregate: 'posts',
pipeline: [
{$match:
/*Can I use the $year/$month operators here to match Sept 2012?
$year:created_date : 2012,
$month:created_date : 9
*/
/*or does this have to be
created_date :
{$gte:{$date:'2012-09-01T04:00:00Z'},
$lt: {$date:'2012-10-01T04:00:00Z'} }
*/
},
{$group:
{_id: '0',
totalComments:{$sum:'$comments'}
}
}
]
}
This works but the match loses access to any indexes for more complicated queries:
{
aggregate: 'posts',
pipeline: [
{$project:
{
month : {$month:'$created_date'},
year : {$year:'$created_date'}
}
},
{$match:
{
month:9,
year: 2012
}
},
{$group:
{_id: '0',
totalComments:{$sum:'$comments'}
}
}
]
}
As you already found, you cannot $match on fields that are not in the document (it works exactly the same way that find works) and if you use $project first then you will lose the ability to use indexes.
What you can do instead is combine your efforts as follows:
{
aggregate: 'posts',
pipeline: [
{$match: {
created_date :
{$gte:{$date:'2012-09-01T04:00:00Z'},
$lt: {date:'2012-10-01T04:00:00Z'}
}}
}
},
{$group:
{_id: '0',
totalComments:{$sum:'$comments'}
}
}
]
}
The above only gives you aggregation for September, if you wanted to aggregate for multiple months, you can for example:
{
aggregate: 'posts',
pipeline: [
{$match: {
created_date :
{ $gte:'2012-07-01T04:00:00Z',
$lt: '2012-10-01T04:00:00Z'
}
},
{$project: {
comments: 1,
new_created: {
"yr" : {"$year" : "$created_date"},
"mo" : {"$month" : "$created_date"}
}
}
},
{$group:
{_id: "$new_created",
totalComments:{$sum:'$comments'}
}
}
]
}
and you'll get back something like:
{
"result" : [
{
"_id" : {
"yr" : 2012,
"mo" : 7
},
"totalComments" : 5
},
{
"_id" : {
"yr" : 2012,
"mo" : 8
},
"totalComments" : 19
},
{
"_id" : {
"yr" : 2012,
"mo" : 9
},
"totalComments" : 21
}
],
"ok" : 1
}
Let's look at building some pipelines that involve operations that are already familiar to us. So, we're going to look at the following stages:
match - this is filtering stage, similar to find.
project
sort
skip
limit
We might ask ourself why these stages are necessary, given that this functionality is already provided in the MongoDB query language, and the reason is because we need these stages to support the more complex analytics-oriented functionality that's included with the aggregation framework. The below query is simply equal to a find:
db.companies.aggregate([{
$match: {
founded_year: 2004
}
}, ])
Let's introduce a project stage in this aggregation pipeline:
db.companies.aggregate([{
$match: {
founded_year: 2004
}
}, {
$project: {
_id: 0,
name: 1,
founded_year: 1
}
}])
We use aggregate method for implementing aggregation framework. The aggregation pipelines are merely an array of documents. Each of the document should stipulate a particular stage operator. So, in the above case we've an aggregation pipeline with two stages. The $match stage is passing the documents one at a time to $project stage.
Let's extend to limit stage:
db.companies.aggregate([{
$match: {
founded_year: 2004
}
}, {
$limit: 5
}, {
$project: {
_id: 0,
name: 1
}
}])
This gets the matching documents and limits to five before projecting out the fields. So, projection is working only on 5 documents. Assume, if we were to do something like this:
db.companies.aggregate([{
$match: {
founded_year: 2004
}
}, {
$project: {
_id: 0,
name: 1
}
}, {
$limit: 5
}])
This gets the matching documents and projects those large number of documents and finally limits to five. So, projection is working on large number of documents and finally limiting to 5. This gives us a lesson that we should limit the documents to those which are absolutely necessary to be passed to the next stage. Now, let's look at sort stage:
db.companies.aggregate([{
$match: {
founded_year: 2004
}
}, {
$sort: {
name: 1
}
}, {
$limit: 5
}, {
$project: {
_id: 0,
name: 1
}
}])
This will sort all documents by name and give only 5 out of them. Assume, if we were to do something like this:
db.companies.aggregate([{
$match: {
founded_year: 2004
}
}, {
$limit: 5
}, {
$sort: {
name: 1
}
}, {
$project: {
_id: 0,
name: 1
}
}])
This will take first 5 documents and sort them. Let's add the skip stage:
db.companies.aggregate([{
$match: {
founded_year: 2004
}
}, {
$sort: {
name: 1
}
}, {
$skip: 10
}, {
$limit: 5
}, {
$project: {
_id: 0,
name: 1
}
}, ])
This will sort all the documents and skip the initial 10 documents and return to us. We should try to include $match stages as early as possible in the pipeline. To filter documents using a $match stage, we use the same syntax for constructing query documents (filters) as we do for find().
Try this;
db.createCollection("so");
db.so.remove();
db.so.insert([
{
post_body: 'This is the body of test post 1',
created_date: ISODate('2012-09-29T05:23:41Z'),
comments: 48
},
{
post_body: 'This is the body of test post 2',
created_date: ISODate('2012-09-24T12:34:13Z'),
comments: 10
},
{
post_body: 'This is the body of test post 3',
created_date: ISODate('2012-08-16T12:34:13Z'),
comments: 10
}
]);
//db.so.find();
db.so.ensureIndex({"created_date":1});
db.runCommand({
aggregate:"so",
pipeline:[
{
$match: { // filter only those posts in september
created_date: { $gte: ISODate('2012-09-01'), $lt: ISODate('2012-10-01') }
}
},
{
$group: {
_id: null, // no shared key
comments: { $sum: "$comments" } // total comments for all the posts in the pipeline
}
},
]
//,explain:true
});
Result is;
{ "result" : [ { "_id" : null, "comments" : 58 } ], "ok" : 1 }
So you could also modify your previous example to do this, although I'm not sure why you'd want to, unless you plan on doing something else with month and year in the pipeline;
{
aggregate: 'posts',
pipeline: [
{$match: { created_date: { $gte: ISODate('2012-09-01'), $lt: ISODate('2012-10-01') } } },
{$project:
{
month : {$month:'$created_date'},
year : {$year:'$created_date'}
}
},
{$match:
{
month:9,
year: 2012
}
},
{$group:
{_id: '0',
totalComments:{$sum:'$comments'}
}
}
]
}