Weighted Average rating through mongodb - mongodb

Is it possible to do a query to sort by "weighted average"
There is 5 values from 1-5 possible. Weighted average is
(n5*5 + n4*4 + n3*3 + n2*2 + n1*1) / (n5+n4+n3+n2+n1)
Where n5 would be the count of objects with rating: 5
I have the following example. If you find better structure to store I am happy to hear.
{
"_id" : "wPg4jzJsEFXNxR5Wf",
"caveId" : "56424a93819e7419112c883e",
"data" : [
{
"value" : 1
},
{
"value" : 3
},
{
"value" : 4
},
{
"value" : 2
}
]
}
{
"_id" : "oSrtv33MgnkJFvNan",
"caveId" : "56424a93819e7419112c949f",
"data" : [
{
"value" : 1
},
{
"value" : 4
},
{
"value" : 4
},
{
"value" : 2
}
]
}
{
"_id" : "gJRMMQPwDwjFrL7zz",
"caveId" : "56424a93819e7419112c8727",
"data" : [
{
"value" : 5
},
{
"value" : 1
},
{
"value" : 4
}
]
}
Example of _ID: oSrtv33MgnkJFvNan (Second one)
(2*4 + 1*2 + 1*1)/(2+1+1) = 2.75
Then I would want to sort all the documents by that value.
Order would be
gJRMMQPwDwjFrL7zz: value: 3.33
oSrtv33MgnkJFvNan: value 2.75
wPg4jzJsEFXNxR5Wf: value 2.5

Well the answer is really both "yes" and "no" in respect to can MongoDB sort data from calculation like this. It can of course do it, but possibly not in a practical way for your purpose.
The two tools MongoDB has to do any sort of calculation are the aggregation framework and mapReduce. The former currently lacks the operators to really handle this in a practical way. The second can be "tricked" into sorting, as an artifact of how mapReduce works, by putting the component to be sorted in the grouping key (even if there is no actual grouping).
So you can basically apply the math with something like this:
db.data.mapReduce(
function() {
var vals = this.data.map(function(el){ return el.value }),
uniq = {};
vals.forEach(function(el) {
if (!uniq.hasOwnProperty(el)) {
uniq[el] = 1;
} else {
uniq[el]++;
}
});
var weight = Array.sum(Object.keys(uniq).map(function(key) {
return uniq[key] * key
})) / Array.sum(Object.keys(uniq).map(function(key) {
return uniq[key];
}))
var id = this._id;
delete this._id;
emit({ "weight": weight, "orig": id },this);
},
function() {},
{ "out": { "inline": 1 } }
)
Which gives you this output:
{
"results" : [
{
"_id" : {
"weight" : 2.5,
"orig" : "wPg4jzJsEFXNxR5Wf"
},
"value" : {
"caveId" : "56424a93819e7419112c883e",
"data" : [
{
"value" : 1
},
{
"value" : 3
},
{
"value" : 4
},
{
"value" : 2
}
]
}
},
{
"_id" : {
"weight" : 2.75,
"orig" : "oSrtv33MgnkJFvNan"
},
"value" : {
"caveId" : "56424a93819e7419112c949f",
"data" : [
{
"value" : 1
},
{
"value" : 4
},
{
"value" : 4
},
{
"value" : 2
}
]
}
},
{
"_id" : {
"weight" : 3.3333333333333335,
"orig" : "gJRMMQPwDwjFrL7zz"
},
"value" : {
"caveId" : "56424a93819e7419112c8727",
"data" : [
{
"value" : 5
},
{
"value" : 1
},
{
"value" : 4
}
]
}
}
]
}
So all the results are sorted, but of course the restriction applies that mapReduce can only produce "inline" output that is under the 16MB BSON limit, or alternately write the results out to another collection.
Even with new features being added to the aggregation framework that can assist here ( from current development series 3.1.x ) this would still require some juggling with $unwind in order to get the "sum" of elements in any way ( no such feature as a "reduce" function yet ), which does not make it a stable or practical alternative.
So you can do it with mapReduce, but for my money I would have another process that calculates this to run periodicallly ( or triggered on updates ) and update a standard "weight" field on the document, that could then be used directly for sorting.
Having a value in place in your documents is always the most performant option.
For the curious, you can grab a development branch release of MongoDB ( 3.1.x series ), or any release after that and apply an aggregation pipeline like this:
db.data.aggregate([
{ "$project": {
"caveId": 1,
"data": 1,
"conv": {
"$setUnion": [
{ "$map": {
"input": "$data",
"as": "el",
"in": "$$el.value"
}},
[]
]
},
"orig": {
"$map": {
"input": "$data",
"as": "el",
"in": "$$el.value"
}
}
}},
{ "$project": {
"caveId": 1,
"data": 1,
"conv": 1,
"orig": 1,
"counts": { "$map": {
"input": "$conv",
"as": "el",
"in": {
"$size": {
"$filter": {
"input": "$orig",
"as": "o",
"cond": {
"$eq": [ "$$o", "$$el" ]
}
}
}
}
}}
}},
{ "$unwind": { "path": "$conv", "includeArrayIndex": true } },
{ "$group": {
"_id": "$_id",
"caveId": { "$first": "$caveId" },
"data": { "$first": "$data" },
"counts": { "$first": "$counts" },
"mult": {
"$sum": {
"$multiply": [
"$conv.value",
{ "$arrayElemAt": [ "$counts", "$conv.index" ] }
]
}
}
}},
{ "$unwind": "$counts" },
{ "$group": {
"_id": "$_id",
"caveId": { "$first": "$caveId" },
"data": { "$first": "$data" },
"count": { "$sum": "$counts" },
"mult": { "$first": "$mult" }
}},
{ "$project": {
"data": 1,
"weight": { "$divide": [ "$mult", "$count" ] }
}},
{ "$sort": { "weight": 1 } }
])
But even with helpers like $filter and "includeArrayIndex" in $unwind and the $arrayElemAt operator using that index later to match up the distinct elements with their counts, the usage of $unwind in any way makes this a non-performant solution.
It may become practical in the future if operators like $map can produce index values needed for pairing and with the introduction of any methods to similarly do an "in-line sum" operation or other math on array results without processing $unwind. But as of writing this does not exist, even in development.

Related

Aggregate and Reduce Nested Documents and Arrays

EDIT:
Our use case:
We get continues reports from servers about visitors. We pre-aggregate the data on the servers for a few seconds aber after that insert these "reports" into MongoDB.
In our dashboard we would like to query the different browsers, OSes, geolocation (country etc.) based on time ranges.
So like: Within the last 7 days, there were 1000 visitors using Chrome, 500 from Germany, 200 from England and so on.
I'm pretty stuck with a MongoDB query we need for our dashboard.
We have following report entries:
{
"_id" : ObjectId("59b9d08e402025326e1a0f30"),
"channel_perm_id" : "c361049fb4144b0e81b71c0b6cfdc296",
"source_id" : "insomnia",
"start_timestamp" : ISODate("2017-09-14T00:42:54.510Z"),
"end_timestamp" : ISODate("2017-09-14T00:42:54.510Z"),
"timestamp" : ISODate("2017-09-14T00:42:54.510Z"),
"resource_uri" : "b755d62a-8c0a-4e8a-945f-41782c13535b",
"sources_info" : {
"browsers" : [
{
"name" : "Chrome",
"count" : NumberLong(2)
}
],
"operating_systems" : [
{
"name" : "Mac OS X",
"count" : NumberLong(2)
}
],
"continent_ids" : [
{
"name" : "EU",
"count" : NumberLong(1)
}
],
"country_ids" : [
{
"name" : "DE",
"count" : NumberLong(1)
}
],
"city_ids" : [
{
"name" : "Solingen",
"count" : NumberLong(1)
}
]
},
"unique_sources" : NumberLong(1),
"requests" : NumberLong(1),
"cache_hits" : NumberLong(0),
"cache_misses" : NumberLong(1),
"cache_hit_size" : NumberLong(0),
"cache_refill_size" : NumberLong("170000000000")
}
Now, we need to aggregate these reports based on timestamp.
So far, so easy:
db.channel_report.aggregate([{
$group: {
_id: {
$dateToString: {
format: "%Y",
date: "$timestamp"
}
},
sources_info: {
$push: "$sources_info"
}
},
}];
But now it gets difficult for me. As you might already noticed, the sources_info object is the problem.
Instead of just "pushing" all sources info into array per group, we need to actually accumulate it.
So, if we have something like this:
{
sources_info: [
{
browsers: [
{
name: "Chrome,
count: 1
}
]
},
{
browsers: [
{
name: "Chrome,
count: 1
}
]
}
]
}
The array should be reduced to this:
{
sources_info:
{
browsers: [
{
name: "Chrome,
count: 2
}
]
}
}
We migrated from MySQL to MongoDB for analytics, but I have no clue how to model this behaviour in Mongo. Regarding the docs I almost think it is not possible, at least not with the current data structure.
Is there a nice solution for this? Or maybe even a different kind of data structure?
Cheers,
Chris from StriveCDN
The basic problem you have is that you are using "named keys" where you probably really should be instead using values to a consistent attribute path. This means instead of keys like "browsers", this probably should simply be "type": "browser" and so on on each entry.
The reasoning for this should become apparent on the general approaches to aggregating the data. It also really helps in querying in general. But the approaches basically involve coercing your initial data format into this kind of structure in order to aggregate it first.
With most recent releases ( MongoDB 3.4.4 and greater ), we can work with your named keys via $objectToArray and manipulate as follows:
db.channel_report.aggregate([
{ "$project": {
"timestamp": 1,
"sources": {
"$reduce": {
"input": {
"$map": {
"input": { "$objectToArray": "$sources_info" },
"as": "s",
"in": {
"$map": {
"input": "$$s.v",
"as": "v",
"in": {
"type": "$$s.k",
"name": "$$v.name",
"count": "$$v.count"
}
}
}
}
},
"initialValue": [],
"in": { "$concatArrays": ["$$value", "$$this"] }
}
}
}},
{ "$unwind": "$sources" },
{ "$group": {
"_id": {
"year": { "$year": "$timestamp" },
"type": "$sources.type",
"name": "$sources.name"
},
"count": { "$sum": "$sources.count" }
}},
{ "$group": {
"_id": { "year": "$_id.year", "type": "$_id.type" },
"v": { "$push": { "name": "$_id.name", "count": "$count" } }
}},
{ "$group": {
"_id": "$_id.year",
"sources_info": {
"$push": { "k": "$_id.type", "v": "$v" }
}
}},
{ "$addFields": {
"sources_info": { "$arrayToObject": "$sources_info" }
}}
])
Taking that back a notch to MongoDB 3.4 ( which should be default on most hosted services by now ) you could alternately declare each key name manually:
db.channel_report.aggregate([
{ "$project": {
"timestamp": 1,
"sources": {
"$concatArrays": [
{ "$map": {
"input": "$sources_info.browsers",
"in": {
"type": "browsers",
"name": "$$this.name",
"count": "$$this.count"
}
}},
{ "$map": {
"input": "$sources_info.operating_systems",
"in": {
"type": "operating_systems",
"name": "$$this.name",
"count": "$$this.count"
}
}},
{ "$map": {
"input": "$sources_info.continent_ids",
"in": {
"type": "continent_ids",
"name": "$$this.name",
"count": "$$this.count"
}
}},
{ "$map": {
"input": "$sources_info.country_ids",
"in": {
"type": "country_ids",
"name": "$$this.name",
"count": "$$this.count"
}
}},
{ "$map": {
"input": "$sources_info.city_ids",
"in": {
"type": "city_ids",
"name": "$$this.name",
"count": "$$this.count"
}
}}
]
}
}},
{ "$unwind": "$sources" },
{ "$group": {
"_id": {
"year": { "$year": "$timestamp" },
"type": "$sources.type",
"name": "$sources.name"
},
"count": { "$sum": "$sources.count" }
}},
{ "$group": {
"_id": { "year": "$_id.year", "type": "$_id.type" },
"v": { "$push": { "name": "$_id.name", "count": "$count" } }
}},
{ "$group": {
"_id": "$_id.year",
"sources": {
"$push": { "k": "$_id.type", "v": "$v" }
}
}},
{ "$project": {
"sources_info": {
"browsers": {
"$arrayElemAt": [
"$sources.v",
{ "$indexOfArray": [ "$sources.k", "browsers" ] }
]
},
"operating_systems": {
"$arrayElemAt": [
"$sources.v",
{ "$indexOfArray": [ "$sources.k", "operating_systems" ] }
]
},
"continent_ids": {
"$arrayElemAt": [
"$sources.v",
{ "$indexOfArray": [ "$sources.k", "continent_ids" ] }
]
},
"country_ids": {
"$arrayElemAt": [
"$sources.v",
{ "$indexOfArray": [ "$sources.k", "country_ids" ] }
]
},
"city_ids": {
"$arrayElemAt": [
"$sources.v",
{ "$indexOfArray": [ "$sources.k", "city_ids" ] }
]
}
}
}}
])
We can even wind that back to MongoDB 3.2 by using $map and $filter in place of $indexOfArray, but the general approach is the main thing to explain.
Concatenate arrays
The main thing that needs to happen is to take the data from the many different arrays with named keys and make a "single array" with a "type" property representing each key name. This is arguably how the data should be stored in the first place, and the first aggregation stage of either approach comes out like this:
/* 1 */
{
"_id" : ObjectId("59b9d08e402025326e1a0f30"),
"timestamp" : ISODate("2017-09-14T00:42:54.510Z"),
"sources" : [
{
"type" : "browsers",
"name" : "Chrome",
"count" : NumberLong(2)
},
{
"type" : "operating_systems",
"name" : "Mac OS X",
"count" : NumberLong(2)
},
{
"type" : "continent_ids",
"name" : "EU",
"count" : NumberLong(1)
},
{
"type" : "country_ids",
"name" : "DE",
"count" : NumberLong(1)
},
{
"type" : "city_ids",
"name" : "Solingen",
"count" : NumberLong(1)
}
]
}
Unwind and Group
Part of the data you want to accumulate on actually includes those "type" and "name" properties from "within" the array. Whenever you need to accumulate across documents from "within an array", the process you use is $unwind in order to be able to access those values as part of the grouping key.
What this means is that after using $unwind on the combined array, you then want to $group on both of those keys and the reduced "timestamp" detail in order to $sum the "count" values.
Since you then have "sub-levels" of detail ( i.e each name of browser within browsers ) then you use additional $group pipeline stages, gradually decreasing the granularity of the grouping keys and using $push to accumulate the details into arrays.
In either case, omitting the very last stage of output the accumulated structure comes out as:
/* 1 */
{
"_id" : 2017,
"sources_info" : [
{
"k" : "continent_ids",
"v" : [
{
"name" : "EU",
"count" : NumberLong(1)
}
]
},
{
"k" : "city_ids",
"v" : [
{
"name" : "Solingen",
"count" : NumberLong(1)
}
]
},
{
"k" : "country_ids",
"v" : [
{
"name" : "DE",
"count" : NumberLong(1)
}
]
},
{
"k" : "browsers",
"v" : [
{
"name" : "Chrome",
"count" : NumberLong(2)
}
]
},
{
"k" : "operating_systems",
"v" : [
{
"name" : "Mac OS X",
"count" : NumberLong(2)
}
]
}
]
}
This really is the final state of the data, though not represented in the same form as it was originally found. It is arguably complete at this point as any further processing is merely cosmetic to output as named keys again.
Output to named keys
As shown the varied approaches are either looking up the array entries by the matching key name, or by using $arrayToObject to transform the array content back into an object with named keys.
An alternate is also to simply do that very last manipulation in code, as shown by this .map() example of manipulating the cursor result in the shell:
db.channel_report.aggregate([
{ "$project": {
"timestamp": 1,
"sources": {
"$reduce": {
"input": {
"$map": {
"input": { "$objectToArray": "$sources_info" },
"as": "s",
"in": {
"$map": {
"input": "$$s.v",
"as": "v",
"in": {
"type": "$$s.k",
"name": "$$v.name",
"count": "$$v.count"
}
}
}
}
},
"initialValue": [],
"in": { "$concatArrays": ["$$value", "$$this"] }
}
}
}},
{ "$unwind": "$sources" },
{ "$group": {
"_id": {
"year": { "$year": "$timestamp" },
"type": "$sources.type",
"name": "$sources.name"
},
"count": { "$sum": "$sources.count" }
}},
{ "$group": {
"_id": { "year": "$_id.year", "type": "$_id.type" },
"v": { "$push": { "name": "$_id.name", "count": "$count" } }
}},
{ "$group": {
"_id": "$_id.year",
"sources_info": {
"$push": { "k": "$_id.type", "v": "$v" }
}
}},
/*
{ "$addFields": {
"sources_info": { "$arrayToObject": "$sources_info" }
}}
*/
]).map( d => Object.assign(d,{
"sources_info": d.sources_info.reduce((acc,curr) =>
Object.assign(acc,{ [curr.k]: curr.v }),{})
}))
Which of course applies to either aggregation pipeline approach.
And of course even $concatArrays can be replaced with $setUnion as long as all the entries have a unique identifying combination of "name" and "type" ( as they appear to be ), and that means with application of modifying the final output by processing the cursor instead you can apply the technique even as far back as MongoDB 2.6.
Final Output
And the final output ( actually aggregated of course, but the question only samples one document ) accumulates for all the sub-keys and reconstructs from the last sample output as shown as:
{
"_id" : 2017,
"sources_info" : {
"continent_ids" : [
{
"name" : "EU",
"count" : NumberLong(1)
}
],
"city_ids" : [
{
"name" : "Solingen",
"count" : NumberLong(1)
}
],
"country_ids" : [
{
"name" : "DE",
"count" : NumberLong(1)
}
],
"browsers" : [
{
"name" : "Chrome",
"count" : NumberLong(2)
}
],
"operating_systems" : [
{
"name" : "Mac OS X",
"count" : NumberLong(2)
}
]
}
}
Where every array entry under each key of sources_info is reduced down to it's cumulative count for every other entry sharing the same "name".

Group by day with Multiple Date Fields

I have documents stored into MongoDB like this :
{
"_id" : "XBpNKbdGSgGfnC2MJ",
"po" : 72134185,
"machine" : 40940,
"location" : "02A01",
"inDate" : ISODate("2017-07-19T06:10:13.059Z"),
"requestDate" : ISODate("2017-07-19T06:17:04.901Z"),
"outDate" : ISODate("2017-07-19T06:30:34Z")
}
And I want give the sum, by day, of inDate and outDate.
I can retrieve of both side the sum of documents by inDate day and, on other side, the sum of documents by outDate, but I would like the sum of each.
Currently, I use this pipeline :
$group: {
_id: {
yearA: { $year: '$inDate' },
monthA: { $month: '$inDate' },
dayA: { $dayOfMonth: '$inDate' },
},
count: { $sum: 1 },
},
and I give :
{ "_id" : { "year" : 2017, "month" : 7, "day" : 24 }, "count" : 1 }
{ "_id" : { "year" : 2017, "month" : 7, "day" : 21 }, "count" : 11 }
{ "_id" : { "year" : 2017, "month" : 7, "day" : 19 }, "count" : 20 }
But I would like, if it's possible :
{ "_id" : { "year" : 2017, "month" : 7, "day" : 24 }, "countIn" : 1, "countOut" : 4 }
{ "_id" : { "year" : 2017, "month" : 7, "day" : 21 }, "countIn" : 11, "countOut" : 23 }
{ "_id" : { "year" : 2017, "month" : 7, "day" : 19 }, "countIn" : 20, "countOut" : 18 }
Any idea ?
Many thanks :-)
You can also split the documents at the source, by essentially combining each value into an array of entries by "type" for "in" and "out". You can do this simply using $map and $cond to select the fields, then $unwind the array and then determine which field to "count" again by inspecting with $cond:
collection.aggregate([
{ "$project": {
"dates": {
"$filter": {
"input": {
"$map": {
"input": [ "in", "out" ],
"as": "type",
"in": {
"type": "$$type",
"date": {
"$cond": {
"if": { "$eq": [ "$$type", "in" ] },
"then": "$inDate",
"else": "$outDate"
}
}
}
}
},
"as": "dates",
"cond": { "$ne": [ "$$dates.date", null ] }
}
}
}},
{ "$unwind": "$dates" },
{ "$group": {
"_id": {
"year": { "$year": "$dates.date" },
"month": { "$month": "$dates.date" },
"day": { "$dayOfMonth": "$dates.date" }
},
"countIn": {
"$sum": {
"$cond": {
"if": { "$eq": [ "$dates.type", "in" ] },
"then": 1,
"else": 0
}
}
},
"countOut": {
"$sum": {
"$cond": {
"if": { "$eq": [ "$dates.type", "out" ] },
"then": 1,
"else": 0
}
}
}
}}
])
That's a safe way to do this that does not risk breaking the BSON limit, no matter what size of data you send at it.
Personally I would rather run as separate processes and "combine" the aggregated results separately, but that would depend on the environment you are running in, which is not mentioned in the question.
For an example of "parallel" execution, you can structure in Meteor somewhere along these lines:
import { Meteor } from 'meteor/meteor';
import { Source } from '../imports/source';
import { Target } from '../imports/target';
Meteor.startup(async () => {
// code to run on server at startup
await Source.remove({});
await Target.remove({});
console.log('Removed');
Source.insert({
"_id" : "XBpNKbdGSgGfnC2MJ",
"po" : 72134185,
"machine" : 40940,
"location" : "02A01",
"inDate" : new Date("2017-07-19T06:10:13.059Z"),
"requestDate" : new Date("2017-07-19T06:17:04.901Z"),
"outDate" : new Date("2017-07-19T06:30:34Z")
});
console.log('Inserted');
await Promise.all(
["In","Out"].map( f => new Promise((resolve,reject) => {
let cursor = Source.rawCollection().aggregate([
{ "$match": { [`${f.toLowerCase()}Date`]: { "$exists": true } } },
{ "$group": {
"_id": {
"year": { "$year": `$${f.toLowerCase()}Date` },
"month": { "$month": `$${f.toLowerCase()}Date` },
"day": { "$dayOfYear": `$${f.toLowerCase()}Date` }
},
[`count${f}`]: { "$sum": 1 }
}}
]);
cursor.on('data', async (data) => {
cursor.pause();
data.date = data._id;
delete data._id;
await Target.upsert(
{ date: data.date },
{ "$set": data }
);
cursor.resume();
});
cursor.on('end', () => resolve('done'));
cursor.on('error', (err) => reject(err));
}))
);
console.log('Mapped');
let targets = await Target.find().fetch();
console.log(targets);
});
Which is essentially going to output to the target collection as was mentioned in comments like:
{
"_id" : "XdPGMkY24AcvTnKq7",
"date" : {
"year" : 2017,
"month" : 7,
"day" : 200
},
"countIn" : 1,
"countOut" : 1
}
Riiiight. I came up with the following query. Admittedly, I have seen simpler and nicer ones in my life but it certainly gets the job done:
db.getCollection('test').aggregate
(
{
$facet: // split aggregation into two pipelines
{
"in": [
{ "$match": { "inDate": { "$ne": null } } }, // get rid of null values
{ $group: { "_id": { "y": { "$year": "$inDate" }, "m": { "$month": "$inDate" }, "d": { "$dayOfMonth": "$inDate" } }, "cIn": { $sum : 1 } } }, // compute sum per inDate
],
"out": [
{ "$match": { "outDate": { "$ne": null } } }, // get rid of null values
{ $group: { "_id": { "y": { "$year": "$outDate" }, "m": { "$month": "$outDate" }, "d": { "$dayOfMonth": "$outDate" } }, "cOut": { $sum : 1 } } }, // compute sum per outDate
]
}
},
{ $project: { "result": { $setUnion: [ "$in", "$out" ] } } }, // merge results into new array
{ $unwind: "$result" }, // unwind array into individual documents
{ $replaceRoot: { newRoot: "$result" } }, // get rid of the additional field level
{ $group: { _id: { year: "$_id.y", "month": "$_id.m", "day": "$_id.d" }, "countIn": { $sum: "$cIn" }, "countOut": { $sum: "$cOut" } } } // group into final result
)
As always with MongoDB aggregations you can get an idea of what's going on by simply reducing the projection stages step by step starting from the end of the query.
EDIT:
As you can see in the comments below there was a bit of a discussion around document size limits and the general applicability of this solution.
So let's look at those aspects in greater detail and let's also compare the performance of the $facet based solution to the one based on $map (suggested by #NeilLunn to avoid potential document size issues).
I created 2 million test records that have random dates assigned to both the "inDate" and the "outDate" field:
{
"_id" : ObjectId("597857e0fa37b3f66959571a"),
"inDate" : ISODate("2016-07-29T22:00:00.000Z"),
"outDate" : ISODate("1988-07-14T22:00:00.000Z")
}
The data range covered was from 01.01.1970 all the way to 01.01.2050, that's a total of 29220 distinct days. Given the random distribution of the 2 million test records across this time range both queries can be expected to return the full 29220 possible results (which both did).
Then I ran both queries five times after restarting my single MongoDB instance freshly and the results in milliseconds I got looked like this:
$facet: 5663, 5400, 5380, 5460, 5520
$map: 9648, 9134, 9058, 9085, 9132
I also measured the size of the single document returned by the facet stage which was 3.19MB so reasonably far away from the MongoDB document size limit (16MB at the time of writing) which, however, only applies to the result document anyway and wouldn't be a problem during pipeline processing.
Bottom line: If you want performance, use the solution suggested here. Be careful about the document size limit, though, in particular if your use case is not the exact one described in the question above (e.g. when you need to collect even more/bigger data). Also, I am not sure if in a sharded scenario both solutions still expose the same performance characteristics...

Group and count over a start and end range

If I have data in the following format:
[
{
_id: 1,
startDate: ISODate("2017-01-1T00:00:00.000Z"),
endDate: ISODate("2017-02-25T00:00:00.000Z"),
type: 'CAR'
},
{
_id: 2,
startDate: ISODate("2017-02-17T00:00:00.000Z"),
endDate: ISODate("2017-03-22T00:00:00.000Z"),
type: 'HGV'
}
]
Is it possible to retrieve data grouped by 'type', but also with a count of the type for each of month in a given date range e.g. between 2017/1/1 to 2017/4/1 would return:
[
{
_id: 'CAR',
monthCounts: [
/*January*/
{
from: ISODate("2017-01-1T00:00:00.000Z"),
to: ISODate("2017-01-31T23:59:59.999Z"),
count: 1
},
/*February*/
{
from: ISODate("2017-02-1T00:00:00.000Z"),
to: ISODate("2017-02-28T23:59:59.999Z"),
count: 1
},
/*March*/
{
from: ISODate("2017-03-1T00:00:00.000Z"),
to: ISODate("2017-03-31T23:59:59.999Z"),
count: 0
},
]
},
{
_id: 'HGV',
monthCounts: [
{
from: ISODate("2017-01-1T00:00:00.000Z"),
to: ISODate("2017-01-31T23:59:59.999Z"),
count: 0
},
{
from: ISODate("2017-02-1T00:00:00.000Z"),
to: ISODate("2017-02-28T23:59:59.999Z"),
count: 1
},
{
from: ISODate("2017-03-1T00:00:00.000Z"),
to: ISODate("2017-03-31T23:59:59.999Z"),
count: 1
},
]
}
]
The returned format is not really important, but what I am trying to achieve is in a single query to retrieve a number of counts for the same grouping (one per month). The input could be simply a start and end date to report from or more likely it could be an array of the date ranges to group by.
The algorithm for this is to basically "iterate" values between the interval of the two values. MongoDB has a couple of ways to deal with this, being what has always been present with mapReduce() and with new features available to the aggregate() method.
I'm going expand on your selection to deliberately show an overlapping month since your examples did not have one. This will result in the "HGV" values appearing in "three" months of output.
{
"_id" : 1,
"startDate" : ISODate("2017-01-01T00:00:00Z"),
"endDate" : ISODate("2017-02-25T00:00:00Z"),
"type" : "CAR"
}
{
"_id" : 2,
"startDate" : ISODate("2017-02-17T00:00:00Z"),
"endDate" : ISODate("2017-03-22T00:00:00Z"),
"type" : "HGV"
}
{
"_id" : 3,
"startDate" : ISODate("2017-02-17T00:00:00Z"),
"endDate" : ISODate("2017-04-22T00:00:00Z"),
"type" : "HGV"
}
Aggregate - Requires MongoDB 3.4
db.cars.aggregate([
{ "$addFields": {
"range": {
"$reduce": {
"input": { "$map": {
"input": { "$range": [
{ "$trunc": {
"$divide": [
{ "$subtract": [ "$startDate", new Date(0) ] },
1000
]
}},
{ "$trunc": {
"$divide": [
{ "$subtract": [ "$endDate", new Date(0) ] },
1000
]
}},
60 * 60 * 24
]},
"as": "el",
"in": {
"$let": {
"vars": {
"date": {
"$add": [
{ "$multiply": [ "$$el", 1000 ] },
new Date(0)
]
},
"month": {
}
},
"in": {
"$add": [
{ "$multiply": [ { "$year": "$$date" }, 100 ] },
{ "$month": "$$date" }
]
}
}
}
}},
"initialValue": [],
"in": {
"$cond": {
"if": { "$in": [ "$$this", "$$value" ] },
"then": "$$value",
"else": { "$concatArrays": [ "$$value", ["$$this"] ] }
}
}
}
}
}},
{ "$unwind": "$range" },
{ "$group": {
"_id": {
"type": "$type",
"month": "$range"
},
"count": { "$sum": 1 }
}},
{ "$sort": { "_id": 1 } },
{ "$group": {
"_id": "$_id.type",
"monthCounts": {
"$push": { "month": "$_id.month", "count": "$count" }
}
}}
])
The key to making this work is the $range operator which takes values for a "start" and and "end" as well as an "interval" to apply. The result is an array of values taken from the "start" and incremented until the "end" is reached.
We use this with startDate and endDate to generate the possible dates in between those values. You will note that we need to do some math here since the $range only takes a 32-bit integer, but we can take the milliseconds away from the timestamp values so that is okay.
Because we want "months", the operations applied extract the month and year values from the generated range. We actually generate the range as the "days" in between since "months" are difficult to deal with in math. The subsequent $reduce operation takes only the "distinct months" from the date range.
The result therefore of the first aggregation pipeline stage is a new field in the document which is an "array" of all the distinct months covered between startDate and endDate. This gives an "iterator" for the rest of the operation.
By "iterator" I mean than when we apply $unwind we get a copy of the original document for every distinct month covered in the interval. This then allows the following two $group stages to first apply a grouping to the common key of "month" and "type" in order to "total" the counts via $sum, and next $group makes the key just the "type" and puts the results in an array via $push.
This gives the result on the above data:
{
"_id" : "HGV",
"monthCounts" : [
{
"month" : 201702,
"count" : 2
},
{
"month" : 201703,
"count" : 2
},
{
"month" : 201704,
"count" : 1
}
]
}
{
"_id" : "CAR",
"monthCounts" : [
{
"month" : 201701,
"count" : 1
},
{
"month" : 201702,
"count" : 1
}
]
}
Note that the coverage of "months" is only present where there is actual data. Whilst possible to produce zero values over a range, it requires quite a bit of wrangling to do so and is not very practical. If you want zero values then it is better to add that in post processing in the client once the results have been retrieved.
If you really have your heart set on the zero values, then you should separately query for $min and $max values, and pass these in to "brute force" the pipeline into generating the copies for each supplied possible range value.
So this time the "range" is made externally to all documents, and you then use a $cond statement into the accumulator to see if the current data is within the grouped range produced. Also since the generation is "external", we really don't need the MongoDB 3.4 operator of $range, so this can be applied to earlier versions as well:
// Get min and max separately
var ranges = db.cars.aggregate(
{ "$group": {
"_id": null,
"startRange": { "$min": "$startDate" },
"endRange": { "$max": "$endDate" }
}}
).toArray()[0]
// Make the range array externally from all possible values
var range = [];
for ( var d = new Date(ranges.startRange.valueOf()); d <= ranges.endRange; d.setUTCMonth(d.getUTCMonth()+1)) {
var v = ( d.getUTCFullYear() * 100 ) + d.getUTCMonth()+1;
range.push(v);
}
// Run conditional aggregation
db.cars.aggregate([
{ "$addFields": { "range": range } },
{ "$unwind": "$range" },
{ "$group": {
"_id": {
"type": "$type",
"month": "$range"
},
"count": {
"$sum": {
"$cond": {
"if": {
"$and": [
{ "$gte": [
"$range",
{ "$add": [
{ "$multiply": [ { "$year": "$startDate" }, 100 ] },
{ "$month": "$startDate" }
]}
]},
{ "$lte": [
"$range",
{ "$add": [
{ "$multiply": [ { "$year": "$endDate" }, 100 ] },
{ "$month": "$endDate" }
]}
]}
]
},
"then": 1,
"else": 0
}
}
}
}},
{ "$sort": { "_id": 1 } },
{ "$group": {
"_id": "$_id.type",
"monthCounts": {
"$push": { "month": "$_id.month", "count": "$count" }
}
}}
])
Which produces the consistent zero fills for all possible months on all groupings:
{
"_id" : "HGV",
"monthCounts" : [
{
"month" : 201701,
"count" : 0
},
{
"month" : 201702,
"count" : 2
},
{
"month" : 201703,
"count" : 2
},
{
"month" : 201704,
"count" : 1
}
]
}
{
"_id" : "CAR",
"monthCounts" : [
{
"month" : 201701,
"count" : 1
},
{
"month" : 201702,
"count" : 1
},
{
"month" : 201703,
"count" : 0
},
{
"month" : 201704,
"count" : 0
}
]
}
MapReduce
All versions of MongoDB support mapReduce, and the simple case of the "iterator" as mentioned above is handled by a for loop in the mapper. We can get output as generated up to the first $group from above by simply doing:
db.cars.mapReduce(
function () {
for ( var d = this.startDate; d <= this.endDate;
d.setUTCMonth(d.getUTCMonth()+1) )
{
var m = new Date(0);
m.setUTCFullYear(d.getUTCFullYear());
m.setUTCMonth(d.getUTCMonth());
emit({ id: this.type, date: m},1);
}
},
function(key,values) {
return Array.sum(values);
},
{ "out": { "inline": 1 } }
)
Which produces:
{
"_id" : {
"id" : "CAR",
"date" : ISODate("2017-01-01T00:00:00Z")
},
"value" : 1
},
{
"_id" : {
"id" : "CAR",
"date" : ISODate("2017-02-01T00:00:00Z")
},
"value" : 1
},
{
"_id" : {
"id" : "HGV",
"date" : ISODate("2017-02-01T00:00:00Z")
},
"value" : 2
},
{
"_id" : {
"id" : "HGV",
"date" : ISODate("2017-03-01T00:00:00Z")
},
"value" : 2
},
{
"_id" : {
"id" : "HGV",
"date" : ISODate("2017-04-01T00:00:00Z")
},
"value" : 1
}
So it does not have the second grouping to compound to arrays, but we did produce the same basic aggregated output.

Limit results in a Mongo Aggregation [duplicate]

I want to group all the documents according to a field but to restrict the number of documents grouped for each value.
Each message has a conversation_ID. I need to get 10 or lesser number of messages for each conversation_ID.
I am able to group according to the following command but can't figure out how to restrict the
number of grouped documents apart from slicing the results
Message.aggregate({'$group':{_id:'$conversation_ID',msgs:{'$push':{msgid:'$_id'}}}})
How to limit the length of msgs array for each conversation_ID to 10?
Modern
From MongoDB 3.6 there is a "novel" approach to this by using $lookup to perform a "self join" in much the same way as the original cursor processing demonstrated below.
Since in this release you can specify a "pipeline" argument to $lookup as a source for the "join", this essentially means you can use $match and $limit to gather and "limit" the entries for the array:
db.messages.aggregate([
{ "$group": { "_id": "$conversation_ID" } },
{ "$lookup": {
"from": "messages",
"let": { "conversation": "$_id" },
"pipeline": [
{ "$match": { "$expr": { "$eq": [ "$conversation_ID", "$$conversation" ] } }},
{ "$limit": 10 },
{ "$project": { "_id": 1 } }
],
"as": "msgs"
}}
])
You can optionally add additional projection after the $lookup in order to make the array items simply the values rather than documents with an _id key, but the basic result is there by simply doing the above.
There is still the outstanding SERVER-9277 which actually requests a "limit to push" directly, but using $lookup in this way is a viable alternative in the interim.
NOTE: There also is $slice which was introduced after writing the original answer and mentioned by "outstanding JIRA issue" in the original content. Whilst you can get the same result with small result sets, it does involve still "pushing everything" into the array and then later limiting the final array output to the desired length.
So that's the main distinction and why it's generally not practical to $slice for large results. But of course can be alternately used in cases where it is.
There are a few more details on mongodb group values by multiple fields about either alternate usage.
Original
As stated earlier, this is not impossible but certainly a horrible problem.
Actually if your main concern is that your resulting arrays are going to be exceptionally large, then you best approach is to submit for each distinct "conversation_ID" as an individual query and then combine your results. In very MongoDB 2.6 syntax which might need some tweaking depending on what your language implementation actually is:
var results = [];
db.messages.aggregate([
{ "$group": {
"_id": "$conversation_ID"
}}
]).forEach(function(doc) {
db.messages.aggregate([
{ "$match": { "conversation_ID": doc._id } },
{ "$limit": 10 },
{ "$group": {
"_id": "$conversation_ID",
"msgs": { "$push": "$_id" }
}}
]).forEach(function(res) {
results.push( res );
});
});
But it all depends on whether that is what you are trying to avoid. So on to the real answer:
The first issue here is that there is no function to "limit" the number of items that are "pushed" into an array. It is certainly something we would like, but the functionality does not presently exist.
The second issue is that even when pushing all items into an array, you cannot use $slice, or any similar operator in the aggregation pipeline. So there is no present way to get just the "top 10" results from a produced array with a simple operation.
But you can actually produce a set of operations to effectively "slice" on your grouping boundaries. It is fairly involved, and for example here I will reduce the array elements "sliced" to "six" only. The main reason here is to demonstrate the process and show how to do this without being destructive with arrays that do not contain the total you want to "slice" to.
Given a sample of documents:
{ "_id" : 1, "conversation_ID" : 123 }
{ "_id" : 2, "conversation_ID" : 123 }
{ "_id" : 3, "conversation_ID" : 123 }
{ "_id" : 4, "conversation_ID" : 123 }
{ "_id" : 5, "conversation_ID" : 123 }
{ "_id" : 6, "conversation_ID" : 123 }
{ "_id" : 7, "conversation_ID" : 123 }
{ "_id" : 8, "conversation_ID" : 123 }
{ "_id" : 9, "conversation_ID" : 123 }
{ "_id" : 10, "conversation_ID" : 123 }
{ "_id" : 11, "conversation_ID" : 123 }
{ "_id" : 12, "conversation_ID" : 456 }
{ "_id" : 13, "conversation_ID" : 456 }
{ "_id" : 14, "conversation_ID" : 456 }
{ "_id" : 15, "conversation_ID" : 456 }
{ "_id" : 16, "conversation_ID" : 456 }
You can see there that when grouping by your conditions you will get one array with ten elements and another with "five". What you want to do here reduce both to the top "six" without "destroying" the array that only will match to "five" elements.
And the following query:
db.messages.aggregate([
{ "$group": {
"_id": "$conversation_ID",
"first": { "$first": "$_id" },
"msgs": { "$push": "$_id" },
}},
{ "$unwind": "$msgs" },
{ "$project": {
"msgs": 1,
"first": 1,
"seen": { "$eq": [ "$first", "$msgs" ] }
}},
{ "$sort": { "seen": 1 }},
{ "$group": {
"_id": "$_id",
"msgs": {
"$push": {
"$cond": [ { "$not": "$seen" }, "$msgs", false ]
}
},
"first": { "$first": "$first" },
"second": { "$first": "$msgs" }
}},
{ "$unwind": "$msgs" },
{ "$project": {
"msgs": 1,
"first": 1,
"second": 1,
"seen": { "$eq": [ "$second", "$msgs" ] }
}},
{ "$sort": { "seen": 1 }},
{ "$group": {
"_id": "$_id",
"msgs": {
"$push": {
"$cond": [ { "$not": "$seen" }, "$msgs", false ]
}
},
"first": { "$first": "$first" },
"second": { "$first": "$second" },
"third": { "$first": "$msgs" }
}},
{ "$unwind": "$msgs" },
{ "$project": {
"msgs": 1,
"first": 1,
"second": 1,
"third": 1,
"seen": { "$eq": [ "$third", "$msgs" ] },
}},
{ "$sort": { "seen": 1 }},
{ "$group": {
"_id": "$_id",
"msgs": {
"$push": {
"$cond": [ { "$not": "$seen" }, "$msgs", false ]
}
},
"first": { "$first": "$first" },
"second": { "$first": "$second" },
"third": { "$first": "$third" },
"forth": { "$first": "$msgs" }
}},
{ "$unwind": "$msgs" },
{ "$project": {
"msgs": 1,
"first": 1,
"second": 1,
"third": 1,
"forth": 1,
"seen": { "$eq": [ "$forth", "$msgs" ] }
}},
{ "$sort": { "seen": 1 }},
{ "$group": {
"_id": "$_id",
"msgs": {
"$push": {
"$cond": [ { "$not": "$seen" }, "$msgs", false ]
}
},
"first": { "$first": "$first" },
"second": { "$first": "$second" },
"third": { "$first": "$third" },
"forth": { "$first": "$forth" },
"fifth": { "$first": "$msgs" }
}},
{ "$unwind": "$msgs" },
{ "$project": {
"msgs": 1,
"first": 1,
"second": 1,
"third": 1,
"forth": 1,
"fifth": 1,
"seen": { "$eq": [ "$fifth", "$msgs" ] }
}},
{ "$sort": { "seen": 1 }},
{ "$group": {
"_id": "$_id",
"msgs": {
"$push": {
"$cond": [ { "$not": "$seen" }, "$msgs", false ]
}
},
"first": { "$first": "$first" },
"second": { "$first": "$second" },
"third": { "$first": "$third" },
"forth": { "$first": "$forth" },
"fifth": { "$first": "$fifth" },
"sixth": { "$first": "$msgs" },
}},
{ "$project": {
"first": 1,
"second": 1,
"third": 1,
"forth": 1,
"fifth": 1,
"sixth": 1,
"pos": { "$const": [ 1,2,3,4,5,6 ] }
}},
{ "$unwind": "$pos" },
{ "$group": {
"_id": "$_id",
"msgs": {
"$push": {
"$cond": [
{ "$eq": [ "$pos", 1 ] },
"$first",
{ "$cond": [
{ "$eq": [ "$pos", 2 ] },
"$second",
{ "$cond": [
{ "$eq": [ "$pos", 3 ] },
"$third",
{ "$cond": [
{ "$eq": [ "$pos", 4 ] },
"$forth",
{ "$cond": [
{ "$eq": [ "$pos", 5 ] },
"$fifth",
{ "$cond": [
{ "$eq": [ "$pos", 6 ] },
"$sixth",
false
]}
]}
]}
]}
]}
]
}
}
}},
{ "$unwind": "$msgs" },
{ "$match": { "msgs": { "$ne": false } }},
{ "$group": {
"_id": "$_id",
"msgs": { "$push": "$msgs" }
}}
])
You get the top results in the array, up to six entries:
{ "_id" : 123, "msgs" : [ 1, 2, 3, 4, 5, 6 ] }
{ "_id" : 456, "msgs" : [ 12, 13, 14, 15 ] }
As you can see here, loads of fun.
After you have initially grouped you basically want to "pop" the $first value off of the stack for the array results. To make this process simplified a little, we actually do this in the initial operation. So the process becomes:
$unwind the array
Compare to the values already seen with an $eq equality match
$sort the results to "float" false unseen values to the top ( this still retains order )
$group back again and "pop" the $first unseen value as the next member on the stack. Also this uses the $cond operator to replace "seen" values in the array stack with false to help in the evaluation.
The final action with $cond is there to make sure that future iterations are not just adding the last value of the array over and over where the "slice" count is greater than the array members.
That whole process needs to be repeated for as many items as you wish to "slice". Since we already found the "first" item in the initial grouping, that means n-1 iterations for the desired slice result.
The final steps are really just an optional illustration of converting everything back into arrays for the result as finally shown. So really just conditionally pushing items or false back by their matching position and finally "filtering" out all the false values so the end arrays have "six" and "five" members respectively.
So there is not a standard operator to accommodate this, and you cannot just "limit" the push to 5 or 10 or whatever items in the array. But if you really have to do it, then this is your best approach.
You could possibly approach this with mapReduce and forsake the aggregation framework all together. The approach I would take ( within reasonable limits ) would be to effectively have an in-memory hash-map on the server and accumulate arrays to that, while using JavaScript slice to "limit" the results:
db.messages.mapReduce(
function () {
if ( !stash.hasOwnProperty(this.conversation_ID) ) {
stash[this.conversation_ID] = [];
}
if ( stash[this.conversation_ID.length < maxLen ) {
stash[this.conversation_ID].push( this._id );
emit( this.conversation_ID, 1 );
}
},
function(key,values) {
return 1; // really just want to keep the keys
},
{
"scope": { "stash": {}, "maxLen": 10 },
"finalize": function(key,value) {
return { "msgs": stash[key] };
},
"out": { "inline": 1 }
}
)
So that just basically builds up the "in-memory" object matching the emitted "keys" with an array never exceeding the maximum size you want to fetch from your results. Additionally this does not even bother to "emit" the item when the maximum stack is met.
The reduce part actually does nothing other than essentially just reduce to "key" and a single value. So just in case our reducer did not get called, as would be true if only 1 value existed for a key, the finalize function takes care of mapping the "stash" keys to the final output.
The effectiveness of this varies on the size of the output, and JavaScript evaluation is certainly not fast, but possibly faster than processing large arrays in a pipeline.
Vote up the JIRA issues to actually have a "slice" operator or even a "limit" on "$push" and "$addToSet", which would both be handy. Personally hoping that at least some modification can be made to the $map operator to expose the "current index" value when processing. That would effectively allow "slicing" and other operations.
Really you would want to code this up to "generate" all of the required iterations. If the answer here gets enough love and/or other time pending that I have in tuits, then I might add some code to demonstrate how to do this. It is already a reasonably long response.
Code to generate pipeline:
var key = "$conversation_ID";
var val = "$_id";
var maxLen = 10;
var stack = [];
var pipe = [];
var fproj = { "$project": { "pos": { "$const": [] } } };
for ( var x = 1; x <= maxLen; x++ ) {
fproj["$project"][""+x] = 1;
fproj["$project"]["pos"]["$const"].push( x );
var rec = {
"$cond": [ { "$eq": [ "$pos", x ] }, "$"+x ]
};
if ( stack.length == 0 ) {
rec["$cond"].push( false );
} else {
lval = stack.pop();
rec["$cond"].push( lval );
}
stack.push( rec );
if ( x == 1) {
pipe.push({ "$group": {
"_id": key,
"1": { "$first": val },
"msgs": { "$push": val }
}});
} else {
pipe.push({ "$unwind": "$msgs" });
var proj = {
"$project": {
"msgs": 1
}
};
proj["$project"]["seen"] = { "$eq": [ "$"+(x-1), "$msgs" ] };
var grp = {
"$group": {
"_id": "$_id",
"msgs": {
"$push": {
"$cond": [ { "$not": "$seen" }, "$msgs", false ]
}
}
}
};
for ( n=x; n >= 1; n-- ) {
if ( n != x )
proj["$project"][""+n] = 1;
grp["$group"][""+n] = ( n == x ) ? { "$first": "$msgs" } : { "$first": "$"+n };
}
pipe.push( proj );
pipe.push({ "$sort": { "seen": 1 } });
pipe.push(grp);
}
}
pipe.push(fproj);
pipe.push({ "$unwind": "$pos" });
pipe.push({
"$group": {
"_id": "$_id",
"msgs": { "$push": stack[0] }
}
});
pipe.push({ "$unwind": "$msgs" });
pipe.push({ "$match": { "msgs": { "$ne": false } }});
pipe.push({
"$group": {
"_id": "$_id",
"msgs": { "$push": "$msgs" }
}
});
That builds the basic iterative approach up to maxLen with the steps from $unwind to $group. Also embedded in there are details of the final projections required and the "nested" conditional statement. The last is basically the approach taken on this question:
Does MongoDB's $in clause guarantee order?
Starting Mongo 4.4, the $group stage has a new aggregation operator $accumulator allowing custom accumulations of documents as they get grouped, via javascript user defined functions.
Thus, in order to only select n messages (for instance 2) for each conversation:
// { "conversationId" : 3, "messageId" : 14 }
// { "conversationId" : 5, "messageId" : 34 }
// { "conversationId" : 3, "messageId" : 39 }
// { "conversationId" : 3, "messageId" : 47 }
db.collection.aggregate([
{ $group: {
_id: "$conversationId",
messages: {
$accumulator: {
accumulateArgs: ["$messageId"],
init: function() { return [] },
accumulate:
function(messages, message) { return messages.concat(message).slice(0, 2); },
merge:
function(messages1, messages2) { return messages1.concat(messages2).slice(0, 2); },
lang: "js"
}
}
}}
])
// { "_id" : 5, "messages" : [ 34 ] }
// { "_id" : 3, "messages" : [ 14, 39 ] }
The accumulator:
accumulates on the field messageId (accumulateArgs)
is initialised to an empty array (init)
accumulates messageId items in an array and only keeps a maximum of 2 (accumulate and merge)
Starting in Mongo 5.2, it's a perfect use case for the new $topN aggregation accumulator:
// { "conversationId" : 3, "messageId" : 14 }
// { "conversationId" : 5, "messageId" : 34 }
// { "conversationId" : 3, "messageId" : 39 }
// { "conversationId" : 3, "messageId" : 47 }
db.collection.aggregate([
{ $group: {
_id: "$conversationId",
messages: { $topN: { n: 2, output: "$messageId", sortBy: { _id: 1 } } }
}}
])
// { "_id" : 5, "messages" : [ 34 ] }
// { "_id" : 3, "messages" : [ 14, 39 ] }
This applies a $topN group accumulation that:
takes for each group the top 2 (n: 2) elements
and for each grouped record extracts the field value (output: "$messageId")
the choice of the "top 2" is defined by sortBy: { _id: 1 } (that I chose to be _id since you didn't specify an order).
The $slice operator is not an aggregation operator so you can't do this (like I suggested in this answer, before the edit):
db.messages.aggregate([
{ $group : {_id:'$conversation_ID',msgs: { $push: { msgid:'$_id' }}}},
{ $project : { _id : 1, msgs : { $slice : 10 }}}]);
Neil's answer is very detailed, but you can use a slightly different approach (if it fits your use case). You can aggregate your results and output them to a new collection:
db.messages.aggregate([
{ $group : {_id:'$conversation_ID',msgs: { $push: { msgid:'$_id' }}}},
{ $out : "msgs_agg" }
]);
The $out operator will write the results of the aggregation to a new collection. You can then use a regular find query project your results with the $slice operator:
db.msgs_agg.find({}, { msgs : { $slice : 10 }});
For this test documents:
> db.messages.find().pretty();
{ "_id" : 1, "conversation_ID" : 123 }
{ "_id" : 2, "conversation_ID" : 123 }
{ "_id" : 3, "conversation_ID" : 123 }
{ "_id" : 4, "conversation_ID" : 123 }
{ "_id" : 5, "conversation_ID" : 123 }
{ "_id" : 7, "conversation_ID" : 1234 }
{ "_id" : 8, "conversation_ID" : 1234 }
{ "_id" : 9, "conversation_ID" : 1234 }
The result will be:
> db.msgs_agg.find({}, { msgs : { $slice : 10 }});
{ "_id" : 1234, "msgs" : [ { "msgid" : 7 }, { "msgid" : 8 }, { "msgid" : 9 } ] }
{ "_id" : 123, "msgs" : [ { "msgid" : 1 }, { "msgid" : 2 }, { "msgid" : 3 },
{ "msgid" : 4 }, { "msgid" : 5 } ] }
Edit
I assume this would mean duplicating the whole messages collection.
Isn't that overkill?
Well, obviously this approach won't scale with huge collections. But, since you're considering using large aggregation pipelines or large map-reduce jobs you probably won't use this for "real-time" requests.
There are many cons of this approach: 16 MB BSON limit if you're creating huge documents with aggregation, wasting disk space / memory with duplication, increased disk IO...
The pros of this approach: its simple to implement and thus easy to change. If your collection is rarely updated you can use this "out" collection like a cache. This way you wouldn't have to perform the aggregation operation multiple times and you could then even support "real-time" client requests on the "out" collection. To refresh your data, you can periodically do aggregation (e.g. in a background job that runs nightly).
Like it was said in the comments this isn't an easy problem and there isn't a perfect solution for this (yet!). I showed you another approach you can use, it's up to you to benchmark and decide what's most appropriate for your use case.
I hope this will work as you wanted:
db.messages.aggregate([
{ $group : {_id:'$conversation_ID',msgs: { $push: { msgid:'$_id' }}}},
{ $project : { _id : 1, msgs : { $slice : ["$msgid",0,10] }}}
]);

MongoDB aggregate count based on multiple query fields - (Multiple field count)

My collection will look this,
{
"_id" : ObjectId("55c8bd1d85b83e06dc54c0eb"),
"name" : "xxx",
"salary" : 10000,
"type" : "type1"
}
{
"_id" : ObjectId("55c8bd1d85b83e06dc54c0eb"),
"name" : "aaa",
"salary" : 10000,
"type" : "type2"
}
{
"_id" : ObjectId("55c8bd1d85b83e06dc54c0eb"),
"name" : "ccc",
"salary" : 10000,
"type" : "type2"
}
My query params will be coming as,
{salary=10000, type=type2}
so based on the query I need to fetch the count of above query params
The result should be something like this,
{ category: 'type1', count: 500 } { category: 'type2', count: 200 } { category: 'name', count: 100 }
Now I am getting count by hitting three different queries and constructing the result (or) server side iteration I can get the result.
Can anyone suggest or provide me good way to get above result
Your quesstion is not very clearly presented, but what it seems you wanted to do here was count the occurances of the data in the fields, optionally filtering those fields by the values that matches the criteria.
Here the $cond operator allows you to tranform a logical condition into a value:
db.collection.aggregate([
{ "$group": {
"_id": null,
"name": { "$sum": 1 },
"salary": {
"$sum": {
"$cond": [
{ "$gte": [ "$salary", 1000 ] },
1,
0
]
}
},
"type": {
"$sum": {
"$cond": [
{ "$eq": [ "$type", "type2" ] },
1,
0
]
}
}
}}
])
All values are in the same document, and it does not really make any sense to split them up here as this is additional work in the pipeline.
{ "_id" : null, "name" : 3, "salary" : 3, "type" : 2 }
Otherwise in the long form, which is not very performant due to needing to make a copy of each document for every key looks like this:
db.collection.aggregate([
{ "$project": {
"name": 1,
"salary": 1,
"type": 1,
"category": { "$literal": ["name","salary","type"] }
}},
{ "$unwind": "$category" },
{ "$group": {
"_id": "$category",
"count": {
"$sum": {
"$cond": [
{ "$and": [
{ "$eq": [ "$category", "name"] },
{ "$ifNull": [ "$name", false ] }
]},
1,
{ "$cond": [
{ "$and": [
{ "$eq": [ "$category", "salary" ] },
{ "$gte": [ "$salary", 1000 ] }
]},
1,
{ "$cond": [
{ "$and": [
{ "$eq": [ "$category", "type" ] },
{ "$eq": [ "$type", "type2" ] }
]},
1,
0
]}
]}
]
}
}
}}
])
And it's output:
{ "_id" : "type", "count" : 2 }
{ "_id" : "salary", "count" : 3 }
{ "_id" : "name", "count" : 3 }
If your documents do not have uniform key names or otherwise cannot specify each key in your pipeline condition, then apply with mapReduce instead:
db.collection.mapReduce(
function() {
var doc = this;
delete doc._id;
Object.keys(this).forEach(function(key) {
var value = (( key == "salary") && ( doc[key] < 1000 ))
? 0
: (( key == "type" ) && ( doc[key] != "type2" ))
? 0
: 1;
emit(key,value);
});
},
function(key,values) {
return Array.sum(values);
},
{
"out": { "inline": 1 }
}
);
And it's output:
"results" : [
{
"_id" : "name",
"value" : 3
},
{
"_id" : "salary",
"value" : 3
},
{
"_id" : "type",
"value" : 2
}
]
Which is basically the same thing with a conditional count, except that you only specify the "reverse" of the conditions you want and only for the fields you want to filter conditions on. And of course this output format is simple to emit as separate documents.
The same approach applies where to test the condition is met on the fields you want conditions for and return 1 where the condition is met or 0 where it is not for the summing the count.
You can use aggregation as following query:
db.collection.aggregate({
$match: {
salary: 10000,
//add any other condition here
}
}, {
$group: {
_id: "$type",
"count": {
$sum: 1
}
}
}, {
$project: {
"category": "$_id",
"count": 1,
_id: 0
}
}