I need to sum the values for 2018-06-01 through 2018-06-30 for each document in the collection. Each key in "days" is a different date and value. What should the mongo aggregate command look like? Result should look something like {
_id: Product_123 ,
June_Sum:
value}
That's really not a great structure for the sort of operation you now want to do. The whole point of keeping data in such a format is that you "increment" it as you go.
For example:
var now = Date.now(),
today = new Date(now - ( now % ( 1000 * 60 * 60 * 24 ))).toISOString().substr(0,10);
var product = "Product_123";
db.counters.updateOne(
{
"month": today.substr(0,7),
"product": product
},
{
"$inc": {
[`dates.${today}`]: 1,
"totals": 1
}
},
{ "upsert": true }
)
In that way the subsequent updates with $inc apply to both the "key" used for the "date" and also increment the "totals" property of the matched document. So after a few iterations you would end up with something like:
{
"_id" : ObjectId("5af395c53945a933add62173"),
"product": "Product_123",
"month": "2018-05",
"dates" : {
"2018-05-10" : 2,
"2018-05-09" : 1
},
"totals" : 3
}
If you're not actually doing that then you "should" be since it's the intended usage pattern for such a structure.
Without keeping a "totals" or like type of entry within the document(s) storing these keys the only methods left for "aggregation" in processing are to effectively coerce the the "keys" into an "array" form.
MongoDB 3.6 with $objectToArray
db.colllection.aggregate([
// Only consider documents with entries within the range
{ "$match": {
"$expr": {
"$anyElementTrue": {
"$map": {
"input": { "$objectToArray": "$days" },
"in": {
"$and": [
{ "$gte": [ "$$this.k", "2018-06-01" ] },
{ "$lt": [ "$$this.k", "2018-07-01" ] }
]
}
}
}
}
}},
// Aggregate for the month
{ "$group": {
"_id": "$product", // <-- or whatever your key for the value is
"total": {
"$sum": {
"$sum": {
"$map": {
"input": { "$objectToArray": "$days" },
"in": {
"$cond": {
"if": {
"$and": [
{ "$gte": [ "$$this.k", "2018-06-01" ] },
{ "$lt": [ "$$this.k", "2018-07-01" ] }
]
},
"then": "$$this.v",
"else": 0
}
}
}
}
}
}
}}
])
Other versions with mapReduce
db.collection.mapReduce(
// Taking the same presumption on your un-named key for "product"
function() {
Object.keys(this.days)
.filter( k => k >= "2018-06-01" && k < "2018-07-01")
.forEach(k => emit(this.product, this.days[k]));
},
function(key,values) {
return Array.sum(values);
},
{
"out": { "inline": 1 },
"query": {
"$where": function() {
return Object.keys(this.days).some(k => k >= "2018-06-01" && k < "2018-07-01")
}
}
}
)
Both are pretty horrible since you need to calculate whether the "keys" fall within the required range even to select the documents and even then still filter through the keys in those documents again in order to decide whether to accumulate for it or not.
Also noting here that if your "Product_123' is also the "name of a key" in the document and NOT a "value", then you're performing even more "gymnastics" to simply convert that "key" into a "value" form, which is how databases do things and the whole point of the the unnecessary coercion going on here.
Better Option
So as opposed to the handling as originally shown where you "should" be accumulating "as you go" with every write to the document(s) at hand, the better option than needing "processing" in order to coerce into an array format is to simply put the data into an array in the first place:
{
"_id" : ObjectId("5af395c53945a933add62173"),
"product": "Product_123",
"month": "2018-05",
"dates" : [
{ "day": "2018-05-09", "value": 1 },
{ "day": "2018-05-10", "value": 2 }
},
"totals" : 3
}
These are infinitely better for purposes of query and further analysis:
db.counters.aggregate([
{ "$match": {
// "month": "2018-05" // <-- or really just that, since it's there
"dates": {
"day": {
"$elemMatch": {
"$gte": "2018-05-01", "$lt": "2018-06-01"
}
}
}
}},
{ "$group": {
"_id": null,
"total": {
"$sum": {
"$sum": {
"$filter": {
"input": "$dates",
"cond": {
"$and": [
{ "$gte": [ "$$this.day", "2018-05-01" ] },
{ "$lt": [ "$$this.day", "2018-06-01" ] }
]
}
}
}
}
}
}}
])
Which is of course really efficient, and kind of deliberately avoiding the "total" field that is already there for demonstration only. But of course you keep the "running accumulation" on writes by doing:
db.counters.updateOne(
{ "product": product, "month": today.substr(0,7)}, "dates.day": today },
{ "$inc": { "dates.$.value": 1, "total": 1 } }
)
Which is really simple. Adding upserts adds a "little" more complexity:
// A "batch" of operations with bulkWrite
db.counter.bulkWrite([
// Incrementing the matched element
{ "udpdateOne": {
"filter": {
"product": product,
"month": today.substr(0,7)},
"dates.day": today
},
"update": {
"$inc": { "dates.$.value": 1, "total": 1 }
}
}},
// Pushing a new "un-matched" element
{ "updateOne": {
"filter": {
"product": product,
"month": today.substr(0,7)},
"dates.day": { "$ne": today }
},
"update": {
"$push": { "dates": { "day": today, "value": 1 } },
"$inc": { "total": 1 }
}
}},
// "Upserting" a new document were not matched
{ "updateOne": {
"filter": {
"product": product,
"month": today.substr(0,7)},
},
"update": {
"$setOnInsert": {
"dates": [{ "day": today, "value": 1 }],
"total": 1
}
},
"upsert": true
}}
])
But generally your getting the "best of both worlds" by having something simple to accumulate "as you go" as well as something that's easy and efficient to query and do other analysis on later.
The overall moral of the story is to "choose the right structure" for what you actually want to do. Don't put things into "keys" which are clearly intended to be used as "values", since it's an anti-pattern which just adds complexity and inefficiency to the rest of your purposes, even if it seemed right for a "single" purpose when you originally stored it that way.
NOTE Also not really advocating storing "strings" for "dates" in any way here. As noted the better approach is to use "values" where you really mean "values" you intend to use. When storing date data as a "value" it is always far more efficient and practical to store as a BSON Date, and NOT a "string".
Related
I have a collection of documents that look like this
{
_id: 1,
weight: 2,
height: 3,
fruit: "Orange",
bald: "Yes"
},
{
_id: 2,
weight: 4,
height: 5,
fruit: "Apple",
bald: "No"
}
I need to get a result that aggregates the entire collection into this.
{
avgWeight: 3,
avgHeight: 4,
orangeCount: 1,
appleCount: 1,
baldCount: 1
}
I think I could map/reduce this, or I could query the averages and counts separately. The only values fruit could ever have are Apple and Orange. What other ways would you go about doing this? I've been away from MongoDB for a while now and maybe there are new amazing ways to do this I'm not aware of?
Aggregation Framework
The aggregation framework will do far better for you than what mapReduce can do, and the basic method is compatible with every release back to 2.2 when the aggregation framework was released.
If you have MongoDB 3.6 you can do
db.fruit.aggregate([
{ "$group": {
"_id": "$fruit",
"avgWeight": { "$avg": "$weight" },
"avgHeight": { "$avg": "$height" },
"baldCount": {
"$sum": { "$cond": [{ "$eq": ["$bald", "Yes"] }, 1, 0] }
},
"count": { "$sum": 1 }
}},
{ "$group": {
"_id": null,
"data": {
"$push": {
"k": {
"$concat": [
{ "$toLower": "$_id" },
"Count"
]
},
"v": "$count"
}
},
"avgWeight": { "$avg": "$avgWeight" },
"avgHeight": { "$avg": "$avgHeight" },
"baldCount": { "$sum": "$baldCount" }
}},
{ "$replaceRoot": {
"newRoot": {
"$mergeObjects": [
{ "$arrayToObject": "$data" },
{
"avgWeight": "$avgWeight",
"avgHeight": "$avgHeight",
"baldCount": "$baldCount"
}
]
}
}}
])
As a slight alternate, you can apply the $mergeObjects in the $group here instead:
db.fruit.aggregate([
{ "$group": {
"_id": "$fruit",
"avgWeight": { "$avg": "$weight" },
"avgHeight": { "$avg": "$height" },
"baldCount": {
"$sum": { "$cond": [{ "$eq": ["$bald", "Yes"] }, 1, 0] }
},
"count": { "$sum": 1 }
}},
{ "$group": {
"_id": null,
"data": {
"$mergeObjects": {
"$arrayToObject": [[{
"k": {
"$concat": [
{ "$toLower": "$_id" },
"Count"
]
},
"v": "$count"
}]]
}
},
"avgWeight": { "$avg": "$avgWeight" },
"avgHeight": { "$avg": "$avgHeight" },
"baldCount": { "$sum": "$baldCount" }
}},
{ "$replaceRoot": {
"newRoot": {
"$mergeObjects": [
"$data",
{
"avgWeight": "$avgWeight",
"avgHeight": "$avgHeight",
"baldCount": "$baldCount"
}
]
}
}}
])
But there are reasons why I personally don't think that is the better approach, and that mostly leads to the next concept.
So even if you don't have a "latest" MongoDB release, you can simply reshape the output since that is all the last pipeline stage actually using the MongoDB 3.6 features is doing:
db.fruit.aggregate([
{ "$group": {
"_id": "$fruit",
"avgWeight": { "$avg": "$weight" },
"avgHeight": { "$avg": "$height" },
"baldCount": {
"$sum": { "$cond": [{ "$eq": ["$bald", "Yes"] }, 1, 0] }
},
"count": { "$sum": 1 }
}},
{ "$group": {
"_id": null,
"data": {
"$push": {
"k": {
"$concat": [
{ "$toLower": "$_id" },
"Count"
]
},
"v": "$count"
}
},
"avgWeight": { "$avg": "$avgWeight" },
"avgHeight": { "$avg": "$avgHeight" },
"baldCount": { "$sum": "$baldCount" }
}},
/*
{ "$replaceRoot": {
"newRoot": {
"$mergeObjects": [
{ "$arrayToObject": "$data" },
{
"avgWeight": "$avgWeight",
"avgHeight": "$avgHeight",
"baldCount": "$baldCount"
}
]
}
}}
*/
]).map( d =>
Object.assign(
d.data.reduce((acc,curr) => Object.assign(acc,{ [curr.k]: curr.v }), {}),
{ avgWeight: d.avgWeight, avgHeight: d.avgHeight, baldCount: d.baldCount }
)
)
And of course you can even just "hardcode" the keys:
db.fruit.aggregate([
{ "$group": {
"_id": null,
"appleCount": {
"$sum": {
"$cond": [{ "$eq": ["$fruit", "Apple"] }, 1, 0]
}
},
"orangeCount": {
"$sum": {
"$cond": [{ "$eq": ["$fruit", "Orange"] }, 1, 0]
}
},
"avgWeight": { "$avg": "$weight" },
"avgHeight": { "$avg": "$height" },
"baldCount": {
"$sum": {
"$cond": [{ "$eq": ["$bald", "Yes"] }, 1, 0]
}
}
}}
])
But it would not be recommended as your data might just change some day, and if there is a value to "group on" then it's better to actually use it than coercing with conditions.
In any form you return the same result:
{
"appleCount" : 1,
"orangeCount" : 1,
"avgWeight" : 3,
"avgHeight" : 4,
"baldCount" : 1
}
We do this with "two" $group stages, being once for accumulating "per fruit" and then secondly to compact all fruit to an array using $push under "k" and "v" values to keep their "key" and their "count". We do a little transformation on the "key" here using $toLower and $concat to join the strings. This is optional at this stage but easier in general.
The "alternate" for 3.6 is simply applying $mergeObjects within this earlier stage instead of $push since we already accumulated these keys. It's just really moving the $arrayToObject to a different stage in the pipeline. It's not really necessary and does not really have any specific advantage. If anything it just removes the flexible option as demonstrated with the "client transform" discussed later.
The "average" accumulations are done via $avg and the "bald" is counted using $cond to test the strings and feed a number to $sum. As the array is "rolled up" we can do all those accumulations again to total for everything.
As mentioned, the only part that actually relies on "new features" is all within the $replaceRoot stage which re-writes the "root" document. That's why this is optional as you can simply do these transformations after the same "already aggregated" data is returned from the database.
All we really do here is take that array with the "k" and "v" entries and turn it into an "object" with named keys via $arrayToObject and apply $mergeObjects on that result with the other keys we already produced at the "root". This transforms that array to be part of the main document returned in result.
The exact same transformation is applied using the JavaScript Array.reduce() and Object.assign() methods in the mongo shell compatible code. It's a very simple thing to apply and the Cursor.map() is generally a feature of most language implementations, so you can do these transforms before you start using the cursor results.
With ES6 compatible JavaScript environments ( not the shell ), we can shorten that syntax a little more:
.map(({ data, ...d }) => ({ ...data.reduce((o,[k,v]) => ({ ...o, [k]: v }), {}), ...d }) )
So it truly is a "one line" function, and that's a general reason why transformations like these are often better in the client code than the server anyway.
As a note on the usage of $cond, it is noted that using it for "hardcoded" evaluation is not really a good idea for several reasons. So it really does not make much sense to "force" that evaluation. Even with the data you present the "bald" would be better expressed as a Boolean value than a "string". If you change "Yes/No" to be true/false then even that "one" valid usage becomes:
"baldCount": { "$sum": { "$cond": ["$bald", 1, 0 ] } }
Which removes the need to "test" a condition on a string match since it's already true/false. MongoDB 4.0 adds another enhancement using $toInt to "coerce" the Boolean to an integer:
"baldCount": { "$sum": { "$toInt": "$bald" } }
That removes the need for $cond altogether, as would simply recording 1 or 0 but that change might cause a loss of clarity in the data, so it is still probably reasonable to have that sort of "coercion" there, but not really optimal anywhere else.
Even with the "dynamic" form using "two" $group stages for accumulation, the main work is still done in the first stage. It simply leaves the remaining accumulation on n result documents for the number of possible unique values of the grouping key. In this case "two", so even though it's an additional instruction there is no real overhead for the gain of having flexible code.
MapReduce
If you really have you're heart set on at least "trying" a mapReduce, then it's really a single pass with a finalize function just to make the averages
db.fruit.mapReduce(
function() {
emit(null,{
"key": { [`${this.fruit.toLowerCase()}Count`]: 1 },
"totalWeight": this.weight,
"totalHeight": this.height,
"totalCount": 1,
"baldCount": (this.bald === "Yes") ? 1 : 0
});
},
function(key,values) {
var output = {
key: { },
totalWeight: 0,
totalHeight: 0,
totalCount: 0,
baldCount: 0
};
for ( let value of values ) {
for ( let key in value.key ) {
if ( !output.key.hasOwnProperty(key) )
output.key[key] = 0;
output.key[key] += value.key[key];
}
Object.keys(value).filter(k => k != 'key').forEach(k =>
output[k] += value[k]
)
}
return output;
},
{
"out": { "inline": 1 },
"finalize": function(key,value) {
return Object.assign(
value.key,
{
avgWeight: value.totalWeight / value.totalCount,
avgHeight: value.totalHeight / value.totalCount,
baldCount: value.baldCount
}
)
}
}
)
Since we already ran through the process for the aggregate() method then the general points should be pretty familiar since we are basically doing much the same thing here.
The main differences are for an "average" you actually need the full totals and counts and of course you get a bit more control over accumulating via an "Object" with JavaScript code.
The results are basically the same, just with the standard mapReduce "bent" on how it presents them:
{
"_id" : null,
"value" : {
"orangeCount" : 1,
"appleCount" : 1,
"avgWeight" : 3,
"avgHeight" : 4,
"baldCount" : 1
}
}
Summary
The general catch being of course that MapReduce using interpreted JavaScript in order to execute has a much higher cost and slower execution than the native coded operations of the aggregation framework.There once may have been an option to use MapReduce for this kind of output on "larger" result sets, but since MongoDB 2.6 introduced "cursor" output for the aggregation framework then the scales have been firmly tipped in favor of the newer option.
Fact is that most "legacy" reasons for employing MapReduce is basically superseded by it's younger sibling as the aggregation framework gains new operations which remove the need for the JavaScript execution environment. It would be a fair comment to say that support for JavaScript execution is generally "dwindling", and once legacy options which used this from the beginning are being gradually removed from the product.
db.demo.aggregate(
// Pipeline
[
// Stage 1
{
$project: {
weight: 1,
height: 1,
Orange: {
$cond: {
if: {
$eq: ["$fruit", 'Orange']
},
then: {
$sum: 1
},
else: 0
}
},
Apple: {
$cond: {
if: {
$eq: ["$fruit", 'Apple']
},
then: {
$sum: 1
},
else: 0
}
},
bald: {
$cond: {
if: {
$eq: ["$bald", 'Yes']
},
then: {
$sum: 1
},
else: 0
}
},
}
},
// Stage 2
{
$group: {
_id: null,
avgWeight: {
$avg: '$weight'
},
avgHeight: {
$avg: '$height'
},
orangeCount: {
$sum: '$Orange'
},
appleCount: {
$sum: '$Apple'
},
baldCount: {
$sum: '$bald'
}
}
},
]
);
I have an aggregation that looks like this:
userSchema.statics.getCounts = function (req, type) {
return this.aggregate([
{ $match: { organization: req.user.organization._id } },
{
$lookup: {
from: 'tickets', localField: `${type}Tickets`, foreignField: '_id', as: `${type}_tickets`,
},
},
{ $unwind: `$${type}_tickets` },
{ $match: { [`${type}_tickets.createdAt`]: { $gte: new Date(moment().subtract(4, 'd').startOf('day').utc()), $lt: new Date(moment().endOf('day').utc()) } } },
{
$group: {
_id: {
groupDate: {
$dateFromParts: {
year: { $year: `$${type}_tickets.createdAt` },
month: { $month: `$${type}_tickets.createdAt` },
day: { $dayOfMonth: `$${type}_tickets.createdAt` },
},
},
userId: `$${type}_tickets.assignee_id`,
},
ticketCount: {
$sum: 1,
},
},
},
{
$sort: { '_id.groupDate': -1 },
},
{ $group: { _id: '$_id.userId', data: { $push: { groupDate: '$_id.groupDate', ticketCount: '$ticketCount' } } } },
]);
};
Which outputs data like this:
[
{
_id: 5aeb6b71709f43359e0888bb,
data: [
{ "groupDate": 2018-05-07T00:00:000Z", ticketCount: 4 }
}
]
Ideally though, I would have data like this:
[
{
_id: 5aeb6b71709f43359e0888bb,
data: [
{ "groupDate": 2018-05-07T00:00:000Z", assignedCount: 4, resolvedCount: 8 }
}
]
The difference being that the object for the user would output both the total number of assigned tickets and the total number of resolved tickets for each date.
My userSchema is like this:
const userSchema = new Schema({
firstName: String,
lastName: String,
assignedTickets: [
{
type: mongoose.Schema.ObjectId,
ref: 'Ticket',
index: true,
},
],
resolvedTickets: [
{
type: mongoose.Schema.ObjectId,
ref: 'Ticket',
index: true,
},
],
}, {
timestamps: true,
});
An example user doc is like this:
{
"_id": "5aeb6b71709f43359e0888bb",
"assignedTickets": ["5aeb6ba7709f43359e0888bd", "5aeb6bf3709f43359e0888c2", "5aec7e0adcdd76b57af9e889"],
"resolvedTickets": ["5aeb6bc2709f43359e0888be", "5aeb6bc2709f43359e0888bf"],
"firstName": "Name",
"lastName": "Surname",
}
An example ticket doc is like this:
{
"_id": "5aeb6ba7709f43359e0888bd",
"ticket_id": 120292,
"type": "assigned",
"status": "Pending",
"assignee_email": "email#gmail.com",
"assignee_id": "5aeb6b71709f43359e0888bb",
"createdAt": "2018-05-02T20:05:59.147Z",
"updatedAt": "2018-05-03T20:05:59.147Z",
}
I've tried adding multiple lookups and group stages, but I keep getting an empty array. If I only do one lookup and one group, I get the correct counts for the searched on field, but I'd like to have both fields in one query. Is it possible to have the query group on two lookups?
In short you seem to be coming to terms with setting up your models in mongoose and have gone overboard with references. In reality you really should not keep the arrays within the "User" documents. This is actually an "anti-pattern" which was just something mongoose used initially as a convention for keeping "references" for population where it did not understand how to translate the references from being kept in the "child" to the "parent" instead.
You actually have that data in each "Ticket" and the natural form of $lookup is to use that "foreignField" in reference to the detail from the local collection. In this case the "assignee_id" on the tickets will suffice for looking at matching back to the "_id" of the "User". Though you don't state it, your "status" should be an indicator of whether the data is actually either "assigned" as when in "Pending" state or "resolved" when it is not.
For the sake of simplicity we are going to consider the state "resolved" if it is anything other than "Pending" in value, but extending on the logic from the example for actual needs is not the problem here.
Basically then we resolve to a single $lookup operation by actually using the natural "foreign key" as opposed to keeping separate arrays.
MongoDB 3.6 and greater
Ideally you would use features from MongoDB 3.6 with sub-pipeline processing here:
// Better date calculations
const oneDay = (1000 * 60 * 60 * 24);
var now = Date.now(),
end = new Date((now - (now % oneDay)) + oneDay),
start = new Date(end.valueOf() - (4 * oneDay));
User.aggregate([
{ "$match": { "organization": req.user.organization._id } },
{ "$lookup": {
"from": Ticket.collection.name,
"let": { "id": "$_id" },
"pipeline": [
{ "$match": {
"createdAt": { "$gte": start, "$lt": end },
"$expr": {
"$eq": [ "$$id", "$assignee_id" ]
}
}},
{ "$group": {
"_id": {
"status": "$status",
"date": {
"$dateFromParts": {
"year": { "$year": "$createdAt" },
"month": { "$month": "$createdAt" },
"day": { "$dayOfMonth": "$createdAt" }
}
}
},
"count": { "$sum": 1 }
}},
{ "$group": {
"_id": "$_id.date",
"data": {
"$push": {
"k": {
"$cond": [
{ "$eq": ["$_id.status", "Pending"] },
"assignedCount",
"resolvedCount"
]
},
"v": "$count"
}
}
}},
{ "$sort": { "_id": -1 } },
{ "$replaceRoot": {
"newRoot": {
"$mergeObjects": [
{ "groupDate": "$_id", "assignedCount": 0, "resolvedCount": 0 },
{ "$arrayToObject": "$data" }
]
}
}}
],
"as": "data"
}},
{ "$project": { "data": 1 } }
])
From MongoDB 3.0 and upwards
Or where you lack those features we use a different pipeline process and a little data transformation after the results are returned from the server:
User.aggregate([
{ "$match": { "organization": req.user.organization._id } },
{ "$lookup": {
"from": Ticket.collection.name,
"localField": "_id",
"foreignField": "assignee_id",
"as": "data"
}},
{ "$unwind": "$data" },
{ "$match": {
"data.createdAt": { "$gte": start, "$lt": end }
}},
{ "$group": {
"_id": {
"userId": "$_id",
"date": {
"$add": [
{ "$subtract": [
{ "$subtract": [ "$data.createdAt", new Date(0) ] },
{ "$mod": [
{ "$subtract": [ "$data.createdAt", new Date(0) ] },
oneDay
]}
]},
new Date(0)
]
},
"status": "$data.status"
},
"count": { "$sum": 1 }
}},
{ "$group": {
"_id": {
"userId": "$_id.userId",
"date": "$_id.date"
},
"data": {
"$push": {
"k": {
"$cond": [
{ "$eq": [ "$_id.status", "Pending" ] },
"assignedCount",
"resolvedCount"
]
},
"v": "$count"
}
}
}},
{ "$sort": { "_id.userId": 1, "_id.date": -1 } },
{ "$group": {
"_id": "$_id.userId",
"data": {
"$push": {
"groupDate": "$_id.date",
"data": "$data"
}
}
}}
])
.then( results =>
results.map( ({ data, ...d }) =>
({
...d,
data: data.map(di =>
({
groupDate: di.groupDate,
assignedCount: 0,
resolvedCount: 0,
...di.data.reduce((acc,curr) => ({ ...acc, [curr.k]: curr.v }),{})
})
)
})
)
)
Which just really goes to show that even with the fancy features in modern releases, you really don't need them because there pretty much has always been ways to work around this. Even the JavaScript parts just had slightly longer winded versions before the current "object spread" syntax was available.
So that is really the direction you need to go in. What you certainly don't want is using "multiple" $lookup stages or even applying $filter conditions on what could potentially be large arrays. Also both forms here do their best to "filter down" the number of items "joined" from the foreign collection so as not to cause a breach of the BSON limit.
Particularly the "pre 3.6" version actually has a trick where $lookup + $unwind + $match occur in succession which you can see in the explain output. All stages actually combine into "one" stage there which solely returns only the items which match the conditions in the $match from the foreign collection. Keeping things "unwound" until we reduce further avoids BSON limit problems, as does the new form with MongoDB 3.6 where the "sub-pipeline" does all the document reduction and grouping before any results are returned.
Your one document sample would return like this:
{
"_id" : ObjectId("5aeb6b71709f43359e0888bb"),
"data" : [
{
"groupDate" : ISODate("2018-05-02T00:00:00Z"),
"assignedCount" : 1,
"resolvedCount" : 0
}
]
}
Once I expand the date selection to include that date, which of course the date selection can also be improved and corrected from your original form.
So it seems to make sense that your relationships are actually defined that way but it's just that you recorded them "twice". You don't need to and even if that's not the definition then you should actually instead record on the "child" rather than an array in the parent. We can juggle and merge the parent arrays, but that's counterproductive to actually establishing the data relations correctly and using them correctly as well.
How about something like this?
db.users.aggregate([
{
$lookup:{ // lookup assigned tickets
from:'tickets',
localField:'assignedTickets',
foreignField:'_id',
as:'assigned',
}
},
{
$lookup:{ // lookup resolved tickets
from:'tickets',
localField:'resolvedTickets',
foreignField:'_id',
as:'resolved',
}
},
{
$project:{
"tickets":{ // merge all tickets into one single array
$concatArrays:[
"$assigned",
"$resolved"
]
}
}
},
{
$unwind:'$tickets' // flatten the 'tickets' array into separate documents
},
{
$group:{ // group by 'createdAt' and 'assignee_id'
_id:{
groupDate:{
$dateFromParts:{
year:{ $year:'$tickets.createdAt' },
month:{ $month:'$tickets.createdAt' },
day:{ $dayOfMonth:'$tickets.createdAt' },
},
},
userId:'$tickets.assignee_id',
},
assignedCount:{ // get the count of assigned tickets
$sum:{
$cond:[
{ // by checking the 'type' field for a value of 'assigned'
$eq:[
'$tickets.type',
'assigned'
]
},
1, // if matching count 1
0 // else 0
]
}
},
resolvedCount:{
$sum:{
$cond:[
{ // by checking the 'type' field for a value of 'resolved'
$eq:[
'$tickets.type',
'resolved'
]
},
1, // if matching count 1
0 // else 0
]
}
},
},
},
{
$sort:{ // sort by 'groupDate' descending
'_id.groupDate':-1
},
},
{
$group:{
_id:'$_id.userId', // group again but only by userId
data:{
$push:{ // create an array
groupDate:'$_id.groupDate',
assignedCount:{
$sum:'$assignedCount'
},
resolvedCount:{
$sum:'$resolvedCount'
}
}
}
}
}
])
Given the following document containing 3 nested documents...
{ "_id": ObjectId("56116d8e4a0000c9006b57ac"), "name": "Stock 1", "items" [
{ "price": 1.50, "description": "Item 1", "count": 10 }
{ "price": 1.70, "description": "Item 2", "count": 13 }
{ "price": 1.10, "description": "Item 3", "count": 20 }
]
}
... I need to select the sub-document with the lowest price closer to a given amount (here below I assume 1.05):
db.stocks.aggregate([
{$unwind: "$items"},
{$sort: {"items.price":1}},
{$match: {"items.price": {$gte: 1.05}}},
{$group: {
_id:0,
item: {$first:"$items"}
}},
{$project: {
_id: "$item._id",
price: "$item.price",
description: "$item.description"
}}
]);
This works as expected and here is the result:
"result" : [
{
"price" : 1.10,
"description" : "Item 3",
"count" : 20
}
],
"ok" : 1
Alongside returning the item with the lowest price closer to a given amount, I need to decrement count by 1. For instance, here below is the result I'm looking for:
"result" : [
{
"price" : 1.10,
"description" : "Item 3",
"count" : 19
}
],
"ok" : 1
It depends on whether you actually want to "update" the result or simply "return" the result with a decremented value. In the former case you will of course need to go back to the document and "decrement" the value for the returned result.
Also want to note that what you "think" is efficient here is actually not. Doing the "filter" of elements "post sort" or even "post unwind" really makes no difference at all to how the $first accumulator works in terms of performance.
The better approach is to basically "pre filter" the values from the array where possible. This reduces the document size in the aggregation pipeline, and the number of array elements to be processed by $unwind:
db.stocks.aggregate([
{ "$match": {
"items.price": { "$gte": 1.05 }
}},
{ "$project": {
"items": {
"$setDifference": [
{ "$map": {
"input": "$items",
"as": "item",
"in": {
"$cond": [
{ "$gte": [ "$$item.price", 1.05 ] }
],
"$$item",
false
}
}},
[false]
]
}
}},
{ "$unwind": "$items"},
{ "$sort": { "items.price":1 } },
{ "$group": {
"_id": 0,
"item": { "$first": "$items" }
}},
{ "$project": {
"_id": "$item._id",
"price": "$item.price",
"description": "$item.description"
}}
]);
Of course that does require a MongoDB version 2.6 or greater server to have the available operators, and going by your output you may have an earlier version. If that is the case then at least loose the $match as it does not do anything of value and would be detremental to performance.
Where a $match is useful, is in the document selection before you do anything, as what you always want to avoid is processing documents that do not even possibly meet the conditions you want from within the array or anywhere else. So you should always $match or use a similar query stage first.
At any rate, if all you wanted was a "projected result" then just use $subtract in the output:
{ "$project": {
"_id": "$item._id",
"price": "$item.price",
"description": "$item.description",
"count": { "$subtract": [ "$item.count", 1 ] }
}}
If you wanted however to "update" the result, then you would be iterating the array ( it's still an array even with one result ) to update the matched item and "decrement" the count via $inc:
var result = db.stocks.aggregate([
{ "$match": {
"items.price": { "$gte": 1.05 }
}},
{ "$project": {
"items": {
"$setDifference": [
{ "$map": {
"input": "$items",
"as": "item",
"in": {
"$cond": [
{ "$gte": [ "$$item.price", 1.05 ] }
],
"$$item",
false
}
}},
[false]
]
}
}},
{ "$unwind": "$items"},
{ "$sort": { "items.price":1 } },
{ "$group": {
"_id": 0,
"item": { "$first": "$items" }
}},
{ "$project": {
"_id": "$item._id",
"price": "$item.price",
"description": "$item.description"
}}
]);
result.forEach(function(item) {
db.stocks.update({ "item._id": item._id},{ "$inc": { "item.$.count": -1 }})
})
And on a MongoDB 2.4 shell, your same aggregate query applies ( but please make the changes ) however the result contains another field called result inside it with the array, so add the level:
result.result.forEach(function(item) {
db.stocks.update({ "item._id": item._id},{ "$inc": { "item.$.count": -1 }})
})
So either just $project for display only, or use the returned result to effect an .update() on the data as required.
I’ve solved this problem but looking for a better way to do it on the mongodb server rather that client.
I have one collection of Orders with a placement datetime (iso date) and a product.
{ _id:1, datetime:“T1”, product:”Apple”}
{ _id:2, datetime:“T2”, product:”Orange”}
{ _id:3, datetime:“T3”, product:”Pear”}
{ _id:4, datetime:“T4”, product:”Pear”}
{ _id:5, datetime:“T5”, product:”Apple”}
Goal: For a given time (or set of times) show the last order for EACH product in the set of my products before that time. Products are finite and known.
eg. query for time T6 will return:
{ _id:2, datetime:“T2”, product:”Orange”}
{ _id:4, datetime:“T4”, product:”Pear”}
{ _id:5, datetime:“T5”, product:”Apple”}
T4 will return:
{ _id:1, datetime:“T1”, product:”Apple”}
{ _id:2, datetime:“T2”, product:”Orange”}
{ _id:4, datetime:“T4”, product:”Pear”}
i’ve implemented this by creating a composite index on orders [datetime:descending, product:ascending]
Then on the java client:
findLastOrdersForTimes(times) {
for (time: times) {
for (product: products) {
db.orders.findOne(product:product, datetime: { $lt: time}}
}
}
}
Now that is pretty fast since it hits the index and only fetching the data i need. However I need to query for many time points (100000+) which will be a lot of calls over the network. Also my orders table will be very large. So how can I do this on the server in one hit, i.e return a collection of time->array products? If it was oracle, id create a stored proc with a cursor that loops back in time and collects the results for every time point and breaks when it gets to the last product after the last time point. I’ve looked at the aggregation framework and mapreduce but can’t see how to achieve this kind of loop. Any pointers?
If you truly want the last order for each product, then the aggregation framework comes in:
db.times.aggregate([
{ "$match": {
"product": { "$in": products },
}},
{ "$group": {
"_id": "$product",
"datetime": { "$max": "$datetime" }
}}
])
Example with an array of products:
var products = ['Apple', 'Orange', 'Pear'];
{ "_id" : "Pear", "datetime" : "T4" }
{ "_id" : "Orange", "datetime" : "T2" }
{ "_id" : "Apple", "datetime" : "T5" }
Or if the _id from the original document is important to you, use the $sort with $last instead:
db.times.aggregate([
{ "$match": {
"product": { "$in": products },
}},
{ "$sort": { "datetime": 1 } },
{ "$group": {
"_id": "$product",
"id": { "$last": "$_id" },
"datetime": { "$last": "$datetime" }
}}
])
And that is what you most likely really want to do in either of those last cases. But the index you really want there is on "product":
db.times.ensureIndex({ "product": 1 })
So even if you need to iterate that with an additional $match condition for $lt a certain timepoint, then that is better or otherwise you can modify the "grouping" to include the "datetime" as well as keeping a set in the $match.
It seems better at any rate, so perhaps this helps at least to modify your thinking.
If I'm reading out your notes correctly you seem to simply be looking for turning this on it's head and finding the last product for each point in time. So the statement is not much different:
db.times.aggregate([
{ "$match": {
"datetime": { "$in": ["T4","T5"] },
}},
{ "$sort": { "product": 1, "datetime": 1 } },
{ "$group": {
"_id": "$datetime",
"id": { "$last": "$_id" },
"product": { "$last": "$product" }
}}
])
That is in theory it is like that based on how you present the question. I have the feeling though that you are abstracting this though and "datetime" is possibly actual timestamps as date object types.
So you might not be aware of the date aggregation operators you can apply, for example to get the boundary of each hour:
db.times.aggregate([
{ "$group": {
"_id": {
"year": { "$year": "$datetime" },
"dayOfYear": { "$dayOfYear": "$datetime" },
"hour": { "$hour": "$datetime" }
},
"id": { "$last": "$_id" },
"datetime": { "$last": "$datetime" },
"product": { "$last": "$product" }
}}
])
Or even using date math instead of the operators if a epoch based timestamp
db.times.aggregate([
{ "$group": {
"_id": {
"$subtract": [
{ "$subtract": [ "$datetime", new Date("1970-01-01") ] },
{ "$mod": [
{ "$subtract": [ "$datetime", new Date("1970-01-01") ] },
1000*60*60
]}
]
},
"id": { "$last": "$_id" },
"datetime": { "$last": "$datetime" },
"product": { "$last": "$product" }
}}
])
Of course you can add a range query for dates in the $match with $gt and $lt operators to keep the data within the range you are particularly looking at.
Your overall solution is probably a combination of ideas, but as I said, your question seem to be about matching the last entries on certain time boundaries, so the last examples possibly in combination with filtering certain products is what you need rather than looping .findOne() requests.
I have a document called user.monthly, in that I have we used store 'day' : no. of clicks .
Here I have given 2 samples for different date
For month January
{
name : "devid",
date : ISODate("2014-01-21T11:32:42.392Z"),
daily: {'1':12,'9':13,'30':13}
}
For month February
{
name : "devid",
date : ISODate("2014-02-21T11:32:42.392Z"),
daily: {'3':12,'12':13,'25':13}
}
How can I aggregate this and get total clicks for January and February ?
Please help me to resolve my problem.
Your current schema is not helping you here as the "daily" field ( which we presume is your clicks per type or something like that ) is represented as a sub-document, which means that you need to explicitly name the path to each field in order to do something with it.
A better approach would be to put this information in an array:
{
"name" : "devid",
"date" : ISODate("2014-02-21T11:32:42.392Z"),
"daily": [
{ "type": "3", "clicks": 12 },
{ "type": "12", "clicks": 13 },
{ "type": "25", "clicks": 13 }
]
}
Then you have an aggregation statement that goes like this:
db.collection.aggregate([
// Just match the dates in January and February
{ "$match": {
"date": {
"$gte": new Date("2014-01-01"), "$lt": new Date("2014-03-01")
}
}},
// Unwind the "daily" array
{ "$unwind": "$daily" },
// Group the values together by "type" on "January" and "February"
{ "$group": {
"_id": {
"year": { "$year": "$date" },
"month": { "$month": "$date" },
"type": "$daily.type"
},
"clicks": { "$sum": "$daily.clicks" }
}},
// Sort the result nicely
{ "$sort": {
"_id.year": 1,
"_id.month": 1,
"_id.type": 1
}}
])
That form is pretty simple. Or even if you do not care about the type as a grouping and just want the month totals:
db.collection.aggregate([
{ "$match": {
"date": {
"$gte": new Date("2014-01-01"), "$lt": new Date("2014-03-01")
}
}},
{ "$unwind": "$daily" },
{ "$group": {
"_id": {
"year": { "$year": "$date" },
"month": { "$month": "$date" },
},
"clicks": { "$sum": "$daily.clicks" }
}},
{ "$sort": { "_id.year": 1, "_id.month": 1 }}
])
But with the current sub-document form you currently have this becomes ugly:
db.collection.aggregate([
{ "$match": {
"date": {
"$gte": new Date("2014-01-01"), "$lt": new Date("2014-03-01")
}
}},
{ "$group": {
"_id": {
"year": { "$year": "$date" },
"month": { "$month": "$date" },
},
"clicks": {
"$sum": {
"$add": [
{ "$ifNull": ["$daily.1", 0] },
{ "$ifNull": ["$daily.3", 0] },
{ "$ifNull": ["$daily.9", 0] },
{ "$ifNull": ["$daily.12", 0] },
{ "$ifNull": ["$daily.25", 0] },
{ "$ifNull": ["$daily.30", 0] },
]
}
}
}}
])
That shows that you have no other option here other than to specify what is essentially every possible field under daily ( so probably much larger ). Then we have to evaluate as that key may possibly not exist for a given document to return a default value.
For example, your first document has no key "daily.3" so without the $ifNull check the returned value would be null and invalidate the whole $sum process so that the total would be "0".
Grouping on those keys as in the first aggregate example gets even worse:
db.collection.aggregate([
// Just match the dates in January and February
{ "$match": {
"date": {
"$gte": new Date("2014-01-01"), "$lt": new Date("2014-03-01")
}
}},
// Project with an array to match all possible values
{ "$project": {
"date": 1,
"daily": 1,
"type": { "$literal": ["1", "3", "9", "12", "25", "30" ] }
}},
// Unwind the "type" array
{ "$unwind": "$type" },
// Project values onto the "type" while grouping
{ "$group" : {
"_id": {
"year": { "$year": "$date" },
"month": { "$month": "$date" },
"type": "$type"
},
"clicks": { "$sum": { "$cond": [
{ "$eq": [ "$type", "1" ] },
"$daily.1",
{ "$cond": [
{ "$eq": [ "$type", "3" ] },
"$daily.3",
{ "$cond": [
{ "$eq": [ "$type", "9" ] },
"$daily.9",
{ "$cond": [
{ "$eq": [ "$type", "12" ] },
"$daily.12",
{ "$cond": [
{ "$eq": [ "$type", "25" ] },
"$daily.25",
"$daily.30"
]}
]}
]}
]}
]}}
}},
{ "$sort": {
"_id.year": 1,
"_id.month": 1,
"_id.type": 1
}}
])
Which is creating one big conditional evaluation using $cond to match out the values to the "type" which we projected all possible values in an array using the $literal operator.
If you do not have MongoDB 2.6 or greater you can always do this in place of the $literal operator statement:
"type": { "$cond": [1, ["1", "3", "9", "12", "25", "30" ], 0] }
Where essentially the true evaluation from $cond returns a "literal" declared value, which is how you specify an array. There is also the hidden $const operator that is not documented, but now exposed as $literal.
As you can see the structure here is doing you no favors, so the best option is to change it. But if you cannot and otherwise find the aggregation concept for this too hard to handle, then mapReduce offers an approach, but the processing will be much slower:
db.collection.mapReduce(
function () {
for ( var k in this.daily ) {
emit(
{
year: this.date.getFullYear(),
month: this.date.getMonth() + 1,
type: k
},
this.daily[k]
);
}
},
function(key,values) {
return Array.sum( values );
},
{
"query": {
"date": {
"$gte": new Date("2014-01-01"), "$lt": new Date("2014-03-01")
}
},
"out": { "inline": 1 }
}
)
The general lesson here is that you will get the cleanest and fastest results by altering the document format and using the aggregation framework. But all the ways to do this are listed here.