I'm just learning mongodb aggregation framework
There are data in the the format below:
{
"questionType": "multiple",
"multipleOptions": ["first", "second", "third", "forth"],
"answers": ["first", "second", "second", "first", "first", "forth"]
},
{
"questionType": "multiple",
"multipleOptions": ["awful", "bad", "soso", "good", "excellent"],
"answers": ["bad", "bad", "good", "soso", "bad", "excellent", "awful", "soso"]
}
I want to aggregate these to something like this:
{
"result": { "first": 3, "second": 2, "forth": 1 }
},
{
"result": { "awful": 1, "bad": 3, "soso": 2, "good": 1, "excellent": 1 }
}
Or like this (no difference):
{
"result": [["first", 3], ["second", 2], ["forth", 1]]
},
{
"result": [["awful", 1], ["bad", 3], ["soso", 2], ["good", 1], ["excellent", 1]]
}
Is there a way to do this in a $project stage?
This can be done with a cohort of array operators working in conjunction to produce the desired effect.
You essentially need an operation that creates an array of key/value pairs of the counts you need. This will then be converted to a hash map. The array of key value pairs is essentially a map which is constructed by looping through the multipleOptions array and checking the size of the elements that match in the answers array.
TLDR;
The final pipeline you need to run follows:
db.collection.aggregate([
{ "$project": {
"result": {
"$arrayToObject": {
"$map": {
"input": { "$range": [ 0, { "$size": "$multipleOptions" } ] },
"as": "idx",
"in": {
"$let": {
"vars": {
"k": {
"$arrayElemAt": [
"$multipleOptions",
"$$idx"
]
},
"v": {
"$size": {
"$filter": {
"input": "$answers",
"as": "ans",
"cond": {
"$eq": [
"$$ans",
{
"$arrayElemAt": [
"$multipleOptions",
"$$idx"
]
}
]
}
}
}
}
},
"in": { "k": "$$k", "v": "$$v" }
}
}
}
}
}
} }
])
To demonstrate this step by step, lets create an additional field in an aggregate operation, this field will be an array of the
counts of the corresponding array element. We need something like
{
"questionType": "multiple",
"multipleOptions": ["awful", "bad", "soso", "good", "excellent"],
"answersCount": [1, 3, 2, 1, 1],
"answers": ["bad", "bad", "good", "soso", "bad", "excellent", "awful", "soso"]
}
To get this we need a way to loop through the multipleOptions and for each option, iterate the answers array, filter it and count the number of elements in the filtered array. The pseudo-algorithm follows:
answersCount = []
for each elem in ["awful", "bad", "soso", "good", "excellent"]:
filteredAnswers = [<answers array containing only elem>]
count = filteredAnswers.length
answersCount.push(count)
In mongo, the filtering part can be done using $filter on the answers array and elements can be referenced with $arrayElemAt
{
"$filter": {
"input": "$answers",
"as": "ans",
"cond": {
"$eq": [
"$$ans",
{ "$arrayElemAt": [ "$multipleOptions", "$$idx" ] }
]
}
}
}
The counts are derived using $size on the above expression
{
"$size": {
"$filter": {
"input": "$answers",
"as": "ans",
"cond": {
"$eq": [
"$$ans",
{ "$arrayElemAt": [ "$multipleOptions", "$$idx" ] }
]
}
}
}
}
For getting the outer loop, we can use $range and $map as
{
"$map": {
"input": { "$range": [ 0, { "$size": "$multipleOptions" } ] },
"as": "idx",
"in": {
"$let": {
"vars": {
"v": {
"$size": {
"$filter": {
"input": "$answers",
"as": "ans",
"cond": {
"$eq": [
"$$ans",
{ "$arrayElemAt": [ "$multipleOptions", "$$idx" ] }
]
}
}
}
}
},
"in": "$$v"
}
}
}
}
This will produce the answersCount in the following aggregate operation
db.collection.aggregate([
{ "$addFields": {
"answersCount": {
"$map": {
"input": { "$range": [ 0, { "$size": "$multipleOptions" } ] },
"as": "idx",
"in": {
"$let": {
"vars": {
"v": {
"$size": {
"$filter": {
"input": "$answers",
"as": "ans",
"cond": {
"$eq": [
"$$ans",
{ "$arrayElemAt": [ "$multipleOptions", "$$idx" ] }
]
}
}
}
}
},
"in": "$$v"
}
}
}
}
} }
])
To then get to the desired output, you need the answersCount to be an array of key/value pairs i.e.
{
"answersCount": [
{ "k": "awful", "v": 1},
{ "k": "bad", "v": 3},
{ "k": "soso", "v": 2},
{ "k": "good", "v": 1},
{ "k": "excellent", "v": 1}
],
}
and when you apply $arrayToObject on the above expression i.e.
{ "$arrayToObject": {
"answersCount": [
{ "k": "awful", "v": 1},
{ "k": "bad", "v": 3},
{ "k": "soso", "v": 2},
{ "k": "good", "v": 1},
{ "k": "excellent", "v": 1}
],
} }
you get
{
"awful" : 1,
"bad" : 3,
"soso" : 2,
"excellent" : 1,
"good" : 1
}
This is a good use case for "multi-stage grouping." Let's begin with an $unwind of answers:
c = db.foo.aggregate([
{$unwind: "$answers"}
]);
{
"_id" : 0,
"questionType" : "multiple",
"multipleOptions" : [
"first",
"second",
"third",
"forth"
],
"answers" : "first"
}
{
"_id" : 0,
"questionType" : "multiple",
"multipleOptions" : [
"first",
"second",
"third",
"forth"
],
"answers" : "second"
}
{
"_id" : 0,
"questionType" : "multiple",
"multipleOptions" : [
"first",
"second",
"third",
"forth"
],
"answers" : "second"
}
// ...
Now we have answers and _id as peers ready to group:
db.foo.aggregate([
{$unwind: "$answers"}
,{$group: {_id: {Xid:"$_id", answer:"$answers"}, n:{$sum:1} }}
]);
{ "_id" : { "Xid" : 1, "answer" : "awful" }, "n" : 1 }
{ "_id" : { "Xid" : 1, "answer" : "excellent" }, "n" : 1 }
{ "_id" : { "Xid" : 1, "answer" : "soso" }, "n" : 2 }
{ "_id" : { "Xid" : 1, "answer" : "bad" }, "n" : 3 }
{ "_id" : { "Xid" : 0, "answer" : "forth" }, "n" : 1 }
Now we group again, this time by the _id.Xid and then use $push to construct the output array of results:
db.foo.aggregate([
{$unwind: "$answers"}
,{$group: {_id: {Xid:"$_id", answer:"$answers"}, n:{$sum:1} }}
,{$group: {_id: "$_id.Xid", result: {$push: {answer: "$_id.answer", n: "$n" }} }}
]);
{
"_id" : 0,
"result" : [
{
"answer" : "forth",
"n" : 1
},
{
"answer" : "second",
"n" : 2
},
{
"answer" : "first",
"n" : 3
}
]
}
{
"_id" : 1,
"result" : [
{
"answer" : "awful",
"n" : 1
},
{
"answer" : "excellent",
"n" : 1
},
{
"answer" : "soso",
"n" : 2
},
{
"answer" : "bad",
"n" : 3
},
{
"answer" : "good",
"n" : 1
}
]
}
So in spirit we have a solution but to really press the point, we will use the $arrayToObject function to turn the array of options from the values of the answer key to keys in their own right. To do so, we will name the $push object args k and v to properly drive the function:
db.foo.aggregate([
{$unwind: "$answers"}
,{$group: {_id: {Xid:"$_id", answer:"$answers"}, n:{$sum:1} }}
,{$group: {_id: "$_id.Xid", QQ: {$push: {k: "$_id.answer", v: "$n" }} }}
,{$project: {_id: true, result: {$arrayToObject: "$QQ"} }}
]);
which yields:
{ "_id" : 0, "result" : { "forth" : 1, "second" : 2, "first" : 3 } }
{
"_id" : 1,
"result" : {
"awful" : 1,
"excellent" : 1,
"soso" : 2,
"bad" : 3,
"good" : 1
}
}
Suppose I have the following documents.
{ "_id" : 1, "score" : [ -1, 3 ] }
{ "_id" : 2, "score" : [ 1, 5 ] }
{ "_id" : 3, "score" : [ 5, 5 ] }
{"_id" : 4, "score" : [ 2, 1, 5 ]}
If I want to find documents with all elements in score array are between 3 and 6 inclusive, how can I create a query with {$gte:3} and {$lte:6} ?
So, it should return only:
{ "_id" : 3, "score" : [ 5, 5 ] }
The general case here is that you probably "should" generate the "range" that falls in between and then test that the array does NOT contain any of those possible numbers:
var start = 3,
end = 6;
var range = Array(end - start + 1).fill(1).map(e => start++);
db.scores.find({
"score": {
"$elemMatch": { "$gte": start, "$lte": end },
"$not": {
"$elemMatch": { "$nin": range }
}
}
})
Would return:
{ "_id" : 3, "score" : [ 5, 5 ] }
Since that is the only element which contains something that falls within that range "only".
IF you have a restriction where the items have too many possible values to list the entire "range", then you either use a $not on the "range condition":
db.scores.find({
"score": {
"$elemMatch": { "$gte": 3, "$lte": 6 },
"$not": {
"$elemMatch": {
"$not": { "$gte": 3, "$lte": 6 }
}
}
}
})
Or alternately process with an additional calculated condition,
Either using $expr:
db.scores.find({
"score": { "$elemMatch": { "$gte": 3, "$lte": 6 } },
"$expr": {
"$allElementsTrue": {
"$map": {
"input": "$score",
"in": {
"$and": [
{ "$gte": [ "$$this", 3 ] },
{ "$lte": [ "$$this", 6 ] }
]
}
}
}
}
})
Or $redact prior to MongoDB 3.6
db.scores.aggregate([
{ "$match": { "score": { "$elemMatch": { "$gte": 3, "$lte": 6 } } } },
{ "$redact": {
"$cond": {
"if": {
"$allElementsTrue": {
"$map": {
"input": "$score",
"in": {
"$and": [
{ "$gte": [ "$$this", 3 ] },
{ "$lte": [ "$$this", 6 ] }
]
}
}
}
},
"then": "$$KEEP",
"else": "$$PRUNE"
}
}}
])
Or even $where using a JavaScript condition for matching where your MongoDB does not support any of the above:
db.scores.find({
"score": { "$elemMatch": { "$gte": 3, "$lte": 6 } },
"$where": function() {
return this.score.every(e => e >= 3 && e <= 6)
}
})
In all cases you actually want that "positive" condition on the $elemMatch in order to find those elements "within the range" by ideally searching the index on that field. None of the other methods either employing the $not and $nin or via the $allElementsTrue aggregation condition or the Array.every() can actually look at the "index" in order to satisfy the condition. They are merely used in all cases as the "additional filter" from which to "rule out" any "potential" matched documents via the "range" from the eventual results.
Let's say I have a document in my aggregate pipeline that looks like this:
{
scores: [
{type: 'quiz', score: 75},
{type: 'quiz', score: 62},
{type: 'final', score: 34},
]
}
I'm using $project to transform it, and I'd like to get the sum of the the quiz scores, is there a way I could somehow chain my $filter and $sum.
I know that I could potentially use two $projects, but the way I currently have my pipeline set up would force me to continue reproject a ton of keys in my second project, which I'd like to avoid.
You would need help from another operator that maps the scores embedded documents into an array of values that you can $sum. You can do that using the $map operator as in the following:
db.test.aggregate([
{
"$project": {
"scores": 1,
"quiz_total": {
"$sum": {
"$map": {
"input": "$scores",
"as": "item",
"in": {
"$cond": [
{ "$eq": [ "$$item.type", "quiz" ] },
"$$item.score",
0
]
}
}
}
}
}
}
])
Sample Output
{
"_id" : ObjectId("582ac9619602e37d2794acd3"),
"scores" : [
{
"type" : "quiz",
"score" : 75
},
{
"type" : "quiz",
"score" : 62
},
{
"type" : "final",
"score" : 34
}
],
"quiz_total" : 137
}
YES!
db.collection.aggregate([
{ "$project": {
"totalQuiz": {
"$sum": {
"$filter": {
"input": "$score",
"as": "s",
"cond": { "$eq": [ "$$s.type", "quiz" ] }
}
}
}
}
])
I have documents like the this
{ "_id" : ObjectId("5755d81e2935fe65f5d167aa"), "prices" : [ 23, 11, 2, 3, 4, 1, 232 ] },
{ "_id" : ObjectId("5755d81e2935fe65f5d167ab"), "prices" : [ 99, 3, 23, 23, 12 ] },
{ "_id" : ObjectId("5755d81e2935fe65f5d167ac"), "prices" : [ 999, 12, 3, 4, 4, 4, 4, 4, 123 ] },
{ "_id" : ObjectId("5755d81e2935fe65f5d167ad"), "prices" : [ 24, 3, 4, 5, 6, 7, 723 ] }
and I want to find the document with array 'prices' containing the highest amount of digit 4, which in my case is the third document. Is there any way to query it?
Starting from MongoDB 3.2, we can $project our documents and use the $size and the $filter operator to return the "count" of the number 4 in each array. From there we need to $group using that "value" and use the $push accumulator operator to return an array of the documents that have same "maximum". Next you $sort your documents by _id and use $limit to return the documents with the maximum occurrence of 4.
db.collection.aggregate(
[
{ "$project": {
"prices": 1,
"counter": {
"$size": {
"$filter": {
"input": "$prices",
"as": "p",
"cond": { "$eq": [ "$$p", 4 ] }
}
}
}
}},
{ "$group": {
"_id": "$counter",
"docs": { "$push": "$$ROOT" }
}},
{ "$sort": { "_id": -1 } },
{ "$limit": 1 }
]
)
I want to use $max operator to select the max value.
And also keep the max record with the key "original_document"
How could I do it in mongoDB
expect result
{ "_id" : "abc", "maxTotalAmount" : 100,
"maxQuantity" : 10,
"original_document": {{ "_id" : 4, "item" : "abc", "price" : 10, "quantity" : 10, "date" : ISODate("2014-02-15T08:00:00Z") }}}
current result
{ "_id" : "abc", "maxTotalAmount" : 100, "maxQuantity" : 10 }
documents
{ "_id" : 1, "item" : "abc", "price" : 10, "quantity" : 2, "date" : ISODate("2014-01-01T08:00:00Z") }
{ "_id" : 4, "item" : "abc", "price" : 10, "quantity" : 10, "date" : ISODate("2014-02-15T08:00:00Z") }
aggregation
db.sales.aggregate(
[
{
$group:
{
_id: "$item",
maxTotalAmount: { $max: { $multiply: [ "$price", "$quantity" ] } },
maxQuantity: { $max: "$quantity" }
}
}
]
)
When you want detail from the same grouping item then you use $sort and $first for the field(s) from the document you wish to preserve:
db.sales.aggregate([
{ "$project": {
"item": 1,
"TotalAmount": { "$multiply": [ "$price", "$quantity" ] },
"quantity": 1
}},
{ "$sort": { "TotalAmount": -1 } },
{ "$group": {
"_id": "$item",
"maxTotalAmount": { "$max": "$TotalAmount" },
"maxQuantity": { "$max": "$quantity" },
"doc_id": { "$first": "$_id" },
"doc_quantity": { "$first": "$quantity" }
}}
])
The aggregation "accumulators" cannot use embedded fields, and pushing all to an array makes little sense. But you can name like above and even rename with another $project or in your code if you want to.
Just to demonstrate how impractical this is to do otherwise, there is this example:
db.sales.aggregate([
{ "$group": {
"_id": "$item",
"maxTotalAmount": { "$max": { "$multiply": [ "$price", "$quantity" ] } },
"maxQuantity": { "$max": "$quantity" },
"docs": { "$push": {
"_id": "$_id",
"quantity": "$quantity",
"TotalAmount": { "$multiply": [ "$price", "$quantity" ] }
}}
}},
{ "$project": {
"maxTotalAmount": 1,
"maxQuantity": 1,
"maxTotalDocs": {
"$setDifference": [
{ "$map": {
"input": "$docs",
"as": "doc",
"in": {
"$cond": [
{ "$eq": [ "$maxTotalAmount", "$$doc.TotalAmount" ] },
"$$doc",
false
]
}
}},
[false]
]
}
}}
])
Which is not a great idea since you are pushing every document within the grouping condition into an array, only to filter out the ones you want later. On any reasaonable data size this is not practical and likely to break.
Please check the below :
db.qt.aggregate([
{ "$project": { "maxTotalAmount" : { "$multiply" :
[ "$price", "$quantity" ]
} ,
"currentDocumnet" : { "_id" : "$_id" ,
"item" : "$item", "price" : "$price",
"quantity" : "$quantity",
"date" : "$date" } }
},
{"$sort" : { "currentDocumnet.item" : 1 , maxTotalAmount : -1}},
{"$group" :{ _id : "$currentDocumnet.item" ,
currentDocumnet : { "$first" : "$currentDocumnet"} ,
maxTotalAmount : { "$first" : "$maxTotalAmount"} ,
maxQuantity: { "$max" : "$currentDocumnet.quantity" }}
}
]);