How to group documents on index of array elements? - mongodb

I'm looking for a way to take data such as this
{ "_id" : 5, "count" : 1, "arr" : [ "aga", "dd", "a" ] },
{ "_id" : 6, "count" : 4, "arr" : [ "aga", "ysdf" ] },
{ "_id" : 7, "count" : 4, "arr" : [ "sad", "aga" ] }
I would like to sum the count based on the 1st item(index) of arr. In another aggregation I would like to do the same with the 1st and the 2nd item in the arr array.
I've tried using unwind, but that breaks up the data and the hierarchy is then lost.
I've also tried using
$group: {
_id: {
arr_0:'$arr.0'
},
total:{
$sum: '$count'
}
}
but the result is blank arrays

Actually you can't use the dot notation to group your documents by element at a specified index. To two that you have two options:
First the optimal way using the $arrayElemAt operator new in MongoDB 3.2. which return the element at a specified index in the array.
db.collection.aggregate([
{ "$group": {
"_id": { "$arrayElemAt": [ "$arr", 0 ] },
"count": { "$sum": 1 }
}}
])
From MongoDB version 3.0 backward you will need to de-normalise your array then in the first time $group by _id and use the $first operator to return the first item in the array. From there you will need to regroup your document using that value and use the $sum to get the sum. But this will only work for the first and last index because MongoDB also provides the $last operator.
db.collection.aggregate([
{ "$unwind": "$arr" },
{ "$group": {
"_id": "$_id",
"arr": { "$first": "$arr" }
}},
{ "$group": {
"_id": "$arr",
"count": { "$sum": 1 }
}}
])
which yields something like this:
{ "_id" : "sad", "count" : 1 }
{ "_id" : "aga", "count" : 2 }
To group using element at position p in your array you will get a better chance using the mapReduce function.
var mapFunction = function(){ emit(this.arr[0], 1); };
var reduceFunction = function(key, value) { return Array.sum(value); };
db.collection.mapReduce(mapFunction, reduceFunction, { "out": { "inline": 1 } } )
Which returns:
{
"results" : [
{
"_id" : "aga",
"value" : 2
},
{
"_id" : "sad",
"value" : 1
}
],
"timeMillis" : 27,
"counts" : {
"input" : 3,
"emit" : 3,
"reduce" : 1,
"output" : 2
},
"ok" : 1
}

Related

Multiple Nested Group Within Array

I'm having group of elements in MongoDB as given below:
/* 1 */
{
"_id" : ObjectId("58736c7f7d43c305461cdb9b"),
"Name" : "Kevin",
"pb_event" : [
{
"event_type" : "Birthday",
"event_date" : "2014-08-31"
},
{
"event_type" : "Anniversary",
"event_date" : "2014-08-31"
}
]
}
/* 2 */
{
"_id" : ObjectId("58736cfc7d43c305461cdba8"),
"Name" : "Peter",
"pb_event" : [
{
"event_type" : "Birthday",
"event_date" : "2014-08-31"
},
{
"event_type" : "Anniversary",
"event_date" : "2015-03-24"
}
]
}
/* 3 */
{
"_id" : ObjectId("58736cfc7d43c305461cdba9"),
"Name" : "Pole",
"pb_event" : [
{
"event_type" : "Birthday",
"event_date" : "2015-03-24"
},
{
"event_type" : "Work Anniversary",
"event_date" : "2015-03-24"
}
]
}
Now I want the result that has group on event_date then after group on event_type. event_type contain all names of the related user, then count of records in the respective array.
Expected Output
/* 1 */
{
"event_date" : "2014-08-31",
"data" : [
{
"event_type" : "Birthday",
"details" : [
{
"_id" : ObjectId("58736c7f7d43c305461cdb9b"),
"name" : "Kevin"
},
{
"_id" : ObjectId("58736cfc7d43c305461cdba8"),
"name" : "Peter"
}
],
"count" : 2
},
{
"event_type" : "Anniversary",
"details" : [
{
"_id" : ObjectId("58736c7f7d43c305461cdb9b"),
"name" : "Kevin"
}
],
"count" : 1
}
]
}
/* 2 */
{
"event_date" : "2015-03-24",
"data" : [
{
"event_type" : "Anniversary",
"details" : [
{
"_id" : ObjectId("58736cfc7d43c305461cdba8"),
"name" : "Peter"
}
],
"count" : 1
},
{
"event_type" : "Birthday",
"details" : [
{
"_id" : ObjectId("58736cfc7d43c305461cdba9"),
"name" : "Pole"
}
],
"count" : 1
},
{
"event_type" : "Work Anniversary",
"details" : [
{
"_id" : ObjectId("58736cfc7d43c305461cdba9"),
"name" : "Pole"
}
],
"count" : 1
}
]
}
Using the aggregation framework, you would need to run a pipeline that has the following stages so that you get the desired result:
db.collection.aggregate([
{ "$unwind": "$pb_event" },
{
"$group": {
"_id": {
"event_date": "$pb_event.event_date",
"event_type": "$pb_event.event_type"
},
"details": {
"$push": {
"_id": "$_id",
"name": "$Name"
}
},
"count": { "$sum": 1 }
}
},
{
"$group": {
"_id": "$_id.event_date",
"data": {
"$push": {
"event_type": "$_id.event_type",
"details": "$details",
"count": "$count"
}
}
}
},
{
"$project": {
"_id": 0,
"event_date": "$_id",
"data": 1
}
}
])
In the above pipeline, the first step is the $unwind operator
{ "$unwind": "$pb_event" }
which comes in quite handy when the data is stored as an array. When the unwind operator is applied on a list data field, it will generate a new record for each and every element of the list data field on which unwind is applied. It basically flattens the data.
This is a necessary operation for the next pipeline stage, the $group step where you group the flattened documents by the deconstructed pb_event array fields event_date and event_type:
{
"$group": {
"_id": {
"event_date": "$pb_event.event_date",
"event_type": "$pb_event.event_type"
},
"details": {
"$push": {
"_id": "$_id",
"name": "$Name"
}
},
"count": { "$sum": 1 }
}
},
The $group pipeline operator is similar to the SQL's GROUP BY clause. In SQL, you can't use GROUP BY unless you use any of the aggregation functions. The same way, you have to use an aggregation function in MongoDB (called an accumulator operator) as well. You can read more about the aggregation functions here.
In this $group operation, the logic to calculate the count aggregate i.e. the total number of documents in the group using the $sum accumulator operator. Within the same pipeline, you can aggregate a list of the name and _id subdocuments by using the $push operator which returns an array of expression values for each group.
The preceding $group pipeline
{
"$group": {
"_id": "$_id.event_date",
"data": {
"$push": {
"event_type": "$_id.event_type",
"details": "$details",
"count": "$count"
}
}
}
}
will further aggregate the results from the last pipeline by grouping on the event_date, which forms basis of the desired output by creating a new data list using $push and then the final $project pipeline stage
{
"$project": {
"_id": 0,
"event_date": "$_id",
"data": 1
}
}
reshapes the documents fields by renaming the _id field to event_date and retaining the other field.

mongodb count number of documents for every category

My collection looks like this:
{
"_id":ObjectId("5744b6cd9c408cea15964d18"),
"uuid":"bbde4bba-062b-4024-9bb0-8b12656afa7e",
"version":1,
"categories":["sport"]
},
{
"_id":ObjectId("5745d2bab047379469e10e27"),
"uuid":"bbde4bba-062b-4024-9bb0-8b12656afa7e",
"version":2,
"categories":["sport", "shopping"]
},
{
"_id":ObjectId("5744b6359c408cea15964d15"),
"uuid":"561c3705-ba6d-432b-98fb-254483fcbefa",
"version":1,
"categories":["politics"]
}
I want to count the number of documents for every category. To do this, I unwind the categories array:
db.collection.aggregate(
{$unwind: '$categories'},
{$group: {_id: '$categories', count: {$sum: 1}} }
)
Result:
{ "_id" : "sport", "count" : 2 }
{ "_id" : "shopping", "count" : 1 }
{ "_id" : "politics", "count" : 1 }
Now I want to count the number of documents for every category, but where document version is the latest version.
This is where I am stuck.
It's ugly but I think this gives you what you're after:
db.collection.aggregate(
{ $unwind : "$categories" },
{ $group :
{ "_id" : { "uuid" : "$uuid" },
"doc" : { $push : { "version" : "$version", "category" : "$categories" } },
"maxVersion" : { $max : "$version" }
}
},
{ $unwind : "$doc" },
{ $project : { "_id" : 0, "uuid" : "$id.uuid", "category" : "$doc.category", "isCurrentVersion" : { $eq : [ "$doc.version", "$maxVersion" ] } } },
{ $match : { "isCurrentVersion" : true }},
{ $group : { "_id" : "$category", "count" : { $sum : 1 } } }
)
You can do this by first grouping the denormalized documents (from the $unwind operator step) by two keys, i.e. the categories and version fields. This is necessary for the preceding pipeline step which orders the grouped documents and their accumulated counts by the version (desc) and categories (asc) keys respectively using the $sort operator.
Another grouping will be required to get the top documents in each categories group after ordering using the $first operator. The following shows this
db.collection.aggregate(
{ "$unwind": "$categories" },
{
"$group": {
"_id": {
'categories': '$categories',
'version': '$version'
},
"count": { "$sum": 1 }
}
},
{ "$sort": { "_id.version": -1, "_id.categories": 1 } },
{
"$group": {
"_id": "$_id.categories",
"count": { "$first": "$count" },
"version": { "$first": "$_id.version" }
}
}
)
Sample Output
{ "_id" : "shopping", "count" : 1, "version" : 2 }
{ "_id" : "sport", "count" : 1, "version" : 2 }
{ "_id" : "politics", "count" : 1, "version" : 1 }

Get percentages with MongoDB aggregate $group

I'd like to get percentages from a group pipeline in a MongoDB aggregate.
My data:
{
_id : 1,
name : 'hello',
type : 'big'
},
{
_id : 2,
name : 'bonjour',
type : 'big'
},
{
_id : 3,
name : 'hi',
type : 'short'
},
{
_id : 4,
name : 'salut',
type : 'short'
},
{
_id : 5,
name : 'ola',
type : 'short'
}
My request group by type, and count:
[{
$group : {
_id : {
type : '$type'
},
"count" : {
"$sum" : 1
}
}
}]
Result:
[
{
_id {
type : 'big',
},
count : 2
},
{
_id {
type : 'short',
},
count : 3
}
]
But I'd like to have count AND percentage, like that:
[
{
_id {
type : 'big',
},
count: 2,
percentage: 40%
},
{
_id {
type : 'short',
},
count: 3,
percentage: 60%
}
]
But I've no idea how to do that. I've tried $divide and other things, but without success. Could you please help me?
Well I think percentage should be string if the value contains %
First get you will need to count the number of document.
var nums = db.collection.count();
db.collection.aggregate(
[
{ "$group": { "_id": {"type": "$type"}, "count": { "$sum": 1 }}},
{ "$project": {
"count": 1,
"percentage": {
"$concat": [ { "$substr": [ { "$multiply": [ { "$divide": [ "$count", {"$literal": nums }] }, 100 ] }, 0,2 ] }, "", "%" ]}
}
}
]
)
Result
{ "_id" : { "type" : "short" }, "count" : 3, "percentage" : "60%" }
{ "_id" : { "type" : "big" }, "count" : 2, "percentage" : "40%" }
First find total number of documents in collections using count method and used that count variable to calculate percentage in aggregation like this :
var totalDocument = db.collectionName.count() //count total doc.
used totalDocument in aggregation as below :
db.collectionName.aggregate({"$group":{"_id":{"type":"$type"},"count":{"$sum":1}}},
{"$project":{"count":1,"percentage":{"$multiply":[{"$divide":[100,totalDocument]},"$count"]}}})
EDIT
If you need to this in single aggregation query then unwind used in aggregation but using unwind it creates Cartesian problem check below aggregation query :
db.collectionName.aggregate({"$group":{"_id":null,"count":{"$sum":1},"data":{"$push":"$$ROOT"}}},
{"$unwind":"$data"},
{"$group":{"_id":{"type":"$data.type"},"count":{"$sum":1},
"total":{"$first":"$count"}}},
{"$project":{"count":1,"percentage":{"$multiply":[{"$divide":[100,"$total"]},"$count"]}}}
).pretty()
I recconmed first find out toatal count and used that count in aggregation as per first query.

Mongodb counting array combinations

I have this kind of documents
[
{
....
tags : ["A","B"]
},
{
....
tags : ["A","B"]
},
{
....
tags : ["J","K"]
},
{
....
tags : ["A","B","C"]
}
]
With the Aggregation Framwork I'd like to group by array combinations to have something like this :
[
{
_id:["A","B"],
count : 2
},
{
_id:["J","K"],
count : 1
},
{
_id:["A","B","C"],
count : 1
},
]
Is it possible to do that?
Thank you
Not sure why you didn't even think this would work:
db.collection.aggregate([
{ "$group": {
"_id": "$tags",
"count": { "$sum": 1 }
}}
])
Returns:
{ "_id" : [ "A", "B", "C" ], "count" : 1 }
{ "_id" : [ "J", "K" ], "count" : 1 }
{ "_id" : [ "A", "B" ], "count" : 2 }
MongoDB "does not care" what you throw into the value of a "field" or "property". This applies to the "grouping key" of _id in the $group operator as well. Everything is a "document" and therefore a BSON value and is therefore valid.
Anything works. So long as it's what you want.

Obtaining $group result with group count

Assuming I have a collection called "posts" (in reality it is a more complex collection, posts is too simple) with the following structure:
> db.posts.find()
{ "_id" : ObjectId("50ad8d451d41c8fc58000003"), "title" : "Lorem ipsum", "author" :
"John Doe", "content" : "This is the content", "tags" : [ "SOME", "RANDOM", "TAGS" ] }
I expect this collection to span hundreds of thousands, perhaps millions, that I need to query for posts by tags and group the results by tag and display the results paginated. This is where the aggregation framework comes in. I plan to use the aggregate() method to query the collection:
db.posts.aggregate([
{ "$unwind" : "$tags" },
{ "$group" : {
_id: { tag: "$tags" },
count: { $sum: 1 }
} }
]);
The catch is that to create the paginator I would need to know the length of the output array. I know that to do that you can do:
db.posts.aggregate([
{ "$unwind" : "$tags" },
{ "$group" : {
_id: { tag: "$tags" },
count: { $sum: 1 }
} }
{ "$group" : {
_id: null,
total: { $sum: 1 }
} }
]);
But that would discard the output from previous pipeline (the first group). Is there a way that the two operations be combined while preserving each pipeline's output? I know that the output of the whole aggregate operation can be cast to an array in some language and have the contents counted but there may be a possibility that the pipeline output may exceed the 16Mb limit. Also, performing the same query just to obtain the count seems like a waste.
So is obtaining the document result and count at the same time possible? Any help is appreciated.
Use $project to save tag and count into tmp
Use $push or addToSet to store tmp into your data list.
Code:
db.test.aggregate(
{$unwind: '$tags'},
{$group:{_id: '$tags', count:{$sum:1}}},
{$project:{tmp:{tag:'$_id', count:'$count'}}},
{$group:{_id:null, total:{$sum:1}, data:{$addToSet:'$tmp'}}}
)
Output:
{
"result" : [
{
"_id" : null,
"total" : 5,
"data" : [
{
"tag" : "SOME",
"count" : 1
},
{
"tag" : "RANDOM",
"count" : 2
},
{
"tag" : "TAGS1",
"count" : 1
},
{
"tag" : "TAGS",
"count" : 1
},
{
"tag" : "SOME1",
"count" : 1
}
]
}
],
"ok" : 1
}
I'm not sure you need the aggregation framework for this other than counting all the tags eg:
db.posts.aggregate(
{ "unwind" : "$tags" },
{ "group" : {
_id: { tag: "$tags" },
count: { $sum: 1 }
} }
);
For paginating through per tag you can just use the normal query syntax - like so:
db.posts.find({tags: "RANDOM"}).skip(10).limit(10)