I'm trying to implement a nested group query in mongodb and I'm getting stuck trying to add the outer group by. Given the below (simplified) data document:
{
"timestamp" : ISODate(),
"category" : "movies",
"term" : "my movie"
}
I'm trying to achieve a list of all categories and within the categories there should be the top number of terms. I would like my output something like this:
[
{ category: "movies",
terms: [ { term: "movie 1", total: 5000 }, { term: "movie 2", total: 200 } ... ]
},
{ category: "sports",
terms: [ { term: "football 1", total: 4000 }, { term: "tennis 2", total: 250 } ... ]
},
]
My 'inner group' is as shown below, and will get the top 5 for all categories:
db.collection.aggregate([
{ $match : { "timestamp": { $gt: ISODate("2014-08-27") } } },
{ $group : { _id : "$term", total : { $sum : 1 } } },
{ $sort : { total : -1 } },
{ $limit: 5 }
]);
// Outputs:
{ "_id" : "movie 1", "total" : 943 }
{ "_id" : "movie 2", "total" : 752 }
How would I go about implementing the 'outer group'?
Additionally sometimes the above aggregate]ion returns a null value (not all documents have a term value). How do I go about ignoring the null values?
thanks in advance
You will need two groups in this case. The first group generates a stream of documents with one document per term and category:
{ $group : {
_id : {
category: "$category",
term: "$term",
},
total: { $sum : 1 }
}
}
A second group will then merge all documents with the same term into one, using the $push operator to merge the categories into an array:
{ $group : {
_id : "$_id.category",
terms: {
$push: {
term:"$_id.term",
total:"$total"
}
}
}
}
Query:
db.getCollection('orders').aggregate([
{$match:{
tipo: {$regex:"[A-Z]+"}
}
},
{$group:
{
_id:{
codigo:"1",
tipo:"$tipo",
},
total:{$sum:1}
}
},
{$group:
{
_id:"$_id.codigo",
tipos:
{
$push:
{
tipo:"$_id.tipo",
total:"$total"
}
},
totalGeneral:{$sum:"$total"}
}
}
]);
Response:
{
"_id" : "1",
"tipos" : [
{
"tipo" : "TIPO_01",
"total" : 13.0
},
{
"tipo" : "TIPO_02",
"total" : 2479.0
},
{
"tipo" : "TIPO_03",
"total" : 12445.0
},
{
"tipo" : "TIPO_04",
"total" : 12445.0
},
{
"tipo" : "TIPO_05",
"total" : 21.0
},
{
"tipo" : "TIPO_06",
"total" : 21590.0
},
{
"tipo" : "TIPO_07",
"total" : 1065.0
},
{
"tipo" : "TIPO_08",
"total" : 562.0
}
],
"totalGeneral" : 50620.0
}
Related
I want to group by and count follow_user.tags.tag_id per record, so no matter how many times the same tag_id show up on the same record, it only counts as 1.
My database structure looks like this:
{
"external_userid" : "EXID1",
"follow_user" : [
{
"userid" : "USERID1",
"tags" : [
{
"tag_id" : "TAG1"
}
]
},
{
"userid" : "USERID2",
"tags" : [
{
"tag_id" : "TAG1"
},
{
"tag_id" : "TAG2"
}
]
}
]
},
{
"external_userid" : "EXID2",
"follow_user" : [
{
"userid" : "USERID1",
"tags" : [
{
"tag_id" : "TAG2"
}
]
}
]
}
Here's my query:
[
{ "$unwind": "$follow_user" }, { "$unwind": "$follow_user.tags" },
{ "$group" : { "_id" : { "follow_user᎐tags᎐tag_id" : "$follow_user.tags.tag_id" }, "COUNT(_id)" : { "$sum" : 1 } } },
{ "$project" : { "total" : "$COUNT(_id)", "tagId" : "$_id.follow_user᎐tags᎐tag_id", "_id" : 0 } }
]
What I expected:
{
"total" : 1,
"tagId" : "TAG1"
},
{
"total" : 2,
"tagId" : "TAG2"
}
What I get:
{
"total" : 2,
"tagId" : "TAG1"
},
{
"total" : 2,
"tagId" : "TAG2"
}
$set - Create a new field follow_user_tags.
1.1. $setUnion - To distinct the value from the Result 1.1.1.
1.1.1. $reduce - Add the value of follow_user.tags.tag_id into array.
$unwind - Deconstruct follow_user_tags array field to multiple documents.
$group - Group by follow_user_tags and perform total count via $sum.
$project - Decorate output document.
db.collection.aggregate([
{
$set: {
follow_user_tags: {
$setUnion: {
"$reduce": {
"input": "$follow_user.tags",
"initialValue": [],
"in": {
"$concatArrays": [
"$$value",
"$$this.tag_id"
]
}
}
}
}
}
},
{
$unwind: "$follow_user_tags"
},
{
$group: {
_id: "$follow_user_tags",
total: {
$sum: 1
}
}
},
{
$project: {
_id: 0,
tagId: "$_id",
total: 1
}
}
])
Sample Mongo Playground
I have a bunch of docs that look like below:
{
"_id" : ObjectId("8f30b453c2ece001364dc04d"),
"SessionId" : "awkuTQjj53kgqAZ4J",
"StartDate" : ISODate("2020-02-24T11:51:36.918+0000"),
"EndDate" : ISODate("2020-02-24T11:51:36.918+0000"),
"List1" : "X",
"List2" : "Y",
"rating" : [
{
"ObjectId" : "5d09e98380c5d5eb89ac5069",
"List" : "List 2",
"Rate" : NumberInt(5),
"RatedDate" : ISODate("2020-02-24T11:55:47.774+0000")
},
{
"ObjectId" : "5d09e98380c5d5eb89ac5069",
"List" : "List 2",
"Rate" : NumberInt(4),
"RatedDate" : ISODate("2020-02-24T11:55:48.408+0000")
},
{
"ObjectId" : "5d09e98380c5d5eb89ac505b",
"List" : "List 2",
"Rate" : NumberInt(3),
"RatedDate" : ISODate("2020-02-24T11:55:49.520+0000")
},
{
"ObjectId" : "5d09e98380c5d5eb89ac505c",
"List" : "List 2",
"Rate" : NumberInt(3),
"RatedDate" : ISODate("2020-02-24T11:55:51.787+0000")
},
{
"ObjectId" : "5d09e98380c5d5eb89ac5057",
"List" : "List 1",
"Rate" : NumberInt(4),
"RatedDate" : ISODate("2020-02-24T11:55:53.865+0000")
},
{
"ObjectId" : "5d09e98380c5d5eb89ac5058",
"List" : "List 1",
"Rate" : NumberInt(4),
"RatedDate" : ISODate("2020-02-24T11:55:53.865+0000")
},
],
"Answers" : {
"SelectedList" : "1",
},
}
I need to sum up all the rating.Rate where rating.List:'List 1' and respectively sum up all rating.Rate where rating.List:'List 2', also exclude duplicate records (by rating.ObjectId) and count only the ones with latest rating.RatedDate. I suppose this is a group aggregation.
Also they should match the criteria
List1:'X' ,
Answers.selectedList:1
What I have written looks like below so far:
[
{
"$match" : {
"List1" : "X",
"Answers.SelectedList" : "1"
}
},
{
"$unwind" : {
"path" : "$rating"
}
},
{
"$group" : {
"_id" : null,
"sum" : {
"$sum" : "$Rate"
}
}
}
]
can you please help me?
I was a little confused around the List1/List2 however I think this will get you most of the way to your required aggregation query.
db.test.aggregate([
{
$match: {
"List1": "X",
"Answers.SelectedList": "1"
}
},
{
"$unwind" : "$rating"
},
{
$group:{
_id: {
id: "$rating.ObjectId",
list: "$rating.List"
},
maxRatedDate: { $max: "$rating.RatedDate" },
ratings: { $push: "$rating" }
}
},{
$addFields: {
ratings: {
$filter: {
input: "$ratings",
as: "item",
cond: { $eq: [ "$$item.RatedDate", "$maxRatedDate" ] }
}
}
}
},
{
$unwind: "$ratings"
},
{
$group:{
_id: "$ratings.List",
sum : {
$sum : "$ratings.Rate"
}
}
}
])
This will output the following
{ "_id" : "List 1", "sum" : 8 }
{ "_id" : "List 2", "sum" : 10 }
However, let's try to break it down.
To start with we've got a simple match, the same as yours in your question. this just limits the number of documents we pass back
$match: {
"List1": "X",
"Answers.SelectedList": "1"
}
Then we unwind all the array items so we get a document for each rating, this allows us to do some extra querying on the data.
{
"$unwind" : "$rating"
}
Next, we've got a group by, here we're a group on the ObjectId of the rating so we can later remove duplicates, we're also finding out in the group which rating we've group has the highest date so we can take that one later in a projection. we're then pushing all the rating back in the array for later.
$group:{
_id: {
id: "$rating.ObjectId",
list: "$rating.List"
},
maxRatedDate: { $max: "$rating.RatedDate" },
ratings: { $push: "$rating" }
}
Next we want to project the ratings array in to a single element in which it only contains the latest rating, for this we use a $filter on the array and filter them all out that don't match our max date we calculated in our previous step.
$addFields: {
ratings: {
$filter: {
input: "$ratings",
as: "item",
cond: { $eq: [ "$$item.RatedDate", "$maxRatedDate" ] }
}
}
}
The next two steps are fairly simple and are just unwinding the array again (we've only got one element, then grouping them to get the total sum for the lists.
{
$unwind: "$ratings"
},
{
$group:{
_id: "$ratings.List",
sum : {
$sum : "$ratings.Rate"
}
}
}
At this point you only need to provide the $group stage with the field that you're actually grouping on as the _id field and reference the fields properly as they are still inside of the rating array:
"$group" : {
"_id" : "$rating.List",
"sum" : {
"$sum" : "$rating.Rate"
}
}
`"ActivityScores" : {
"Spring" : [
{
"ActivityId" : "8fd38724-7e7d-4518-bd49-d38a8b4b3435",
"ActivityTime" : "2017-05-25T16:07:02.000-06:00"
}
],
"Winter" : [
{
"ActivityId" : "90d2a976-19d9-4ce0-aa88-d32c122d173b",
"ActivityTime" : "2017-02-14T22:50:00.000-06:00"
}
],
"Fall" : [
{
"ActivityId" : "84b8c41e-788f-4acd-abec-dc455285972b",
"ActivityTime" : "2016-11-15T22:37:02.000-06:00"
},
{
"ActivityId" : "157af880-d47b-42fc-8ecf-ecfc1bbb56b1",
"ActivityTime" : "2016-09-01T22:50:05.000-06:00"
}
]
},
"Grade" : "2",
"GradeTag" : "GRADE_2", `
I am looking for aggregation query to get Total of ActivityIds. I tried various combination of $group, $unwind, $size $addToset but none of them seems to be working . I need to find total activities using aggregation framework only. I don't want to go through each document using javascript or python to get the total counts. Is there any easy way around?
Thanks.We are on version 3.2.Finally below combination worked. ActivityScores was field to entity.SchoolYears in our Schema.Working Aggregation Pipeline for me.
db.studentcontentareadocument.aggregate(
[
{
$project: {
"SpringActivitiesPerDoc" : {
"$size" : "$entity.SchoolYears.ActivityScores.Spring"
},
"WinterActivitiesPerDoc" : {
"$size" : "$entity.SchoolYears.ActivityScores.Winter"
},
"FallActivitiesPerDoc" : {
"$size" : "$entity.SchoolYears.ActivityScores.Fall"
}
}
},
{
$project: {
"TotalActivitiesPerDoc" : {
"$add" : [
"$SpringActivitiesPerDoc",
"$WinterActivitiesPerDoc",
"$FallActivitiesPerDoc"
]
}
}
},
{
$group: {
"_id" : null,
"TotalActivities" : {
"$sum" : "$TotalActivitiesPerDoc"
}
}
},
{
$project: {
"_id" : 0,
"TotalSGPActivities" : "$TotalActivities"
}
}
],
{
cursor: {
batchSize: 50
},
allowDiskUse: true
}
);
I am learning MongoDB aggregation pipeline, i tried to get matched count from output of $unwind and $group. I am able to see the results for $unwind and $group. But I am not sure why I didn't get the matched count. Please help to get percentage field greater than 25.
Here's an example document:
{
"_id":ObjectId("599e9dbd8fbad926e712f902"),
"sample":"1",
"attribute":[
{
"functionName":"1",
"percentage":31.6
}
]
}
I tried this:
db.docs3.aggregate({
$unwind:'$attribute'
},
{
$group:{
_id:{
func:"$attribute.functionName",
percen:"$attribute.percentage"
}
}
})
And got this output:
{ "_id" : { "func" : "7", "percen" : 30 } }
{ "_id" : { "func" : "5", "percen" : 23.1 } }
{ "_id" : { "func" : "8", "percen" : 27.8 } }
{ "_id" : { "func" : "6", "percen" : 32.1 } }
{ "_id" : { "func" : "1", "percen" : 31.6 } }
{ "_id" : { "func" : "2", "percen" : 35 } }
{ "_id" : { "func" : "3", "percen" : 7.1 } }
{ "_id" : { "func" : "4", "percen" : 31.6 } }
I tried this:
db.docs3.aggregate({
$unwind:'$attribute'
},
{
$group:{
_id:{
func:"$attribute.functionName",
percen:"$attribute.percentage"
}
}
},
{
$match:{
"attribute.percentage":{
$gt:25
}
}
})
And I got an error.
I'm not sure whether you want a count of
Documents having at least one attribute with percentage > 25
Or
Attributes having percentage > 25
If you are interested in No. 1 then you do not need to unwind the attribute array in order to apply your match. The following will return a count of documents containing at least one attribute with percentage > 25:
db.getCollection('other').aggregate([
{ $match: { 'attribute.percentage': { $gt: 25 } } },
{ $group: { _id: null, count: { $sum: 1 } } }
])
If you are interested in No. 2 then the following will return a count of attributes with percentage > 25:
db.getCollection('other').aggregate([
{ $unwind: '$attribute' },
{ $match: { 'attribute.percentage': { $gt: 25 } } },
{ $group: { _id: null, count: { $sum: 1 } } }
])
For the following documents:
{
"_id" : ObjectId("599e9dbd8fbad926e712f902"),
"sample" : "1",
"attribute" : [
{
"functionName" : "1",
"percentage" : 31.6
}
]
}
{
"_id" : ObjectId("59a54104e7e9cc2109863beb"),
"sample" : "1",
"attribute" : [
{
"functionName" : "2",
"percentage" : 21.2
}
]
}
{
"_id" : ObjectId("59a542c4e7e9cc2109863c45"),
"sample" : "1",
"attribute" : [
{
"functionName" : "2",
"percentage" : 20.2
},
{
"functionName" : "2",
"percentage" : 28.2
},
{
"functionName" : "2",
"percentage" : 35.2
}
]
}
The first command returns count=2, the second command returns count=3.
Use the money sign to get the current value, and use the same field name you used in your $group stage
db.docs3.aggregate({
$unwind:'$attribute'
},
{
$group:{
_id:{
func:"$attribute.functionName",
percen:"$attribute.percentage"
}
}
},
{
$match:{
"$_id.percen":{
$gt:25
}
}
})
I'm attempting to write a query to return the top X terms across each category - e.g. top 5, top 10 etc. Each term has an associated category, and based up on some help from another stackoverflow question I've managed to get this:
db.collection.aggregate([
{
$group : {
_id : {
category: "$uri.category",
term: "$uri.term",
},
total: { $sum : 1 }
}
},
{ $sort : { total : -1 } },
{
$group : {
_id : "$_id.category",
terms: {
$push: {
term: "$_id.term",
total: "$total"
}
}
}
}
]);
The above query does work, and returns data that looks something like this:
[
{ category: "movies",
terms: [ { term: "movie 1", total: 5000 }, { term: "movie 2", total: 200 } ... ]
},
{ category: "sports",
terms: [ { term: "football 1", total: 4000 }, { term: "tennis 2", total: 250 } ... ]
},
]
However I'm trying to limit the terms array to a fixed number i.e. 5 or 10 - this will correspond to the X number of searches per category. I've been trying various options such as adding $slice within the $push to reduce the terms array down with no success.
Can this be achieved using the aggregate framework, or should I look at another approach?
As of MongoDb version 3.1.6 you can now slice on the $project stage:
{
$project: {
terms: {
$slice: ["$terms", 0, 10]
}
}
}
If you wanted to limit the number of items $pushed to 10.
Here's the issue:
https://jira.mongodb.org/browse/SERVER-6074
It seems as of Mongodb 2.6, the ability to limit the size of an array using $slice or $push with the .aggregate() function/command is unsupported.
Here's the feature request on the MongoDb issue tracker.
What I would do is output the aggregated result to an collection. Then update the collection.
Example:
Setup:
use test;
var rInt = function(x) {
return 1 + ~~(Math.random() * x);
};
var rObj = function() {
return {
"timestamp": new Date(),
"category": "movies" + rInt(5),
"term": "my movie" + rInt(20)
}
};
for (var i = 0, l = 100; i < l; i++) {
db.al.insert(rObj());
}
Aggregate query
db.al_out.drop();
db.al.aggregate([
{
$group : {
_id : {
category: "$category",
term: "$term",
},
total: { $sum : 1 }
}
},
{ $sort : { total : -1 } },
{
$group : {
_id : "$_id.category",
terms: {
$push: {
term: "$_id.term",
total: "$total"
}
}
}
}
,{ $out : "al_out" } // output the documents to `db.al_out`
]);
// limit the size of terms to 3 elements.
db.al_out.update( {}, {
$push : {
terms : {
$each : [],
$slice : 3
}
}
}, {
multi:true
});
Result:
db.al_out.find();
{ "_id" : "movies1", "terms" : [ { "term" : "my movie7", "total" : 3 }, { "term" : "my movie6", "total" : 3 }, { "term" : "my movie17", "total" : 2 } ] }
{ "_id" : "movies2", "terms" : [ { "term" : "my movie3", "total" : 4 }, { "term" : "my movie11", "total" : 2 }, { "term" : "my movie2", "total" : 2 } ] }
{ "_id" : "movies4", "terms" : [ { "term" : "my movie9", "total" : 3 }, { "term" : "my movie1", "total" : 3 }, { "term" : "my movie7", "total" : 2 } ] }
{ "_id" : "movies3", "terms" : [ { "term" : "my movie19", "total" : 5 }, { "term" : "my movie8", "total" : 4 }, { "term" : "my movie14", "total" : 4 } ] }
{ "_id" : "movies5", "terms" : [ { "term" : "my movie7", "total" : 6 }, { "term" : "my movie17", "total" : 4 }, { "term" : "my movie3", "total" : 2 } ] }
I would add a $limit stage after the $sort and before the $group:
{ $limit : 5 },
This should limit the number of documents that are then being pushed into the array to 5. This will also serve to limit the total number of documents maintained in memory in the sort, which should improve overall performance:
When a $sort immediately precedes a $limit in the pipeline, the $sort
operation only maintains the top n results as it progresses, where n
is the specified limit, and MongoDB only needs to store n items in
memory.
http://docs.mongodb.org/manual/reference/operator/aggregation/limit/