mongodb count number of documents for every category - mongodb

My collection looks like this:
{
"_id":ObjectId("5744b6cd9c408cea15964d18"),
"uuid":"bbde4bba-062b-4024-9bb0-8b12656afa7e",
"version":1,
"categories":["sport"]
},
{
"_id":ObjectId("5745d2bab047379469e10e27"),
"uuid":"bbde4bba-062b-4024-9bb0-8b12656afa7e",
"version":2,
"categories":["sport", "shopping"]
},
{
"_id":ObjectId("5744b6359c408cea15964d15"),
"uuid":"561c3705-ba6d-432b-98fb-254483fcbefa",
"version":1,
"categories":["politics"]
}
I want to count the number of documents for every category. To do this, I unwind the categories array:
db.collection.aggregate(
{$unwind: '$categories'},
{$group: {_id: '$categories', count: {$sum: 1}} }
)
Result:
{ "_id" : "sport", "count" : 2 }
{ "_id" : "shopping", "count" : 1 }
{ "_id" : "politics", "count" : 1 }
Now I want to count the number of documents for every category, but where document version is the latest version.
This is where I am stuck.

It's ugly but I think this gives you what you're after:
db.collection.aggregate(
{ $unwind : "$categories" },
{ $group :
{ "_id" : { "uuid" : "$uuid" },
"doc" : { $push : { "version" : "$version", "category" : "$categories" } },
"maxVersion" : { $max : "$version" }
}
},
{ $unwind : "$doc" },
{ $project : { "_id" : 0, "uuid" : "$id.uuid", "category" : "$doc.category", "isCurrentVersion" : { $eq : [ "$doc.version", "$maxVersion" ] } } },
{ $match : { "isCurrentVersion" : true }},
{ $group : { "_id" : "$category", "count" : { $sum : 1 } } }
)

You can do this by first grouping the denormalized documents (from the $unwind operator step) by two keys, i.e. the categories and version fields. This is necessary for the preceding pipeline step which orders the grouped documents and their accumulated counts by the version (desc) and categories (asc) keys respectively using the $sort operator.
Another grouping will be required to get the top documents in each categories group after ordering using the $first operator. The following shows this
db.collection.aggregate(
{ "$unwind": "$categories" },
{
"$group": {
"_id": {
'categories': '$categories',
'version': '$version'
},
"count": { "$sum": 1 }
}
},
{ "$sort": { "_id.version": -1, "_id.categories": 1 } },
{
"$group": {
"_id": "$_id.categories",
"count": { "$first": "$count" },
"version": { "$first": "$_id.version" }
}
}
)
Sample Output
{ "_id" : "shopping", "count" : 1, "version" : 2 }
{ "_id" : "sport", "count" : 1, "version" : 2 }
{ "_id" : "politics", "count" : 1, "version" : 1 }

Related

MongoDB - Group by and count value, but treat per record as one

I want to group by and count follow_user.tags.tag_id per record, so no matter how many times the same tag_id show up on the same record, it only counts as 1.
My database structure looks like this:
{
"external_userid" : "EXID1",
"follow_user" : [
{
"userid" : "USERID1",
"tags" : [
{
"tag_id" : "TAG1"
}
]
},
{
"userid" : "USERID2",
"tags" : [
{
"tag_id" : "TAG1"
},
{
"tag_id" : "TAG2"
}
]
}
]
},
{
"external_userid" : "EXID2",
"follow_user" : [
{
"userid" : "USERID1",
"tags" : [
{
"tag_id" : "TAG2"
}
]
}
]
}
Here's my query:
[
{ "$unwind": "$follow_user" }, { "$unwind": "$follow_user.tags" },
{ "$group" : { "_id" : { "follow_user᎐tags᎐tag_id" : "$follow_user.tags.tag_id" }, "COUNT(_id)" : { "$sum" : 1 } } },
{ "$project" : { "total" : "$COUNT(_id)", "tagId" : "$_id.follow_user᎐tags᎐tag_id", "_id" : 0 } }
]
What I expected:
{
"total" : 1,
"tagId" : "TAG1"
},
{
"total" : 2,
"tagId" : "TAG2"
}
What I get:
{
"total" : 2,
"tagId" : "TAG1"
},
{
"total" : 2,
"tagId" : "TAG2"
}
$set - Create a new field follow_user_tags.
1.1. $setUnion - To distinct the value from the Result 1.1.1.
1.1.1. $reduce - Add the value of follow_user.tags.tag_id into array.
$unwind - Deconstruct follow_user_tags array field to multiple documents.
$group - Group by follow_user_tags and perform total count via $sum.
$project - Decorate output document.
db.collection.aggregate([
{
$set: {
follow_user_tags: {
$setUnion: {
"$reduce": {
"input": "$follow_user.tags",
"initialValue": [],
"in": {
"$concatArrays": [
"$$value",
"$$this.tag_id"
]
}
}
}
}
}
},
{
$unwind: "$follow_user_tags"
},
{
$group: {
_id: "$follow_user_tags",
total: {
$sum: 1
}
}
},
{
$project: {
_id: 0,
tagId: "$_id",
total: 1
}
}
])
Sample Mongo Playground

MongoDB get user which are new today

I am trying to find a user list which is new for day-1. I have written the query to find the users who arrived till the day before yesterday and the list of users arrived yesterday. Now I want minus those data how can I do that in a single aggregate function.
Function to get the list before yesterday
db.chat_question_logs.aggregate([
{
$match : {"createdDate":{$lte: ISODate("2020-04-29T00:00:00Z")}}
},
{
"$project" :
{
_id : 0,
"userInfo.userId":1
}
},
{
"$group": {
"_id": {userId:"$userInfo.userId"},"count": {$sum : 1}}
}
])
similarly for the day-1 is as below
db.chat_question_logs.aggregate([
{
$match : {"createdDate":{$gte: ISODate("2020-04-30T00:00:00Z"),$lte: ISODate("2020-05-01T00:00:00Z")}}
},
{
"$project" :
{
_id : 0,
"userInfo.userId":1
}
},
{
"$group": {
"_id": {userId:"$userInfo.userId"},"count": {$sum : 1}}
}
])
Result JSON are as below
/* 1 */
{
"_id" : {
"userId" : "2350202241750776"
},
"count" : 1
},
/* 2 */
{
"_id" : {
"userId" : "26291570771793121"
},
"count" : 1
},
/* 3 */
{
"_id" : {
"userId" : "2742872209107866"
},
"count" : 5
},
/* 4 */
{
"_id" : {
"userId" : "23502022417507761212"
},
"count" : 1
},
/* 5 */
{
"_id" : {
"userId" : "2629157077179312"
},
"count" : 43
}
How can I find the difference.
It sounds like what you want is to get all users created yesterday (which is the 28th in this example).
db.chat_question_logs.aggregate([
{
$match : { $and: [
{ "createdDate":{$lt: ISODate("2020-04-29T00:00:00Z")} },
{ "createdDate": {$gte: ISODate("2020-04-28T00:00:00Z") }}
] }
},
{
"$project" :
{
_id : 0,
"userInfo.userId":1
}
},
{
"$group": {
"_id": {userId:"$userInfo.userId"},"count": {$sum : 1}}
}
])
Is this what you want?
Hi found the solution which is below
I used the group and first appearance of the Id and then filter record on date which I wanted.The query is as below
db.chat_question_logs.aggregate([
{
$group:
{
_id: "$userInfo.userId",
firstApprance: { $first: "$createdDate" }
}
},
{
$match : { "firstApprance": { $gte: new ISODate("2020-05-03"), $lt: new ISODate("2020-05-05") } }
}
])

Mongodb aggregate by day and delete duplicate value

I'm trying to clean a huge database.
Sample DB :
{
"_id" : ObjectId("59fc5249d5ab401d99f3de7f"),
"addedAt" : ISODate("2017-11-03T11:26:01.744Z"),
"__v" : 0,
"check" : 17602,
"lastCheck" : ISODate("2018-04-05T11:47:00.609Z"),
"tracking" : [
{
"timeCheck" : ISODate("2017-11-06T13:17:20.861Z"),
"_id" : ObjectId("5a0060e00f3c330012bafe39"),
"rank" : 2395,
},
{
"timeCheck" : ISODate("2017-11-06T13:22:31.254Z"),
"_id" : ObjectId("5a0062170f3c330012bafe77"),
"rank" : 2395,
},
{
"timeCheck" : ISODate("2017-11-06T13:27:40.551Z"),
"_id" : ObjectId("5a00634c0f3c330012bafebe"),
"rank" : 2379,
},
{
"timeCheck" : ISODate("2017-11-06T13:32:41.084Z"),
"_id" : ObjectId("5a0064790f3c330012baff03"),
"rank" : 2395,
},
{
"timeCheck" : ISODate("2017-11-06T13:37:51.012Z"),
"_id" : ObjectId("5a0065af0f3c330012baff32"),
"rank" : 2379,
},
{
"timeCheck" : ISODate("2017-11-07T13:37:51.012Z"),
"_id" : ObjectId("5a0065af0f3c330012baff34"),
"rank" : 2379,
}]
}
I have a lot of duplicate value but I need to clean only by day.
To obtain this for example :
{
"_id" : ObjectId("59fc5249d5ab401d99f3de7f"),
"addedAt" : ISODate("2017-11-03T11:26:01.744Z"),
"__v" : 0,
"check" : 17602,
"lastCheck" : ISODate("2018-04-05T11:47:00.609Z"),
"tracking" : [
{
"timeCheck" : ISODate("2017-11-06T13:17:20.861Z"),
"_id" : ObjectId("5a0060e00f3c330012bafe39"),
"rank" : 2395,
},
{
"timeCheck" : ISODate("2017-11-06T13:27:40.551Z"),
"_id" : ObjectId("5a00634c0f3c330012bafebe"),
"rank" : 2379,
},
{
"timeCheck" : ISODate("2017-11-07T13:37:51.012Z"),
"_id" : ObjectId("5a0065af0f3c330012baff34"),
"rank" : 2379,
}]
}
How can I aggregate by day and after delete last value duplicate?
I need to keep the values per day even if they are identical with another day.
The aggregation framework cannot update data at this stage. However, you can use the following aggregation pipeline in order to get the desired output and then use e.g. a bulk replace to update all your documents:
db.collection.aggregate({
$unwind: "$tracking" // flatten the "tracking" array into separate documents
}, {
$sort: {
"tracking.timeCheck": 1 // sort by timeCheck to allow us to use the $first operator in the next stage reliably
}
}, {
$group: {
_id: { // group by
"_id": "$_id", // "_id" and
"rank": "$tracking.rank", // "rank" and
"date": { // the "date" part of the "timeCheck" field
$dateFromParts : {
year: { $year: "$tracking.timeCheck" },
month: { $month: "$tracking.timeCheck" },
day: { $dayOfWeek: "$tracking.timeCheck" }
}
}
},
"doc": { $first: "$$ROOT" } // only keep the first document per group
}
}, {
$sort: {
"doc.tracking.timeCheck": 1 // restore ascending sort order - may or may not be needed...
}
}, {
$group: {
_id: "$_id._id", // merge everything again per "_id"
"addedAt": { $first: "$doc.addedAt" },
"__v": { $first: "$doc.__v" },
"check": { $first: "$doc.check" },
"lastCheck": { $first: "$doc.lastCheck" },
"tracking": { $push: "$doc.tracking" } // in order to join the tracking values into an array again
}
})

MongoDB $sum and $avg of sub documents

I need to get $sum and $avg of subdocuments, i would like to get $sum and $avg of Channels[0].. and other channels as well.
my data structure looks like this
{
_id : ... Location : 1,
Channels : [
{ _id: ...,
Value: 25
},
{
_id: ... ,
Value: 39
},
{
_id: ..,
Value: 12
}
]
}
In order to get the sum and average of the Channels.Value elements for each document in your collection you will need to use mongodb's Aggregation processing. Further, since Channels is an array you will need to use the $unwind operator to deconstruct the array.
Assuming that your collection is called example, here's how you could get both the document sum and average of the Channels.Values:
db.example.aggregate( [
{
"$unwind" : "$Channels"
},
{
"$group" : {
"_id" : "$_id",
"documentSum" : { "$sum" : "$Channels.Value" },
"documentAvg" : { "$avg" : "$Channels.Value" }
}
}
] )
The output from your post's data would be:
{
"_id" : SomeObjectIdValue,
"documentSum" : 76,
"documentAvg" : 25.333333333333332
}
If you have more than one document in your collection then you will see a result row for each document containing a Channels array.
Solution 1: Using two groups based this example:
previous question
db.records.aggregate(
[
{ $unwind: "$Channels" },
{ $group: {
_id: {
"loc" : "$Location",
"cId" : "$Channels.Id"
},
"value" : {$sum : "$Channels.Value" },
"average" : {$avg : "$Channels.Value"},
"maximun" : {$max : "$Channels.Value"},
"minimum" : {$min : "$Channels.Value"}
}},
{ $group: {
_id : "$_id.loc",
"ChannelsSumary" : { $push :
{ "channelId" : '$_id.cId',
"value" :'$value',
"average" : '$average',
"maximun" : '$maximun',
"minimum" : '$minimum'
}}
}
}
]
)
Solution 2:
there is property i didn't show on my original question that might of help "Channels.Id" independent from "Channels._Id"
db.records.aggregate( [
{
"$unwind" : "$Channels"
},
{
"$group" : {
"_id" : "$Channels.Id",
"documentSum" : { "$sum" : "$Channels.Value" },
"documentAvg" : { "$avg" : "$Channels.Value" }
}
}
] )

Aggregate in MongoDB return more fields

I'm currently experimenting with MongoDB. Using the Twitters Streaming API I collected a bunch of tweets (seemed a good way to learn to use MongoDB's aggregation options).
I have the following query
db.twitter.aggregate([
{ $group : { _id : '$status.user.screen_name', count: { $sum : 1 } } },
{ $sort : { count : -1, _id : 1 } },
{ $skip : 0 },
{ $limit : 5 },
]);
As expected this is te result:
{
"result" : [
{
"_id" : "VacaturesBreda",
"count" : 5
},
{
"_id" : "breda_nws",
"count" : 3
},
{
"_id" : "BredaDichtbij",
"count" : 2
},
{
"_id" : "JobbirdUTITBaan",
"count" : 2
},
{
"_id" : "vacatures_nr1",
"count" : 2
}
],
"ok" : 1
}
The question is how can I match on the user id_str and return the screen_name and for example the followers_count of the user. I tried to do this with { $project .... } but I kept ending up with an empty result set.
For those not familiar with the user object in Twitters JSON response here a part of it (just selected the first user in the db).
"user" : {
"id" : 2678963916,
"id_str" : "2678963916",
"name" : "JobbirdUT IT Banen",
"screen_name" : "JobbirdUTITBaan",
"location" : "Utrecht",
"url" : "http://www.jobbird.com",
"description" : "Blijf op de hoogte van de nieuwste IT/Automatisering vacatures in Utrecht, via http://Jobbird.com",
"protected" : false,
"verified" : false,
"followers_count" : 1,
"friends_count" : 1,
"listed_count" : 0,
"favourites_count" : 0,
"statuses_count" : 311,
"created_at" : "Fri Jul 25 07:35:48 +0000 2014",
...
},
Update: As requested a clear example on the proposed response (sorry for not adding it).
So instead of grouping on the screen_name grouping on the id_str. Why you might ask, it is possible to edit your screen_name but you are still the same user for Twitter (so the last screen_name should be returned:
db.twitter.aggregate([
{ $group : { _id : '$status.user.id_str', count: { $sum : 1 } } },
{ $sort : { count : -1, _id : 1 } },
{ $skip : 0 },
{ $limit : 5 },
]);
And as the response something like this:
{
"result" : [
{
"_id" : "123456789",
"screen_name": "awsome_screen_name",
"followers_count": 523,
"count" : 5
},
....
],
"ok" : 1
}
You are basically looking for an operator that does not specifically "aggregate" the content, and this is basically what the $first and $last operators do:
db.twitter.aggregate([
{ "$group": {
"_id": "$status.user.id_str",
"screen_name": { "$first": "$status.user.screen_name" },
"followers_count": { "$sum": "$status.user.followers_count" },
"count": { "$sum": 1 }
}},
{ "$sort": { "followers_count": -1, "count": -1 } },
{ "$limit": 5 }
])
Which picks the "first" occurrence of the field based on the grouping key. That is generally useful where there is duplicated related data in the documents to the grouping key.
An alternate approach is to include the fields in the grouping key. You can later restructure with $project:
db.twitter.aggregate([
{ "$group": {
"_id": {
"_id": "$status.user.id_str",
"screen_name": "$status.user.screen_name"
},
"followers_count": { "$sum": "$status.user.followers_count" },
"count": { "$sum": 1 }
}},
{ "$project": {
"_id": "$_id._id",
"screen_name": "$_id.screen_name"
"followers_count": 1,
"count": 1
}},
{ "$sort": { "followers_count": -1, "count": -1 } },
{ "$limit": 5 }
])
Which is useful where you are unsure of the related "uniqueness".