MongoDB: group, and then counting different values - mongodb

I have a list of books. I would like to get the number of genres that the authors wrote, also I would like to add to the results which are those genres. My database looks like this:
{"_id": ObjectID("1), "title": "Harry Potter", "year": NumberInt(2000), "author": "JK. Rowling",
"genres": "Fantasy"},
"_id": ObjectID("2"), "title": "Harry Potter 99", "year": NumberInt(2020), "author": "JK. Rowling",
"genres": "Drama"}, "_id": ObjectID("2"), "title": "Harry Potter", "year": NumberInt(2000), "author": "JK. Rowling",
"genres": "Drama"}, {...}
So, my code so far looks like this:
phase1 = {$group : {"_id" : "$author"}, "countgenres" : {$sum : 1}}
phase2 = {$addFields : "genres"}}
phase3 = {$sort : {"numgenres" : -1}}
steps = [phase1, phase2, phase3]
db.database.aggregate(steps)
This is not working for me, so I would like if someone could help me to write a correct code to do this. The result should look like this:
{
"_id" : "JK. Rowling",
"countgenres" : 4,
"genres" : [
"War",
"Fantasy",
"Drama",
"Crime"
]
}
Thanks.

you cannot do that directly in the $group stage. Instead of that, you have to use an $addFields stage and use $reduce and $setUnion, to concat arrays without doubles.
Then you can just add a field with your new array size, and do the $sort.
db.collection.aggregate([
{
$group: {
_id: "$author",
genres: {
$push: "$genres"
}
}
},
{
$addFields: {
genres: {
"$reduce": {
"input": "$genres",
"initialValue": [],
"in": {
$setUnion: [
"$$value",
"$$this"
]
}
}
}
}
},
{
$addFields: {
countGenres: {
$size: "$genres"
}
}
}
])
You can test here

Try this query:
db.collection.aggregate([
{
"$match": {
"author": "JK. Rowling"
}
},
{
"$group": {
"_id": "$author",
"genres": {
"$addToSet": "$genres"
}
}
},
{
$addFields: {
genres: {
"$reduce": {
"input": "$genres",
"initialValue": [],
"in": {
$setUnion: [
"$$value",
"$$this"
]
}
}
}
}
},
{
"$project": {
"countgenres": {
"$size": "$genres"
},
"genres": 1
}
}
])
First stage is $match by the author.
Then group and I've used $addToSet to avoid repeated values.
After that, with $addFields values are merged.
After the group, the field genres is output like this:
"genres": [
[
"War",
"Fantasy",
"Drama",
"Crime"
],
[
"War",
"Fantasy",
"Drama"
]
]
And to merge is neccessary to do $setUnion.
And, the last step is count the array size and output the values you want.
Example here

Related

MongoDB Aggregate keep results

Is there a way to keep results from a match under a new field, and under another new field some computed value?
I'm trying to extract a set of genres from a collection of movies, and also keep original results...
Document example:
{
"_id": "62e97ba6ec445b864fc3bc39",
"id": 19913,
"genres": [
"Comedy",
"Drama",
"Romance"
],
"imdb_id": "tt1022603",
"overview": "Tom, greeting-card writer and hopeless romantic...",
"title": "(500) Days of Summer",
"release_date": "2009-07-17",
}
Desired output:
{
result: [
... movies
]
categories: [
"Comedy",
"Drama",
"Romance"
]
}
What I have so far:
use('the_base');
function matchGenre(genre) {
return {
"$match": {
"genres": genre,
}
};
}
function limit(num) {
return {
"$limit": num
};
}
db.movie.aggregate([
matchGenre("Drama"),
limit(5),
{"$unwind": "$genres"},
{"$group": {
"_id": 0,
"gens": { "$addToSet": "$genres" }
}}
]);
My current result:
{
"_id": 0,
"gens": [
"Romance",
"Comedy",
"Thriller",
"Science Fiction",
"Fantasy",
"Drama",
"Crime",
"Action",
"Mystery",
"Adventure",
"Horror"
]
}
I would generally use facets.
Here's an example: https://mongoplayground.net/p/PbORyp4JaF5
db.collection.aggregate([
{
$facet: {
results: [
{
$match: {}
}
],
categories: [
{
$unwind: "$genres"
},
{
$sortByCount: "$genres"
}
],
release_date: [
{
$unwind: "$release_date"
},
{
$sortByCount: "$release_date"
}
]
}
}
])
I have taken the liberty to add an additional facet of release_date, and also ensure that there is a count present in each of the facets, as this is often helpful and required.

MongoDB: Get all $matched elements individually from an array

I'm trying to get all matched elements individually, here is the sample data and the query.
// json
[
{
"name": "Mr Cool",
"ican": [
{
"subcategory": [
{
"id": "5bffdba824488b182ec86f8d", "name": "Cricket"
},
{
"id": "5bffdba824488b182ec86f8c", "name": "Footbal"
}
],
"category": "5bffdba824488b182ec86f88",
"name": "Sports"
}
]
}
]
// query
db.collection.aggregate([
{
"$match": {
"ican.subcategory.name": { $in: ["Cricket","Football"] }
}
},
{
"$project": { "_id": 1, "name": 1, }
}
])
I'm getting the combined result, I need the individual match record. I tried $all and $elementMatch but getting the same response. how can I get the results as below. I'm using $aggregate because I will be using $geoNear pipeline for getting the nearby users.
// current result
[
{
"_id": ObjectId("5a934e000102030405000000"),
"name": "Mr Cool"
}
]
// expected result
[
{
"_id": ObjectId("5a934e000102030405000000"),
"name": "Mr Cool",
"subcategory: "Cricket"
},
{
"_id": ObjectId("5a934e000102030405000000"),
"name": "Mr Cool",
"subcategory: "Footbal"
}
]
Thank you
Try this Mongo Playground
db.col.aggregate([
{"$unwind" : "$ican"},
{"$unwind" : "$ican.subcategory"},
{"$match" : {"ican.subcategory.name": { "$in": ["Cricket","Football"] }}},
{"$group" : {"_id" : null,"data" : {"$push" : {"_id" : "$_id","name" : "$name","subcategory" : "$ican.subcategory.name"}}}},
{"$unwind" : "$data"},
{"$replaceRoot" : {"newRoot" : "$data"}}
])
You can use below aggregation without the $unwind and for better performance
db.collection.aggregate([
{ "$match": { "ican.subcategory.name": { "$in": ["Cricket","Football"] }}},
{ "$project": {
"ican": {
"$reduce": {
"input": "$ican",
"initialValue": [],
"in": {
"$concatArrays": [
{ "$filter": {
"input": {
"$map": {
"input": "$$this.subcategory",
"as": "s",
"in": { "name": "$name", "subcategory": "$$s.name" }
}
},
"as": "fil",
"cond": { "$in": ["$$fil.subcategory", ["Football"]] }
}},
"$$value"
]
}
}
}
}},
{ "$unwind": "$ican" },
{ "$replaceRoot": { "newRoot": "$ican" }}
])

How to get count of multiple fields based on value in mongodb?

Collection exists as below:
[
{"currentLocation": "Chennai", "baseLocation": "Bengaluru"},
{"currentLocation": "Chennai", "baseLocation": "Bengaluru"},
{"currentLocation": "Delhi", "baseLocation": "Bengaluru"},
{"currentLocation": "Chennai", "baseLocation": "Chennai"}
]
Expected Output:
[
{"city": "Chennai", "currentLocationCount": 3, "baseLocationCount": 1},
{"city": "Bengaluru", "currentLocationCount": 0, "baseLocationCount": 3},
{"city": "Delhi", "currentLocationCount": 1, "baseLocationCount": 0}
]
What I have tried is:
db.getCollection('users').aggregate([{
$group: {
"_id": "$baselocation",
baseLocationCount: {
$sum: 1
}
},
}, {
$project: {
"_id": 0,
"city": "$_id",
"baseLocationCount": 1
}
}])
Got result as:
[
{"city": "Chennai", "baseLocationCount": 1},
{"city": "Bengaluru", "baseLocationCount": "3"}
]
I'm not familiar with mongo, so any help?
MongoDB Version - 3.4
Neil Lunn and myself had a lovely argument over this topic the other day which you can read all about here: Group by day with Multiple Date Fields.
Here are two solutions to your precise problem.
The first one uses the $facet stage. Bear in mind, though, that it may not be suitable for large collections because $facet produces a single (potentially huge) document that might be bigger than the current MongoDB document size limit of 16MB (which only applies to the result document and wouldn't be a problem during pipeline processing anyway):
collection.aggregate(
{
$facet:
{
"current":
[
{
$group:
{
"_id": "$currentLocation",
"currentLocationCount": { $sum: 1 }
}
}
],
"base":
[
{
$group:
{
"_id": "$baseLocation",
"baseLocationCount": { $sum: 1 }
}
}
]
}
},
{ $project: { "result": { $setUnion: [ "$current", "$base" ] } } }, // merge results into new array
{ $unwind: "$result" }, // unwind array into individual documents
{ $replaceRoot: { newRoot: "$result" } }, // get rid of the additional field level
{ $group: { "_id": "$_id", "currentLocationCount": { $sum: "$currentLocationCount" }, "baseLocationCount": { $sum: "$baseLocationCount" } } }, // group into final result)
{ $project: { "_id": 0, "city": "$_id", "currentLocationCount": 1, "baseLocationCount": 1 } } // group into final result
)
The second one works based on the $map stage instead:
collection.aggregate(
{
"$project": {
"city": {
"$map": {
"input": [ "current", "base" ],
"as": "type",
"in": {
"type": "$$type",
"name": {
"$cond": {
"if": { "$eq": [ "$$type", "current" ] },
"then": "$currentLocation",
"else": "$baseLocation"
}
}
}
}
}
}
},
{ "$unwind": "$city" },
{
"$group": {
"_id": "$city.name",
"currentLocationCount": {
"$sum": {
"$cond": {
"if": { "$eq": [ "$city.type", "current" ] },
"then": 1,
"else": 0
}
}
},
"baseLocationCount": {
"$sum": {
"$cond": {
"if": { "$eq": [ "$city.type", "base" ] },
"then": 1,
"else": 0
}
}
}
}
}
)

MongoDB aggregate/grouping by key-value pairs

My data looks something like this:
{
"_id" : "9aa072e4-b706-47e6-9607-1a39e904a05a",
"customerId" : "2164289-4",
"channelStatuses" : {
"FOO" : {
"status" : "done"
},
"BAR" : {
"status" : "error"
}
},
"channel" : "BAR",
}
My aggregate/group looks like this:
{
"_id" : {
"customerId" : "$customerId",
"channel" : "$channel",
"status" : "$channelStatuses[$channel].status"
},
"count" : {
"$sum" : 1
}
}
So basically with the example data the group should give me a group grouped by:
{"customerId": "2164289-4", "channel": "BAR", "status": "error"}
But I cannot use []-indexing in a aggregate/group. What should I do instead?
You cannot get the result you want with the current structure using .aggregate(). You "could" change the structure to use an array rather than named keys, and the operation is actually quite simple.
So with a document like:
{
"_id" : "9aa072e4-b706-47e6-9607-1a39e904a05a",
"customerId" : "2164289-4",
"channelStatuses" : [
{
"channel": "FOO",
"status" : "done"
},
{
"channel": "BAR",
"status" : "error"
}
],
"channel" : "BAR",
}
You can then do in modern releases with $filter, $map and $arrayElemAt:
{ "$group": {
"_id": {
"customerId" : "$customerId",
"channel" : "$channel",
"status": {
"$arrayElemAt": [
{ "$map": {
"input": { "$filter": {
"input": "$chanelStatuses",
"as": "el",
"cond": { "$eq": [ "$$el.channel", "$channel" ] }
}},
"as": "el",
"in": "$$el.status"
}},
0
]
}
},
"count": { "$sum": 1 }
}}
Older versions of MongoDB are going to going to require $unwind to access the matched array element.
In MongoDB 2.6 then you can still "pre-filter" the array before unwind:
[
{ "$project": {
"customerId": 1,
"channel": 1,
"status": {
"$setDifference": [
{ "$map": {
"input": "$channelStatuses",
"as": "el",
"in": {
"$cond": [
{ "$eq": [ "$$el.channel", "$channel" ] },
"$$el.status",
false
]
}
}},
[false]
]
}
}},
{ "$unwind": "$status" },
{ "$group": {
"_id": {
"customerId": "$customerId",
"channel": "$channel",
"status": "$status"
},
"count": { "$sum": 1 }
}}
]
And anything prior to that you "filter" after $unwind instead:
[
{ "$unwind": "$channelStatuses" },
{ "$project": {
"customerId": 1,
"channel": 1,
"status": "$channelStatuses.status",
"same": { "$eq": [ "$channelStatuses.status", "$channel" ] }
}},
{ "$match": { "same": true } },
{ "$group": {
"_id": "$_id",
"customerId": { "$first": "$customerId" },
"channel": { "$first": "$channel" },
"status": { "$first": "$status" }
}},
{ "$group": {
"_id": {
"customerId": "$customerId",
"channel": "$channel",
"status": "$status"
},
"count": { "$sum": 1 }
}}
]
In a lesser version than MongoDB 2.6 you also need to $project the result of the equality test between the two fields and then $match on the result in a seperate stage. You might also note the "two" $group stages, since the first one removes any possible duplicates of the "channel" values after the filter via the $first accumulators. The following $group is exactly the same as in the previous listing.
But if you cannot change the structure and need "flexible" matching of keys where you cannot supply every name, then you must use mapReduce:
db.collection.mapReduce(
function() {
emit({
"customerId": this.customerId,
"channel": this.channel,
"status": this.channelStatuses[this.channel].status
},1);
},
function(key,values) {
return Array.sum(values);
},
{ "out": { "inline": 1 } }
)
Where of course you can use that sort of notation

MongoDB nested query using aggregate function

I have a collection "superpack", which has the nested objects. The sample document looks like below.
{
"_id" : ObjectId("56038c8cca689261baca93eb"),
"name": "Test sub",
"packs": [
{
"id": "55fbc7f6b0ce97a309b3cead",
"name": "Classic",
"packDispVal": "PACK",
"billingPts": [
{
"id": "55fbc7f6b0ce97a309b3ceab",
"name": "Classic 1 month",
"expiryVal": 1,
"amount": 20,
"topUps": [
{
"id": "55fbc7f6b0ce97a309b3cea9",
"name": "1 extra",
"amount": 8
},
{
"id": "55fbc7f6b0ce97a309b3ceaa",
"name": "2 extra",
"amount": 12
}
]
},
{
"id": "55fbc7f6b0ce97a309b3ceac",
"name": "Classic 2 month",
"expiryVal": 1,
"amount": 30,
"topUps": [
{
"id": "55fbc7f6b0ce97a309b3cea8",
"name": "3 extra",
"amount": 16
}
]
}
]
}
]
}
I need to query for the nested object topups with the id field and result should have only the selected topup object and its associated parent. I am expecting the output to like below, when i query it on topup id 55fbc7f6b0ce97a309b3cea9.
{
"_id" : ObjectId("56038c8cca689261baca93eb"),
"name": "Test sub",
"packs": [
{
"id": "55fbc7f6b0ce97a309b3cead",
"name": "Classic",
"packDispVal": "PACK",
"billingPts": [
{
"id": "55fbc7f6b0ce97a309b3ceab",
"name": "Classic 1 month",
"expiryVal": 1,
"amount": 20,
"topUps": [
{
"id": "55fbc7f6b0ce97a309b3cea9",
"name": "1 extra",
"amount": 8
}
]
}
]
}
]
}
I tried with the below aggregate query for the same. However its not returning any result. Can you please help me, what is wrong in the query?
db.superpack.aggregate( [{ $match: { "id": "55fbc7f6b0ce97a309b3cea9" } }, { $redact: {$cond: { if: { $eq: [ "$id", "55fbc7f6b0ce97a309b3cea9" ] }, "then": "$$KEEP", else: "$$PRUNE" }}} ])
Unfortunately $redact is not a viable option here based on the fact that with the recursive $$DESCEND it is basically looking for a field called "id" at all levels of the document. You cannot possibly ask to do this only at a specific level of embedding as it's all or nothing.
This means you need alternate methods of filtering the content rather than $redact. All "id" values are unique so their is no problem filtering via "set" operations.
So the most efficient way to do this is via the following:
db.docs.aggregate([
{ "$match": {
"packs.billingPts.topUps.id": "55fbc7f6b0ce97a309b3cea9"
}},
{ "$project": {
"packs": {
"$setDifference": [
{ "$map": {
"input": "$packs",
"as": "pack",
"in": {
"$let": {
"vars": {
"billingPts": {
"$setDifference": [
{ "$map": {
"input": "$$pack.billingPts",
"as": "billing",
"in": {
"$let": {
"vars": {
"topUps": {
"$setDifference": [
{ "$map": {
"input": "$$billing.topUps",
"as": "topUp",
"in": {
"$cond": [
{ "$eq": [ "$$topUp.id", "55fbc7f6b0ce97a309b3cea9" ] },
"$$topUp",
false
]
}
}},
[false]
]
}
},
"in": {
"$cond": [
{ "$ne": [{ "$size": "$$topUps"}, 0] },
{
"id": "$$billing.id",
"name": "$$billing.name",
"expiryVal": "$$billing.expiryVal",
"amount": "$$billing.amount",
"topUps": "$$topUps"
},
false
]
}
}
}
}},
[false]
]
}
},
"in": {
"$cond": [
{ "$ne": [{ "$size": "$$billingPts"}, 0 ] },
{
"id": "$$pack.id",
"name": "$$pack.name",
"packDispVal": "$$pack.packDispVal",
"billingPts": "$$billingPts"
},
false
]
}
}
}
}},
[false]
]
}
}}
])
Where after digging down to the innermost array that is being filtered, that then the size of each resulting array going outwards is tested to see if it is zero, and omitted from results where it is.
It's a long listing but it is the most efficient way since each array is filtered down first and within each document.
A not so efficient way is to pull apart with $unwind and the $group back the results:
db.docs.aggregate([
{ "$match": {
"packs.billingPts.topUps.id": "55fbc7f6b0ce97a309b3cea9"
}},
{ "$unwind": "$packs" },
{ "$unwind": "$packs.billingPts" },
{ "$unwind": "$packs.billingPts.topUps"},
{ "$match": {
"packs.billingPts.topUps.id": "55fbc7f6b0ce97a309b3cea9"
}},
{ "$group": {
"_id": {
"_id": "$_id",
"packs": {
"id": "$packs.id",
"name": "$packs.name",
"packDispVal": "$packs.packDispVal",
"billingPts": {
"id": "$packs.billingPts.id",
"name": "$packs.billingPts.name",
"expiryVal": "$packs.billingPts.expiryVal",
"amount": "$packs.billingPts.amount"
}
}
},
"topUps": { "$push": "$packs.billingPts.topUps" }
}},
{ "$group": {
"_id": {
"_id": "$_id._id",
"packs": {
"id": "$_id.packs.id",
"name": "$_id.packs.name",
"packDispVal": "$_id.packs.packDispVal"
}
},
"billingPts": {
"$push": {
"id": "$_id.packs.billingPts.id",
"name": "$_id.packs.billingPts.name",
"expiryVal": "$_id.packs.billingPts.expiryVal",
"amount": "$_id.packs.billingPts.amount",
"topUps": "$topUps"
}
}
}},
{ "$group": {
"_id": "$_id._id",
"packs": {
"$push": {
"id": "$_id.packs.id",
"name": "$_id.packs.name",
"packDispVal": "$_id.packs.packDispVal",
"billingPts": "$billingPts"
}
}
}}
])
The listing looks a lot more simple but of course there is a lot of overhead introduced by $unwind here. The process of grouping back is basically keeping a copy of everything outside of the current array level being reconstructed, and then push that content back into the array in the next stage, until you get back to the root _id.
Please note that unless you intend such a search to match more than one document or if you are going to have significant gains from reduced network traffic by effectively reducing down the response size from a very large document, then it would be advised to do neither of these but follow much of the same design as the first pipeline example but in client code.
Whilst the first example would be still okay performance wise, it's still a mouthful to send to the server and as a general listing, that is typically written with the same operations in a cleaner way in client code to process and filter the resulting structure.
{
"_id" : ObjectId("56038c8cca689261baca93eb"),
"packs" : [
{
"id" : "55fbc7f6b0ce97a309b3cead",
"name" : "Classic",
"packDispVal" : "PACK",
"billingPts" : [
{
"id" : "55fbc7f6b0ce97a309b3ceab",
"name" : "Classic 1 month",
"expiryVal" : 1,
"amount" : 20,
"topUps" : [
{
"id" : "55fbc7f6b0ce97a309b3cea9",
"name" : "1 extra",
"amount" : 8
}
]
}
]
}
]
}