Group by subdocument field using aggregation framework - mongodb

The structure is the following:
{
"_id" : "79f00e2f-5ff6-42e9-a341-3d50410168de",
"bookings" : [
{
"name" : "name1",
"email" : "george_bush#gov.us",
"startDate" : ISODate("2013-12-31T22:00:00Z"),
"endDate" : ISODate("2014-01-09T22:00:00Z")
},
{
"name" : "name2",
"email" : "george_bush#gov.us",
"startDate" : ISODate("2014-01-19T22:00:00Z"),
"endDate" : ISODate("2014-01-24T22:00:00Z")
}
],
"name" : "Hotel0",
"price" : 0,
"rating" : 2
}
Now, I want to generate a report telling me how many bookings were made, grouped by booking month (assume that only booking start date matters) and also grouped by hotels rating.
I expect the answer to be like that:
{
{
rating: 0,
counts: {
month1: 10,
month2: 20,
...
month12: 7
}
}
{
rating: 1,
counts: {
month1: 5,
month2: 8,
...
month12: 9
}
}
...
{
rating: 6,
counts: {
month1: 22,
month2: 23,
...
month12: 24
}
}
}
I tried this with aggregation framework but I'm a little bit stuck.

The following query:
db.book.aggregate([
{ $unwind: '$bookings' },
{ $project: { bookings: 1, rating: 1, month: { $month: '$bookings.startDate' } } },
{ $group: { _id: { rating: '$rating', month: '$month' }, count: { $sum: 1 } } }
]);
Will give you the result per rating/month, but it does not make a subdocument for months. In general, you can not convert a value (such as the month nr) to a key (such as month1)—this is something you can probably quite easily handle in your application though.
The above aggregation results in:
"result" : [
{
"_id" : {
"rating" : 2,
"month" : 1
},
"count" : 1
},
{
"_id" : {
"rating" : 2,
"month" : 12
},
"count" : 1
}
],
"ok" : 1

Related

Aggregate value of each hour by MongoDB

Like the image, the above table represents my original data, time field is irregular. Now I want to get the data that represents the average value between every hour. What I thought was by using $match, $group, $project even with for method. I don't get an accurate idea and method.
id: ObjectId,
value: Number,
time: Date()
I have sample collection, hours.:
{ "_id" : 1, "value" : 10, "dt" : ISODate("2019-10-17T00:01:32Z") }
{ "_id" : 2, "value" : 16, "dt" : ISODate("2019-10-17T00:02:12Z") }
{ "_id" : 3, "value" : 8, "dt" : ISODate("2019-10-17T01:04:09Z") }
{ "_id" : 4, "value" : 12, "dt" : ISODate("2019-10-17T02:14:21Z") }
{ "_id" : 5, "value" : 6, "dt" : ISODate("2019-10-17T02:54:02Z") }
{ "_id" : 6, "value" : 11, "dt" : ISODate("2019-10-17T04:06:31Z") }
The following aggregation query returns the average value by the hour (the hour is of the date field):
db.hours.aggregate( [
{ $project: { value: 1, hr: { $hour: "$dt" } } } ,
{ $addFields: { hour: { $add: [ "$hr", 1 ] } } },
{ $group: { _id: "$hour",
count: { $sum: 1 },
totalValue: { $sum: "$value" },
avgValue: { $avg: "$value" }
}
},
{ $project: { hour: "$_id", _id: 0, count: 1, totalValue: 1, avgValue: 1} }
] )
=>
{ "count" : 2, "totalValue" : 18, "avgValue" : 9, "hour" : 3 }
{ "count" : 1, "totalValue" : 8, "avgValue" : 8, "hour" : 2 }
{ "count" : 1, "totalValue" : 11, "avgValue" : 11, "hour" : 5 }
{ "count" : 2, "totalValue" : 26, "avgValue" : 13, "hour" : 1 }
Finally, I solve this issue. Below is my code.

how to show a specific field in array using $project in mongodb

I have a database that contains information about flights. I'm trying to find the category that has the least minutes of delays. I managed to find and show the number of the minimum minutes of the category but not the category itself.
I've tried to put ":true" after each field to show it
db.delayData.aggregate([{
$group: {
"_id": "$carrier",
"arr_sum": {
$sum: "$arr_delay"
},
"carrier_sum": {
$sum: "$carrier_delay"
},
"weather_sum": {
$sum: "$weather_delay"
},
"nas_sum": {
$sum: "$nas_delay"
},
"sec_sum": {
$sum: "$security_delay"
},
"late_air_sum": {
$sum: "$late_aircraft_delay"
}
}
},
{
$project {
"min_delay_category": {
$min["$arr_sum", "$carrier_sum", "$weather_sum", "$nas_sum", "$sec_sum", "$late_air_sum"]
}
}
]).pretty()
I want to have something like that:
{ "_id" : "VX", "min_delay_category" : 1449, "sec_sum"... }
I've tried to write:
..."$sec_sum":1,"$late_air_sum":1]
but the error message is:
"missing ] after element list"
when I wrote:
...{"sec_sum":1},{"late_air_sum":1}]
I don't have error message but it will give me the least second result, not the first one.
for example:
{ "_id" : "VX", "min_delay_category" : 69081 }
but the true result for "VX" is 1449
The following query can get us the expected output:
db.collection.aggregate([
{
$project:{
"carrier":1,
"category.arr_delay":"$arr_delay",
"category.carrier_delay":"$carrier_delay",
"category.weather_delay":"$weather_delay",
"category.nas_delay":"$nas_delay",
"category.security_delay":"$security_delay",
"category.late_aircraft_delay":"$late_aircraft_delay"
}
},
{
$project:{
"carrier":1,
"categories":{
$objectToArray:"$category"
}
}
},
{
$unwind:"$categories"
},
{
$group:{
"_id":{
"carrier":"$carrier",
"category":"$categories.k"
},
"carrier":{
$first:"$carrier"
},
"category":{
$first:"$categories.k"
},
"total_delay":{
$sum:"$categories.v"
}
}
},
{
$sort:{
"total_delay":1
}
},
{
$group:{
"_id": "$carrier",
"carrier":{
$first:"$carrier"
},
"category":{
$first:"$category"
},
"minimum_delay":{
$first:"$total_delay"
}
}
},
{
$project:{
"_id":0
}
}
]).pretty();
Data set:
{
"_id" : ObjectId("5d5b5058435c7584459b7bae"),
"year" : 2003,
"month" : 6,
"carrier" : "AA",
"carrier_name" : "American Airlines Inc.",
"airport" : "ABQ",
"airport_name" : "Albuquerque, NM: Albuquerque International Sunport",
"arr_flights" : 307,
"arr_del15" : 56,
"carrier_ct" : 14.68,
"weather_ct" : 10.79,
"nas_ct" : 19.09,
"security_ct" : 1.48,
"late_aircraft_ct" : 9.96,
"arr_cancelled" : 1,
"arr_diverted" : 1,
"arr_delay" : 2530,
"carrier_delay" : 510,
"weather_delay" : 621,
"nas_delay" : 676,
"security_delay" : 25,
"late_aircraft_delay" : 698,
"" : ""
},
{
"_id" : ObjectId("5d5b5058435c7584459b7bbe"),
"year" : 2003,
"month" : 6,
"carrier" : "AA",
"carrier_name" : "American Airlines Inc.",
"airport" : "ABQ",
"airport_name" : "Albuquerque, NM: Albuquerque International Sunport",
"arr_flights" : 307,
"arr_del15" : 56,
"carrier_ct" : 14.68,
"weather_ct" : 10.79,
"nas_ct" : 19.09,
"security_ct" : 1.48,
"late_aircraft_ct" : 9.96,
"arr_cancelled" : 1,
"arr_diverted" : 1,
"arr_delay" : 2530,
"carrier_delay" : 510,
"weather_delay" : 621,
"nas_delay" : 676,
"security_delay" : 2512,
"late_aircraft_delay" : 698,
"" : ""
}
Output:
{ "carrier" : "AA", "category" : "carrier_delay", "minimum_delay" : 1020 }
Aggregation stage details:
STAGE I: Projecting all delays as a part of category document
STAGE II: Converting category into an array of key-value pair
where 'k' is delay type and 'v' is a delay
STAGE III: Unwinding the prepared array
STAGE IV: Grouping on the basis of carrier and delay type(k) and summing up delay for each type
STAGE V: Sorting on total calculated delay in ascending order
STAGE VI: Grouping on carrier and fetching the first document
which holds the minimum delay

aggregation and display details in mongodb

I have been learning MongoDB, while doing so I tried to implement the aggregation property for my database collection. I grouped the details of the employee based on their age and by using match function, my question is it possible to display the other key-value once they pass the age criteria?
db.employee.aggregate([
{ $match: { age: { $gte: 23 } } },
{
$group: {
_id:'$age',
total: { $sum: 1 },
name: { $addToSet: '$name' }
}
}
])
and the output was like this
{ "_id" : 27, "total" : 2, "name" : [ "indhu", "logesh" ] }
{ "_id" : 26, "total" : 1, "name" : [ "keerthana" ] }
{ "_id" : 25, "total" : 1, "name" : [ "sneha" ] }
{ "_id" : 24, "total" : 1, "name" : [ "dhiva" ] }
{ "_id" : 23, "total" : 1, "name" : [ "elango" ] }
where _id denotes their age.

Why would I get a "-Infinity" result for $avg in an aggregate query?

My aggregate query for retrieving "average percents per month" returns a -Infinity average for some months. What would cause this?
The relative properties in mycollection are mydate and mynumericfield, which stores a percentage value as a double.
db.mycollection.aggregate(
[
{
$match: {
mydate: {
$gte: new Date(Date.UTC(2014, 8, 1)),
$lte: new Date(Date.UTC(2014, 12, 1)),
}
}
},
{
$group : {
_id : { month: { $month: "$mydate" }, year: { $year: "$mydate" } },
average: { $avg: "$mynumericfield" },
count: { $sum: 1 }
}
}
]
)
Here's a sample of the result:
/* 1 */
{
"result" : [
{
"_id" : {
"month" : 9,
"year" : 2014
},
"average" : 84.2586640583996598,
"count" : 20959.0000000000000000
},
{
"_id" : {
"month" : 11,
"year" : 2014
},
"average" : 96.9326915103888638,
"count" : 20743.0000000000000000
},
{
"_id" : {
"month" : 10,
"year" : 2014
},
"average" : -Infinity,
"count" : 20939.0000000000000000
},
{
"_id" : {
"month" : 12,
"year" : 2014
},
"average" : -Infinity,
"count" : 20913.0000000000000000
}
],
"ok" : 1.0000000000000000
}
I've managed to somehow reproduce the -Infinity problem but on a smaller scale.
Let's take sample collection nums having only 4 documents in it:
{ "_id" : 0, "grp" : 1, "mynum" : -Infinity }
{ "_id" : 1, "grp" : 1, "mynum" : 5 }
{ "_id" : 3, "grp" : 2, "mynum" : 8 }
{ "_id" : 4, "grp" : 2, "mynum" : 89 }
Performing simple aggregation on this collection like:
> db.nums.aggregate([{$group:{"_id":"$grp", "average" : {$avg : "$mynum"}}}])
gives the following result:
{ "_id" : 2, "average" : 48.5 }
{ "_id" : 1, "average" : -Infinity }
which is identical in effects to what you have experienced.
Please try to find out whether in your collection there is a document which has mynumericfield with value -Infinity - maybe your situation is similar to the reproduced one:
> db.mycollection.find({mynumericfield : -Infinity})
I hope it might help you some way.

Combining columns into one in MongoDB Aggregate Framework

Is it possible to group by values across multiple columns?
Let's say I'm storing interactions between people by day, and keep track of from's and to's with a count as follows.
db.collection =
[
{ from : 'bob', to : 'mary', day : 1, count : 2 },
{ from : 'bob', to : 'steve', day : 2, count : 1 },
{ from : 'mary', to : 'bob', day : 1, count : 3 },
{ from : 'mary', to : 'steve', day : 3, count : 1 },
{ from : 'steve', to : 'bob', day : 2, count : 2 },
{ from : 'steve', to : 'mary', day : 1, count : 1 }
]
This allows me to get all interactions for, lets say, 'bob' with any one by grouping on from:, and summing count:.
Now I want to get all interaction for a user, so basically group by values across from: and to:. Essentially, sum up count: for each name, regardless whether it was in from: or to:
[UPDATE]
The desired output would be:
[
{ name : 'bob', count : 8 },
{ name : 'mary', count : 7 },
{ name : 'steve', count : 3 }
]
The easiest would be to create a new column names: and store from: and to: inside, then $unwind, but that seems wasteful.
Any hints?
Thanks
Is it possible to group by values across multiple columns?
Yes, it's possible in MongoDB to group values across different columns.
It's very straight forward to do it via MapReduce. But it's also possible to do it with aggregation framework, even if you don't store an array of participants (if you had array of names with both participants, then it's just an $unwind, and a $group - quite simple and I think more elegant than either MapReduce or the pipeline you'd have to use with the current schema).
Pipeline that works with your schema as is:
db.collection.aggregate( [
{
"$group" : {
"_id" : "$from",
"sum" : {
"$sum" : "$count"
},
"tos" : {
"$push" : {
"to" : "$to",
"count" : "$count"
}
}
}
}
{ "$unwind" : "$tos" }
{
"$project" : {
"prev" : {
"id" : "$_id",
"sum" : "$sum"
},
"tos" : 1
}
}
{
"$group" : {
"_id" : "$tos.to",
"count" : {
"$sum" : "$tos.count"
},
"prev" : {
"$addToSet" : "$prev"
}
}
}
{ "$unwind" : "$prev" }
{
"$group" : {
"_id" : "1",
"t" : {
"$addToSet" : {
"id" : "$_id",
"c" : "$count"
}
},
"f" : {
"$addToSet" : {
"id" : "$prev.id",
"c" : "$prev.sum"
}
}
}
}
{ "$unwind" : "$t" }
{ "$unwind" : "$f" }
{
"$project" : {
"name" : {
"$cond" : [
{
"$eq" : [
"$t.id",
"$f.id"
]
},
"$t.id",
"nobody"
]
},
"count" : {
"$add" : [
"$t.c",
"$f.c"
]
},
"_id" : 0
}
}
{ "$match" : { "name" : { "$ne" : "nobody" } } }
]);
On your sample input the output is:
{
"result" : [
{
"name" : "bob",
"count" : 8
},
{
"name" : "mary",
"count" : 7
},
{
"name" : "steve",
"count" : 5
}
],
"ok" : 1
}
$unwind can be expensive. Wouldn't this be easier to query?
db.collection =
[
{ name : 'bob', to : 'mary', day : 1, count : 2 },
{ name : 'mary', from : 'bob', day : 1, count : 2 },
{ name : 'bob', to : 'steve', day : 2, count : 1 },
{ name : 'bob', from : 'steve',day : 2, count : 1 },
{ name : 'mary', to : 'bob', day : 1, count : 3 },
{ name : 'mary', from : 'bob', day : 1, count : 3 },
{ name : 'mary', to : 'steve', day : 3, count : 1 },
{ name : 'mary', from : 'steve' day : 3, count : 1 },
{ name : 'steve', to : 'bob', day : 2, count : 2 },
{ name : 'steve', from : 'bob', day : 2, count : 2 },
{ name : 'steve', to : 'mary', day : 1, count : 1 }
{ name : 'steve', from : 'mary', day : 1, count : 1 }
]
[Update]
With your existing structure, here's how you can do it with Map-Reduce, but this isn't really for real-time results. It will be slower overall, but likely more efficient though than a massive $unwind operation in AF;
db.so.drop();
db.so.insert(
[
{ from: 'bob', to: 'mary', day: 1, count: 2 },
{ from: 'bob', to: 'steve', day: 2, count: 1 },
{ from: 'mary', to: 'bob', day: 1, count: 3 },
{ from: 'mary', to: 'steve', day: 3, count: 1 },
{ from: 'steve', to: 'bob', day: 2, count: 2 },
{ from: 'steve', to: 'mary', day: 1, count: 1 }
]);
db.runCommand(
{
"mapreduce": "so", // don't need the collection name here if it's above
"map": function(){
emit(this.from, {count: this.count});
emit(this.to, {count: this.count});
},
"reduce": function (name, values) {
var result = { count: 0 };
values.forEach(function (v) {
result.count += v.count;
});
return result;
},
query: {},
out: { inline: 1 },
}
);
which produces;
{
"results" : [
{
"_id" : "bob",
"value" : {
"count" : 8
}
},
{
"_id" : "mary",
"value" : {
"count" : 7
}
},
{
"_id" : "steve",
"value" : {
"count" : 5
}
}
],
"timeMillis" : 1,
"counts" : {
"input" : 6,
"emit" : 12,
"reduce" : 3,
"output" : 3
},
"ok" : 1
}