I work with mongodb and aggregation-framework. I have fallowing data in database I need to sort and group it by year
[{
"_id": "5df537d615a0cd001759f5e4",
"timeDoneA": {
"year":2020,
"day": 5,
"month": 12
},
"timeDoneB": {
"year": 2020,
"day": 4,
"month": 2
},
"timeDoneC": {
"year": 2020,
"day": 4,
"month": 2
},
},
{
"_id": "5df6595dab96a000174e29d7",
"timeDoneA": {
"year": 2020,
"day": 12,
"month": 12
},
"timeDoneB": {
"year": 2019,
"day": 15,
"month": 12
},
"timeDoneC": {
"year": 2019,
"day": 15,
"month": 12
},
}...etc],
I want to get something like this to sort data by years:
2019: {
timeDoneA: 0,
timeDoneB: 1,
timeDoneC: 1
},
2020: {
timeDoneA: 2,
timeDoneB: 1,
timeDoneC: 1
}
How can I do that with aggregation query?
You could use a $facet stage to collect the year values and remove unnecessary fields, then $unwind both of these, and finally use $group to collate the values.
A quick and dirty example:
db.aggtest.aggregate([
{$facet:{
years:[
{$group:{ _id:null,
listC:{$addToSet: "$timeDoneC.year"},
listB:{$addToSet:"$timeDoneB.year"},
listA:{$addToSet:"$timeDoneA.year"}}},
{$project:{ _id:0,
list:{$setUnion:["$listA","$listB","$listC"]}}}],
done:[{$project:{ _id:0,
timeDoneA:"$timeDoneA.year",
timeDoneB:"$timeDoneB.year",
timeDoneC:"$timeDoneC.year"}}]}},
{$unwind:"$done"},
{$unwind:"$years"},
{$unwind:"$years.list"},
{$group:{ _id:"$years.list",
timeDoneA:{$sum:{$cond:[{$eq:["$done.timeDoneA","$years.list"]},1,0]}},
timeDoneB:{$sum:{$cond:[{$eq:["$done.timeDoneB","$years.list"]},1,0]}},
timeDoneC:{$sum:{$cond:[{$eq:["$done.timeDoneC","$years.list"]},1,0]}}}}])
Testing this on your sample data gives:
{ "_id" : 2020, "timeDoneA" : 2, "timeDoneB" : 1, "timeDoneC" : 1 }
{ "_id" : 2019, "timeDoneA" : 0, "timeDoneB" : 1, "timeDoneC" : 1 }
Related
I'm using the New York Times Articles & Comments (2020) dataset from Kaggle and I want to generate the average of published articels per weekday for the whole year like so:
Mo: 50
Tu: 60
Wed: 40
...
So far I managed to query the articles count for each day in the dataset for the whole year and sort it:
db.nyt_art.aggregate(
[
{ $group: {
_id: {
year: { $year: "$pub_date" },
month: { $month: "$pub_date" },
day: { $dayOfMonth: "$pub_date" }
},
count: { $sum: 1 }
}
},
{$sort: {_id: -1}}
]
)
[
{
"_id": {
"year": 2020,
"month": 12,
"day": 31
},
"count": 35
},
{
"_id": {
"year": 2020,
"month": 12,
"day": 30
},
"count": 53
},
{
"_id": {
"year": 2020,
"month": 12,
"day": 29
},
"count": 47
},
{
"_id": {
"year": 2020,
"month": 12,
"day": 28
},
"count": 49
},
{
"_id": {
"year": 2020,
"month": 12,
"day": 27
},
"count": 6
},
{
"_id": {
"year": 2020,
"month": 12,
"day": 26
},
"count": 9
},
{
"_id": {
"year": 2020,
"month": 12,
"day": 25
},
"count": 12
},
{
"_id": {
"year": 2020,
"month": 12,
"day": 24
},
"count": 41
},
{
"_id": {
"year": 2020,
"month": 12,
"day": 23
},
"count": 72
},
{
"_id": {
"year": 2020,
"month": 12,
"day": 22
},
"count": 59
},
{
"_id": {
"year": 2020,
"month": 12,
"day": 21
},
"count": 50
},
{
"_id": {
"year": 2020,
"month": 12,
"day": 20
},
"count": 21
},
{
"_id": {
"year": 2020,
"month": 12,
"day": 19
},
"count": 12
},
{
"_id": {
"year": 2020,
"month": 12,
"day": 18
},
"count": 47
},
{
"_id": {
"year": 2020,
"month": 12,
"day": 17
},
"count": 61
},
{
"_id": {
"year": 2020,
"month": 12,
"day": 16
},
"count": 58
},
{
"_id": {
"year": 2020,
"month": 12,
"day": 15
},
"count": 70
},
{
"_id": {
"year": 2020,
"month": 12,
"day": 14
},
"count": 53
},
{
"_id": {
"year": 2020,
"month": 12,
"day": 13
},
"count": 11
},
{
"_id": {
"year": 2020,
"month": 12,
"day": 12
},
"count": 17
}
]
Now I'm stuck as I don't know how to get the average article output on each weekday.
UPDATE:
I managed to get the results I wanted but as I'm new to mongodb, I'm not sure if I did it the correct and best way. Here is what I did:
// Average of articles published per weekday
db.nyt_art.aggregate(
[
// First Stage
{
$group: {
_id: {
year: { $year: "$pub_date" },
month: { $month: "$pub_date" },
day: { $dayOfMonth: "$pub_date" },
weekday: { $dayOfWeek: "$pub_date" }
},
count: { $sum: 1 }
}
},
// Second Stage
{
$group: {
_id: { wd: "$_id.weekday"},
total_on_wd: {
$sum: "$count"
},
total_wds: {
$sum: "$_id.weekday"
}
}
},
// Third Stage
{
$project: {
_id: { wd: "$_id.wd"},
avg_wd: {
$divide: ["$total_on_wd", 52]
}
}
},
// Fourth Stage
{$sort: {_id: 1}}
]
)
And the results:
[
{
"_id": {
"wd": 1
},
"avg_wd": 16.634615384615383
},
{
"_id": {
"wd": 2
},
"avg_wd": 52.78846153846154
},
{
"_id": {
"wd": 3
},
"avg_wd": 62.53846153846154
},
{
"_id": {
"wd": 4
},
"avg_wd": 63.30769230769231
},
{
"_id": {
"wd": 5
},
"avg_wd": 61.30769230769231
},
{
"_id": {
"wd": 6
},
"avg_wd": 49.21153846153846
},
{
"_id": {
"wd": 7
},
"avg_wd": 17.03846153846154
}
]
Is this actually delivering the correct results? Also I'm not sure on how to count how often each day has occuret in that year (right now I'm just using a constant of 52 days of each weekday per year).
I have a Mongo collection that looks like this with a bunch of months, days, years:
[
{
"Date": ISODate("2021-08-05T04:59:54.000Z"),
"Amount": 999,
"Business": "Business 1",
},
{
"Date": ISODate("2021-08-05T04:59:54.000Z"),
"Amount": 5.99,
"Business": "Business 2",
},
{
"Date": ISODate("2021-07-17T21:41:56.000Z"),
"Amount": 20000,
"Business": "Business 2",
},
{
"Date": ISODate("2021-06-17T21:41:56.000Z"),
"Amount": 200,
"Business": "Business 5",
}
]
I have done an aggregation like this
db.collection.aggregate({
$group: {
_id: {
year: {
$year: "$Date"
},
month: {
$month: "$Date"
}
},
sum: {
$sum: "$Amount"
}
}
})
...which partially gives me what I want which is a sum of amounts per year and month.
[
{
"_id": {
"month": 6,
"year": 2021
},
"sum": 200
},
{
"_id": {
"month": 7,
"year": 2021
},
"sum": 20000
},
{
"_id": {
"month": 8,
"year": 2021
},
"sum": 1004.99
}
]
What I would like however is to have something like the below where the year is at the top and the months are aggregated in a sum so that it's easier to iterate in the frontend but I have not been able to get it no matter what I have tried:
[
{
"year": 2021,
"sumAmount": 21204.99,
"months": [
{
"month": 7,
"amount": 20000
},
{
"month": 6,
"amount": 200
},
{
"month": 8,
"amount": 1004.99
}
]
},
{ "year" : 2020,
....
}
]
I have been pretty close in using another $group and $push but I have not been able to get what in my mind is a second group by month. Any help will be appreciated!
You just need one more $group to get your expected result. For another sorting, you can put an $sort after the $group stage. You will need to use $push to keep the ordering in the final array.
db.collection.aggregate([
{
$group: {
_id: {
year: {
$year: "$Date"
},
month: {
$month: "$Date"
}
},
sum: {
$sum: "$Amount"
}
}
},
{
"$sort": {
"_id.year": 1,
"_id.month": 1
}
},
{
"$group": {
"_id": "$_id.year",
"sumAmount": {
$sum: "$sum"
},
"months": {
"$push": {
"month": "$_id.month",
"amount": "$sum"
}
}
}
}
])
Here is the Mongo playground for your reference.
I have an aggregate pipeline that is supposed to display an array of inventory activities by month, from the former to the latter part of each month.
exports.InventoryTable = asyncHandler(async (req, res, next) => {
Log.aggregate([
{
$group: {
_id: {
name: '$name',
quantity: '$quantity',
year: {$year: '$updatedAt'},
month: {$month: '$updatedAt'},
dayOfMonth: {$dayOfMonth: '$updatedAt'}
},
totalAmountSold: { $sum :'$modified_quantity' },
},
},
]).exec((err, results) => {
if (err) throw err;
res.json(results);
});
});
and here is the sample output :
[
{
"_id": {
"name": "Pop",
"quantity": 58,
"year": 2020,
"month": 6,
"dayOfMonth": 21
},
"totalAmountSold": -57
},
{
"_id": {
"name": "Cement",
"quantity": 51,
"year": 2020,
"month": 6,
"dayOfMonth": 21
},
"totalAmountSold": -50
},
{
"_id": {
"name": "Washing Machine",
"quantity": 85,
"year": 2020,
"month": 6,
"dayOfMonth": 21
},
"totalAmountSold": -20
},
{
"_id": {
"name": "Pop",
"quantity": 4,
"year": 2020,
"month": 6,
"dayOfMonth": 14
},
"totalAmountSold": -15
},
{
"_id": {
"name": "Cement",
"quantity": 1,
"year": 2020,
"month": 6,
"dayOfMonth": 20
},
"totalAmountSold": -17
},
{
"_id": {
"name": "Pop",
"quantity": 24,
"year": 2020,
"month": 6,
"dayOfMonth": 8
},
"totalAmountSold": -6
}
]
I would want to have the result displayed in the reverse order with the earliest record at the top of the document. I have tried making use of the sort() function :
]).sort({"id":-1}).exec((err, results) => {
if (err) throw err;
res.json(results);
});
````
but the record still stays the same. on second thought, I might be applying the "sort" incorrectly. I really require help with this one.
You need to use $sort within the pipeline:
{
$group: {
_id: {
name: '$name',
quantity: '$quantity',
year: {$year: '$updatedAt'},
month: {$month: '$updatedAt'},
dayOfMonth: {$dayOfMonth: '$updatedAt'}
},
totalAmountSold: { $sum :'$modified_quantity' },
}
},
{
$sort: {
"_id.year": 1,
"_id.month": 1,
"_id.dayOfMonth": 1,
}
}
I am stuck with achieving weekofMonth instead of WeekofYear. Can somebody guide me on how to get this right?
db.activity.aggregate([
{
$group:{
_id: {
week: { $week: "$createdAt" },
month: { $month: "$createdAt" },
year: { $year: "$createdAt" }
},
count: { $sum: 1 }
}
},
{ $match : { "_id.year" : 2016, "_id.month" : 5 } }
])
Output
/* 1 */
{
"_id" : {
"week" : 19,
"month" : 5,
"year" : 2016
},
"count" : 133.0
}
/* 2 */
{
"_id" : {
"week" : 18,
"month" : 5,
"year" : 2016
},
"count" : 1.0
}
In the above shown data, it is actually not displaying weekofMonth. How can I get this given week 18 is the first week of Month?
The $week operator gives you the week of year as described in the docs.
The week of month can be calculated by getting the day of month and dividing by 7.
db.activity.aggregate([
{$project: {
"year": {$year: "$createdAt"},
"month": {$month: "$createdAt"},
"weekOfMonth": {$floor: {$divide: [{$dayOfMonth: "$createdAt"}, 7]}}
}},
{$group: {
"_id": {"year": "$year", "month": "$month", "weekOfMonth": "$weekOfMonth"},
count: { $sum: 1 }
}},
{$match : { "_id.year" : 2016, "_id.month" : 5}}
])
Note that the week of month here is 0 based. If you want it to start at 1 just $add 1. Also, the $floor operator is new in version 3.2.
Edit
You can simulate the floor using $mod (which exists in version 3.0)
"weekOfMonth": {$subtract: [{$divide: [{$dayOfMonth: "$createdAt"}, 7]}, {$mod: [{$divide: [{$dayOfMonth: "$createdAt"}, 7]}, 1]}]},
I have some log data stored in a mongo collection that includes basic information as a request_id and the time it was added to the collection, for example:
{
"_id" : ObjectId("55ae6ea558a5d3fe018b4568"),
"request_id" : "030ac9f1-aa13-41d1-9ced-2966b9a6g5c3",
"time" : ISODate("2015-07-21T16:00:00.00Z")
}
I was wondering if I could use the aggregation framework to aggregate some statistical data. I would like to get the counts of the objects created within each interval of N minutes for the last X hours.
So the output which I need for 10 minutes intervals for the last 1 hour should be something like the following:
{ "_id" : 0, "time" : ISODate("2015-07-21T15:00:00.00Z"), "count" : 67 }
{ "_id" : 0, "time" : ISODate("2015-07-21T15:10:00.00Z"), "count" : 113 }
{ "_id" : 0, "time" : ISODate("2015-07-21T15:20:00.00Z"), "count" : 40 }
{ "_id" : 0, "time" : ISODate("2015-07-21T15:30:00.00Z"), "count" : 10 }
{ "_id" : 0, "time" : ISODate("2015-07-21T15:40:00.00Z"), "count" : 32 }
{ "_id" : 0, "time" : ISODate("2015-07-21T15:50:00.00Z"), "count" : 34 }
I would use that to get data for graphs.
Any advice is appreciated!
There are a couple of ways of approaching this depending on which output format best suits your needs. The main note is that with the "aggregation framework" itself, you cannot actually return something "cast" as a date, but you can get values that are easily reconstructed into a Date object when processing results in your API.
The first approach is to use the "Date Aggregation Operators" available to the aggregation framework:
db.collection.aggregate([
{ "$match": {
"time": { "$gte": startDate, "$lt": endDate }
}},
{ "$group": {
"_id": {
"year": { "$year": "$time" },
"dayOfYear": { "$dayOfYear": "$time" },
"hour": { "$hour": "$time" },
"minute": {
"$subtract": [
{ "$minute": "$time" },
{ "$mod": [ { "$minute": "$time" }, 10 ] }
]
}
},
"count": { "$sum": 1 }
}}
])
Which returns a composite key for _id containing all the values you want for a "date". Alternately if just within an "hour" always then just use the "minute" part and work out the actual date based on the startDate of your range selection.
Or you can just use plain "Date math" to get the milliseconds since "epoch" which can again be fed to a date contructor directly.
db.collection.aggregate([
{ "$match": {
"time": { "$gte": startDate, "$lt": endDate }
}},
{ "$group": {
"_id": {
"$subtract": [
{ "$subtract": [ "$time", new Date(0) ] },
{ "$mod": [
{ "$subtract": [ "$time", new Date(0) ] },
1000 * 60 * 10
]}
]
},
"count": { "$sum": 1 }
}}
])
In all cases what you do not want to do is use $project before actually applying $group. As a "pipeline stage", $project must "cycle" though all documents selected and "transform" the content.
This takes time, and adds to the execution total of the query. You can simply just apply to the $group directly as has been shown.
Or if you are really "pure" about a Date object being returned without post processing, then you can always use "mapReduce", since the JavaScript functions actually allow recasting as a date, but slower than the aggregation framework and of course without a cursor response:
db.collection.mapReduce(
function() {
var date = new Date(
this.time.valueOf()
- ( this.time.valueOf() % ( 1000 * 60 * 10 ) )
);
emit(date,1);
},
function(key,values) {
return Array.sum(values);
},
{ "out": { "inline": 1 } }
)
Your best bet is using aggregation though, as transforming the response is quite easy:
db.collection.aggregate([
{ "$match": {
"time": { "$gte": startDate, "$lt": endDate }
}},
{ "$group": {
"_id": {
"year": { "$year": "$time" },
"dayOfYear": { "$dayOfYear": "$time" },
"hour": { "$hour": "$time" },
"minute": {
"$subtract": [
{ "$minute": "$time" },
{ "$mod": [ { "$minute": "$time" }, 10 ] }
]
}
},
"count": { "$sum": 1 }
}}
]).forEach(function(doc) {
doc._id = new Date(doc._id);
printjson(doc);
})
And then you have your interval grouping output with real Date objects.
Something like this?
pipeline = [
{"$project":
{"date": {
"year": {"$year": "$time"},
"month": {"$month": "$time"},
"day": {"$dayOfMonth": "$time"},
"hour": {"$hour": "$time"},
"minute": {"$subtract": [
{"$minute": "$time"},
{"$mod": [{"$minute": "$time"}, 10]}
]}
}}
},
{"$group": {"_id": "$date", "count": {"$sum": 1}}}
]
Example:
> db.foo.insert({"time": new Date(2015, 7, 21, 22, 21)})
> db.foo.insert({"time": new Date(2015, 7, 21, 22, 23)})
> db.foo.insert({"time": new Date(2015, 7, 21, 22, 45)})
> db.foo.insert({"time": new Date(2015, 7, 21, 22, 33)})
> db.foo.aggregate(pipeline)
and output:
{ "_id" : { "year" : 2015, "month" : 8, "day" : 21, "hour" : 20, "minute" : 40 }, "count" : 1 }
{ "_id" : { "year" : 2015, "month" : 8, "day" : 21, "hour" : 20, "minute" : 20 }, "count" : 2 }
{ "_id" : { "year" : 2015, "month" : 8, "day" : 21, "hour" : 20, "minute" : 30 }, "count" : 1 }
a pointer in lieu of a concrete answer. you can very easily do it for minutes, hours and given periods using the date aggregations . every 10 minutes will be a bit trickier but likely possible with some wrangling. nevertheless, the aggregation will be slow as nuts on large data sets.
i would suggest extracting the minutes post-insert
{
"_id" : ObjectId("55ae6ea558a5d3fe018b4568"),
"request_id" : "030ac9f1-aa13-41d1-9ced-2966b9a6g5c3",
"time" : ISODate("2015-07-21T16:00:00.00Z"),
"minutes": 16
}
and even though it sounds utterly absurd adding quartiles and sextiles or whatever that N might be.
{
"_id" : ObjectId("55ae6ea558a5d3fe018b4568"),
"request_id" : "030ac9f1-aa13-41d1-9ced-2966b9a6g5c3",
"time" : ISODate("2015-07-21T16:00:00.00Z"),
"minutes": 16,
"quartile: 1,
"sextile: 2,
}
first try doing a $div on the minutes. doesnt do ceil and floor. but check out
Is there a floor function in Mongodb aggregation framework?