Aggregation for computing distribution across array - mongodb

I'm trying to create an aggregation that will compute the distribution of values across an array of objects and return an array of computed values.
Here is a sample document
[
{
"duration": 1208,
"dataPoints": 2,
"binMin": 0,
"binMax": 5000
},
{
"duration": 25735,
"dataPoints": 3,
"binMin": 5000,
"binMax": 10000
},
{
"duration": 0,
"dataPoints": 0,
"binMin": 10000,
"binMax": 20000
},
{
"duration": 54088,
"dataPoints": 2,
"binMin": 20000,
"binMax": 28817
}
]
I need to add up the durations for each object, then compute the distribution across each object and return a new array like so:
[
{
"duration": 1208,
"dataPoints": 2,
"binMin": 0,
"binMax": 5000,
"ratio": 0.014907874763979
},
{
"duration": 25735,
"dataPoints": 3,
"binMin": 5000,
"binMax": 10000,
"ratio": 0.317594500870037
},
{
"duration": 0,
"dataPoints": 0,
"binMin": 10000,
"binMax": 20000,
"ratio": 0
},
{
"duration": 54088,
"dataPoints": 2,
"binMin": 20000,
"binMax": 28817,
"ratio": 0.667497624365983
}
]
I am able to calculate the total duration and divide to get the ratio value but it seems to be only doing it to the first element of the array.
This is my aggregation so far:
[{$project: {
_id: '$_id',
username: 1,
uuid: 1,
data: '$stats.dataHistogram'
}}, {$unwind: {
path: '$data'
}}, {$group: {
_id: '$_id',
data_bin: {
$first: '$data'
},
total_duration: {
$sum: '$data.duration'
}
}}, {$project: {
_id: '$_id',
total_duration: 1,
data_bin: 1,
ratio: {
$divide: [
'$data_bin.duration',
{
$add: [
'$total_duration',
1
]
}
]
}
}}]
(I'm adding a 1 to the $total_duration because it can be 0 some times and I get a "Cannot divide by zero" error)
I feel like I'm super close but not sure what the next steps should be. Thanks for the help!

You can use $reduce to compute the total duration first. Then apply element wise $divide by using $map
db.collection.aggregate([
{
"$addFields": {
"totalDuration": {
"$reduce": {
"input": "$stats.histogram",
"initialValue": 0,
"in": {
$add: [
"$$value",
"$$this.duration"
]
}
}
}
}
},
{
"$addFields": {
"totalDuration": {
"$cond": {
"if": {
$eq: [
"$totalDuration",
0
]
},
"then": 1,
"else": "$totalDuration"
}
}
}
},
{
"$addFields": {
"stats.histogram": {
"$map": {
"input": "$stats.histogram",
"as": "h",
"in": {
"duration": "$$h.duration",
"dataPoints": "$$h.dataPoints",
"binMin": "$$h.binMin",
"binMax": "$$h.binMax",
"ratio": {
"$divide": [
"$$h.duration",
"$totalDuration"
]
}
}
}
}
}
}
])
Here is the Mongo playground for your reference.

Related

Mongo DB query for count with condition

I have the following documents stored in "Deployments" collection in MongoDB version 4.2.
I would like to achieve the following results group by productId and between a range of dates.
I have achieved execution times using this query.
db.getCollection('Deployments').aggregate([
{
$match : {$and:[{ "startedAt": { $gte: new ISODate("2021-10-01") } },
{ "startedAt": { $lte: new ISODate("2021-11-17") } }]}
},
{
$group : {
_id:"$productId",
count: { $sum: 1 },
minExecutionTime:
{
$min:
{
$divide:[{$subtract:["$completedAt", "$startedAt"]}, 1000 * 60]
}
},
maxExecutionTime:
{
$max:
{
$divide:[{$subtract:["$completedAt", "$startedAt"]}, 1000 * 60]
}
},
avgExecutionTime:
{
$avg:
{
$divide:[{$subtract:["$completedAt", "$startedAt"]}, 1000 * 60]
}
}
}
}
])
Any help to add the counts to this query please?
How to truncate the execution times to 2 decimal places?
Please suggest in case of any optimizations to this query.
Documents:
[
{
"productId": 1,
"deploymentStatus": "Succeeded",
"startedAt": ISODate("2021-01-21T14:00:19.782Z"),
"completedAt": ISODate("2021-01-21T14:03:55.789Z")
},
{
"productId": 2,
"deploymentStatus": "Failed",
"startedAt": ISODate("2021-01-21T15:00:19.782Z"),
"completedAt": ISODate("2021-01-21T15:03:55.789Z")
},
{
"productId": 3,
"deploymentStatus": "Cancelled",
"startedAt": ISODate("2021-01-21T16:00:19.782Z"),
"completedAt": ISODate("2021-01-21T16:03:55.789Z")
},
{
"productId": 1,
"deploymentStatus": "Failed",
"startedAt": ISODate("2021-01-21T17:00:19.782Z"),
"completedAt": ISODate("2021-01-21T17:03:55.789Z")
},
{
"productId": 2,
"deploymentStatus": "Failed",
"startedAt": ISODate("2021-01-21T18:00:19.782Z"),
"completedAt": ISODate("2021-01-21T18:03:55.789Z")
},
{
"productId": 3,
"deploymentStatus": "Succeeded",
"startedAt": ISODate("2021-01-21T19:00:19.782Z"),
"completedAt": ISODate("2021-01-21T19:03:55.789Z")
},
{
"productId": 1,
"deploymentStatus": "Cancelled",
"startedAt": ISODate("2021-01-21T20:00:19.782Z"),
"completedAt": ISODate("2021-01-21T20:03:55.789Z")
},
{
"productId": 2,
"deploymentStatus": "Failed",
"startedAt": ISODate("2021-01-21T21:00:19.782Z"),
"completedAt": ISODate("2021-01-21T21:03:55.789Z")
},
{
"productId": 3,
"deploymentStatus": "Succeeded",
"startedAt": ISODate("2021-01-21T22:00:19.782Z"),
"completedAt": ISODate("2021-01-21T22:03:55.789Z")
}
]
Mongo Playground
Your aggregation is actually on the right track. For your 3 questions:
Any help to add the counts to this query please?
Just break the count into 3 conditional count using $cond
How to truncate the execution times to 2 decimal places?
Use $round
Please suggest in case of any optimizations to this query.
I did a minor tweak to pre-compute the duration in minute instead of computing them again in the $group stage
db.collection.aggregate([
{
$match: {
$and: [
{
"startedAt": {
$gte: ISODate("2021-01-21")
}
},
{
"startedAt": {
$lte: ISODate("2021-01-22")
}
}
]
}
},
{
"$addFields": {
"durationInMin": {
$round: [
{
$divide: [
{
$subtract: [
"$completedAt",
"$startedAt"
]
},
60000
]
},
2
]
}
}
},
{
$group: {
_id: "$productId",
SucceedCount: {
$sum: {
"$cond": {
"if": {
$eq: [
"$deploymentStatus",
"Succeeded"
]
},
"then": 1,
"else": 0
}
}
},
FailedCount: {
$sum: {
"$cond": {
"if": {
$eq: [
"$deploymentStatus",
"Failed"
]
},
"then": 1,
"else": 0
}
}
},
CancelledCount: {
$sum: {
"$cond": {
"if": {
$eq: [
"$deploymentStatus",
"Cancelled"
]
},
"then": 1,
"else": 0
}
}
},
minExecutionTime: {
$min: "$durationInMin"
},
maxExecutionTime: {
$max: "$durationInMin"
},
avgExecutionTime: {
$avg: "$durationInMin"
}
}
}
])
Here is the Mongo playground for your reference.

Mongo DB aggregate grouping multiple values that belong to the same document

I have documents that look like this
{
"_id": "5e3334cede31d9555e38dbee",
"time": 400,
"datetime": "2020-01-05T16:35:42.315Z",
"version": "2.0.30",
"hostname": "bvasilchik-lt.extron.com",
"testfile": "cards.txt",
"tests": 5,
"failures": 3,
"skips": 0,
"status": "Failed",
"__v": 0
}
I want to create a result that includes the documents that have the highest number of time per testfile name, so if the top 10 were all the same testfile name I'd only want to show the top one that had the same testfile name.
I have done this but I also wanted to include another field that also shows the number of tests matching that grouping, but the only ways I found were to add the $first or the $last or the $max or the $min for the tests field, but that wouldn't be correct b/c the highest time might have a different number of tests.
I am also matching results from a specific date range
const times = await Suite.aggregate([
{
"$match": {
datetime: { "$gte": dateRange.startDate, "$lt": dateRange.endDate, }
}
},
{
"$group": {
_id: "$testfile",
time: { "$max" : "$time" },
}
},
{
"$sort": {
time: order
}
},
{
"$project": {
_id: 0,
testfile: "$_id",
time: "$time"
}
}
])
this produces these results
[
{
"testfile": "lists.txt",
"time": 900
},
{
"testfile": "buttons.txt",
"time": 800
},
{
"testfile": "cards.txt",
"time": 400
},
{
"testfile": "popover.txt",
"time": 300
},
{
"testfile": "about-pages.neb",
"time": 76
}
]
but what I want it to return is
[
{
"testfile": "lists.txt",
"tests": 5,
"time": 900
},
{
"testfile": "buttons.txt",
"tests": 4,
"time": 800
},
{
"testfile": "cards.txt",
"tests": 8,
"time": 400
},
{
"testfile": "popover.txt",
"tests": 1,
"time": 300
},
{
"testfile": "about-pages.neb",
"tests": 2,
"time": 76
}
]
You need to add extra field into $group and $project stages.
You need to use $max operator for time field and accumulatetests field time:tests values. In the last stage, we $reduce tests field taking highest value
{
"$group": {
_id: "$testfile",
time: {
$max: "$time"
},
tests: {
"$push": {
time: "$time",
tests: "$tests"
}
}
}
},
{
"$sort": {
time: 1
}
},
{
"$project": {
_id: 0,
testfile: "$_id",
time: "$time",
tests: {
$reduce: {
input: "$tests",
initialValue: 0,
in: {
$add: [
"$$value",
{
$cond: [
{
$and: [
{
$eq: [
"$time",
"$$this.time"
]
},
{
$gt: [
"$$this.tests",
"$$value"
]
}
]
},
{
$subtract: [
"$$this.tests",
"$$value"
]
},
0
]
}
]
}
}
}
}
}
MongoPlayground

Get objects containing max values for multiple fields using aggregation in mongodb

I want to fetch the documents having highest value for a list of specifics fields. I don't know if it's possible in only one request.
Consider below data:
_id:1, kills:12, deaths:6, assists:1
_id:2, kills:2, deaths:2, assists:22
_id:3, kills:1, deaths:2, assists:3
_id:4, kills:0, deaths:23, assists:4
_id:5, kills:6, deaths:3, assists:5
_id:6, kills:7, deaths:1, assists:6
Answer should be something like
maxKills: { _id:1, kills:12, deaths:6, assists:1 },
maxDeaths: { _id:4, kills:0, deaths:23, assists:4 },
maxAssists: { _id:2, kills:2, deaths:2, assists:22 },
I have tried several queries, but I can't get the whole objects containing the max values.
db.coll.aggregate([{
$group: {
_id: null,
kills: { $max: "$stats.kills" },
deaths: { $max: "$stats.deaths" },
assists: { $max: "$stats.assists" },
}
}])
For example here I have all the max values I want but I don't get the whole matches Objects.
---- UPDATE ----
With this answer https://stackoverflow.com/a/33361913/9188650, I've made it works but I receive data in a not really user friendly way.
{
"$group": {
"_id": null,
"maxKills": { "$max": "$stats.kills" },
"maxDeaths": { "$max": "$stats.deaths" },
"maxAssists": { "$max": "$stats.assists" },
"matches": {
"$push": {
"champion": "$champion",
"gameId": "$gameId",
"kills": "$stats.kills",
"deaths": "$stats.deaths",
"assists": "$stats.assists",
}
}
}
},
{
"$project": {
"_id": 0,
"maxKills": 1,
"maxDeaths": 1,
"maxAssists": 1,
"matches": {
"$setDifference": [
{
"$map": {
"input": "$matches",
"as": "match",
"in": {
$switch: {
branches: [
{ case: { $eq: ["$maxKills", "$$match.kills"] }, then: "$$match" },
{ case: { $eq: ["$maxDeaths", "$$match.deaths"] }, then: "$$match" },
{ case: { $eq: ["$maxAssists", "$$match.assists"] }, then: "$$match" },
],
default: false
}
}
}
},
[false]
]
}
}
}
It will returns:
{
"maxKills": 25,
"maxDeaths": 20,
"maxAssists": 39,
"matches": [
{
"champion": {
"id": 145,
"name": "Kai'Sa",
},
"gameId": 4263819967,
"kills": 25,
"deaths": 3,
"assists": 16
},
{
"champion": {
"id": 8,
"name": "Vladimir",
},
"gameId": 4262731529,
"kills": 8,
"deaths": 20,
"assists": 3
},
{
"champion": {
"id": 22,
"name": "Ashe",
},
"gameId": 4340383097,
"kills": 9,
"deaths": 7,
"assists": 39
},
{
"champion": {
"id": 23,
"name": "Tryndamere",
},
"gameId": 4352236936,
"kills": 25,
"deaths": 6,
"assists": 22
}
]
}
My last issue are cases when multiple objects have the same max value (as the example above, 2 matches have 25 kills). I only want the oldest one in these cases.
You can do it easier by using $filter and $arrayElemAt after $group stage:
db.collection.aggregate([
{
$group: {
_id: null,
maxKills: { $max: "$kills" },
maxDeaths: { $max: "$deaths" },
maxAssists: { $max: "$assists" },
docs: { $push: "$$ROOT" }
}
},
{
$project: {
_id: 0,
maxKills: { $arrayElemAt: [ { $filter: { input: "$docs", cond: { $eq: [ "$$this.kills", "$maxKills" ] } } }, 0 ] },
maxDeaths: { $arrayElemAt: [ { $filter: { input: "$docs", cond: { $eq: [ "$$this.deaths", "$maxDeaths" ] } } }, 0 ] },
maxAssists: { $arrayElemAt: [ { $filter: { input: "$docs", cond: { $eq: [ "$$this.assists", "$maxAssists" ] } } }, 0 ] }
}
}
])
Mongo Playground

How to get count of multiple fields based on value in mongodb?

Collection exists as below:
[
{"currentLocation": "Chennai", "baseLocation": "Bengaluru"},
{"currentLocation": "Chennai", "baseLocation": "Bengaluru"},
{"currentLocation": "Delhi", "baseLocation": "Bengaluru"},
{"currentLocation": "Chennai", "baseLocation": "Chennai"}
]
Expected Output:
[
{"city": "Chennai", "currentLocationCount": 3, "baseLocationCount": 1},
{"city": "Bengaluru", "currentLocationCount": 0, "baseLocationCount": 3},
{"city": "Delhi", "currentLocationCount": 1, "baseLocationCount": 0}
]
What I have tried is:
db.getCollection('users').aggregate([{
$group: {
"_id": "$baselocation",
baseLocationCount: {
$sum: 1
}
},
}, {
$project: {
"_id": 0,
"city": "$_id",
"baseLocationCount": 1
}
}])
Got result as:
[
{"city": "Chennai", "baseLocationCount": 1},
{"city": "Bengaluru", "baseLocationCount": "3"}
]
I'm not familiar with mongo, so any help?
MongoDB Version - 3.4
Neil Lunn and myself had a lovely argument over this topic the other day which you can read all about here: Group by day with Multiple Date Fields.
Here are two solutions to your precise problem.
The first one uses the $facet stage. Bear in mind, though, that it may not be suitable for large collections because $facet produces a single (potentially huge) document that might be bigger than the current MongoDB document size limit of 16MB (which only applies to the result document and wouldn't be a problem during pipeline processing anyway):
collection.aggregate(
{
$facet:
{
"current":
[
{
$group:
{
"_id": "$currentLocation",
"currentLocationCount": { $sum: 1 }
}
}
],
"base":
[
{
$group:
{
"_id": "$baseLocation",
"baseLocationCount": { $sum: 1 }
}
}
]
}
},
{ $project: { "result": { $setUnion: [ "$current", "$base" ] } } }, // merge results into new array
{ $unwind: "$result" }, // unwind array into individual documents
{ $replaceRoot: { newRoot: "$result" } }, // get rid of the additional field level
{ $group: { "_id": "$_id", "currentLocationCount": { $sum: "$currentLocationCount" }, "baseLocationCount": { $sum: "$baseLocationCount" } } }, // group into final result)
{ $project: { "_id": 0, "city": "$_id", "currentLocationCount": 1, "baseLocationCount": 1 } } // group into final result
)
The second one works based on the $map stage instead:
collection.aggregate(
{
"$project": {
"city": {
"$map": {
"input": [ "current", "base" ],
"as": "type",
"in": {
"type": "$$type",
"name": {
"$cond": {
"if": { "$eq": [ "$$type", "current" ] },
"then": "$currentLocation",
"else": "$baseLocation"
}
}
}
}
}
}
},
{ "$unwind": "$city" },
{
"$group": {
"_id": "$city.name",
"currentLocationCount": {
"$sum": {
"$cond": {
"if": { "$eq": [ "$city.type", "current" ] },
"then": 1,
"else": 0
}
}
},
"baseLocationCount": {
"$sum": {
"$cond": {
"if": { "$eq": [ "$city.type", "base" ] },
"then": 1,
"else": 0
}
}
}
}
}
)

Group books and get count for each of their ratings

I have a Rating model with a book and rating value to it. I would like to get all the ratings count (ratings vary from 1 to 5) for each book in the database.
My schema simply looks like -
{
"_id": ObjectId("57e112312a52fe257e5d1d5c"),
"book": ObjectId("57e111142a52fe257e5d1d42"),
"rating": 4
}
{
"_id": ObjectId("57e7a002420d22d6106a4715"),
"book": ObjectId("57e111142a52fe257e5d1d42"),
"rating": 5
}
{
"_id": ObjectId("57e7a4cd98bfdb5a11962d54"),
"book": ObjectId("57e111142a52fe257e5d17676"),
"rating": 5
}
{
"_id": ObjectId("57e7a4cd98bfdb5a11962d54"),
"book": ObjectId("57e111142a52fe257e5d17676"),
"rating": 1
}
Currently, i have only been able to get to this point where i can get the no of ratings for each book but it doesn't specify exactly the rating value count.
This is my current query -
db.ratings.aggregate([
{$match: {book: {$in: [ObjectId("57e111142a52fe257e5d1d42"), ObjectId('57e6bef7cad79fa38555c643')]}}},
{$group: {_id: {book: "$book", value: "$value"} } },
{$group: {_id: "$_id.book", total: {$sum: 1}}},
])
The output is this -
{
"result": [
{
"_id": ObjectId("57e6bef7cad79fa38555c643"),
"total": 2
},
{
"_id": ObjectId("57e111142a52fe257e5d1d42"),
"total": 2
}
],
"ok": 1
}
However, i want to club all the documents and get a result with the count of ratings for each value of the rating field, something like below. The whole point is that i just want the count of ratings for each value for each book.
{
result: [
{
_id: "57e111142a52fe257e5d17676",
5_star_ratings: 1,
4_star_ratings: 3,
3_star_ratings: 4,
2_star_ratings: 1,
1_star_ratings: 0,
},
{
_id: "57e111142a52fe257e5d1d42",
5_star_ratings: 10,
4_star_ratings: 13,
3_star_ratings: 7,
2_star_ratings: 8,
1_star_ratings: 19,
}
.
.
.
.
]
}
How do i go about this?
Accomplishing the task require a $group pipeline that uses the $cond operator in the $sum accumulator operator. The $cond operator will evaluate a logical condition based on its first argument (if) and then returns the second argument where the evaluation is true (then) or the third argument where false (else). This converts the true/false logic into 1 and 0 numerical values that feed into $sum respectively:
{
"$sum": {
"$cond": [ { "$eq": [ "$rating", 1 ] }, 1, 0 ]
}
}
As a resulting operation, you might want to run the following aggregation pipeline:
var pipeline = [
{
"$match": {
"book": {
"$in": [
ObjectId("57e111142a52fe257e5d1d42"),
ObjectId('57e6bef7cad79fa38555c643')
]
}
}
},
{
"$group": {
"_id": "$book",
"5_star_ratings": {
"$sum": {
"$cond": [ { "$eq": [ "$rating", 5 ] }, 1, 0 ]
}
},
"4_star_ratings": {
"$sum": {
"$cond": [ { "$eq": [ "$rating", 4 ] }, 1, 0 ]
}
},
"3_star_ratings": {
"$sum": {
"$cond": [ { "$eq": [ "$rating", 3 ] }, 1, 0 ]
}
},
"2_star_ratings": {
"$sum": {
"$cond": [ { "$eq": [ "$rating", 2 ] }, 1, 0 ]
}
},
"1_star_ratings": {
"$sum": {
"$cond": [ { "$eq": [ "$rating", 1 ] }, 1, 0 ]
}
}
}
},
]
db.ratings.aggregate(pipeline)
For a more flexible and better performant approach which executes much faster than the above, consider running an alternative pipeline as follows
db.ratings.aggregate([
{
"$match": {
"book": {
"$in": [
ObjectId("57e111142a52fe257e5d1d42"),
ObjectId('57e6bef7cad79fa38555c643')
]
}
}
},
{
"$group": {
"_id": {
"book": "$name",
"rating": "$rating"
},
"count": { "$sum": 1 }
}
},
{
"$group": {
"_id": "$_id.book",
"counts": {
"$push": {
"rating": "$_id.rating",
"count": "$count"
}
}
}
}
])