MongoDB - using $count inside a $facet stage for the calculations - mongodb

I have a collection and I want to get the total number of documents and use that amount in a $facet to calculate the 95th percentile (in the example below, the duration for each document is defined as finish_time - start_time).
Let's say I have these documents:
[
{
"_id": ObjectId("178768747638364736373637"),
"start_time": ISODate("2019-02-03T12:00:00.000Z"),
"finish_time": ISODate("2019-02-03T12:01:00.000Z")
},
{
"_id": ObjectId("266747364736363536353555"),
"start_time": ISODate("2019-02-03T12:00:00.000Z"),
"finish_time": ISODate("2019-02-03T12:03:00.000Z")
},
{
"_id": ObjectId("367463536453623546353625"),
"start_time": ISODate("2019-02-03T12:00:00.000Z"),
"finish_time": ISODate("2019-02-03T12:08:00.000Z")
}
]
I'm expecting this output:
[
{
"Percentile95Index": 2.8499999999999996,
"_id": ObjectId("178768747638364736373637"),
"duration": 60,
"totalCount": 3
},
{
"Percentile95Index": 2.8499999999999996,
"_id": ObjectId("266747364736363536353555"),
"duration": 180,
"totalCount": 3
},
{
"Percentile95Index": 2.8499999999999996,
"_id": ObjectId("367463536453623546353625"),
"duration": 480,
"totalCount": 3
}
]
So I did this (https://mongoplayground.net/p/xtHLHouzNQr):
db.collection.aggregate([
{
// Since total count will always be a one-document result, I need $facet to run multiple aggregation pipelines and then merge results.
$facet: {
totalCount: [
{
$count: "value"
}
],
pipelineResults: [
{
"$project": {
duration: {
"$divide": [
{
"$subtract": [
"$finish_time",
"$start_time"
]
},
1000
]// duration is in seconds
},
Percentile95Index: {
"$multiply": [
0.95,
"$totalCount.value" // HERE I'D LIKE TO ACCESS THE TOTAL_COUNT!!!!
]
}
}
}
]
}
},
{
$unwind: "$totalCount"
},
{
$unwind: "$pipelineResults"
},
{
$replaceRoot: {
newRoot: {
$mergeObjects: [
"$pipelineResults",
{
totalCount: "$totalCount.value"
}
]
}
}
}
])
As you can see, inside the Percentile95Index field, I'd like to access the totalCount.value but it's not accessible there and I can't figure out how to do it.
Any help?

For my opinion, I don't think the $facet stage is needed based on your scenario. (Would be great if you can share the reason to use $facet).
To count the total documents in the collection and render the count value to each document, you may look for $setWindowFields.
$setWindowFields - Render the totalCount field with the value of the total number of documents in the collection.
$project - Decorate the output documents.
db.collection.aggregate([
{
$setWindowFields: {
output: {
totalCount: {
$sum: 1
}
}
}
},
{
"$project": {
totalCount: 1,
duration: {
"$divide": [
{
"$subtract": [
"$finish_time",
"$start_time"
]
},
1000
]
},
Percentile95Index: {
"$multiply": [
0.95,
"$totalCount"
]
}
}
}
])
Demo # MongoPlayground

Related

MongoDB Express filter results on count of nested list of ints

Im new to mongoDB, so having some difficulties filtering my collections as I need.
I have this collection
[
{
"id": "sdfsdfsdf",
"key": "tryrtyrty",
"createdAt": "2017-01-28T01:22:14.398Z",
"counts": [
170
],
"value": "Something"
},
{
"id": "hjmhjhjm",
"key": "yuiyuiyui",
"createdAt": "2017-01-28T01:22:14.398Z",
"counts": [
150,
160
],
"value": "Something"
}
]
I want to filter by range of dates (min-max date) and range of counts, meaning I want to give a min and max value for the totalCount of the sum in the field. Example, I would like to filter results whose min counts sum is 200 and max 400. This would only return the second result (the sum is 310, while the first result the sum is 170).
Right now I have this:
db.collection.aggregate([
{
$project: {
totalCount: {
$sum: {
"$filter": {
"input": "$counts",
"as": "bla",
"cond": {
"$gte": [
"$sum", // I think the error is here, I dont know how to reference the sum of the list
300 //I want records whose count sum is more than this value
]
}
}
}
}
}
}
])
This returns all the records with TotalCount on 0, which is not want I want, I would like the records matching the count condition with the correct TotalCount (and eventually matching the dates as well)
[
{
"_id": ObjectId("5a934e000102030405000000"),
"totalCount": 0
},
{
"_id": ObjectId("5a934e000102030405000001"),
"totalCount": 0
}
Desired output
[
{
"_id": ObjectId("5a934e000102030405000001"),
"totalCount": 310,
"key": "yuiyuiyui",
"createdAt": "2017-01-28T01:22:14.398Z"
}
]
Any help would be greatly appreciated. Even more if it comes with both dates and count filter.
You should not use $filter as it doesn't suitable for this scenario.
Stages:
set - Create totalCounts with $sum all elements in counts array.
$match - Fiter the documents which has totalCounts within the range.
$unset - Remove fields for decorating output document.
db.collection.aggregate([
{
$set: {
"totalCounts": {
$sum: "$counts"
}
}
},
{
$match: {
totalCounts: {
$gte: 200,
$lte: 400
}
}
},
{
$unset: [
"counts",
"id"
]
}
])
Sample Mongo Playground
For date range filter, you need $expr and $and operator as below
{
$match: {
totalCounts: {
$gte: 200,
$lte: 400
},
$expr: {
$and: [
{
$gte: [
{
"$toDate": "$createdAt"
},
/* Date from */
]
},
{
$lte: [
{
"$toDate": "$createdAt"
},
/* Date to */
]
}
]
}
}
}
Sample Mongo Playground (with date range filter)

Groupby Array elements and categorize them based on date in Mongodb Query

I have an array element in my DB and I want to group By and calculate the number of repetitions of different elements in this array based on different datetime. assume following collection:
{
_id: ObjectId(7df78ad8902c)
title: 'MongoDB Overview',
tags: ['SQL', 'database', 'NoSQL'],
created_at: 2021-10-03 10:05:51.755Z
},
{
_id: ObjectId(7df78ad8902d)
title: 'NoSQL Overview',
tags: ['mongodb', 'database', 'PHP'],
created_at: 2021-10-03 14:05:51.755Z
},
{
_id: ObjectId(7df78ad8902d)
title: 'Developing',
tags: ['java', 'btc/usdt', 'PHP'],
created_at: 2021-10-03 14:05:51.755Z
}
,
{
_id: ObjectId(7df78ad8902d)
title: 'databases for search',
tags: ['elasticsearch', 'database', 'PHP'],
created_at: 2021-10-03 12:05:51.755Z
}
I want to calculate the number of repetitions of different elements in tags field such as mongodb, database, noSQL based on datetime (for example hot hashtags in last hour, today or this month) in this collection. How can I solve this problem in mongo?
expected answer like .
1 - hot hashtags in last hour ['a' , 'b' , 'c']
2 - hot hastags in last 5 hours : ...
3 - today : ...
4 - this month : ...
Query
not calendar based, based on difference on milliseconds
keep only last 30 days data
facet and 4 group by
its exactly the same code 4x
The only difference is the multiply
last 30days : (now_date-created_at) <= (* 30 24 60 60 1000)
last 24h : (now_date-created_at) <= (* 24 60 60 1000)
last 120 hours(5days) : (now_date-created_at) <= (* 5 60 60 1000)
last 60 min : (now_date-created_at) <= (* 60 60 1000)
*subtraction works on dates also, and returns milliseconds
filter the date depending on what we want its 4 different filters
unwind
group by tag and count occurences
sort by count
limit 2 to keep like the 2 top hotttest tags, you can change it to any value, like limit 1 to keep only the hottest tag
*if you want calendar based like 3 October(3 days data only), query must be changed, the same is for day(query is for 24 hours) etc (in those cases we should use $hour $month etc)
Its not big change in query.
//same month
{"$eq" : [ {"$month" : "$$NOW"}, {"$month" : "$created_at"} ]
//same day(assuming same month from previous filter)
{"$eq" : [ {"$dayOfMonth" : "$$NOW"}, {"$dayOfMonth" : "$created_at"} ]
//same hour
*we could also use the new $dateTrunc to check for same month etc.
Test code here
(Query is big but its the same thing 4x)
db.collection.aggregate([
{
"$set": {
"created_at": {
"$dateFromString": {
"dateString": "$created_at"
}
}
}
},
{
"$unwind": {
"path": "$tags"
}
},
{
"$match": {
"$expr": {
"$lte": [
{
"$subtract": [
"$$NOW",
"$created_at"
]
},
{
"$multiply": [
30,
24,
60,
60,
1000
]
}
]
}
}
},
{
"$facet": {
"month-tag": [
{
"$match": {
"$expr": {
"$lte": [
{
"$subtract": [
"$$NOW",
"$created_at"
]
},
{
"$multiply": [
30,
24,
60,
60,
1000
]
}
]
}
}
},
{
"$group": {
"_id": "$tags",
"count": {
"$sum": 1
}
}
},
{
"$sort": {
"count": -1
}
},
{
"$limit": 2
},
{
"$project": {
"_id": 0,
"tag": "$_id"
}
}
],
"day-tag": [
{
"$match": {
"$expr": {
"$lte": [
{
"$subtract": [
"$$NOW",
"$created_at"
]
},
{
"$multiply": [
24,
60,
60,
1000
]
}
]
}
}
},
{
"$group": {
"_id": "$tags",
"count": {
"$sum": 1
}
}
},
{
"$sort": {
"count": -1
}
},
{
"$limit": 2
},
{
"$project": {
"_id": 0,
"tag": "$_id"
}
}
],
"5hour-tag": [
{
"$match": {
"$expr": {
"$lte": [
{
"$subtract": [
"$$NOW",
"$created_at"
]
},
{
"$multiply": [
5,
60,
60,
1000
]
}
]
}
}
},
{
"$group": {
"_id": "$tags",
"count": {
"$sum": 1
}
}
},
{
"$sort": {
"count": -1
}
},
{
"$limit": 2
},
{
"$project": {
"_id": 0,
"tag": "$_id"
}
}
],
"hour-tag": [
{
"$match": {
"$expr": {
"$lte": [
{
"$subtract": [
"$$NOW",
"$created_at"
]
},
{
"$multiply": [
60,
60,
1000
]
}
]
}
}
},
{
"$group": {
"_id": "$tags",
"count": {
"$sum": 1
}
}
},
{
"$sort": {
"count": -1
}
},
{
"$limit": 2
},
{
"$project": {
"_id": 0,
"tag": "$_id"
}
}
]
}
}
])

MongoDB: Assign document objects to field in '$project' stage

I have a user collection:
[
{"_id": 1,"name": "John", "age": 25, "valid_user": true}
{"_id": 2, "name": "Bob", "age": 40, "valid_user": false}
{"_id": 3, "name": "Jacob","age": 27,"valid_user": null}
{"_id": 4, "name": "Amelia","age": 29,"valid_user": true}
]
I run a '$facet' stage on this collection. Checkout this MongoPlayground.
I want to talk about the first output from the facet stage. The following is the response currently:
{
"user_by_valid_status": [
{
"_id": false,
"count": 1
},
{
"_id": true,
"count": 2
},
{
"_id": null,
"count": 1
}
]
}
However, I want to restructure the output in this way:
"analytics": {
"invalid_user": {
"_id": false
"count": 1
},
"valid_user": {
"_id": true
"count": 2
},
"user_with_unknown_status": {
"_id": null
"count": 1
}
}
The problem with using a '$project' stage along with 'arrayElemAt' is that the order may not be definite for me to associate an index with an attribute like 'valid_users' or others. Also, it gets further complicated because unlike the sample documents that I have shared, my collection may not always contain all the three categories of users.
Is there some way I can do this?
You can use $switch conditional operator,
$project to show value part in v with _id and count field as object, k to put $switch condition
db.collection.aggregate([
{
"$facet": {
"user_by_valid_status": [
{
"$group": {
"_id": "$valid_user",
"count": { "$sum": 1 }
}
},
{
$project: {
_id: 0,
v: { _id: "$_id", count: "$count" },
k: {
$switch: {
branches: [
{ case: { $eq: ["$_id", null] }, then: "user_with_unknown_status" },
{ case: { $eq: ["$_id", false] }, then: "invalid_user" },
{ case: { $eq: ["$_id", true] }, then: "valid_user" }
]
}
}
}
}
],
"users_above_30": [{ "$match": { "age": { "$gt": 30 } } }]
}
},
$project stage in root, convert user_by_valid_status array to object using $arrayToObject
{
$project: {
analytics: { $arrayToObject: "$user_by_valid_status" },
users_above_30: 1
}
}
])
Playground

Mongo DB aggregate grouping multiple values that belong to the same document

I have documents that look like this
{
"_id": "5e3334cede31d9555e38dbee",
"time": 400,
"datetime": "2020-01-05T16:35:42.315Z",
"version": "2.0.30",
"hostname": "bvasilchik-lt.extron.com",
"testfile": "cards.txt",
"tests": 5,
"failures": 3,
"skips": 0,
"status": "Failed",
"__v": 0
}
I want to create a result that includes the documents that have the highest number of time per testfile name, so if the top 10 were all the same testfile name I'd only want to show the top one that had the same testfile name.
I have done this but I also wanted to include another field that also shows the number of tests matching that grouping, but the only ways I found were to add the $first or the $last or the $max or the $min for the tests field, but that wouldn't be correct b/c the highest time might have a different number of tests.
I am also matching results from a specific date range
const times = await Suite.aggregate([
{
"$match": {
datetime: { "$gte": dateRange.startDate, "$lt": dateRange.endDate, }
}
},
{
"$group": {
_id: "$testfile",
time: { "$max" : "$time" },
}
},
{
"$sort": {
time: order
}
},
{
"$project": {
_id: 0,
testfile: "$_id",
time: "$time"
}
}
])
this produces these results
[
{
"testfile": "lists.txt",
"time": 900
},
{
"testfile": "buttons.txt",
"time": 800
},
{
"testfile": "cards.txt",
"time": 400
},
{
"testfile": "popover.txt",
"time": 300
},
{
"testfile": "about-pages.neb",
"time": 76
}
]
but what I want it to return is
[
{
"testfile": "lists.txt",
"tests": 5,
"time": 900
},
{
"testfile": "buttons.txt",
"tests": 4,
"time": 800
},
{
"testfile": "cards.txt",
"tests": 8,
"time": 400
},
{
"testfile": "popover.txt",
"tests": 1,
"time": 300
},
{
"testfile": "about-pages.neb",
"tests": 2,
"time": 76
}
]
You need to add extra field into $group and $project stages.
You need to use $max operator for time field and accumulatetests field time:tests values. In the last stage, we $reduce tests field taking highest value
{
"$group": {
_id: "$testfile",
time: {
$max: "$time"
},
tests: {
"$push": {
time: "$time",
tests: "$tests"
}
}
}
},
{
"$sort": {
time: 1
}
},
{
"$project": {
_id: 0,
testfile: "$_id",
time: "$time",
tests: {
$reduce: {
input: "$tests",
initialValue: 0,
in: {
$add: [
"$$value",
{
$cond: [
{
$and: [
{
$eq: [
"$time",
"$$this.time"
]
},
{
$gt: [
"$$this.tests",
"$$value"
]
}
]
},
{
$subtract: [
"$$this.tests",
"$$value"
]
},
0
]
}
]
}
}
}
}
}
MongoPlayground

MongoDB Get average of group considering rank of document

I have documents getting in order like:
{
"_id": "abcde1",
"value" : 300
},
{
"_id": "abcde2",
"value" : 200
},
{
"_id": "abcde3",
"value" : 400
},
{
"_id": "abcde4",
"value" : 500
},
{
"_id": "abcde5",
"value" : 600
}
i.e,
I want average of "_id" of first 2, first 4 and all 5 documents matching like in single query:
{
"value_2" : 250, // Average of first 2 documents
"value_4" : 350, // Average of first four documents
"value_5" : 400 // Average of all 5 documents
}
Is it possible to Group documents based on rank of document.
I can do 3 results in 3 separate queries. Is it possible in single query?
You could try running the following pipeline:
db.collection.aggregate([
// previous pipeline here
{
"$group": {
"_id": null,
"values": { "$push": "$value" }
}
},
{ "$unwind": { "path": "$values", "includeArrayIndex": "rank" } },
{
"$group": {
"_id": null,
"value_2_sum": {
"$sum": {
"$cond": [
{ "$lt": ["$rank", 2] },
"$values",
0
]
}
},
"value_2_count": {
"$sum": {
"$cond": [
{ "$lt": ["$rank", 2] },
1,
0
]
}
},
"value_4_sum": {
"$sum": {
"$cond": [
{ "$lt": ["$rank", 4] },
"$values",
0
]
}
},
"value_4_count": {
"$sum": {
"$cond": [
{ "$lt": ["$rank", 4] },
1,
0
]
}
},
"value_5": { "$avg": "$values" }
}
},
{
"$project": {
"value_2" : { "$divide": ["$value_2_sum", "$value_2_count"] }, // Average of first 2 documents
"value_4" : { "$divide": ["$value_4_sum", "$value_4_count"] }, // Average of first four documents
"value_5" : 1
}
}
])
You could use a $facet aggregation stage:
// { _id: "abcde1", value: 300 }
// { _id: "abcde2", value: 200 }
// { _id: "abcde3", value: 400 }
// { _id: "abcde4", value: 500 }
// { _id: "abcde5", value: 600 }
db.collection.aggregate([
{ $facet: {
value_2: [ { $limit: 2 }, { $group: { _id: null, value_2: { $avg: "$value" } } } ],
value_4: [ { $limit: 4 }, { $group: { _id: null, value_4: { $avg: "$value" } } } ],
value_5: [ { $limit: 5 }, { $group: { _id: null, value_5: { $avg: "$value" } } } ]
}},
// {
// value_2: [ { _id: null, value_2: 250 } ],
// value_4: [ { _id: null, value_4: 350 } ],
// value_5: [ { _id: null, value_5: 400 } ]
// }
{ $set: {
value_2: { $first: "$value_2.value_2" },
value_4: { $first: "$value_4.value_4" },
value_5: { $first: "$value_5.value_5" }
}}
])
// { "value_2" : 250, "value_4" : 350, "value_5" : 400 }
The $facet stage allows us to run multiple aggregation pipelines within a single stage on the same set of input documents. Each sub-pipeline has its own field in the output document where its results are stored as an array of documents.
Each field is thus produced by its own aggregation pipeline whose first stage is a simple $limit, followed by a $group stage that'll produce the $avg (average) of all considered documents.
The second part of the pipeline (the $set stage) is just there to clean-up the $facet output to the format you wished for.