Mongo DB aggregate grouping multiple values that belong to the same document - mongodb

I have documents that look like this
{
"_id": "5e3334cede31d9555e38dbee",
"time": 400,
"datetime": "2020-01-05T16:35:42.315Z",
"version": "2.0.30",
"hostname": "bvasilchik-lt.extron.com",
"testfile": "cards.txt",
"tests": 5,
"failures": 3,
"skips": 0,
"status": "Failed",
"__v": 0
}
I want to create a result that includes the documents that have the highest number of time per testfile name, so if the top 10 were all the same testfile name I'd only want to show the top one that had the same testfile name.
I have done this but I also wanted to include another field that also shows the number of tests matching that grouping, but the only ways I found were to add the $first or the $last or the $max or the $min for the tests field, but that wouldn't be correct b/c the highest time might have a different number of tests.
I am also matching results from a specific date range
const times = await Suite.aggregate([
{
"$match": {
datetime: { "$gte": dateRange.startDate, "$lt": dateRange.endDate, }
}
},
{
"$group": {
_id: "$testfile",
time: { "$max" : "$time" },
}
},
{
"$sort": {
time: order
}
},
{
"$project": {
_id: 0,
testfile: "$_id",
time: "$time"
}
}
])
this produces these results
[
{
"testfile": "lists.txt",
"time": 900
},
{
"testfile": "buttons.txt",
"time": 800
},
{
"testfile": "cards.txt",
"time": 400
},
{
"testfile": "popover.txt",
"time": 300
},
{
"testfile": "about-pages.neb",
"time": 76
}
]
but what I want it to return is
[
{
"testfile": "lists.txt",
"tests": 5,
"time": 900
},
{
"testfile": "buttons.txt",
"tests": 4,
"time": 800
},
{
"testfile": "cards.txt",
"tests": 8,
"time": 400
},
{
"testfile": "popover.txt",
"tests": 1,
"time": 300
},
{
"testfile": "about-pages.neb",
"tests": 2,
"time": 76
}
]

You need to add extra field into $group and $project stages.
You need to use $max operator for time field and accumulatetests field time:tests values. In the last stage, we $reduce tests field taking highest value
{
"$group": {
_id: "$testfile",
time: {
$max: "$time"
},
tests: {
"$push": {
time: "$time",
tests: "$tests"
}
}
}
},
{
"$sort": {
time: 1
}
},
{
"$project": {
_id: 0,
testfile: "$_id",
time: "$time",
tests: {
$reduce: {
input: "$tests",
initialValue: 0,
in: {
$add: [
"$$value",
{
$cond: [
{
$and: [
{
$eq: [
"$time",
"$$this.time"
]
},
{
$gt: [
"$$this.tests",
"$$value"
]
}
]
},
{
$subtract: [
"$$this.tests",
"$$value"
]
},
0
]
}
]
}
}
}
}
}
MongoPlayground

Related

MongoDB - using $count inside a $facet stage for the calculations

I have a collection and I want to get the total number of documents and use that amount in a $facet to calculate the 95th percentile (in the example below, the duration for each document is defined as finish_time - start_time).
Let's say I have these documents:
[
{
"_id": ObjectId("178768747638364736373637"),
"start_time": ISODate("2019-02-03T12:00:00.000Z"),
"finish_time": ISODate("2019-02-03T12:01:00.000Z")
},
{
"_id": ObjectId("266747364736363536353555"),
"start_time": ISODate("2019-02-03T12:00:00.000Z"),
"finish_time": ISODate("2019-02-03T12:03:00.000Z")
},
{
"_id": ObjectId("367463536453623546353625"),
"start_time": ISODate("2019-02-03T12:00:00.000Z"),
"finish_time": ISODate("2019-02-03T12:08:00.000Z")
}
]
I'm expecting this output:
[
{
"Percentile95Index": 2.8499999999999996,
"_id": ObjectId("178768747638364736373637"),
"duration": 60,
"totalCount": 3
},
{
"Percentile95Index": 2.8499999999999996,
"_id": ObjectId("266747364736363536353555"),
"duration": 180,
"totalCount": 3
},
{
"Percentile95Index": 2.8499999999999996,
"_id": ObjectId("367463536453623546353625"),
"duration": 480,
"totalCount": 3
}
]
So I did this (https://mongoplayground.net/p/xtHLHouzNQr):
db.collection.aggregate([
{
// Since total count will always be a one-document result, I need $facet to run multiple aggregation pipelines and then merge results.
$facet: {
totalCount: [
{
$count: "value"
}
],
pipelineResults: [
{
"$project": {
duration: {
"$divide": [
{
"$subtract": [
"$finish_time",
"$start_time"
]
},
1000
]// duration is in seconds
},
Percentile95Index: {
"$multiply": [
0.95,
"$totalCount.value" // HERE I'D LIKE TO ACCESS THE TOTAL_COUNT!!!!
]
}
}
}
]
}
},
{
$unwind: "$totalCount"
},
{
$unwind: "$pipelineResults"
},
{
$replaceRoot: {
newRoot: {
$mergeObjects: [
"$pipelineResults",
{
totalCount: "$totalCount.value"
}
]
}
}
}
])
As you can see, inside the Percentile95Index field, I'd like to access the totalCount.value but it's not accessible there and I can't figure out how to do it.
Any help?
For my opinion, I don't think the $facet stage is needed based on your scenario. (Would be great if you can share the reason to use $facet).
To count the total documents in the collection and render the count value to each document, you may look for $setWindowFields.
$setWindowFields - Render the totalCount field with the value of the total number of documents in the collection.
$project - Decorate the output documents.
db.collection.aggregate([
{
$setWindowFields: {
output: {
totalCount: {
$sum: 1
}
}
}
},
{
"$project": {
totalCount: 1,
duration: {
"$divide": [
{
"$subtract": [
"$finish_time",
"$start_time"
]
},
1000
]
},
Percentile95Index: {
"$multiply": [
0.95,
"$totalCount"
]
}
}
}
])
Demo # MongoPlayground

MongoDB - How to bring age group data

How to bring age group base data from a collection in MongoDB i.e how many people are 0-18, 19-24, 25-34, 35+
[
{
"_id": ObjectId("608be7c608c7de2367c89638"),
"status": true,
"gender": "Male",
"first_name": "Vinter",
"last_name": "R",
"dob": "1-2-1999"
},
{
"_id": ObjectId("608be7c608c7de2267c89639"),
"status": true,
"gender": "Male",
"first_name": "Ray",
"last_name": "Morgan",
"dob": "1-2-2015"
}
....
]
See the Mongo Playground:
https://mongoplayground.net/p/4ydNg9Plh6P
Interesting question!
Would like to credit to #Takis and #YuTing.
Good hint from #Takis's comment on $bucket.
#YuTing's answer is good.
Think this answer is shorter by utilizing the feature provided by MongoDB.
$toDate - Convert date string to Date (supported for version 4.0 above).
$dateDiff - Date subtraction and get the unit (Supported in version 5).
$$CURRENT - Variable to get the current iterated document. For adding into persons array field (via $push).
$switch - To display group value based on conditions (Optional).
db.collection.aggregate([
{
"$addFields": {
"age": {
$dateDiff: {
startDate: {
$toDate: "$dob"
},
endDate: "$$NOW",
unit: "year"
}
}
}
},
{
$bucket: {
groupBy: "$age",
// Field to group by
boundaries: [
0,
19,
25,
35
],
// Boundaries for the buckets
default: "Other",
// Bucket id for documents which do not fall into a bucket
output: {
// Output for each bucket
"count": {
$sum: 1
},
"persons": {
$push: "$$CURRENT"
}
}
}
},
{
$project: {
_id: 0,
group: {
$switch: {
branches: [
{
case: {
$lt: [
"$_id",
19
]
},
then: "0-18"
},
{
case: {
$lt: [
"$_id",
25
]
},
then: "19-24"
},
{
case: {
$lt: [
"$_id",
35
]
},
then: "25-34"
}
],
default: "35+"
}
},
count: 1,
persons: 1
}
}
])
Sample Mongo Playground
use $bucket
db.collection.aggregate([
{
$bucket: {
groupBy: {
"$subtract": [
{
$year: new Date()
},
{
$toInt: {
$substr: [
"$dob",
{
$subtract: [
{
$strLenCP: "$dob"
},
4
]
},
4
]
}
}
]
},
// Field to group by
boundaries: [
0,
19,
25,
35,
100
],
// Boundaries for the buckets
default: "Other",
// Bucket id for documents which do not fall into a bucket
output: {
// Output for each bucket
"count": {
$sum: 1
},
"artists": {
$push: {
"name": {
$concat: [
"$first_name",
" ",
"$last_name"
]
},
"age": {
"$subtract": [
{
$year: new Date()
},
{
$toInt: {
$substr: [
"$dob",
{
$subtract: [
{
$strLenCP: "$dob"
},
4
]
},
4
]
}
}
]
}
}
}
}
}
}
])
mongoplayground

Groupby Array elements and categorize them based on date in Mongodb Query

I have an array element in my DB and I want to group By and calculate the number of repetitions of different elements in this array based on different datetime. assume following collection:
{
_id: ObjectId(7df78ad8902c)
title: 'MongoDB Overview',
tags: ['SQL', 'database', 'NoSQL'],
created_at: 2021-10-03 10:05:51.755Z
},
{
_id: ObjectId(7df78ad8902d)
title: 'NoSQL Overview',
tags: ['mongodb', 'database', 'PHP'],
created_at: 2021-10-03 14:05:51.755Z
},
{
_id: ObjectId(7df78ad8902d)
title: 'Developing',
tags: ['java', 'btc/usdt', 'PHP'],
created_at: 2021-10-03 14:05:51.755Z
}
,
{
_id: ObjectId(7df78ad8902d)
title: 'databases for search',
tags: ['elasticsearch', 'database', 'PHP'],
created_at: 2021-10-03 12:05:51.755Z
}
I want to calculate the number of repetitions of different elements in tags field such as mongodb, database, noSQL based on datetime (for example hot hashtags in last hour, today or this month) in this collection. How can I solve this problem in mongo?
expected answer like .
1 - hot hashtags in last hour ['a' , 'b' , 'c']
2 - hot hastags in last 5 hours : ...
3 - today : ...
4 - this month : ...
Query
not calendar based, based on difference on milliseconds
keep only last 30 days data
facet and 4 group by
its exactly the same code 4x
The only difference is the multiply
last 30days : (now_date-created_at) <= (* 30 24 60 60 1000)
last 24h : (now_date-created_at) <= (* 24 60 60 1000)
last 120 hours(5days) : (now_date-created_at) <= (* 5 60 60 1000)
last 60 min : (now_date-created_at) <= (* 60 60 1000)
*subtraction works on dates also, and returns milliseconds
filter the date depending on what we want its 4 different filters
unwind
group by tag and count occurences
sort by count
limit 2 to keep like the 2 top hotttest tags, you can change it to any value, like limit 1 to keep only the hottest tag
*if you want calendar based like 3 October(3 days data only), query must be changed, the same is for day(query is for 24 hours) etc (in those cases we should use $hour $month etc)
Its not big change in query.
//same month
{"$eq" : [ {"$month" : "$$NOW"}, {"$month" : "$created_at"} ]
//same day(assuming same month from previous filter)
{"$eq" : [ {"$dayOfMonth" : "$$NOW"}, {"$dayOfMonth" : "$created_at"} ]
//same hour
*we could also use the new $dateTrunc to check for same month etc.
Test code here
(Query is big but its the same thing 4x)
db.collection.aggregate([
{
"$set": {
"created_at": {
"$dateFromString": {
"dateString": "$created_at"
}
}
}
},
{
"$unwind": {
"path": "$tags"
}
},
{
"$match": {
"$expr": {
"$lte": [
{
"$subtract": [
"$$NOW",
"$created_at"
]
},
{
"$multiply": [
30,
24,
60,
60,
1000
]
}
]
}
}
},
{
"$facet": {
"month-tag": [
{
"$match": {
"$expr": {
"$lte": [
{
"$subtract": [
"$$NOW",
"$created_at"
]
},
{
"$multiply": [
30,
24,
60,
60,
1000
]
}
]
}
}
},
{
"$group": {
"_id": "$tags",
"count": {
"$sum": 1
}
}
},
{
"$sort": {
"count": -1
}
},
{
"$limit": 2
},
{
"$project": {
"_id": 0,
"tag": "$_id"
}
}
],
"day-tag": [
{
"$match": {
"$expr": {
"$lte": [
{
"$subtract": [
"$$NOW",
"$created_at"
]
},
{
"$multiply": [
24,
60,
60,
1000
]
}
]
}
}
},
{
"$group": {
"_id": "$tags",
"count": {
"$sum": 1
}
}
},
{
"$sort": {
"count": -1
}
},
{
"$limit": 2
},
{
"$project": {
"_id": 0,
"tag": "$_id"
}
}
],
"5hour-tag": [
{
"$match": {
"$expr": {
"$lte": [
{
"$subtract": [
"$$NOW",
"$created_at"
]
},
{
"$multiply": [
5,
60,
60,
1000
]
}
]
}
}
},
{
"$group": {
"_id": "$tags",
"count": {
"$sum": 1
}
}
},
{
"$sort": {
"count": -1
}
},
{
"$limit": 2
},
{
"$project": {
"_id": 0,
"tag": "$_id"
}
}
],
"hour-tag": [
{
"$match": {
"$expr": {
"$lte": [
{
"$subtract": [
"$$NOW",
"$created_at"
]
},
{
"$multiply": [
60,
60,
1000
]
}
]
}
}
},
{
"$group": {
"_id": "$tags",
"count": {
"$sum": 1
}
}
},
{
"$sort": {
"count": -1
}
},
{
"$limit": 2
},
{
"$project": {
"_id": 0,
"tag": "$_id"
}
}
]
}
}
])

Is it possible to group (aggregate) objects with dates into incremental intervals in MongoDB?

I am currently trying to create an aggregation pipeline in MongoDB to group the items into incremental time intervals, but I only succeeded in grouping them in disjoint time intervals so far.
Sample data:
{
"eventID": "abc",
"date": ISODate("2020-11-05T12:05:11.790Z"),
...........
},
{
"eventID": "xyz",
"date": ISODate("2020-11-05T12:12:11.790Z"),
...........
},
{
"eventID": "klm",
"date": ISODate("2020-11-05T12:28:11.790Z"),
...........
}
Current solution:
$group: {
"_id": {
"year": { $year: "$date" },
"dayOfYear": { $dayOfYear: "$date" },
"hour": { $hour: "$date" },
"interval": {
"$subtract": [
{ "$minute": "$date" },
{ "$mod": [{ "$minute": "$date"}, 10 ] }
]
}
},
"grouped_data": { "$push": { "eventID": "$eventID", "date": "$date" },
"count": { $sum: 1 } }
}
Which returns the data grouped in 10 minutes intervals but those are disjoint intervals (time windows of 10minutes that do not intersect).
Eg:
{
"_id": {
"year": 2020,
"dayOfYear": "314",
"hour": 12,
"interval": 0, // = interval beginning at minute 0 of 12th hour of the day
},
"grouped_data": [{ "eventID": "abc", "date": ISODate("2020-11-05T12:05:11.790Z" }],
"count": 1
},
{
"_id": {
"year": 2020,
"dayOfYear": "314",
"hour": 12,
"interval": 10, // = beginning at minute 10
},
"grouped_data": [{ "eventID": "xyz", "date": ISODate("2020-11-05T12:12:11.790Z") }],
"count": 1
},
{
"_id": {
"year": 2020,
"dayOfYear": "314",
"hour": 12,
"interval": 20, // = beginning at minute 20
},
"grouped_data": [{ "eventID": "klm", "date": ISODate("2020-11-05T12:28:11.790Z") }],
"count": 1
}
What I am actually looking for is grouping them in 10 minutes(or whatever is needed) incremental intervals. Eg: 0-9, 1-10, 2-11, etc. instead of 0-9, 10-19, 20-29 etc.
Edit:
The end goal here is to check if a count threshold is surpassed on a interval length defined by the user.
If user asks "Are there more than 2 events on a 10minute time window?", based on the sample data above and my current solution, the condition is not met. (1 event in 0-9 interval, and 1 event in 10-19). With incremental intervals I should be able to find that there are indeed 2 events in 10 minutes, but in the time interval 5-14. Eg:
{
"_id": {
*whatever logic for grouping in 10minutes window*
},
"grouped_data": [
{ "eventID": "abc", "date": ISODate("2020-11-05T12:05:11.790Z") },
{ "eventID": "xyz", "date": ISODate("2020-11-05T12:12:11.790Z") }],
"count": 2
},
{
"_id": {
*whatever logic for grouping in 10minutes window*
},
"grouped_data": [
{ "eventID": "klm", "date": ISODate("2020-11-05T12:28:11.790Z") }]
"count": 1
},
For me it is not clear which output you like to get, but this aggregation pipeline makes the sliding-window group:
db.collection.aggregate([
{
$group: {
_id: null,
data: { $push: "$$ROOT" },
min_date: { $min: "$date" },
max_date: { $max: "$date" }
}
},
{
$addFields: {
interval: {
$range: [
{ $toInt: { $divide: [{ $toLong: "$min_date" }, 1000] } },
{ $toInt: { $divide: [{ $toLong: "$max_date" }, 1000] } },
10 * 60]
}
}
},
{
$set: {
interval: {
$map: {
input: "$interval",
in: { $toDate: { $multiply: ["$$this", 1000] } }
}
}
}
},
{ $unwind: "$interval" },
{
$project: {
grouped_data: {
$filter: {
input: "$data",
cond: {
$and: [
{ $gte: ["$$this.date", "$interval"] },
{ $lt: ["$$this.date", { $add: ["$interval", 1000 * 60 * 10] }] },
]
}
}
},
interval: 1
}
}
])
Boundaries are given by input data, however can also use fixes dates:
db.collection.aggregate([
{ $group: { _id: null, data: { $push: "$$ROOT" } } },
{
$addFields: {
interval: {
$range: [
{ $toInt: { $divide: [{ $toLong: ISODate("2020-01-01T00:00:00Z") }, 1000] } },
{ $toInt: { $divide: [{ $toLong: ISODate("2020-12-31T23:59:59Z") }, 1000] } },
10 * 60]
}
}
},
{
$set: {
interval: {
$map: {
input: "$interval",
in: { $toDate: { $multiply: ["$$this", 1000] } }
}
}
}
},
{ $unwind: "$interval" },
{
$project: {
grouped_data: {
$filter: {
input: "$data",
cond: {
$and: [
{ $gte: ["$$this.date", "$interval"] },
{ $lt: ["$$this.date", { $add: ["$interval", 1000 * 60 * 10] }] },
]
}
}
},
interval: 1
}
}
])
I will try to answer my own question, maybe it will help other people on the internet. The solution I came up with is based on the answer of #Wernfried (thanks!).
db.getCollection("events_en").aggregate([
{
$match: { eventID: "XYZ" }
},
{
$group: {
_id: null,
events: { $push: "$$ROOT" },
limit: { $push: { $toDate: { $add: [{ $toLong: "$date" }, 1000 * 60 * 10] } } }
}
},
{ $unwind: "$limit" },
{
$project: {
events: {
$filter: {
input: "$events",
cond: {
$and: [
{ $lt: ["$$this.date", "$limit"] },
{ $gte: ["$$this.date", { $subtract: ["$limit", 1000 * 60 * 10] }] },
]
}
}
},
limit: 1,
}
},
{
$addFields: {
count: {
$size: "$events"
}
}
}
])
This will create a limit for each event, based on its date + 10 minutes (or whatever). And afterwards it filters the events (which are now duplicated for each of the limit using $unwind: "$limit"), based on that limit. The result is something like this:
{
"_id" : null,
"limit" : ISODate("2020-11-05T12:28:27.000+0000"),
"events" : [
{
"_id" : 13,
"eventID" : "XYZ",
"date" : ISODate("2020-11-05T12:18:27.000+0000")
},
{
"_id" : 63,
"eventID" : "XYZ",
"date" : ISODate("2020-11-05T12:19:55.000+0000")
},
............................
{
"_id" : 90,
"eventID" : "XYZ",
"date" : ISODate("2020-11-05T12:27:57.000+0000")
}
],
"count" : 5
}
{
"_id" : null,
"limit" : ISODate("2020-11-05T12:29:55.000+0000"),
"events" : [
{
"_id" : 63,
"eventID" : "XYZ",
"date" : ISODate("2020-11-05T12:19:55.000+0000")
},
{
"_id" : 90,
"eventID" : "XYZ",
"date" : ISODate("2020-11-05T12:27:57.000+0000")
},
{
"_id" : 97,
"eventID" : "XYZ",
"date" : ISODate("2020-11-05T12:29:36.000+0000")
}
],
"count" : 3
}
As you can see, looking at the limit of each group and at the dates of the events in each group, these intervals are now incremental, not disjoint. (event X is found in multiple groups, as long as it doesnt exceeds the time interval of 10minutes)

MongoDB Get average of group considering rank of document

I have documents getting in order like:
{
"_id": "abcde1",
"value" : 300
},
{
"_id": "abcde2",
"value" : 200
},
{
"_id": "abcde3",
"value" : 400
},
{
"_id": "abcde4",
"value" : 500
},
{
"_id": "abcde5",
"value" : 600
}
i.e,
I want average of "_id" of first 2, first 4 and all 5 documents matching like in single query:
{
"value_2" : 250, // Average of first 2 documents
"value_4" : 350, // Average of first four documents
"value_5" : 400 // Average of all 5 documents
}
Is it possible to Group documents based on rank of document.
I can do 3 results in 3 separate queries. Is it possible in single query?
You could try running the following pipeline:
db.collection.aggregate([
// previous pipeline here
{
"$group": {
"_id": null,
"values": { "$push": "$value" }
}
},
{ "$unwind": { "path": "$values", "includeArrayIndex": "rank" } },
{
"$group": {
"_id": null,
"value_2_sum": {
"$sum": {
"$cond": [
{ "$lt": ["$rank", 2] },
"$values",
0
]
}
},
"value_2_count": {
"$sum": {
"$cond": [
{ "$lt": ["$rank", 2] },
1,
0
]
}
},
"value_4_sum": {
"$sum": {
"$cond": [
{ "$lt": ["$rank", 4] },
"$values",
0
]
}
},
"value_4_count": {
"$sum": {
"$cond": [
{ "$lt": ["$rank", 4] },
1,
0
]
}
},
"value_5": { "$avg": "$values" }
}
},
{
"$project": {
"value_2" : { "$divide": ["$value_2_sum", "$value_2_count"] }, // Average of first 2 documents
"value_4" : { "$divide": ["$value_4_sum", "$value_4_count"] }, // Average of first four documents
"value_5" : 1
}
}
])
You could use a $facet aggregation stage:
// { _id: "abcde1", value: 300 }
// { _id: "abcde2", value: 200 }
// { _id: "abcde3", value: 400 }
// { _id: "abcde4", value: 500 }
// { _id: "abcde5", value: 600 }
db.collection.aggregate([
{ $facet: {
value_2: [ { $limit: 2 }, { $group: { _id: null, value_2: { $avg: "$value" } } } ],
value_4: [ { $limit: 4 }, { $group: { _id: null, value_4: { $avg: "$value" } } } ],
value_5: [ { $limit: 5 }, { $group: { _id: null, value_5: { $avg: "$value" } } } ]
}},
// {
// value_2: [ { _id: null, value_2: 250 } ],
// value_4: [ { _id: null, value_4: 350 } ],
// value_5: [ { _id: null, value_5: 400 } ]
// }
{ $set: {
value_2: { $first: "$value_2.value_2" },
value_4: { $first: "$value_4.value_4" },
value_5: { $first: "$value_5.value_5" }
}}
])
// { "value_2" : 250, "value_4" : 350, "value_5" : 400 }
The $facet stage allows us to run multiple aggregation pipelines within a single stage on the same set of input documents. Each sub-pipeline has its own field in the output document where its results are stored as an array of documents.
Each field is thus produced by its own aggregation pipeline whose first stage is a simple $limit, followed by a $group stage that'll produce the $avg (average) of all considered documents.
The second part of the pipeline (the $set stage) is just there to clean-up the $facet output to the format you wished for.