Group by inner array element after $unwind - mongodb

I have the following time series data stored in mongodb
{
"_id" : ObjectId("59a46062e1aeb958a712490e"),
"channelName" : "ABC",
"rtData" : [
{
"ts" : ISODate("2017-08-28T18:26:42.837Z"),
"data" : [ 676.297664, 676.297664 ]
},
{
"ts" : ISODate("2017-08-28T18:27:42.837Z"),
"data" : [ 724.297664, 676.297664 ]
},
{
"ts" : ISODate("2017-08-28T18:29:42.837Z"),
"data" : [ 878.297, 676.297 ]
}
]
}
I want to group the data based on the ts field on hour and get the first element of rtData for that hour.
Here is what I have tried
db.channels.aggregate( [ {$match: {"channelName": "ABC"} }, { $unwind : "$rtData" }, { $group : {_id: { $hour: "$rtData.ts" }, ucast: { $sum: $rtData.data[0]} }
But the above code gives me the following output
{ "_id" : 28, "ucast" : 0 }
What I actually want is
{ "_id" : 28, "ucast" : 676.297664 }

You don't notate getting a first element of an array in an aggregation pipeline like that. You want $arrayElemAt which returns the array value by index:
db.channels.aggregate( [
{ $match: {"channelName": "ABC"} },
{ $unwind : "$rtData" },
{ $group : {
_id: { $hour: "$rtData.ts" },
ucast: { $sum: { $arrayElemAt: [ "$rtData.data", 0 ] } }
}}
])
If your MongoDB does not support $arrayElemAt ( prior to 3.2 ), then you can instead use $first in an additional $group on just the document key, done before you "accumulate" for the desired grouping key:
db.channels.aggregate( [
{ $match: {"channelName": "ABC"} },
{ $unwind : "$rtData" },
{ $group: {
_id: { _id: "$_id", ts: "$rtData.ts" },
data: { $first: "$rtData.data" }
}},
{ $group : {
_id: { $hour: "$_id.ts" },
ucast: { $sum: "$data" }
}}
])
In modern versions you can "double barrel" the $sum to both add up array elements as well as act as an accumulator if you wanted to "sum" all elements of the array:
db.channels.aggregate( [
{ $match: {"channelName": "ABC"} },
{ $unwind : "$rtData" },
{ $group : {
_id: { $hour: "$rtData.ts" },
ucast: { $sum: { $sum: "$rtData.data" } }
}}
])
And with older versions ( prior to 3.2 ) you would "double" $unwind for each array path instead:
db.channels.aggregate( [
{ $match: {"channelName": "ABC"} },
{ $unwind : "$rtData" },
{ $unwind : "$rtData.data" },
{ $group : {
_id: { $hour: "$rtData.ts" },
ucast: { $sum: "$rtData.data" }
}}
])

you need to use $first operator for that instead of $sum
db.channels.aggregate( [ {$match: {"channelName": "ABC"} }, { $unwind : "$rtData" }, { $group : {_id: { $hour: "$rtData.ts" }, ucast: { $first: $rtData.data} }
which will give you output like { "_id" : 28, "ucast" : [ 676.297664, 676.297664 ] }
if you want output like { "_id" : 28, "ucast" : 676.297664 } in next $project or $addFields stage use $arrayElemAt

Related

How can i count total documents and also grouped counts simultanously in mongodb aggregation?

I have a dataset in mongodb collection named visitorsSession like
{ip : 192.2.1.1,country : 'US', type : 'Visitors',date : '2019-12-15T00:00:00.359Z'},
{ip : 192.3.1.8,country : 'UK', type : 'Visitors',date : '2019-12-15T00:00:00.359Z'},
{ip : 192.5.1.4,country : 'UK', type : 'Visitors',date : '2019-12-15T00:00:00.359Z'},
{ip : 192.8.1.7,country : 'US', type : 'Visitors',date : '2019-12-15T00:00:00.359Z'},
{ip : 192.1.1.3,country : 'US', type : 'Visitors',date : '2019-12-15T00:00:00.359Z'}
I am using this mongodb aggregation
[{$match: {
nsp : "/hrm.sbtjapan.com",
creationDate : {
$gte: "2019-12-15T00:00:00.359Z",
$lte: "2019-12-20T23:00:00.359Z"
},
type : "Visitors"
}}, {$group: {
_id : "$country",
totalSessions : {
$sum: 1
}
}}, {$project: {
_id : 0,
country : "$_id",
totalSessions : 1
}}, {$sort: {
country: -1
}}]
using above aggregation i am getting results like this
[{country : 'US',totalSessions : 3},{country : 'UK',totalSessions : 2}]
But i also total visitors also along with result like totalVisitors : 5
How can i do this in mongodb aggregation ?
You can use $facet aggregation stage to calculate total visitors as well as visitors by country in a single pass:
db.visitorsSession.aggregate( [
{
$match: {
nsp : "/hrm.sbtjapan.com",
creationDate : {
$gte: "2019-12-15T00:00:00.359Z",
$lte: "2019-12-20T23:00:00.359Z"
},
type : "Visitors"
}
},
{
$facet: {
totalVisitors: [
{
$count: "count"
}
],
countrySessions: [
{
$group: {
_id : "$country",
sessions : { $sum: 1 }
}
},
{
$project: {
country: "$_id",
_id: 0,
sessions: 1
}
}
],
}
},
{
$addFields: {
totalVisitors: { $arrayElemAt: [ "$totalVisitors.count" , 0 ] },
}
}
] )
The output:
{
"totalVisitors" : 5,
"countrySessions" : [
{
"sessions" : 2,
"country" : "UK"
},
{
"sessions" : 3,
"country" : "US"
}
]
}
You could be better off with two queries to do this.
To save the two db round trips following aggregation can be used which IMO is kinda verbose (and might be little expensive if documents are very large) to just count the documents.
Idea: Is to have a $group at the top to count documents and preserve the original documents using $push and $$ROOT. And then before other matches/filter ops $unwind the created array of original docs.
db.collection.aggregate([
{
$group: {
_id: null,
docsCount: {
$sum: 1
},
originals: {
$push: "$$ROOT"
}
}
},
{
$unwind: "$originals"
},
{ $match: "..." }, //and other stages on `originals` which contains the source documents
{
$group: {
_id: "$originals.country",
totalSessions: {
$sum: 1
},
totalVisitors: {
$first: "$docsCount"
}
}
}
]);
Sample O/P: Playground Link
[
{
"_id": "UK",
"totalSessions": 2,
"totalVisitors": 5
},
{
"_id": "US",
"totalSessions": 3,
"totalVisitors": 5
}
]

Count Both Outer and Inner embedded array in a single query

{
_id: ObjectId("5dbdacc28cffef0b94580dbd"),
"comments" : [
{
"_id" : ObjectId("5dbdacc78cffef0b94580dbf"),
"replies" : [
{
"_id" : ObjectId("5dbdacd78cffef0b94580dc0")
},
]
},
]
}
How to count the number of element in comments and sum with number of relies
My approach is do 2 query like this:
1. total elements of replies
db.posts.aggregate([
{$match: {_id:ObjectId("5dbdacc28cffef0b94580dbd")}},
{ $unwind: "$comments",},
{$project:{total:{$size:"$comments.replies"} , _id: 0} }
])
2. count total elements of comments
db.posts.aggregate([
{$match: {_id:ObjectId("5dbdacc28cffef0b94580dbd")}},
{$project:{total:{$size:"$comments.replies"} , _id: 0} }
])
Then sum up both, do we have any better solution to write the query like return the sum of of total element comments + replies
You can use $reduce and $concatArrays to "merge" an inner "array of arrays" into a single list and measure the $size of that. Then simply $add the two results together:
db.posts.aggregate([
{ "$match": { _id:ObjectId("5dbdacc28cffef0b94580dbd") } },
{ "$addFields": {
"totalBoth": {
"$add": [
{ "$size": "$comments" },
{ "$size": {
"$reduce": {
"input": "$comments.replies",
"initialValue": [],
"in": {
"$concatArrays": [ "$$value", "$$this" ]
}
}
}}
]
}
}}
])
Noting that an "array of arrays" is the effect of an expression like $comments.replies, so hence the operation to make these into a single array where you can measure all elements.
Try using the $unwind to flatten the list you get from the $project before using $count.
This is another way of getting the result.
Input documents:
{ "_id" : 1, "array1" : [ { "array2" : [ { id: "This is a test!"}, { id: "test1" } ] }, { "array2" : [ { id: "This is 2222!"}, { id: "test 222" }, { id: "222222" } ] } ] }
{ "_id" : 2, "array1" : [ { "array2" : [ { id: "aaaa" }, { id: "bbbb" } ] } ] }
The query:
db.arrsizes2.aggregate( [
{ $facet: {
array1Sizes: [
{ $project: { array1Size: { $size: "$array1" } } }
],
array2Sizes: [
{ $unwind: "$array1" },
{ $project: { array2Size: { $size: "$array1.array2" } } },
],
} },
{ $project: { result: { $concatArrays: [ "$array1Sizes", "$array2Sizes" ] } } },
{ $unwind: "$result" },
{ $group: { _id: "$result._id", total1: { $sum: "$result.array1Size" }, total2: { $sum: "$result.array2Size" } } },
{ $addFields: { total: { $add: [ "$total1", "$total2" ] } } },
] )
The output:
{ "_id" : 2, "total1" : 1, "total2" : 2, "total" : 3 }
{ "_id" : 1, "total1" : 2, "total2" : 5, "total" : 7 }

Fetch distinct values from Mongo DB nested array and output to a single array

given below is my data in mongo db.I want to fetch all the unique ids from the field articles ,which is nested under the jnlc_subjects index .The result should contain only the articles array with distinct object Ids.
Mongo Data
{
"_id" : ObjectId("5c9216f1a21a4a31e0c7fa56"),
"jnlc_journal_category" : "Biology",
"jnlc_subjects" : [
{
"subject" : "Conservation Biology",
"views" : "123",
"articles" : [
ObjectId("5c4e93d0135edb6812200d5f"),
ObjectId("5c4e9365135edb6a12200d60"),
ObjectId("5c4e93a8135edb6912200d61")
]
},
{
"subject" : "Micro Biology",
"views" : "20",
"articles" : [
ObjectId("5c4e9365135edb6a12200d60"),
ObjectId("5c4e93d0135edb6812200d5f"),
ObjectId("5c76323fbaaccf5e0bae7600"),
ObjectId("5ca33ce19d677bf780fc4995")
]
},
{
"subject" : "Marine Biology",
"views" : "8",
"articles" : [
ObjectId("5c4e93d0135edb6812200d5f")
]
}
]
}
Required result
I want to get output in following format
articles : [
ObjectId("5c4e9365135edb6a12200d60"),
ObjectId("5c4e93a8135edb6912200d61"),
ObjectId("5c76323fbaaccf5e0bae7600"),
ObjectId("5ca33ce19d677bf780fc4995"),
ObjectId("5c4e93d0135edb6812200d5f")
]
Try as below:
db.collection.aggregate([
{
$unwind: "$jnlc_subjects"
},
{
$unwind: "$jnlc_subjects.articles"
},
{ $group: {_id: null, uniqueValues: { $addToSet: "$jnlc_subjects.articles"}} }
])
Result:
{
"_id" : null,
"uniqueValues" : [
ObjectId("5ca33ce19d677bf780fc4995"),
ObjectId("5c4e9365135edb6a12200d60"),
ObjectId("5c4e93a8135edb6912200d61"),
ObjectId("5c4e93d0135edb6812200d5f"),
ObjectId("5c76323fbaaccf5e0bae7600")
]
}
Try with this
db.collection.aggregate([
{
$unwind:{
path:"$jnlc_subjects",
preserveNullAndEmptyArrays:true
}
},
{
$unwind:{
path:"$jnlc_subjects.articles",
preserveNullAndEmptyArrays:true
}
},
{
$group:{
_id:"$_id",
articles:{
$addToSet:"$jnlc_subjects.articles"
}
}
}
])
If you don't want to $group with _id ypu can use null instead of $_id
According to description as mentioned into above question,as a solution to it please try executing following aggregate operation.
db.collection.aggregate(
// Pipeline
[
// Stage 1
{
$match: {
"_id": ObjectId("5c9216f1a21a4a31e0c7fa56")
}
},
// Stage 2
{
$unwind: {
path: "$jnlc_subjects",
}
},
// Stage 3
{
$unwind: {
path: "$jnlc_subjects.articles"
}
},
// Stage 4
{
$group: {
_id: null,
articles: {
$addToSet: '$jnlc_subjects.articles'
}
}
},
// Stage 5
{
$project: {
articles: 1,
_id: 0
}
},
]
);

Need to sum from array object value in mongodb

I am trying to calculate total value if that value exits. But query is not working 100%. So can somebody help me to solve this problem. Here my sample document. I have attached two documents. Please these documents & find out best solution
Document : 1
{
"_id" : 1"),
"message_count" : 4,
"messages" : {
"data" : [
{
"id" : "11",
"saleValue": 1000
},
{
"id" : "112",
"saleValue": 1400
},
{
"id" : "22",
},
{
"id" : "234",
"saleValue": 111
}
],
},
"createdTime" : ISODate("2018-03-18T10:18:48.000Z")
}
Document : 2
{
"_id" : 444,
"message_count" : 4,
"messages" : {
"data" : [
{
"id" : "444",
"saleValue" : 2060
},
{
"id" : "444",
},
{
"id" : 234,
"saleValue" : 260
},
{
"id" : "34534",
}
]
},
"createdTime" : ISODate("2018-03-18T03:11:50.000Z")
}
Needed Output:
{
total : 4831
}
My query :
db.getCollection('myCollection').aggregate([
{
"$group": {
"_id": "$Id",
"totalValue": {
$sum: {
$sum: "$messages.data.saleValue"
}
}
}
}
])
So please if possible help me to solve this problem. Thanks in advance
It's not working correctly because it is aggregating all the documents in the collection; you are grouping on a constant "_id": "tempId", you just need to reference the correct key by adding the $ as:
db.getCollection('myCollection').aggregate([
{ "$group": {
"_id": "$tempId",
"totalValue": {
"$sum": { "$sum": "$messages.data.saleValue" }
}
} }
])
which in essence is a single stage pipeline version of an aggregate operation with an extra field that holds the sum expression before the group pipeline then calling that field as the $sum operator in the group.
The above works since $sum from MongoDB 3.2+ is available in both the $project and $group stages and when used in the $project stage, $sum returns the sum of the list of expressions. The expression "$messages.data.value" returns a list of numbers [120, 1200] which are then used as the $sum expression:
db.getCollection('myCollection').aggregate([
{ "$project": {
"values": { "$sum": "$messages.data.value" },
"tempId": 1,
} },
{ "$group": {
"_id": "$tempId",
"totalValue": { "$sum": "$values" }
} }
])
You can add a $unwind before your $group, in that way you will deconstructs the data array, and then you can group properly:
db.myCollection.aggregate([
{
"$unwind": "$messages.data"
},
{
"$group": {
"_id": "tempId",
"totalValue": {
$sum: {
$sum: "$messages.data.value"
}
}
}
}
])
Output:
{ "_id" : "tempId", "totalValue" : 1320 }
db.getCollection('myCollection').aggregate([
{
$unwind: "$messages.data",
$group: {
"_id": "tempId",
"totalValue": { $sum: "$messages.data.value" }
}
}
])
$unwind
According to description as mentioned into above question, as a solution please try executing following aggregate query
db.myCollection.aggregate(
// Pipeline
[
// Stage 1
{
$unwind: {
path: '$messages.data'
}
},
// Stage 2
{
$group: {
_id: {
pageId: '$pageId'
},
total: {
$sum: '$messages.data.saleValue'
}
}
},
// Stage 3
{
$project: {
pageId: '$_id.pageId',
total: 1,
_id: 0
}
}
]
);
You can do it without using $group. Grouping made other data to be managed and addressed. So, I prefer using $sum and $map as shown below:
db.getCollection('myCollection').aggregate([
{
$addFields: {
total: {
$sum: {
$map: {
input: "$messages.data",
as: "message",
in: "$$message.saleValue",
},
},
},
},
},
}
])

MongoDB Aggregation: Compute Running Totals from sum of previous rows

Sample Documents:
{ time: ISODate("2013-10-10T20:55:36Z"), value: 1 }
{ time: ISODate("2013-10-10T22:43:16Z"), value: 2 }
{ time: ISODate("2013-10-11T19:12:66Z"), value: 3 }
{ time: ISODate("2013-10-11T10:15:38Z"), value: 4 }
{ time: ISODate("2013-10-12T04:15:38Z"), value: 5 }
It's easy to get the aggregated results that is grouped by date.
But what I want is to query results that returns a running total
of the aggregation, like:
{ time: "2013-10-10" total: 3, runningTotal: 3 }
{ time: "2013-10-11" total: 7, runningTotal: 10 }
{ time: "2013-10-12" total: 5, runningTotal: 15 }
Is this possible with the MongoDB Aggregation?
EDIT: Since MongoDB v5.0 the prefered approach would be to use the new $setWindowFields aggregation stage as shared by Xavier Guihot.
This does what you need. I have normalised the times in the data so they group together (You could do something like this). The idea is to $group and push the time's and total's into separate arrays. Then $unwind the time array, and you have made a copy of the totals array for each time document. You can then calculated the runningTotal (or something like the rolling average) from the array containing all the data for different times. The 'index' generated by $unwind is the array index for the total corresponding to that time. It is important to $sort before $unwinding since this ensures the arrays are in the correct order.
db.temp.aggregate(
[
{
'$group': {
'_id': '$time',
'total': { '$sum': '$value' }
}
},
{
'$sort': {
'_id': 1
}
},
{
'$group': {
'_id': 0,
'time': { '$push': '$_id' },
'totals': { '$push': '$total' }
}
},
{
'$unwind': {
'path' : '$time',
'includeArrayIndex' : 'index'
}
},
{
'$project': {
'_id': 0,
'time': { '$dateToString': { 'format': '%Y-%m-%d', 'date': '$time' } },
'total': { '$arrayElemAt': [ '$totals', '$index' ] },
'runningTotal': { '$sum': { '$slice': [ '$totals', { '$add': [ '$index', 1 ] } ] } },
}
},
]
);
I have used something similar on a collection with ~80 000 documents, aggregating to 63 results. I am not sure how well it will work on larger collections, but I have found that performing transformations(projections, array manipulations) on aggregated data does not seem to have a large performance cost once the data is reduced to a manageable size.
here is another approach
pipeline
db.col.aggregate([
{$group : {
_id : { time :{ $dateToString: {format: "%Y-%m-%d", date: "$time", timezone: "-05:00"}}},
value : {$sum : "$value"}
}},
{$addFields : {_id : "$_id.time"}},
{$sort : {_id : 1}},
{$group : {_id : null, data : {$push : "$$ROOT"}}},
{$addFields : {data : {
$reduce : {
input : "$data",
initialValue : {total : 0, d : []},
in : {
total : {$sum : ["$$this.value", "$$value.total"]},
d : {$concatArrays : [
"$$value.d",
[{
_id : "$$this._id",
value : "$$this.value",
runningTotal : {$sum : ["$$value.total", "$$this.value"]}
}]
]}
}
}
}}},
{$unwind : "$data.d"},
{$replaceRoot : {newRoot : "$data.d"}}
]).pretty()
collection
> db.col.find()
{ "_id" : ObjectId("4f442120eb03305789000000"), "time" : ISODate("2013-10-10T20:55:36Z"), "value" : 1 }
{ "_id" : ObjectId("4f442120eb03305789000001"), "time" : ISODate("2013-10-11T04:43:16Z"), "value" : 2 }
{ "_id" : ObjectId("4f442120eb03305789000002"), "time" : ISODate("2013-10-12T03:13:06Z"), "value" : 3 }
{ "_id" : ObjectId("4f442120eb03305789000003"), "time" : ISODate("2013-10-11T10:15:38Z"), "value" : 4 }
{ "_id" : ObjectId("4f442120eb03305789000004"), "time" : ISODate("2013-10-13T02:15:38Z"), "value" : 5 }
result
{ "_id" : "2013-10-10", "value" : 3, "runningTotal" : 3 }
{ "_id" : "2013-10-11", "value" : 7, "runningTotal" : 10 }
{ "_id" : "2013-10-12", "value" : 5, "runningTotal" : 15 }
>
Here is a solution without pushing previous documents into a new array and then processing them. (If the array gets too big then you can exceed the maximum BSON document size limit, the 16MB.)
Calculating running totals is as simple as:
db.collection1.aggregate(
[
{
$lookup: {
from: 'collection1',
let: { date_to: '$time' },
pipeline: [
{
$match: {
$expr: {
$lt: [ '$time', '$$date_to' ]
}
}
},
{
$group: {
_id: null,
summary: {
$sum: '$value'
}
}
}
],
as: 'sum_prev_days'
}
},
{
$addFields: {
sum_prev_days: {
$arrayElemAt: [ '$sum_prev_days', 0 ]
}
}
},
{
$addFields: {
running_total: {
$sum: [ '$value', '$sum_prev_days.summary' ]
}
}
},
{
$project: { sum_prev_days: 0 }
}
]
)
What we did: within the lookup we selected all documents with smaller datetime and immediately calculated the sum (using $group as the second step of lookup's pipeline). The $lookup put the value into the first element of an array. We pull the first array element and then calculate the sum: current value + sum of previous values.
If you would like to group transactions into days and after it calculate running totals then we need to insert $group to the beginning and also insert it into $lookup's pipeline.
db.collection1.aggregate(
[
{
$group: {
_id: {
$substrBytes: ['$time', 0, 10]
},
value: {
$sum: '$value'
}
}
},
{
$lookup: {
from: 'collection1',
let: { date_to: '$_id' },
pipeline: [
{
$group: {
_id: {
$substrBytes: ['$time', 0, 10]
},
value: {
$sum: '$value'
}
}
},
{
$match: {
$expr: {
$lt: [ '$_id', '$$date_to' ]
}
}
},
{
$group: {
_id: null,
summary: {
$sum: '$value'
}
}
}
],
as: 'sum_prev_days'
}
},
{
$addFields: {
sum_prev_days: {
$arrayElemAt: [ '$sum_prev_days', 0 ]
}
}
},
{
$addFields: {
running_total: {
$sum: [ '$value', '$sum_prev_days.summary' ]
}
}
},
{
$project: { sum_prev_days: 0 }
}
]
)
The result is:
{ "_id" : "2013-10-10", "value" : 3, "running_total" : 3 }
{ "_id" : "2013-10-11", "value" : 7, "running_total" : 10 }
{ "_id" : "2013-10-12", "value" : 5, "running_total" : 15 }
Starting in Mongo 5, it's a perfect use case for the new $setWindowFields aggregation operator:
// { time: ISODate("2013-10-10T20:55:36Z"), value: 1 }
// { time: ISODate("2013-10-10T22:43:16Z"), value: 2 }
// { time: ISODate("2013-10-11T12:12:66Z"), value: 3 }
// { time: ISODate("2013-10-11T10:15:38Z"), value: 4 }
// { time: ISODate("2013-10-12T05:15:38Z"), value: 5 }
db.collection.aggregate([
{ $group: {
_id: { $dateToString: { format: "%Y-%m-%d", date: "$time" } },
total: { $sum: "$value" }
}},
// e.g.: { "_id" : "2013-10-11", "total" : 7 }
{ $set: { "date": "$_id" } }, { $unset: ["_id"] },
// e.g.: { "date" : "2013-10-11", "total" : 7 }
{ $setWindowFields: {
sortBy: { date: 1 },
output: {
running: {
$sum: "$total",
window: { documents: [ "unbounded", "current" ] }
}
}
}}
])
// { date: "2013-10-11", total: 7, running: 7 }
// { date: "2013-10-10", total: 3, running: 10 }
// { date: "2013-10-12", total: 5, running: 15 }
Let's focus on the $setWindowFields stage that:
chronologically $sorts grouped documents by date: sortBy: { date: 1 }
adds the running field in each document (output: { running: { ... }})
which is the $sum of totals ($sum: "$total")
on a specified span of documents (the window)
which is in our case any previous document: window: { documents: [ "unbounded", "current" ] } }
as defined by [ "unbounded", "current" ] meaning the window is all documents seen between the first document (unbounded) and the current document (current).