Unable to compute average time - mongodb

3 days old into MongoDB, and I am not finding it very fluid. I am simply trying to compute the average time for a field but I keep running into all sorts of problems.
Here is my code:
db.results.group({
key:{"profile.Zend_Http_Client_Adapter_Socket::read==>fgets":{$exists:true}},
initial: {count: 0, total:0},
reduce: function(doc, out){
out.count++;
out.total += doc."profile.Zend_Http_Client_Adapter_Socket::read==>fgets.wt";
},
finalize: function(out){
out.avg = out.total/out.count;
}
});
The error:
SyntaxError: Unexpected String
The above is a very childish error, but I can't understand why it would state this, the only plausible reason I can think of is that the keys I have specified here are in quotes hence Mongo is getting confused.
BUT, these keys ARE in quotes in my collection, so there should be no reason why I keep getting this syntax error right ?
Sample document:
{
"_id" : ObjectId("532a2a986803faba658b456b"),
"profile" : {
"main()==>register_shutdown_function" : {
"ct" : 1,
"wt" : 13,
"cpu" : 0,
"mu" : 1568,
"pmu" : 1000
},
"main()==>load::htdocs/index.php" : {
"ct" : 1,
"wt" : 17,
"cpu" : 0,
"mu" : 1736,
"pmu" : 4296
},
{"Zend_Http_Client_Adapter_Curl::write==>curl_exec" : {
"ct" : 3,
"wt" : 54782314,
"cpu" : 16001,
"mu" : 83288,
"pmu" : 49648
}, ....

As per the comment your problem is one of forming valid JavaScript. Also your "key" value would not seem to be what you really want. There is however the aggregate function that you should be favoring over the use of "group"
db.results.aggregate([
{ "$match": {
"$and": [
{ "profile.Zend_Http_Client_Adapter_Socket::read==>fgets.wt": {
"$exists": true
}},
{ "profile.Zend_Http_Client_Adapter_Socket::read==>fgets.wt": {
"$not": { "$type": 2 }
}}
]
}},
{ "$group": {
"_id": null,
"total": { "$sum":
"$profile.Zend_Http_Client_Adapter_Socket::read==>fgets.wt"
},
"count": { "$sum": 1 }
}},
{ "$project": {
"_id": 0,
"avg": { "$divide": [ "$total", "$count" ] }
}}
])
The aggregation pipeline sort of supercedes earlier introduced functions such as group and distinct. And for all but trivial operations should be your favored choice.
It will run much faster as well as this is processed in native code and not the JavaScript engine.
Also see the SQL to aggregation mapping chart in the documentation.
Problems With Data
Your sample is not very complete. To sort out all issues I have to put in a document like this:
{
"profile": {
"Zend_Http_Client_Adapter_Socket::read==>fgets": {
"ct" : 3,
"wt" : 54782314,
"cpu" : 16001,
"mu" : 83288,
"pmu" : 49648
},
}
}
Also your document example has some invalid fields in it:
{
"_id" : ObjectId("532a2a986803faba658b456b"),
"profile" : {
"main()==>register_shutdown_function" : {
"ct" : 1,
"wt" : 13,
"cpu" : 0,
"mu" : 1568,
"pmu" : 1000
},
"main()==>load::htdocs/index.php" : { <-- Invalid
"ct" : 1,
"wt" : 17,
"cpu" : 0,
"mu" : 1736,
"pmu" : 4296
},
So that field cannot exist as it has a . in the field name, which for obvious sub-document reasons is not allowed.

#Neils answer led me to the correct solution:
db.results.aggregate([
{
$match: {
"profile.Zend_Http_Client_Adapter_Socket::read==>fgets.wt": {
"$exists": true
}
}
},
{
$group: {
"_id": null,
"total": {
$sum: "$profile.Zend_Http_Client_Adapter_Socket::read==>fgets.wt"
},
"count": {
$sum: 1
}
}
},
{
$project: {
"_id": 0,
"count": "$count",
"avg": {
$divide: [
"$total",
"$count"
]
}
}
}
]);

Related

In MongoDb how to get the max\min\avg\count of a single matched element occurred inside an embedded array?

I have a MongoDB collection as below. And I want to get the min\max\avg\count of the xxx field inside all documents which $match: { "Parsed.FileId": "421462559", "Parsed.MessageId": "123" }
Note that Fields of each document only contains one single field with SchemaName = xxx
Is it possible with MongoDB aggregation (or other feature) and how?
{
"_id" : NumberLong(409),
"Parsed" : {
"FileId" : "421462559",
"MessageId": "123",
"Fields" : [
{
"SchemaName" : "xxx",
"Type" : 0,
"Value" : 6
},
{
"SchemaName" : "yyy",
"Type" : 0,
"Value" : 5
}
]
}
},
{
"_id" : NumberLong(510),
"Parsed" : {
"FileId" : "421462559",
"MessageId": "123",
"Fields" : [
{
"SchemaName" : "xxx",
"Type" : 0,
"Value" : 10
},
{
"SchemaName" : "yyy",
"Type" : 0,
"Value" : 20
}
]
}
}
For example collection above, I expect to get the result for field xxx as:
{
count: 2,
min: 6,
max: 10,
avg: 8
}
You can use below aggregation
Basically you need to first $unwind the nested array and then have to use $group with the corresponding accumulator i.e. min
max
$sum
$avg
db.collection.aggregate([
{ "$match": { "Parsed.Fields.SchemaName": "xxx" }},
{ "$unwind": "$Parsed.Fields" },
{ "$match": { "Parsed.Fields.SchemaName": "xxx" }},
{ "$group": {
"_id": null,
"count": { "$sum": 1 },
"max": { "$max": "$Parsed.Fields.Value" },
"min": { "$min": "$Parsed.Fields.Value" },
"avg": { "$avg": "$Parsed.Fields.Value" }
}}
])

Counting results in aggregate selection

My MongoDB database have a structure
{
"_id" : ObjectId("5c1ccc20fc0f60769227d455"),
"type" : 0,
"id" : "hwJyzAHyfjXUlrGhblT7txWd",
"userowner" : 1.0,
"campid" : "9548",
"date" : 1545391136,
"useragent" : "mozilla/5.0 (windows nt 10.0; win64; x64; rv:65.0) gecko/20100101 firefox/65.0",
"domain" : "",
"referer" : "",
"country" : "en",
"language" : "en-US",
"languages" : [
"en-US",
"en"
],
"screenres" : [
"1920*1080"
],
"avscreenres" : [
"1080*1858"
],
"webgl" : "angle (nvidia geforce gtx 1060 6gb direct3d11 vs_5_0 ps_5_0)",
"hash" : 123,
"timezone" : -180,
"result" : true,
"resultreason" : "learning",
"remoteip" : "0.0.0.0"
}
Every a document have a vield "result" with a bool value.
I make aggregation selection:
db.getCollection('clicks').aggregate([
{ $match: {userowner: 1, date:{$gte: 0, $lte: 9545392055}} },
{ $group : {_id : "$campid",
number: {$sum: 1}}}
])
and get a Result:
/* 1 */
{
"_id" : "4587",
"number" : 2.0
}
/* 2 */
{
"_id" : "9548",
"number" : 1346.0
}
How can count the amount of value "true" and "false" in a field "result" and get a result like this:
/* 1 */
{
"_id" : "4587",
"number" : 2.0,
"passed":100,
"blocked":120
}
/* 2 */
{
"_id" : "9548",
"number" : 1346.0,
"passed":100,
"blocked":120
}
I hope this works as per your requirement.
db.getCollection('clicks').aggregate(
[
{
$match: {
userowner: 1, date: {
$gte: 0, $lte: 9545392055
}
}
},
{
$group: {
_id: "$campid", passed: {
$sum: {
$cond:
[
{ $eq: ["$result", true] },
1, 0
]
}
},
blocked: {
$sum: {
$cond:
[
{
$eq: ["$result", false]
}
, 1, 0]
}
},
number: { $sum: 1 }
}
},
{
$project: {
_id: 0,
campid: "$_id",
number: 1,
passed: 1,
blocked: 1
}
}
])
Output:-
{
"passed" : 3,
"blocked" : 2,
"number" : 5,
"campid" : "4587"
}
{
"passed" : 2,
"blocked" : 1,
"number" : 3,
"campid" : "9548"
}
Refer $group, $cond, and $eq for more info.
With MongoDb 3.6 and newer, you can leverage the use of $arrayToObject operator within a $replaceRoot pipeline to get the desired result.
You would need to group the documents intially by the campid and the result field, aggregate the sum and pass the results to yet another group pipeline stage. This group stage will prepare the documents in a way that $arrayToObject operator will give you the desired object by creating a key-value array using $push.
The result from this is then fed to the $replaceRoot pipeline to bring the passed and blocked fields to the root of the document.
The following aggregate pipeline describes the above:
db.getCollection('clicks').aggregate([
{ "$match": { "userowner": 1, "date": { "$gte": 0, "$lte": 9545392055 } } },
{ "$group": {
"_id": {
"campid": "$campid",
"result": { "$cond": [ "$result", "passed", "blocked" ] }
},
"count": { "$sum": 1 }
} },
{ "$group": {
"_id": "$_id.campid",
"number": { "$sum": "$count" },
"counts": {
"$push": {
"k": "$_id.result",
"v": "$count"
}
}
} },
{ "$replaceRoot": {
"newRoot": {
"$mergeObjects": [
{ "$arrayToObject": "$counts" },
"$$ROOT"
]
}
} },
{ "$project": { "counts": 0 } }
])

Is it recommended to use unwind in working with large amount of data with nested documents on MongoDB [duplicate]

I am new in mongodb and trying to work with nested documents.I have a query as below
db.EndpointData.aggregate([
{ "$group" : { "_id" : "$EndpointId", "RequestCount" : { "$sum" : 1 }, "FirstActivity" : { "$min" : "$DateTime" }, "LastActivity" : { "$max" : "$DateTime" }, "Tags" : { "$push" : "$Tags" } } },
{ "$unwind" : "$Tags" },
{ "$unwind" : "$Tags" },
{ "$group" : { "_id" : "$_id", "RequestCount" : { "$first" : "$RequestCount" }, "Tags" : { "$push" : "$Tags" }, "FirstActivity" : { "$first" : "$FirstActivity" }, "LastActivity" : { "$first" : "$LastActivity" } } },
{ "$unwind" : "$Tags" },
{ "$unwind" : "$Tags.Sensors" },
{ "$group" : { "_id" : { "EndpointId" : "$_id", "Uid" : "$Tags.Uid", "Type" : "$Tags.Sensors.Type" }, "RequestCount" : { "$first" : "$RequestCount" }, "FirstActivity" : { "$first" : "$FirstActivity" }, "LastActivity" : { "$first" : "$LastActivity" } } },
{ "$group" : { "_id" : { "EndpointId" : "$_id.EndpointId", "Uid" : "$_id.Uid" }, "count" : { "$sum" : 1 }, "RequestCount" : { "$first" : "$RequestCount" }, "FirstActivity" : { "$first" : "$FirstActivity" }, "LastActivity" : { "$first" : "$LastActivity" } } },
{ "$group" : { "_id" : "$_id.EndpointId", "TagCount" : { "$sum" : 1 }, "SensorCount" : { "$sum" : "$count" }, "RequestCount" : { "$first" : "$RequestCount" }, "FirstActivity" : { "$first" : "$FirstActivity" }, "LastActivity" : { "$first" : "$LastActivity" } } }])
and my data structure is as below
{
"_id": "6aef51dfaf42ea1b70d0c4db",
"EndpointId": "98799bcc-e86f-4c8a-b340-8b5ed53caf83",
"DateTime": "2018-05-06T19:05:02.666Z",
"Url": "test",
"Tags": [
{
"Uid": "C1:3D:CA:D4:45:11",
"Type": 1,
"DateTime": "2018-05-06T19:05:02.666Z",
"Sensors": [
{
"Type": 1,
"Value": { "$numberDecimal": "-95" }
},
{
"Type": 2,
"Value": { "$numberDecimal": "-59" }
},
{
"Type": 3,
"Value": { "$numberDecimal": "11.029802536740132" }
}
]
},
{
"Uid": "C1:3D:CA:D4:45:11",
"Type": 1,
"DateTime": "2018-05-06T19:05:02.666Z",
"Sensors": [
{
"Type": 1,
"Value": { "$numberDecimal": "-92" }
},
{
"Type": 2,
"Value": { "$numberDecimal": "-59" }
}
]
}
]
}
This query works fine and correct. I count Tags, Sensors and repeat times of each EdpointID. But the problem is when I work with large size of data (about 10,000,000 documents) I get memory problem. It seems having 4 levels of unwind make problem in this query. How can I reduce unwinds in this query?
As long as your data has unique sensor and tag readings per document, which to date what you have presented appears to, then you simply don't need $unwind at all.
In fact, all you really need is a single $group:
db.endpoints.aggregate([
// In reality you would $match to limit the selection of documents
{ "$match": {
"DateTime": { "$gte": new Date("2018-05-01"), "$lt": new Date("2018-06-01") }
}},
{ "$group": {
"_id": "$EndpointId",
"FirstActivity" : { "$min" : "$DateTime" },
"LastActivity" : { "$max" : "$DateTime" },
"RequestCount": { "$sum": 1 },
"TagCount": {
"$sum": {
"$size": { "$setUnion": ["$Tags.Uid",[]] }
}
},
"SensorCount": {
"$sum": {
"$sum": {
"$map": {
"input": { "$setUnion": ["$Tags.Uid",[]] },
"as": "tag",
"in": {
"$size": {
"$reduce": {
"input": {
"$filter": {
"input": {
"$map": {
"input": "$Tags",
"in": {
"Uid": "$$this.Uid",
"Type": "$$this.Sensors.Type"
}
}
},
"cond": { "$eq": [ "$$this.Uid", "$$tag" ] }
}
},
"initialValue": [],
"in": { "$setUnion": [ "$$value", "$$this.Type" ] }
}
}
}
}
}
}
}
}}
])
Or if you actually do need to accumulate those "unique" values of "Sensors" and "Tags" from across different documents, then you still need initial $unwind statements to get the right grouping, but nowhere near as much as you presently have:
db.endpoints.aggregate([
// In reality you would $match to limit the selection of documents
{ "$match": {
"DateTime": { "$gte": new Date("2018-05-01"), "$lt": new Date("2018-06-01") }
}},
{ "$unwind": "$Tags" },
{ "$unwind": "$Tags.Sensors" },
{ "$group": {
"_id": {
"EndpointId": "$EndpointId",
"Uid": "$Tags.Uid",
"Type": "$Tags.Sensors.Type"
},
"FirstActivity": { "$min": "$DateTime" },
"LastActivity": { "$max": "$DateTime" },
"RequestCount": { "$addToSet": "$_id" }
}},
{ "$group": {
"_id": {
"EndpointId": "$_id.EndpointId",
"Uid": "$_id.Uid",
},
"FirstActivity": { "$min": "$FirstActivity" },
"LastActivity": { "$max": "$LastActivity" },
"count": { "$sum": 1 },
"RequestCount": { "$addToSet": "$RequestCount" }
}},
{ "$group": {
"_id": "$_id.EndpointId",
"FirstActivity": { "$min": "$FirstActivity" },
"LastActivity": { "$max": "$LastActivity" },
"TagCount": { "$sum": 1 },
"SensorCount": { "$sum": "$count" },
"RequestCount": { "$addToSet": "$RequestCount" }
}},
{ "$addFields": {
"RequestCount": {
"$size": {
"$reduce": {
"input": {
"$reduce": {
"input": "$RequestCount",
"initialValue": [],
"in": { "$setUnion": [ "$$value", "$$this" ] }
}
},
"initialValue": [],
"in": { "$setUnion": [ "$$value", "$$this" ] }
}
}
}
}}
],{ "allowDiskUse": true })
And from MongoDB 4.0 you can use $toString on the ObjectId within _id and simply merge the unique keys for those in order to keep the RequestCount using $mergeObjects. This is cleaner and a bit more scalable than pushing nested array content and flattening it
db.endpoints.aggregate([
// In reality you would $match to limit the selection of documents
{ "$match": {
"DateTime": { "$gte": new Date("2018-05-01"), "$lt": new Date("2018-06-01") }
}},
{ "$unwind": "$Tags" },
{ "$unwind": "$Tags.Sensors" },
{ "$group": {
"_id": {
"EndpointId": "$EndpointId",
"Uid": "$Tags.Uid",
"Type": "$Tags.Sensors.Type"
},
"FirstActivity": { "$min": "$DateTime" },
"LastActivity": { "$max": "$DateTime" },
"RequestCount": {
"$mergeObjects": {
"$arrayToObject": [[{ "k": { "$toString": "$_id" }, "v": 1 }]]
}
}
}},
{ "$group": {
"_id": {
"EndpointId": "$_id.EndpointId",
"Uid": "$_id.Uid",
},
"FirstActivity": { "$min": "$FirstActivity" },
"LastActivity": { "$max": "$LastActivity" },
"count": { "$sum": 1 },
"RequestCount": { "$mergeObjects": "$RequestCount" }
}},
{ "$group": {
"_id": "$_id.EndpointId",
"FirstActivity": { "$min": "$FirstActivity" },
"LastActivity": { "$max": "$LastActivity" },
"TagCount": { "$sum": 1 },
"SensorCount": { "$sum": "$count" },
"RequestCount": { "$mergeObjects": "$RequestCount" }
}},
{ "$addFields": {
"RequestCount": {
"$size": {
"$objectToArray": "$RequestCount"
}
}
}}
],{ "allowDiskUse": true })
Either form returns the same data, though the order of keys in the result may vary:
{
"_id" : "89799bcc-e86f-4c8a-b340-8b5ed53caf83",
"FirstActivity" : ISODate("2018-05-06T19:05:02.666Z"),
"LastActivity" : ISODate("2018-05-06T19:05:02.666Z"),
"RequestCount" : 2,
"TagCount" : 4,
"SensorCount" : 16
}
The result is obtained from these sample documents which you originally gave as a sample source in the original question on the topic:
{
"_id" : ObjectId("5aef51dfaf42ea1b70d0c4db"),
"EndpointId" : "89799bcc-e86f-4c8a-b340-8b5ed53caf83",
"DateTime" : ISODate("2018-05-06T19:05:02.666Z"),
"Url" : "test",
"Tags" : [
{
"Uid" : "C1:3D:CA:D4:45:11",
"Type" : 1,
"DateTime" : ISODate("2018-05-06T19:05:02.666Z"),
"Sensors" : [
{
"Type" : 1,
"Value" : NumberDecimal("-95")
},
{
"Type" : 2,
"Value" : NumberDecimal("-59")
},
{
"Type" : 3,
"Value" : NumberDecimal("11.029802536740132")
},
{
"Type" : 4,
"Value" : NumberDecimal("27.25")
},
{
"Type" : 6,
"Value" : NumberDecimal("2924")
}
]
},
{
"Uid" : "C1:3D:CA:D4:45:11",
"Type" : 1,
"DateTime" : ISODate("2018-05-06T19:05:02.666Z"),
"Sensors" : [
{
"Type" : 1,
"Value" : NumberDecimal("-95")
},
{
"Type" : 2,
"Value" : NumberDecimal("-59")
},
{
"Type" : 3,
"Value" : NumberDecimal("11.413037961112279")
},
{
"Type" : 4,
"Value" : NumberDecimal("27.25")
},
{
"Type" : 6,
"Value" : NumberDecimal("2924")
}
]
},
{
"Uid" : "E5:FA:2A:35:AF:DD",
"Type" : 1,
"DateTime" : ISODate("2018-05-06T19:05:02.666Z"),
"Sensors" : [
{
"Type" : 1,
"Value" : NumberDecimal("-97")
},
{
"Type" : 2,
"Value" : NumberDecimal("-58")
},
{
"Type" : 3,
"Value" : NumberDecimal("10.171658037099185")
}
]
}
]
}
/* 2 */
{
"_id" : ObjectId("5aef51e0af42ea1b70d0c4dc"),
"EndpointId" : "89799bcc-e86f-4c8a-b340-8b5ed53caf83",
"Url" : "test",
"Tags" : [
{
"Uid" : "E2:02:00:18:DA:40",
"Type" : 1,
"DateTime" : ISODate("2018-05-06T19:05:04.574Z"),
"Sensors" : [
{
"Type" : 1,
"Value" : NumberDecimal("-98")
},
{
"Type" : 2,
"Value" : NumberDecimal("-65")
},
{
"Type" : 3,
"Value" : NumberDecimal("7.845424441900629")
},
{
"Type" : 4,
"Value" : NumberDecimal("0.0")
},
{
"Type" : 6,
"Value" : NumberDecimal("3012")
}
]
},
{
"Uid" : "12:3B:6A:1A:B7:F9",
"Type" : 1,
"DateTime" : ISODate("2018-05-06T19:05:04.574Z"),
"Sensors" : [
{
"Type" : 1,
"Value" : NumberDecimal("-95")
},
{
"Type" : 2,
"Value" : NumberDecimal("-59")
},
{
"Type" : 3,
"Value" : NumberDecimal("12.939770381907275")
}
]
}
]
}
Bottom line is that you can either use the first given form here which will accumulate "within each document" and then "accumulate per endpoint" within a single stage and is the most optimal, or you actually require to identify things like the "Uid" on the tags or the "Type" on the sensor where those values occur more than once over any combination of documents grouping by the endpoint.
Your sample data supplied to date only shows that these values are "unique within each document", therefore the first given form would be most optimal if this is the case for all remaining data.
In the event that it is not, then "unwinding" the two nested arrays in order to "aggregate the detail across documents" is the only way to approach this. You can limit the date range or other criteria as most "queries" typically have some bounds and do not actually work on the "whole" collection data, but the main fact remains that arrays would be "unwound" creating essentially a document copy for every array member.
The point on optimization means that you only need to do this "twice" as there are only two arrays. Doing successive $group to $unwind to $group is always a sure sign you a doing something really wrong. Once you "take something apart" you should only ever need to "put it back together" once. In a series of graded steps as demonstrated here is the once approach which optimizes.
Outside of the scope of your question still remains:
Add other realistic constraints to the query to reduce the documents processed, maybe even do so in "batches" and combine results
Add the allowDiskUse option to the pipeline to let temporary storage be used. ( actually demonstrated on the commands )
Consider that "nested arrays" are probably not the best storage method for the analysis you want to do. It's always more efficient when you know you need to $unwind to simply write the data in that "unwound" form directly into a collection.
If you're dealing with data on the order of 10,000,000 documents, you're going to run into aggregation pipeline size limits easily. Specifically, according to the MongoDB documentation, there is a pipeline RAM use limit of 100MB. If each document has at least 10 bytes of data, then that's enough to hit that limit, and your documents would absolutely exceed that amount.
There are a few options available to you to resolve this problem:
1) You can use the allowDiskUse option as noted in the documentation.
2) You can project your documents further between unwind stages to limit document size (very unlikely to be enough on its own).
3) You can periodically generate summary documents on subsets of your data, and then perform your aggregations on those summary documents. If, for example, you run summary documents on subsets of size 1,000, you can reduce the number of documents in your pipelines from 10,000,000 to just 10,000.
4) You can look into sharding your collection and running these aggregate operations on a cluster to reduce the load on any single server.
Options 1 and 2 are both very short-term solutions. They're easy to implement, but won't help much in the long run. Options 3 and 4, however, are far more involved and trickier to implement, but will provide the greatest amount of scalability and are more likely to continue meeting your needs long-term.
Do be warned, however, that if you plan to approach option 4, you need to be very prepared. A sharded collection cannot be unsharded, and messing up can cause potentially irreparable data loss. Having a dedicated DBA with experience with MongoDB clusters is recommended.

MongoDB Group By count occurences of values and output as new field

I have a 3 Collections Assignments, Status, Assignee.
Assignments Fields : [_id, status, Assignee]
Assignee and Status Fields : [_id, name].
There can be many assignments associated with various Status and Assignee collections(linked via _id field), There is no nesting or complex data.
I need a query for all assignments ids where Assignees are the row, Status are the Columns, there combined cell is the count with Total counts at the end.
To help you visualize, I am attaching below image. I am new to complex Mongo DB Aggregate framework, kindly guide me to achieve query.
Note: Data in Status and Assignee collection will be dynamic. Nothing is predetermined in the Query. So, the Rows and Columns are going to grow dynamically in future, If the query is given pagination, then it would be of great help. I cannot write a query with hard coded status names like 'pending', 'completed' etc. As data shall grow and existing data may change like 'pending task', 'completed work'.
Below is my query
db.getCollection('Assignments').aggregate([
{
"$group": {
"_id": {
"assignee": "$assignee",
"statusId": "$statusId"
},
"statusCount": { "$sum": 1 }
}
},
{
"$group": {
"_id": "$_id.assignee",
"statuses": {
"$push": {
"statusId": "$_id.statusId",
"count": "$statusCount"
},
},
"count": { "$sum": "$statusCount" }
}
},
]);
Below is the output format:
{
"_id" : "John",
"statuses" : {
"statusId" : "Pending",
"count" : 3.0
},
"count" : 3.0
}
{
"_id" : "Katrina",
"statuses" : [{
"statusId" : "Pending",
"count" : 1.0
},
{
"statusId" : "Completed",
"count" : 1.0
},
{
"statusId" : "Assigned",
"count" : 1.0
}],
"count" : 3.0
}
{
"_id" : "Collins",
"statuses" : {
"statusId" : "Pending",
"count" : 4.0
},
"count" : 4.0
}
Expected Output is:
{
"_id" : "Katrina",
"Pending" : 1.0,
"Completed" : 1.0,
"Assigned" : 1.0,
"totalCount" : 3.0
}
Any Idea on how to many various statusId for different assignee as keys and not values in single document.
You need another $group stage after $unwind to count number of status based on statusId string value:
{
"$group": {
"_id": "$_id",
"Pending" : {
"$sum": {
"$cond": [
{ "$eq": [
"$statuses.statusId",
"Pending"
]},
"$statuses.count",
0
]
}
},
"Completed" : {
"$sum": {
"$cond": [
{ "$eq": [
"$statuses.statusId",
"Completed"
]},
"$statuses.count",
0
]
}
},
"Assigned" : {
"$sum": {
"$cond": [
{ "$eq": [
"$statuses.statusId",
"Assigned"
]},
"$statuses.count",
0
]
}
},
"totalCount": { "$sum": 1 }
}
}
The final aggregate command:
db.getCollection('Assignments').aggregate([
{
"$group": {
"_id": {
"assignee": "$assignee",
"statusId": "$statusId"
},
"statusCount": { "$sum": 1 }
}
},
{
"$group": {
"_id": "$_id.assignee",
"statuses": {
"$push": {
"statusId": "$_id.statusId",
"count": "$statusCount"
},
},
"count": { "$sum": "$statusCount" }
}
},
{ "$unwind": "$statuses" },
{
"$group": {
"_id": "$_id",
"Pending" : {
"$sum": {
"$cond": [
{ "$eq": [
"$statuses.statusId",
"Pending"
]},
"$statuses.count",
0
]
}
},
"Completed" : {
"$sum": {
"$cond": [
{ "$eq": [
"$statuses.statusId",
"Completed"
]},
"$statuses.count",
0
]
}
},
"Assigned" : {
"$sum": {
"$cond": [
{ "$eq": [
"$statuses.statusId",
"Assigned"
]},
"$statuses.count",
0
]
}
},
"totalCount": { "$sum": 1 }
}
}
]);
Why not just keep statuses as an object so each status is a key/val pair. If that works you do the following
db.getCollection('Assignments').aggregate([
[
{
"$group": {
"_id": {
"assignee": "$assignee",
"statusId": "$statusId"
},
"statusCount": { "$sum": 1 }
},
},
{
"$group" : {
"_id" : "$_id.assignee",
"statuses" : {
"$push" : {
"k" : "$_id.statusId", // <- "k" as key value important for $arrayToObject Function
"v" : "$statusCount" // <- "v" as key value important for $arrayToObject Function
}
},
"count" : {
"$sum" : "$statusCount"
}
}
},
{
"$project" : {
"_id" : 1.0,
"statuses" : {
"$arrayToObject" : "$statuses"
},
"totalCount" : "$count"
}
}
],
{
"allowDiskUse" : false
}
);
This gives you:
{
"_id" : "Katrina",
"statuses": {
"Pending" : 1.0,
"Completed" : 1.0,
"Assigned" : 1.0,
},
"totalCount" : 3.0
}
A compromise having it one layer deeper but still the shape of statuses you wanted and dynamic with each new statusId added.

mongodb aggregation conditional adding field based on value in array

please excuse the title. could find a better description for what iam trying to do.
I have a collection of messages which stores the following information
code: a unique identification code of the message
from: phone number the message was sent from
to: phone number the message was sent to
message: the message text
readings: an array of ObjectIds. The ids reference documents in another collection names "users". if an ObjectId is here it means, that this message has been read by that particular user.
Example Data
{
"_id" : ObjectId("59ba30c95869d32a803e4c4d"),
"code" : "SM54c9366e9b8544e89bdcf2ee841adea7",
"from" : "+49157xxxxxxx",
"to" : "+49160xxxxxxxx",
"message" : "xxxxxxxx",
"createdAt" : ISODate("2017-09-14T07:33:39.000Z"),
"lastModifiedAt" : ISODate("2017-09-14T07:33:32.324Z"),
"status" : "delivered",
"room" : ObjectId("59bfa293bd7717251cecfae7"),
"readings" : [
ObjectId("59c25751dcfdaf2944ee2fae"),
ObjectId("59c25751dcfdaf2944e32fae")
],
}
/* 2 */
{
"_id" : ObjectId("59ba3270f53b7f2fb4fa807f"),
"code" : "SM04585672d02644018e3ff466d73c571d",
"from" : "+49xxxxxxx",
"to" : "+49xxxxxxxx",
"message" : "xxxxxxx",
"createdAt" : ISODate("2017-09-14T07:40:42.000Z"),
"lastModifiedAt" : ISODate("2017-09-14T07:40:34.338Z"),
"status" : "delivered",
"room" : ObjectId("59bfa293bd7717251cecfae7"),
"readings" : [
ObjectId("59c25751dcfdaf2944ee2fae")
],
}
Want i want to achieve is that a message gets an additional field "hasRead" if a specific user has read the message.
Here is the result i want to achieve
{
"_id" : ObjectId("59ba30c95869d32a803e4c4d"),
"code" : "SM54c9366e9b8544e89bdcf2ee841adea7",
"to" : "+491606983534",
"message" : "Schau mer mal",
"createdAt" : ISODate("2017-09-14T07:33:39.000Z"),
"lastModifiedAt" : ISODate("2017-09-14T07:33:32.324Z"),
"status" : "delivered",
"room" : ObjectId("59bfa293bd7717251cecfae7"),
"hasRead" : true
}
/* 2 */
{
"_id" : ObjectId("59ba3270f53b7f2fb4fa807f"),
"code" : "SM04585672d02644018e3ff466d73c571d",
"to" : "+491606983534",
"message" : "Schau mer mal",
"createdAt" : ISODate("2017-09-14T07:40:42.000Z"),
"lastModifiedAt" : ISODate("2017-09-14T07:40:34.338Z"),
"status" : "delivered",
"room" : ObjectId("59bfa293bd7717251cecfae7"),
"hasRead" : true
}
I constructed an aggregation with the following stages but it looks so BIG for such a simple task and i wonder if there is a more elegant, lighter way to do so ?
The stages are:
$addFields: Checks if the readings array is 0. if 0 it adds a dummy ObjectId, else it sets the readings array
$unwind: Unwind the readings array
$addFields: adds a field "hasRead" upon checking if a specific ObjectId matches the "readings" field. True if equal else false
$group: Group by all fields except the "hasRead" field, "hasRead" based in $max hasRead
$project: Constructing the result to make it a flat object.
And here is my code:
db.getCollection('sms').aggregate([
{ $addFields: {
"readings": {
"$cond": {
if: { $or: [ { "$gt": [ {"$size": "$readings"},0] } ]} ,
then: "$readings",
else: [ObjectId("000000000000000000000000")]
}
}
}},
{ $unwind: "$readings" },
{ $addFields: {
"hasRead": {
$cond: {
if: {
$eq: ["$readings", ObjectId("59c25751dcfdaf2944ee2fae")]
},
then: true,
else: false
}
}
}
},
{ $group: {
_id: {
_id: "$_id",
code: "$code",
from: "$from",
to: "$to",
message: "$message",
createdAt: "$createdAt",
lastModifiedAt: "$lastModifiedAt",
room: "$room"
},
hasRead: { $max: "$hasRead" }
}},
{ $project: {
"_id": "$_id._id",
"code": "$_id.code",
"from": "$_id.from",
"to": "$_id.to",
"message": "$_id.message",
"createdAt": "$_id.createdAt",
"lastModifiedAt": "$_id.lastModifiedAt",
"room": "$_id.room",
"hasRead": "$hasRead"
}}
])
After browsing thru answers Neil (see comment) gave to another questioni could simplfiy the query to this:
db.getCollection('sms').aggregate([
{ "$addFields": {
"hasRead" : {
"$filter": {
"input": { "$setUnion": [ "$readings", []] },
"as": "o",
"cond" : {
"$eq": [ "$$o",ObjectId("59c25751dcfdaf2944ee2fae")]
}
}
}
}
},
{ "$project": {
"_id": 1,
"code": 1,
"to": 1,
"message": 1,
"createdAt": 1,
"lastModifiedAt" : 1,
"status": 1,
"room": 1,
"hasRead": {
"$cond": {
if: { $or: [ { "$gt": [ {"$size": "$readings"},0] } ]} ,
then: true,
else: false
}
}
}
}
])
Way too late for this, but you can simply write:
db.getCollection("sms").aggregate([
{
$project: {
_id: 1,
code: 1,
to: 1,
message: 1,
createdAt: 1,
lastModifiedAt: 1,
status: 1,
room: 1,
hasRead: {
$in: [ObjectId("59c25751dcfdaf2944ee2fae"), "$readings"],
},
},
},
]);
often the simplest solution is the correct one :)