Sum array number values across multiple documents - mongodb

I have Mongo documents which have array number values in order (it's by day) and I want to sum the same values across multiple documents for each position grouped by field outside of the array.
{"_id" : "1",
"group" : "A",
"value_list" : [1,2,3,4,5,6,7]
},
{"_id" : "2",
"group" : "B",
"value_list" : [10,20,30,40,50,60,70]
},
{"_id" : "3",
"group" : "A",
"value_list" : [1,2,3,4,5,6,7]
},
{"_id" : "4",
"group" : "B",
"value_list" : [10,20,30,40,50,60,70]
}
So the results I'm after is listed below.
There are two group A documents above and at position 1 of the value_list array, both documents have the value of 1. so 1+1=2. Position 2 the value is 2 in both documents so 2+2=4, etc.
There are two group B documents above and at position 1 of the value_list array, both documents have the value of 10. so 10+10=20. Position 2 the value is 20 in both documents so 20+20=40, etc.
{"_id" : "30",
"group" : "A",
"value_list" : [2,4,6,8,10,12,14]
},
{"_id" : "30",
"group" : "A",
"value_list" : [20,40,60,80,100,120,140]
}
How would I do this using Mongo Script? Thanks, Matt

Certainly the most "scalable" way is to use the includeArrayIndex option of $unwind in order to track the positions and then $sum the "unwound" combinations, before adding back into array format:
db.getCollection('test').aggregate([
{ "$unwind": { "path": "$value_list", "includeArrayIndex": "index" } },
{ "$group": {
"_id": {
"group": "$group",
"index": "$index"
},
"value_list": { "$sum": "$value_list" }
}},
{ "$sort": { "_id": 1 } },
{ "$group": {
"_id": "$_id.group",
"value_list": { "$push": "$value_list" }
}},
{ "$sort": { "_id": 1 } }
])
Note you need to $sort after the first $group in order to maintain the array positions.
If you can get away with it, you could also apply all arrays into $reduce:
db.getCollection('test').aggregate([
{ "$group": {
"_id": "$group",
"value_list": { "$push": "$value_list" }
}},
{ "$addFields": {
"value_list": {
"$reduce": {
"input": "$value_list",
"initialValue": [],
"in": {
"$map": {
"input": {
"$zip": {
"inputs": ["$$this", "$$value"],
"useLongestLength": true,
}
},
"in": { "$sum": "$$this"}
}
}
}
}
}},
{ "$sort": { "_id": 1 } }
])
Essentially you create an "array of arrays" using the initial $push, which you process with $reduce. The $zip does a "pairwise" assignment per element, which are then added together at each position during $map using $sum.
While a bit more efficient, it's not really practical for large data as you would probably break the BSON limit by adding all grouped "arrays" into a single array on the grouping, before you "reduce" it.
Either method produces the same result:
/* 1 */
{
"_id" : "A",
"value_list" : [
2.0,
4.0,
6.0,
8.0,
10.0,
12.0,
14.0
]
}
/* 2 */
{
"_id" : "B",
"value_list" : [
20.0,
40.0,
60.0,
80.0,
100.0,
120.0,
140.0
]
}

Related

Group sums of an attribute by the values of another array attribute

I have a collection "tagsCount" that looks like that:
{
"_id" : ObjectId("59e3a46a48507851d411ad78"),
"tags" : [ "Marketing" ],
"cpt" : 14354
},
{
"_id" : ObjectId("59e3a46a48507851d411ad79"),
"tags" : [
"chatbot",
"Content marketing",
"Intelligence artificielle",
"Marketing digital",
"Personnalisation"
],
"cpt" : 9037
}
Of course there are many more lines.
I want to get the sum of "cpt" grouped by the values of "tags".
I have come up with that:
db.tagsCount.aggregate([
{ "$project": { "tags":1 }},
{ "$unwind": "$tags"},
{ "$group": {
"_id" : "$tags",
cpt : "$cpt" ,
"count": { "$sum": "$cpt" }
}}
])
But that doesn't do the trick, I have the list of all different tags and the count have a value a 0.
Is it possible to do what I want?
The problem is that your aggregation pipeline starts with $project which selects only tags to the next stages and that's why you're executing $group on documents without cpt. Here's my working example:
db.tagsCount.aggregate([
{ "$unwind": "$tags"},
{ "$group": {
"_id": "$tags",
"count": { "$sum": "$cpt" }
}},
{ "$project": { "tag": "$_id", "_id": 0, "count": 1 }}
])

How to match and group array elements with the max value in aggregation

I need help to get the array element having maximum value of a field(level) from a document. Then count the total occurences grouped by array element field "bssid".
Consider the following data
/* 1 */
{
"_id" : "18:59:36:0c:94:a3",
"timestamp" : "1460012567",
"apdata" : [{
"bssid" : "f4:b7:e2:56:e4:20",
"ssid" : "Test Network2",
"level" : -55
}, {
"bssid" : "b8:a3:86:67:03:56",
"ssid" : "Test Network1",
"level" : -76
}]
}
/* 2 */
{
"_id" : "d0:b3:3f:b9:42:38",
"timestamp" : "1460013345",
"apdata" : [{
"bssid" : "f4:b7:e2:56:e4:20",
"ssid" : "Test Network2",
"level" : -65
}, {
"bssid" : "b8:a3:86:67:03:56",
"ssid" : "Test Network1",
"level" : -46
}]
}
/* 3 */
{
"_id" : "d0:b3:3f:b9:42:41",
"timestamp" : "1460013145",
"apdata" : [{
"bssid" : "f4:b7:e2:56:e4:20",
"ssid" : "Test Network2",
"level" : -65
}, {
"bssid" : "b8:a3:86:67:03:56",
"ssid" : "Test Network1",
"level" : -46
}]
}
The output required is
{
"bssid" : "f4:b7:e2:56:e4:20",
"ssid" : "Test Network2",
"count" : 1
}, {
"bssid" : "b8:a3:86:67:03:56",
"ssid" : "Test Network1",
"count" : 2
}
Which is the count of times each bssid had the maximum value within the array of each document over the whole collection.
If you have MongoDB 3.2 available then you can do something like this:
db.sample.aggregate([
{ "$project": {
"apdata": {
"$arrayElemAt": [
{ "$filter": {
"input": "$apdata",
"as": "el",
"cond": {
"$eq": [
"$$el.level",
{ "$max": {
"$map": {
"input": "$apdata",
"as": "data",
"in": "$$data.level"
}
}}
]
}
}},
0
]
}
}},
{ "$group": {
"_id": "$apdata.bssid",
"ssid": { "$first": "$apdata.ssid" },
"count": { "$sum": 1 }
}}
])
For at least MongoDB 2.6 you need to do this:
db.sample.aggregate([
{ "$unwind": "$apdata" },
{ "$group": {
"_id": "$_id",
"apdata": { "$push": "$apdata" },
"max": { "$max": "$apdata.level" }
}},
{ "$unwind": "$apdata" },
{ "$redact": {
"$cond": {
"if": { "$eq": [ "$apdata.level", "$max" ] },
"then": "$$KEEP",
"else": "$$PRUNE"
}
}},
{ "$group": {
"_id": "$apdata.bssid",
"ssid": { "$first": "$apdata.ssid" },
"count": { "$sum": 1 }
}}
])
And for MongoDB 2.4 or 2.2 like this:
db.sample.aggregate([
{ "$unwind": "$apdata" },
{ "$group": {
"_id": "$_id",
"apdata": { "$push": "$apdata" },
"max": { "$max": "$apdata.level" }
}},
{ "$unwind": "$apdata" },
{ "$project": {
"apdata": 1,
"isMax": { "$eq": [ "$apdata.level", "$max" ] }
}},
{ "$match": { "isMax": true } },
{ "$group": {
"_id": "$apdata.bssid",
"ssid": { "$first": "$apdata.ssid" },
"count": { "$sum": 1 }
}}
])
In all cases $max is used to get the "maximum" value of of the array in each document "first", then you can use that to "filter" the array content prior to using it in a $group. The approaches to this only vary with version
MongoDB 3.2: Allows the $max to work directly on an "array" of values. So the $map is used to just get the "level" values and find out what that "max" actually is.
Then the $filter can be used to just return the array element which matches that "max" value, and finally $arrayElemAt is used to return that "only" ( out of two possible and "zero" index ) element as a plain document.
The whole process can be done in $group "only" if you basically repeat that whole statement for both the _id and in order to get the $first "ssid" value, but it's a bit easier to write in a $project separately to demonstrate.
MongoDB 2.6: This lacks the fancier operators and most notably the ability of $max to work "directly" on an array. The notable thing is the need to $unwind the array first and then actually $group just on the original document, solely in order to get that "max" value.
Then the process really needs you to $unwind again since you will be grouping on the element from the array later, and then use $redact to filter the content. This is a "logical" form of $match where you can directly compare the "level" against the computed "max" from the earlier stage. So the element that is not the "max" is removed.
MongoDB 2.4: Is again basically the same logic, except instead of $redact you actually need the physical $project in order to put a field in the document to use in filtering with $match.
All versions have the same final $group, where you supply the path to "apdata.bssid" for the grouping key and the $first result on that grouping boundary for the "ssid" and a simple $sum to count the occurrences of the grouping key in the results.
Everything returns just as follows:
{ "_id" : "f4:b7:e2:56:e4:20", "ssid" : "Test Network2", "count" : 1 }
{ "_id" : "b8:a3:86:67:03:56", "ssid" : "Test Network1", "count" : 2 }
Actually the most "efficient" form for MongoDB 3.2 would be as follows:
db.sample.aggregate([
{ "$group": {
"_id": {
"$arrayElemAt": [
{ "$map": {
"input": {
"$filter": {
"input": "$apdata",
"as": "el",
"cond": {
"$eq": [
"$$el.level",
{ "$max": {
"$map": {
"input": "$apdata",
"as": "data",
"in": "$$data.level"
}
}}
]
}
}
},
"as": "apdata",
"in": {
"bssid": "$$apdata.bssid",
"ssid": "$$apdata.ssid"
}
}},
0
]
},
"count": { "$sum": 1 }
}}
])
With a slightly different form due to the compound _id, but it is a single $group stage only, without repetition of the whole process to find the array element data for the "max" value:
{
"_id" : {
"bssid" : "b8:a3:86:67:03:56",
"ssid" : "Test Network1"
},
"count" : 2
}
{
"_id" : {
"bssid" : "f4:b7:e2:56:e4:20",
"ssid" : "Test Network2"
},
"count" : 1
}

Mongodb grouby by and sum and get media

I have this collection in my database:
{ "IdUser" : "1", "IdItem" : "1" },
{ "IdUser" : "1", "IdItem" : "2" },
{ "IdUser" : "1", "IdItem" : "3" },
{ "IdUser" : "2", "IdItem" : "4" },
{ "IdUser" : "2", "IdItem" : "5" },
{ "IdUser" : "4", "IdItem" : "6" },
{ "IdUser" : "5", "IdItem" : "7" }
How can I obtain this result:
Users with one item: 2
Users with two items: 1
Users with three items: 1
You need to first $group your documents by IdUser then count the number of time each IdUser appear in your collection using the $sum accumulator operator. This allows you in the next stage to group your documents by "count" and return the count for "user" with same number of "items".
db.items.aggregate([
{ "$group": {
"_id": "$IdUser",
"count": { "$sum": 1 }
}},
{ "$group": {
"_id": "$count",
"nUsers": { "$sum": 1 }
}}
])
Using the aggregate() method will give you the desired result though the output documents would be different in that instead of having a key/value pair, you have two different fields with values that show the number of users and their respective
item count. The following aggregation pipeline explains this:
db.collection.aggregate([
{
"$group": {
"_id": "$IdUser",
"count": {
"$sum": { "$cond": [{ "$gt": [ "$IdItem", null ] }, 1, 0 ] }
}
}
},
{
"$group": {
"_id": "$count",
"users": { "$push": "$_id" }
}
},
{
"$project": {
"_id": 0,
"number_of_items": "$_id",
"number_of_users": { "$size": "$users" }
}
}
])
In the above, you $group all the documents by the user key to get their counts, taking into consideration documents that may not have the item field, which would be discounted in the aggregate. A further $group operation is necessary to then get the users per count, in the form of an array.
The last step in the pipeline $project serves to reshape the final output so that you get the following output (with the sample documents supplied with the question):
{ "number_of_items" : 1, "number_of_users" : 2 }
{ "number_of_items" : 3, "number_of_users" : 1 }
{ "number_of_items" : 2, "number_of_users" : 1 }

Mongodb aggregation, finding within an array of values

I have a schemea that creates documents using the following structure:
{
"_id" : "2014-07-16:52TEST",
"date" : ISODate("2014-07-16T23:52:59.811Z"),
"name" : "TEST"
"values" : [
[
1405471921000,
0.737121
],
[
1405471922000,
0.737142
],
[
1405471923000,
0.737142
],
[
1405471924000,
0.737142
]
]
}
In the values, the first index is a timestamp. What I'm trying to do is query a specific timestamp to find the closest value ($gte).
I've tried the following aggregate query:
[
{ "$match": {
"values": {
"$elemMatch": { "0": {"$gte": 1405471923000} }
},
"name" : 'TEST'
}},
{ "$project" : {
"name" : 1,
"values" : 1
}},
{ "$unwind": "$values" },
{ "$match": { "values.0": { "$gte": 1405471923000 } } },
{ "$limit" : 1 },
{ "$sort": { "values.0": -1 } },
{ "$group": {
"_id": "$name",
"values": { "$push": "$values" },
}}
]
This seems to work, but it doesn't pull the closest value. It seems to pull anything greater or equal to and the sort doesn't seem to get applied, so it will pull a timestamp that is far in the future.
Any suggestions would be great!
Thank you
There are a couple of things wrong with the approach here even though it is a fair effort. You are right that you need to $sort here, but the problem is that you cannot "sort" on an inner element with an array. In order to get a value that can be sorted you must $unwind the array first as it otherwise will not sort on an array position.
You also certainly do not want $limit in the pipeline. You might be testing this against a single document, but "limit" will actually act on the entire set of documents in the pipeline. So if more than one document was matching your condition then they would be thrown away.
The key thing you want to do here is use $first in your $group stage, which is applied once you have sorted to get the "closest" element that you want.
db.collection.aggregate([
// Documents that have an array element matching the condition
{ "$match": {
"values": { "$elemMatch": { "0": {"$gte": 1405471923000 } } }
}},
// Unwind the top level array
{ "$unwind": "$values" },
// Filter just the elements that match the condition
{ "$match": { "values.0": { "$gte": 1405471923000 } } },
// Take a copy of the inner array
{ "$project": {
"date": 1,
"name": 1,
"values": 1,
"valCopy": "$values"
}},
// Unwind the inner array copy
{ "$unwind": "$valCopy" },
// Filter the inner elements
{ "$match": { "valCopy": { "$gte": 1405471923000 } }},
// Sort on the now "timestamp" values ascending for nearest
{ "$sort": { "valCopy": 1 } },
// Take the "first" values
{ "$group": {
"_id": "$_id",
"date": { "$first": "$date" },
"name": { "$first": "$name" },
"values": { "$first": "$values" },
}},
// Optionally push back to array to match the original structure
{ "$group": {
"_id": "$_id",
"date": { "$first": "$date" },
"name": { "$first": "$name" },
"values": { "$push": "$values" },
}}
])
And this produces your document with just the "nearest" timestamp value matching the original document form:
{
"_id" : "2014-07-16:52TEST",
"date" : ISODate("2014-07-16T23:52:59.811Z"),
"name" : "TEST",
"values" : [
[
1405471923000,
0.737142
]
]
}

mongodb get filtered count

I have no extended knowledge on how to create mongodb queries, but I wanted to ask how could I query collection get something like this:
{
Total: 1000,
Filtered: 459,
DocumentArray: []
}
Of course doing that in one query, so I do not need to do something like this:
db.collection.find();
db.collection.find().count();
db.colection.count();
Well you could do something along these lines:
Considering documents like this:
{ "_id" : ObjectId("531251829df82824bdb53578"), "name" : "a", "type" : "b" }
{ "_id" : ObjectId("531251899df82824bdb53579"), "name" : "a", "type" : "c" }
{ "_id" : ObjectId("5312518e9df82824bdb5357a"), "type" : "c", "name" : "b" }
And an aggregate pipeline like this:
db.collection.aggregate([
{ "$group": {
"_id": null,
"totalCount": { "$sum": 1 },
"docs": { "$push": {
"name": "$name",
"type": "$type"
}},
}},
{ "$unwind": "$docs" },
{ "$match": { "docs.name": "a" } },
{ "$group": {
"_id": null,
"totalCount": { "$first": "$totalCount" },
"filteredCount": { "$sum": 1 },
"docs": { "$push": "$docs" }
}}
])
But I would not recommend it. It will certainly blow up on any "real" collection due to exceeding the maximum BSON document size. And I would doubt it would be performing very well. But that is how it can be done, even if the utility is purely academic.
Just do what you are doing if you need the information. That is the "right way" to do it.