MongoDB Property String Starts With Query And Set Group Id - mongodb

Data structure - one document in a big collection:
{
OPERATINGSYSTEM: "Android 6.0"
}
Issue: The operatingsystem can equal e.g. "Android 5.0", "Android 6.0", "Windows Phone", "Windows Phone 8.1"
There is no property which only contains the kind of operating system e.g. only Android.
I need to get the count of windows phones, and android phones.
My temporary solution:
db.getCollection('RB').find(
{OPERATINGSYSTEM: {$regex: "^Android"}}
).count();
I'm doing that query replacing "^Android" by windows phone and so on which takes much time and needs to be done in parallel.
Using the aggregation framework I though about this:
db.RB.aggregate(
{$group: {_id: {OPERATINGSYSTEM:"$OPERATINGSYSTEM"}}},)
But using this I get an entry for each operatingsystem version Android 5.0, Android 6.0 etc...
The solution I'm searching for should return data in this format:
{
"Android": 50,
"Windows Phone": 100
}
How can this be done in a single query?

Provided your strings at least consistently have the numeric version as the last thing in the string, then you could use $split with the aggregation framework to make an array from the "space delimited" content, then remove the last element from the array before reconstructing:
Given data like :
{ "name" : "Android 6.0" }
{ "name" : "Android 7.0" }
{ "name" : "Windows Phone 10" }
You can try:
db.getCollection('phones').aggregate([
{ "$group": {
"_id": {
"$let": {
"vars": { "split": { "$split": [ "$name", " " ] } },
"in": {
"$reduce": {
"input": { "$slice": [ "$$split", 0, { "$subtract": [ { "$size": "$$split" }, 1 ] } ] },
"initialValue": "",
"in": {
"$cond": {
"if": { "$eq": [ "$$value", "" ] },
"then": "$$this",
"else": { "$concat": [ "$$value", " ", "$$this" ] }
}
}
}
}
}
},
"count": { "$sum": 1 }
}},
{ "$replaceRoot": {
"newRoot": {
"$arrayToObject": [[{ "k": "$_id", "v": "$count" }]]
}
}}
])
That's all possible if your MongoDB is at least MongoDB 3.4 to support both $split and $reduce. The $replaceRoot is really about naming the keys, and not really required.
Alternately you can use mapReduce:
db.getCollection('phones').mapReduce(
function() {
var re = /\d+/g;
emit(this.name.substr(0,this.name.search(re)-1),1);
},
function(key,values) { return Array.sum(values) },
{ "out": { "inline": 1 } }
)
Where it's easier to break down the string by the index where a numeric value occurs. In either case, you are not required to "hardcode" anything, and the values of the keys are completely dependent on the strings in context.
Keep in mind though that unless there is an extremely large number of possible values, then running parallel .count() operations "should" be the fastest to process since returning cursor counts is a lot faster than actually counting the aggregated entries.

You can use map reduce, and apply your logic in the map function.
var map = function(){
var name = this.op.includes("android") ? "Android" : ""; // could be a regexp
if(name === ""){
name = this.op.includes("windows") ? "Windows" : "";
}
emit(name, 1);
}
var reduce = function(key, values){
return Array.sum(values)
}
db.operating.mapReduce(map, reduce, {out: "total"})
https://docs.mongodb.com/manual/tutorial/map-reduce-examples/

Related

MongoDb - Update all properties in an object using MongoShell

I have a collection with many documents containing shipping prices:
{
"_id": {
"$oid": "5f7439c3bc3395dd31ca4f19"
},
"adapterKey": "transport1",
"pricegrid": {
"10000": 23.66,
"20000": 23.75,
"30000": 23.83,
"31000": 43.5,
"40000": 44.16,
"50000": 49.63,
"60000": 50.25,
"70000": 52,
"80000": 56.62,
"90000": 59,
"100000": 62.5,
"119000": 68.85,
"149000": 80,
"159000": 87,
"179000": 94,
"199000": 100.13,
"249000": 118.5,
"299000": 138.62,
"999000": 208.63
},
"zones": [
"25"
],
"franco": null,
"tax": 20,
"doc_created": {
"$date": "2020-09-30T07:54:43.966Z"
},
"idConfig": "0000745",
"doc_modified": {
"$date": "2020-09-30T07:54:43.966Z"
}
}
In pricegrid, all the properties can be different from one grid to another.
I'd like to update all the prices in the field "pricegrid" (price * 1.03 + 1).
I tried this :
db.shipping_settings.updateMany(
{ 'adapterKey': 'transport1' },
{
$mul: { 'pricegrid.$': 1.03 },
$inc: { 'pricegrid.$': 1}
}
)
Resulting in this error :
MongoServerError: Updating the path 'pricegrid.$' would create a conflict at 'grille.$'
So I tried with only $mul (planning on doing $inc in another query) :
db.livraison_config.updateMany(
{ 'adapterKey': 'transport1' },
{
$mul: { 'pricegrid.$': 1.03 }
}
)
But in that case, I get this error :
MongoServerError: The positional operator did not find the match needed from the query.
Could you please direct me on the correct way to write the request ?
You can use an aggregation pipeline in an update. $objectToArray pricegrid to convert it into an array of k-v tuple first. Then, do a $map to perform the computation. Finally, $arrayToObject to convert it back.
db.collection.update({
"adapterKey": "transport1"
},
[
{
$set: {
pricegrid: {
"$objectToArray": "$pricegrid"
}
}
},
{
"$set": {
"pricegrid": {
"$map": {
"input": "$pricegrid",
"as": "p",
"in": {
"k": "$$p.k",
"v": {
"$add": [
{
"$multiply": [
"$$p.v",
1.03
]
},
1
]
}
}
}
}
}
},
{
$set: {
pricegrid: {
"$arrayToObject": "$pricegrid"
}
}
}
])
Here is the Mongo playground for your reference.
You can do it with Aggregation framework:
$objectToArray - to transform pricegrid object to array so you can iterate of its items
$map to iterate over array generated in previous step
$sum and multiply to perform mathematical operations
$arrayToObject to transform updated array back to object
db.collection.update({
"adapterKey": "transport1"
},
[
{
"$set": {
"pricegrid": {
"$arrayToObject": {
"$map": {
"input": {
"$objectToArray": "$pricegrid"
},
"in": {
k: "$$this.k",
v: {
"$sum": [
1,
{
"$multiply": [
"$$this.v",
1.02
]
}
]
}
}
}
}
}
}
}
],
{
"multi": true
})
Working example
I might be wrong, but it looks like there's currently no support for this feature - there's actually an open jira-issue that addresses this topic. Doesn't look like this is going to be implemented though.

How to use $set and dot notation to update embedded array elements using corresponding old element?

I have following documents in a MongoDb:
from pymongo import MongoClient
client = MongoClient(host='my_host', port=27017)
database = client.forecast
collection = database.regions
collection.delete_many({})
regions = [
{
'id': 'DE',
'sites': [
{
'name': 'paper_factory',
'energy_consumption': 1000
},
{
'name': 'chair_factory',
'energy_consumption': 2000
},
]
},
{
'id': 'FR',
'sites': [
{
'name': 'pizza_factory',
'energy_consumption': 3000
},
{
'name': 'foo_factory',
'energy_consumption': 4000
},
]
}
]
collection.insert_many(regions)
Now I would like to copy the property sites.energy_consumption to a new field sites.new_field for each site:
set_stage = {
"$set": {
"sites.new_field": "$sites.energy_consumption"
}
}
pipeline = [set_stage]
collection.aggregate(pipeline)
However, instead of copying the individual value per site, all site values are collected and added as an array. Intead of 'new_field': [1000, 2000] I would like to get 'new_field': 1000 for the first site:
{
"_id": ObjectId("61600c11732a5d6b103ba6be"),
"id": "DE",
"sites": [
{
"name": "paper_factory",
"energy_consumption": 1000,
"new_field": [
1000,
2000
]
},
{
"name": "chair_factory",
"energy_consumption": 2000,
"new_field": [
1000,
2000
]
}
]
},
{
"_id": ObjectId("61600c11732a5d6b103ba6bf"),
"id": "FR",
"sites": [
{
"name": "pizza_factory",
"energy_consumption": 3000,
"new_field": [
3000,
4000
]
},
{
"name": "foo_factory",
"energy_consumption": 4000,
"new_field": [
3000,
4000
]
}
]
}
=> What expression can I use to only use the corresponding entry of the array?
Is there some sort of current-index operator:
$sites[<current_index>].energy_consumption
or an alternative dot operator (would remind me on difference between * multiplication and .* element wise matrix multiplication)?
$sites:energy_consumption
Or is this a bug?
Edit
I also tried to use the "$" positional operator, e.g. with
sites.$.new_field
or
$sites.$.energy_consumption
but then I get the error
FieldPath field names may not start with '$'
Related:
https://docs.mongodb.com/manual/reference/operator/aggregation/set/#std-label-set-add-field-to-embedded
In MongoDB how do you use $set to update a nested value/embedded document?
If the field is member of an array by selecting it you are selecting all of them.
{ar :[{"a" : 1}, {"a" : 2}]}
"$ar.a" = [1 ,2]
Also you cant mix update operators with aggregation, you cant use things like
$sites.$.energy_consumption, if you are doing aggregation you have to use aggregate operators, with only exception the $match stage where you can use query operators.
Query
alternative slightly different solution from yours using $setField
i guess it will be faster, but probably little difference
no need to use javascript it will be slower
this is >= MongoDB 5 solution, $setField is new operator
Test code here
aggregate(
[{"$set":
{"sites":
{"$map":
{"input":"$sites",
"in":
{"$setField":
{"field":"new_field",
"input":"$$this",
"value":"$$this.energy_consumption"}}}}}}]
)
use $addFields
db.collection.update({},
[
{
"$addFields": {
"sites": {
$map: {
input: "$sites",
as: "s",
in: {
name: "$$s.name",
energy_consumption: "$$s.energy_consumption",
new_field: {
$map: {
input: "$sites",
as: "value",
in: "$$value.energy_consumption"
}
}
}
}
}
}
}
])
mongoplayground
I found following ugly workarounds that set the complete sites instead of only specifying a new field with dot notation:
a) based on javascript function
set_stage = {
"$set": {
"sites": {
"$function": {
"body": "function(sites) {return sites.map(site => {site.new_field = site.energy_consumption_in_mwh; return site})}",
"args": ["$sites"],
"lang": "js"
}
}
}
}
b) based on map and mergeObjects
set_stage = {
"$set": {
"sites": {
"$map": {
"input": "$sites",
"in": {
"$mergeObjects": ["$$this", {
"new_field": "$$this.energy_consumption_in_mwh"
}]
}
}
}
}
}
If there is some kind of $$this context for the dot operator expression, allowing a more elegant solution, please let me know.

Return only matched sub-document elements within a nested array

The main collection is retailer, which contains an array for stores. Each store contains an array of offers (you can buy in this store). This offers array has an array of sizes. (See example below)
Now I try to find all offers, which are available in the size L.
{
"_id" : ObjectId("56f277b1279871c20b8b4567"),
"stores" : [
{
"_id" : ObjectId("56f277b5279871c20b8b4783"),
"offers" : [
{
"_id" : ObjectId("56f277b1279871c20b8b4567"),
"size": [
"XS",
"S",
"M"
]
},
{
"_id" : ObjectId("56f277b1279871c20b8b4567"),
"size": [
"S",
"L",
"XL"
]
}
]
}
}
I've try this query: db.getCollection('retailers').find({'stores.offers.size': 'L'})
I expect some Output like that:
{
"_id" : ObjectId("56f277b1279871c20b8b4567"),
"stores" : [
{
"_id" : ObjectId("56f277b5279871c20b8b4783"),
"offers" : [
{
"_id" : ObjectId("56f277b1279871c20b8b4567"),
"size": [
"S",
"L",
"XL"
]
}
]
}
}
But the Output of my Query contains also the non matching offer with size XS,X and M.
How I can force MongoDB to return only the offers, which matched my query?
Greetings and thanks.
So the query you have actually selects the "document" just like it should. But what you are looking for is to "filter the arrays" contained so that the elements returned only match the condition of the query.
The real answer is of course that unless you are really saving a lot of bandwidth by filtering out such detail then you should not even try, or at least beyond the first positional match.
MongoDB has a positional $ operator which will return an array element at the matched index from a query condition. However, this only returns the "first" matched index of the "outer" most array element.
db.getCollection('retailers').find(
{ 'stores.offers.size': 'L'},
{ 'stores.$': 1 }
)
In this case, it means the "stores" array position only. So if there were multiple "stores" entries, then only "one" of the elements that contained your matched condition would be returned. But, that does nothing for the inner array of "offers", and as such every "offer" within the matchd "stores" array would still be returned.
MongoDB has no way of "filtering" this in a standard query, so the following does not work:
db.getCollection('retailers').find(
{ 'stores.offers.size': 'L'},
{ 'stores.$.offers.$': 1 }
)
The only tools MongoDB actually has to do this level of manipulation is with the aggregation framework. But the analysis should show you why you "probably" should not do this, and instead just filter the array in code.
In order of how you can achieve this per version.
First with MongoDB 3.2.x with using the $filter operation:
db.getCollection('retailers').aggregate([
{ "$match": { "stores.offers.size": "L" } },
{ "$project": {
"stores": {
"$filter": {
"input": {
"$map": {
"input": "$stores",
"as": "store",
"in": {
"_id": "$$store._id",
"offers": {
"$filter": {
"input": "$$store.offers",
"as": "offer",
"cond": {
"$setIsSubset": [ ["L"], "$$offer.size" ]
}
}
}
}
}
},
"as": "store",
"cond": { "$ne": [ "$$store.offers", [] ]}
}
}
}}
])
Then with MongoDB 2.6.x and above with $map and $setDifference:
db.getCollection('retailers').aggregate([
{ "$match": { "stores.offers.size": "L" } },
{ "$project": {
"stores": {
"$setDifference": [
{ "$map": {
"input": {
"$map": {
"input": "$stores",
"as": "store",
"in": {
"_id": "$$store._id",
"offers": {
"$setDifference": [
{ "$map": {
"input": "$$store.offers",
"as": "offer",
"in": {
"$cond": {
"if": { "$setIsSubset": [ ["L"], "$$offer.size" ] },
"then": "$$offer",
"else": false
}
}
}},
[false]
]
}
}
}
},
"as": "store",
"in": {
"$cond": {
"if": { "$ne": [ "$$store.offers", [] ] },
"then": "$$store",
"else": false
}
}
}},
[false]
]
}
}}
])
And finally in any version above MongoDB 2.2.x where the aggregation framework was introduced.
db.getCollection('retailers').aggregate([
{ "$match": { "stores.offers.size": "L" } },
{ "$unwind": "$stores" },
{ "$unwind": "$stores.offers" },
{ "$match": { "stores.offers.size": "L" } },
{ "$group": {
"_id": {
"_id": "$_id",
"storeId": "$stores._id",
},
"offers": { "$push": "$stores.offers" }
}},
{ "$group": {
"_id": "$_id._id",
"stores": {
"$push": {
"_id": "$_id.storeId",
"offers": "$offers"
}
}
}}
])
Lets break down the explanations.
MongoDB 3.2.x and greater
So generally speaking, $filter is the way to go here since it is designed with the purpose in mind. Since there are multiple levels of the array, you need to apply this at each level. So first you are diving into each "offers" within "stores" to examime and $filter that content.
The simple comparison here is "Does the "size" array contain the element I am looking for". In this logical context, the short thing to do is use the $setIsSubset operation to compare an array ("set") of ["L"] to the target array. Where that condition is true ( it contains "L" ) then the array element for "offers" is retained and returned in the result.
In the higher level $filter, you are then looking to see if the result from that previous $filter returned an empty array [] for "offers". If it is not empty, then the element is returned or otherwise it is removed.
MongoDB 2.6.x
This is very similar to the modern process except that since there is no $filter in this version you can use $map to inspect each element and then use $setDifference to filter out any elements that were returned as false.
So $map is going to return the whole array, but the $cond operation just decides whether to return the element or instead a false value. In the comparison of $setDifference to a single element "set" of [false] all false elements in the returned array would be removed.
In all other ways, the logic is the same as above.
MongoDB 2.2.x and up
So below MongoDB 2.6 the only tool for working with arrays is $unwind, and for this purpose alone you should not use the aggregation framework "just" for this purpose.
The process indeed appears simple, by simply "taking apart" each array, filtering out the things you don't need then putting it back together. The main care is in the "two" $group stages, with the "first" to re-build the inner array, and the next to re-build the outer array. There are distinct _id values at all levels, so these just need to be included at every level of grouping.
But the problem is that $unwind is very costly. Though it does have purpose still, it's main usage intent is not to do this sort of filtering per document. In fact in modern releases it's only usage should be when an element of the array(s) needs to become part of the "grouping key" itself.
Conclusion
So it's not a simple process to get matches at multiple levels of an array like this, and in fact it can be extremely costly if implemented incorrectly.
Only the two modern listings should ever be used for this purpose, as they employ a "single" pipeline stage in addition to the "query" $match in order to do the "filtering". The resulting effect is little more overhead than the standard forms of .find().
In general though, those listings still have an amount of complexity to them, and indeed unless you are really drastically reducing the content returned by such filtering in a way that makes a significant improvement in bandwidth used between the server and client, then you are better of filtering the result of the initial query and basic projection.
db.getCollection('retailers').find(
{ 'stores.offers.size': 'L'},
{ 'stores.$': 1 }
).forEach(function(doc) {
// Technically this is only "one" store. So omit the projection
// if you wanted more than "one" match
doc.stores = doc.stores.filter(function(store) {
store.offers = store.offers.filter(function(offer) {
return offer.size.indexOf("L") != -1;
});
return store.offers.length != 0;
});
printjson(doc);
})
So working with the returned object "post" query processing is far less obtuse than using the aggregation pipeline to do this. And as stated the only "real" diffrerence would be that you are discarding the other elements on the "server" as opposed to removing them "per document" when received, which may save a little bandwidth.
But unless you are doing this in a modern release with only $match and $project, then the "cost" of processing on the server will greatly outweigh the "gain" of reducing that network overhead by stripping the unmatched elements first.
In all cases, you get the same result:
{
"_id" : ObjectId("56f277b1279871c20b8b4567"),
"stores" : [
{
"_id" : ObjectId("56f277b5279871c20b8b4783"),
"offers" : [
{
"_id" : ObjectId("56f277b1279871c20b8b4567"),
"size" : [
"S",
"L",
"XL"
]
}
]
}
]
}
as your array is embeded we cannot use $elemMatch, instead you can use aggregation framework to get your results:
db.retailers.aggregate([
{$match:{"stores.offers.size": 'L'}}, //just precondition can be skipped
{$unwind:"$stores"},
{$unwind:"$stores.offers"},
{$match:{"stores.offers.size": 'L'}},
{$group:{
_id:{id:"$_id", "storesId":"$stores._id"},
"offers":{$push:"$stores.offers"}
}},
{$group:{
_id:"$_id.id",
stores:{$push:{_id:"$_id.storesId","offers":"$offers"}}
}}
]).pretty()
what this query does is unwinds arrays (twice), then matches size and then reshapes the document to previous form. You can remove $group steps and see how it prints.
Have a fun!
It's also works without aggregate.
here is the solution link:https://mongoplayground.net/p/Q5lxPvGK03A
db.collection.find({
"stores.offers.size": "L"
},
{
"stores": {
"$filter": {
"input": {
"$map": {
"input": "$stores",
"as": "store",
"in": {
"_id": "$$store._id",
"offers": {
"$filter": {
"input": "$$store.offers",
"as": "offer",
"cond": {
"$setIsSubset": [
[
"L"
],
"$$offer.size"
]
}
}
}
}
}
},
"as": "store",
"cond": {
"$ne": [
"$$store.offers",
[]
]
}
}
}
})

Turn _ids into Keys of New Object

I have a huge bunch of documents as such:
{
_id: '1abc',
colors: [
{ value: 'red', count: 2 },
{ value: 'blue', count: 3}
]
},
{
_id: '2abc',
colors: [
{ value: 'red', count: 7 },
{ value: 'blue', count: 34},
{ value: 'yellow', count: 12}
]
}
Is it possible to make use of aggregate() to get the following?
{
_id: 'null',
colors: {
"1abc": [
{ value: 'red', count: 2 },
{ value: 'blue', count: 3}
],
"2abc": [
{ value: 'red', count: 7 },
{ value: 'blue', count: 34},
{ value: 'yellow', count: 12}
]
}
}
Basically, is it possible to turn all of the original documents' _ids into keys of a new object in the singular new aggregated document?
So far, when trying to use$group, I had not been able to use a variable value, e.g. $_id, on the left hand side of an assignment. Am I missing something or is it simply impossible?
I can do this easily using Javascript but it is unbearably slow. Hence why I am looking to see if it is possible using mongo native aggregate(), which will probably be faster.
If impossible... I would appreciate any kind suggestions that could point towards a sufficient alternative (change structure, etc.?). Thank you!
Like a said in comments, whilst there are things you can do with the aggregation framework or even mapReduce to make the "server" reshape the response, it's kind of silly to do so.
Lets consider the cases:
Aggregate
db.collection.aggregate([
{ "$match": { "_id": { "$in": ["1abc","2abc"] } } },
{ "$group": {
"_id": null,
"result": { "$push": "$$ROOT" }
}},
{ "$project": {
"colors": {
"1abc": {
"$arrayElemAt": [
{ "$map": {
"input": {
"$filter": {
"input": "$result",
"as": "r",
"cond": { "$eq": [ "$$r._id", "1abc" ] },
}
},
"as": "r",
"in": "$$r.colors"
}},
0
]
},
"2abc": {
"$arrayElemAt": [
{ "$map": {
"input": {
"$filter": {
"input": "$result",
"as": "r",
"cond": { "$eq": [ "$$r._id", "2abc" ] },
}
},
"as": "r",
"in": "$$r.colors"
}},
0
]
}
}
}}
])
So the aggregation framework purely does not dynamically generate "keys" of a document. If you want to process this way, then you need to know all of the "values" that you are going to use to make the keys in the result.
After putting everything into one document with $group, you can then work with the result array to extact data for your "keys". The basic operators here are:
$filter to get the matched element of the array for the "value" that you want.
$map to return just the specific property from the filtered array
$arrayElemAt to just grab the single elment that was filtered out of the resulting mapped array
So it really isn't practical in a lot of cases, and the coding of the statement is fairly involved.
MapReduce
db.collection.mapReduce(
function() {
var obj = { "colors": {} };
obj.colors[this._id] = this.colors;
emit(null,obj);
},
function(key,values) {
var obj = { "colors": {} };
values.forEach(function(value) {
Object.keys(value.colors).forEach(function(key) {
obj.colors[key] = value.colors[key];
});
})
return obj;
},
{ "out": { "inline": 1 } }
)
Since it is actually written in a "language" then you have the ability to loop structures and "build things" in a more dynamic way.
However, close inspection should tell you that the "reducer" function here is not doing anything more than being the processor of "all the results" which have been "stuffed into it" but each emitted document.
That means that "iterating the values" fed to the reducer is really no different to "iterating the cursor", and that leads to the next conclusion.
Cursor Iteration
var result = { "colors": {} };
db.collection.find().forEach(function(doc) {
result.colors[doc._id] = doc.colors;
})
printjson(result)
The simplicity of this should really speak volumes. It is afterall doing exactly what you are trying to "shoehorn" into a server operation and nothing more, and just simply "rolls up it sleeves" and gets on with the task at hand.
The key point here is none of the process requires any "aggregation" in a real sense, that cannot be equally achieved by simply iterating the cursor and building up the response document.
This is really why you always need to look at what you are doing and choose the right method. "Server side" aggregation has a primary task of "reducing" a result so you would not need to iterate a cursor. But nothing here "reduces" anything. It's just all of the data, transformed into a different format.
Therefore the simple approach for this type of "transform" is to just iterate the cursor and build up your transformed version of "all the results" anyway.

MongoDB advanced aggregation

I'm a total newbie to MongoDB. I work on a privat project for my golf club to analyze the round.
I use meteorJS for the Application and tried some aggregation on the command line. But I'm not sure if I even have the right point to the task
A sample document:
{
"_id" : "2KasYR3ytsaX8YuoT",
"course" : {
"id" : "rHmYJBhRtSt38m68s",
"name" : "CourseXYZ"
},
"player" : {
"id" : "tdaYaSvXJueDq4oTN",
"firstname" : "Just",
"lastname" : "aPlayer"
},
"event" : "Training Day",
"tees" : [
{
"tee" : 1,
"par" : 4,
"fairway" : "straight",
"greenInRegulation" : true,
"putts" : 3,
"strokes" : 5
},
{
"tee" : 2,
"par" : 5,
"fairway" : "right",
"greenInRegulation" : true,
"putts" : 2,
"strokes" : 5
},
{
"tee" : 3,
"par" : 5,
"fairway" : "right",
"greenInRegulation" : false,
"shotType": "bunker",
"putts" : 2,
"strokes" : 5
}
]
}
My attempt so far:
db.analysis.aggregate([
{$unwind: "$tees"},
{$group: {
_id:"$player.id",
strokes: {$sum: "$tees.strokes"},
par: {$sum: "$tees.par"},
putts: {$sum: "$tees.putts"},
teesPlayed: {$sum:1}
}}
])
And what I want for a result
{
"_id" : "tdaYaSvXJueDq4oTN",
"strokes" : 15,
"par" : 14,
"putts" : 7,
"teesPlayed" : 3
// here comes what I want to add:
"fairway.straight": 1 // where tees.fairway equals "straight"
"fairway.right": 2 // where tees.fraiway equals "right" (etc.)
"shotType.bunker": 1 // where shotType equals "bunker" etc.
}
There are a few ways of approaching this depending on your overall needs and which MongoDB server version you have available as a target for your project.
Whilst "meteor" installations and default project setups do not "bundle" a MongoDB 3.2 instance, there is no need why your project cannot use such an instance as an external target. If it's a new project to get off the ground, then I would highly recommend working against the latest version available. And maybe even possibly against latest development releases, depending on your own targeted release cycle. Work with what is most fresh, and your application will be too.
For that reason, we start with the latest at the top of the list.
MongoDB 3.2 way - Fast
The big feature in MongoDB 3.2 that makes it really stand out here in terms of performance is a change in how $sum operates. Previously just as an accumulator operator for $group this would work on singular numeric values to produce a total.
The big improvement is hidden within the $project stage usage which is added, where $sum can be directly applied to an array of values. i.e { "$sum": [1,2,3] } results in 6. So now you can "nest" the operations with anything that produces an array of values from a source. Most notably here is $map:
db.analysis.aggregate([
{ "$group": {
"_id": "$player.id",
"strokes": {
"$sum": {
"$sum": {
"$map": {
"input": "$tees",
"as": "tee",
"in": "$$tee.strokes"
}
}
}
},
"par": {
"$sum": {
"$sum": {
"$map": {
"input": "$tees",
"as": "tee",
"in": "$$tee.par"
}
}
}
},
"putts": {
"$sum": {
"$sum": {
"$map": {
"input": "$tees",
"as": "tee",
"in": "$$tee.putts"
}
}
}
},
"teesPlayed": { "$sum": { "$size": "$tees" } },
"shotsRight": {
"$sum": {
"$size": {
"$filter": {
"input": "$tees",
"as": "tee",
"cond": { "$eq": [ "$$tee.fairway", "right" ] }
}
}
}
},
"shotsStraight": {
"$sum": {
"$size": {
"$filter": {
"input": "$tees",
"as": "tee",
"cond": { "$eq": [ "$$tee.fairway", "straight" ] }
}
}
}
},
"bunkerShot": {
"$sum": {
"$size": {
"$filter": {
"input": "$tees",
"as": "tee",
"cond": { "$eq": [ "$$tee.shotType", "bunker" ] }
}
}
}
}
}}
])
So here each field is split out by either doing the double $sum trick on the single field values from the array items, or in contrast the arrays are being processed with $filter to just restrict to matching items and processed for lenght of matches with $size, for the result fields that rather want "counts".
Though this looks long winded in pipeline construction it will yield the fasted results. And though you need to specify all of the keys to result with the associated logic, there is nothing stopping "generation" of the data structure necessary for the pipeline as the result of other queries on the data set.
The other Aggregate Way - A bit slower
Of course not every project can practically use the latest version of things. So before a MongoDB 3.2 release that introduced some of the operators used above, the only real practical way to work with array data and conditionally work with different elements and sums was to process first with $unwind.
So essentially we start with the query you began to construct, but then add in the handling for the different fields:
db.analysis.aggregate([
{ "$unwind": "$tees" },
{ "$group": {
"_id": "$player.id",
"strokes": { "$sum": "$tees.strokes" },
"par": { "$sum": "$tees.par" },
"putts": { "$sum": "$tees.putts" },
"teedsPlayed": { "$sum": 1 },
"shotsRight": {
"$sum": {
"$cond": [
{ "$eq": [ "$tees.fairway", "right" ] },
1,
0
]
}
},
"shotsStraight": {
"$sum": {
"$cond": [
{ "$eq": [ "$tees.fairway", "straight" ] },
1,
0
]
}
},
"bunkerShot": {
"$sum": {
"$cond": [
{ "$eq": [ "$tees.shotType", "bunker" ] },
1,
0
]
}
}
}}
])
So you should note that there is still "some" similarity to the first listing, in that where the $filter statements all have some logic within there "cond" argument, that logic is rather transposed to the $cond operator here.
As a "ternary" operator ( if/then/else) , it's job it is to evaluate a logical condition (if) and either return the next argument where that condition was true (then) or otherwise return the last argument where it is false (else). In this case either 1 or 0 depending on whether the tested condition matched. This gives the "counts" to $sum as is required.
In either statement, the produced results come out like this:
{
"_id" : "tdaYaSvXJueDq4oTN",
"strokes" : 15,
"par" : 14,
"putts" : 7,
"teesPlayed" : 3,
"shotsRight" : 2,
"shotsStraight" : 1,
"bunkerShot" : 1
}
Since this is an aggregate statement with $group, then one rule is that the "keys" ( apart from needing to be specified in the constructed statement ) must be in the "top-level" of the structure. So no "nested" structures are allowed within a $group, hence the whole names for each key.
If you really must transform, then you can by adding a $project stage following the $group in each example:
{ "$project": {
"strokes": 1,
"par": 1,
"putts": 1,
"teesPlayed": 1,
"fairway": {
"straight": "$shotsStraight",
"right": "$shotsRight"
},
"shotType": {
"bunker": "$bunkerShot"
}
}}
So a bit of "re-shaping" can be done, but of course all the names and structure must be specified, though again you could in theory just generate this all in code. It is just a data structure after all.
The bottom line here is that $unwind adds cost, and quite a lot of cost. It is basically going to add a copy of each document in the pipeline for processing "per" every array element contained in each document. So not is there only the cost of processing all of those produced things, but also a cost of "producing" them in the first place.
MapReduce - Slower still, but more flexible on the keys
And finally as an approach
db.analysis.mapReduce(
function() {
var data = { "strokes": 0 ,"par": 0, "putts": 0, "teesPlayed": 0, "fairway": {} };
this.tees.forEach(function(tee) {
// Increment common values
data.strokes += tee.strokes;
data.par += tee.par;
data.putts += tee.putts;
data.teesPlayed++;
// Do dynamic keys
if (!data.fairway.hasOwnProperty(tee.fairway))
data.fairway[tee.fairway] = 0;
data.fairway[tee.fairway]++;
if (tee.hasOwnProperty('shotType')) {
if (!data.hasOwnProperty('shotType'))
data.shotType = {};
if (!data.shotType.hasOwnProperty(tee.shotType))
data.shotType[tee.shotType] = 0;
data.shotType[tee.shotType]++
}
});
emit(this.player.id,data);
},
function(key,values) {
var data = { "strokes": 0 ,"par": 0, "putts": 0, "teesPlayed": 0, "fairway": {} };
values.forEach(function(value) {
// Common keys
data.strokes += value.strokes;
data.par += value.par;
data.putts += value.putts;
data.teesPlayed += value.teesPlayed;
Object.keys(value.fairway).forEach(function(fairway) {
if (!data.fairway.hasOwnProperty(fairway))
data.fairway[fairway] = 0;
data.fairway[fairway] += value.fairway[fairway];
});
if (value.hasOwnProperty('shotType')) {
if (!data.hasOwnProperty('shotType'))
data.shotType = {};
Object.keys(value.shotType).forEach(function(shotType) {
if (!data.shotType.hasOwnProperty(shotType))
data.shotType[shotType] = 0;
data.shotType[shotType] += value.shotType[shotType];
});
}
});
return data;
},
{ "out": { "inline": 1 } }
)
And the output from this can be done immediately with the nested structure, but of course in the very mapReduce output form of "key/value", being that "key" is the grouping _id and "value" contains all the output:
{
"_id" : "tdaYaSvXJueDq4oTN",
"value" : {
"strokes" : 15,
"par" : 14,
"putts" : 7,
"teesPlayed" : 3,
"fairway" : {
"straight" : 1,
"right" : 2
},
"shotType" : {
"bunker" : 1
}
}
}
The "out" options for mapReduce are either the "inline" as shown here where you can fit all the result in memory ( and within the 16MB BSON limit ), or alternately to another collection from which you can read later. There is a similar $out for .aggregate(), but this is generally negated by aggregation output being available as a "cursor", unless of course you really want it in a collection instead.
Concluding
So it all depends on how you really want to approach this. If speed is of the upmost importance then .aggregate() is generally going to yield the fastest results. On the other hand if you want to work "dynamically" with the produced "keys" then mapReduce allows the logic to be generally self contained, without the need for another inspection pass to generate the required aggregation pipeline statement.
I am not clear how to do that through aggregation, however, there is one work around in this way
> db.collection.find({}).forEach(function(doc) {
var ret = {};
ret._id = doc._id;
doc.tees.forEach(function(obj) {
for (var k in obj) {
var type = typeof obj[k];
if (type === 'number') {
if (ret.hasOwnProperty(k)) {
ret[k] += obj[k];
} else {
ret[k] = obj[k];
}
} else if (type === 'string') {
if (ret.hasOwnProperty(k+'.'+obj[k])) {
ret[k+'.'+obj[k]] += 1;
} else {
ret[k+'.'+obj[k]] = 1;
}
}
}
});
printjson(ret);
});