Union Set using MapReduce MongoDB - mongodb

I'm trying to unite two collections using MapReduce. They have identical structure, for example:
db.tableR.insert({product:"A", quantity:150});
db.tableR.insert({product:"B", quantity:100});
db.tableR.insert({product:"C", quantity:60});
db.tableR.insert({product:"D", quantity:200});
db.tableS.insert({product:"A", quantity:150});
db.tableS.insert({product:"B", quantity:100});
db.tableS.insert({product:"F", quantity:220});
db.tableS.insert({product:"G", quantity:130});
I want MapReduce delete duplicates.
I'm creating a map that divides collection according quantity:
map = function(){
if (this.quantity<150){
var key=0;
}else{
var key=1;
}
var value = {"product":this.product, "quantity":this.quantity};
emit(key,value);
};
Now I want that reduce function removes duplicates but I can't find a way to add the new ones to the reduced var.
This is what I tried:
reduce = function(keys,values){
var reduced = {
product:"",
quantity:""
};
for (var i=0; i < values.length;i++)
{
if(values[i].product !== null) {reduced.insert({product: values[i].product, quantity: values[i].quantity})}
}
return reduced;};
db.tableR.mapReduce(map,reduce,{out:'map_reduce_result'});
db.tableS.mapReduce(map,reduce,{out:'map_reduce_result'});
db.map_reduce_result.find();
What function can I use?
My expected output:
{"_id" : 0, "value" : {"product" : "B","quantity" : 100}}
{"_id" : 0, "value" : {"product" : "C","quantity" : 60}}
{"_id" : 0, "value" : {"product" : "G","quantity" : 130}}
{"_id" : 1, "value" : {"product" : "A","quantity" : 150}}
{"_id" : 1, "value" : {"product" : "D","quantity" : 200}}
{"_id" : 1, "value" : {"product" : "F","quantity" : 220}}

The reduce function can only return a single value, so you want it to execute for every single row. The reduce function gets called for each unique key returned in your map function. Your keys were 0 and 1, so it would only get called twice for each collection - once for key 0 and once for key 1. Hence, the max number of results would only be 2 for each collection.
What you need to do is set the key to the product in the map function:
map = function(){
emit(this.product,{product:this.product,quantity:this.quantity});
};
Now, the reduce function will get called for every unique product value. Our new map function just returns the first value in the array (if there were duplicates in the same collection it would just take the first. You could be smart here and take the highest or lowest quantity - or the sum of the quantities, etc).
reduce = function(keys,values){
return values[0];
};
Run your first map reduce job:
db.tableR.mapReduce(map,reduce,{out:'map_reduce_result'});
Run your second, but this time merge the result:
db.tableS.mapReduce(map,reduce,{out: {merge: 'map_reduce_result'}});
Now db.map_reduce_result.find() returns:
{ "_id" : "A", "value" : { "product" : "A", "quantity" : 150 } }
{ "_id" : "B", "value" : { "product" : "B", "quantity" : 100 } }
{ "_id" : "C", "value" : { "product" : "C", "quantity" : 60 } }
{ "_id" : "D", "value" : { "product" : "D", "quantity" : 200 } }
{ "_id" : "F", "value" : { "product" : "F", "quantity" : 220 } }
{ "_id" : "G", "value" : { "product" : "G", "quantity" : 130 } }
Obviously the _id doesn't match what you are looking for. If you absolutely need that you can use the aggregation framework like so:
db.map_reduce_result.aggregate([{$project:{
_id:{$cond: { if: { $gte: [ "$value.quantity", 150 ] }, then: 1, else: 0 }},
value:1
}}]);
This results in:
{ "_id" : 1, "value" : { "product" : "A", "quantity" : 150 } }
{ "_id" : 0, "value" : { "product" : "B", "quantity" : 100 } }
{ "_id" : 0, "value" : { "product" : "C", "quantity" : 60 } }
{ "_id" : 1, "value" : { "product" : "D", "quantity" : 200 } }
{ "_id" : 1, "value" : { "product" : "F", "quantity" : 220 } }
{ "_id" : 0, "value" : { "product" : "G", "quantity" : 130 } }
Note: If two rows from different collections have the same product ID, but different quantities I am unsure which one will be returned.

Related

Mongo aggregation - Sorting using a field value from previous pipeline as the sort field

I have produced the below output using mongodb aggregation (including $group pipeline inside levelsCount field) :
{
"_id" : "1",
"name" : "First",
"levelsCount" : [
{ "_id" : "level_One", "levelNum" : 1, "count" : 1 },
{ "_id" : "level_Three", "levelNum" : 3, "count" : 1 },
{ "_id" : "level_Four", "levelNum" : 4, "count" : 8 }
]
}
{
"_id" : "2",
"name" : "Second",
"levelsCount" : [
{ "_id" : "level_One", "levelNum" : 1, "count" : 5 },
{ "_id" : "level_Two", "levelNum" : 2, "count" : 2 },
{ "_id" : "level_Three", "levelNum" : 3, "count" : 1 },
{ "_id" : "level_Four", "levelNum" : 4, "count" : 3 }
]
}
{
"_id" : "3",
"name" : "Third",
"levelsCount" : [
{ "_id" : "level_One", "levelNum" : 1, "count" : 1 },
{ "_id" : "level_Two", "levelNum" : 2, "count" : 3 },
{ "_id" : "level_Three", "levelNum" : 3, "count" : 2 },
{ "_id" : "level_Four", "levelNum" : 4, "count" : 3 }
]
}
Now, I need to sort these documents based on the levelNum and count fields of levelsCount array elements. I.e. If two documents both had the count 5 forlevelNum: 1 (level_One), then the sort goes to compare the count of levelNum: 2 (level_Two) field and so on.
I see how $sort pipeline would work on multiple fields (Something like { $sort : { level_One : 1, level_Two: 1 } }), But the problem is how to access those values of levelNum of each array element and set that value as a field name to do sorting on that. (I couldn't handle it even after $unwinding the levelsCount array).
P.s: The initial order of levelsCount array's elements may differ on each document and is not important.
Edit:
The expected output of the above structure would be:
// Sorted result:
{
"_id" : "2",
"name" : "Second",
"levelsCount" : [
{ "_id" : "level_One", "levelNum" : 1, "count" : 5 }, // "level_One's count: 5" is greater than "level_One's count: 1" in two other documents, regardless of other level_* fields. Therefore this whole document with "name: Second" is ordered first.
{ "_id" : "level_Two", "levelNum" : 2, "count" : 2 },
{ "_id" : "level_Three", "levelNum" : 3, "count" : 1 },
{ "_id" : "level_Four", "levelNum" : 4, "count" : 3 }
]
}
{
"_id" : "3",
"name" : "Third",
"levelsCount" : [
{ "_id" : "level_One", "levelNum" : 1, "count" : 1 },
{ "_id" : "level_Two", "levelNum" : 2, "count" : 3 }, // "level_Two's count" in this document exists with value (3) while the "level_Two" doesn't exist in the below document which mean (0) value for count. So this document with "name: Third" is ordered higher than the below document.
{ "_id" : "level_Three", "levelNum" : 3, "count" : 2 },
{ "_id" : "level_Four", "levelNum" : 4, "count" : 3 }
]
}
{
"_id" : "1",
"name" : "First",
"levelsCount" : [
{ "_id" : "level_One", "levelNum" : 1, "count" : 1 },
{ "_id" : "level_Three", "levelNum" : 3, "count" : 1 },
{ "_id" : "level_Four", "levelNum" : 4, "count" : 8 }
]
}
Of course, I'd prefer to have an output document in the below format, But the first problem is to sort all docs:
{
"_id" : "1",
"name" : "First",
"levelsCount" : [
{ "level_One" : 1 },
{ "level_Three" : 1 },
{ "level_Four" : 8 }
]
}
You can sort by levelNum as descending order and count as ascending order,
db.collection.aggregate([
{
$sort: {
"levelsCount.levelNum": -1,
"levelsCount.count": 1
}
}
])
Playground
For key-value format result of levelsCount array,
$map to iterate loop of levelsCount array
prepare key-value pair array and convert to object using $arrayToObject
{
$addFields: {
levelsCount: {
$map: {
input: "$levelsCount",
in: {
$arrayToObject: [
[{ k: "$$this._id", v: "$$this.levelNum" }]
]
}
}
}
}
}
Playground

Find MongoDB docs where all sub-docs match criteria

I have some Product documents that each contain a list of ProductVariation sub-documents. I need to find all the Product docs where ALL their child ProductVariation docs have zero quantity.
Schemas look like this:
var Product = new mongoose.Schema({
name: String,
variations: [ProductVariation]
});
var ProductVariation = new mongoose.Schema({
type: String,
quantity: Number,
price: Number
});
I am a little new to mongodb, so even sure where to start here.
Try using $not wrapped around { "$gt" : 0 }:
> db.products.find()
{ "_id" : ObjectId("5b7cae558ff28edda6ba4a67"), "name" : "widget", "variations" : [ { "type" : "color", "quantity" : 0, "price" : 10 }, { "type" : "size", "quantity" : 0, "price" : 5 } ] }
{ "_id" : ObjectId("5b7cae678ff28edda6ba4a68"), "name" : "foo", "variations" : [ { "type" : "color", "quantity" : 2, "price" : 15 }, { "type" : "size", "quantity" : 0, "price" : 5 } ] }
{ "_id" : ObjectId("5b7cae7f8ff28edda6ba4a69"), "name" : "bar", "variations" : [ { "type" : "color", "quantity" : 0, "price" : 15 }, { "type" : "size", "quantity" : 1, "price" : 5 } ] }
> db.products.find({"variations.quantity": { "$not" : { "$gt" : 0 } } })
{ "_id" : ObjectId("5b7cae558ff28edda6ba4a67"), "name" : "widget", "variations" : [ { "type" : "color", "quantity" : 0, "price" : 10 }, { "type" : "size", "quantity" : 0, "price" : 5 } ] }
It can also take advantage of an index on { "variations.quantity" : 1 }.

Filling in with documents with default values after find/aggregate

I have a collection:
{ "name" : "A", "value" : 1, "date" : ISODate("2014-01-01T00:00:00.000Z") }
{ "name" : "B", "value" : 7, "date" : ISODate("2014-01-01T00:00:00.000Z") }
{ "name" : "A", "value" : 3, "date" : ISODate("2014-01-02T00:00:00.000Z") }
{ "name" : "B", "value" : 8, "date" : ISODate("2014-01-02T00:00:00.000Z") }
{ "name" : "B", "value" : 8, "date" : ISODate("2014-01-03T00:00:00.000Z") }
{ "name" : "A", "value" : 5, "date" : ISODate("2014-01-04T00:00:00.000Z") }
{ "name" : "A", "value" : 4, "date" : ISODate("2014-01-05T00:00:00.000Z") }
The document for A on 3rd Jan 2014 is not available. When I do a find/aggregate on A, I would like the document to appear in my result set with a default value (or better, value to be same as previous date). For example:
{ "name" : "A", "value" : 1, "date" : ISODate("2014-01-01T00:00:00.000Z") }
{ "name" : "A", "value" : 3, "date" : ISODate("2014-01-02T00:00:00.000Z") }
{ "name" : "A", "value" : 3 (or default value -1), "date" : ISODate("2014-01-03T00:00:00.000Z") }
{ "name" : "A", "value" : 5, "date" : ISODate("2014-01-04T00:00:00.000Z") }
{ "name" : "A", "value" : 4, "date" : ISODate("2014-01-05T00:00:00.000Z") }
How can this be done?
One thing you need in order to be able to do this in aggregation framework is an array of dates that you want your report to cover. For example, for input that you show, you might have an array:
days = [ ISODate("2014-01-01T00:00:00Z"), ISODate("2014-01-02T00:00:00Z"),
ISODate("2014-01-03T00:00:00Z"), ISODate("2014-01-04T00:00:00Z"),
ISODate("2014-01-05T00:00:00Z"), ISODate("2014-01-06T00:00:00Z") ];
to indicate that you want every one of these six days represented.
Here is the aggregation that you would run:
db.coll.aggregate( [
{$group : {_id:{name:"$name",date:"$date"},value:{$sum:"$value"}}},
{$group : {_id:"$_id.name", days:{$addToSet:"$_id.date"},docs:{$push:"$$ROOT"}}},
{$project : {missingDays:{$setDifference:[days,"$days"]},docs:1}},
{$unwind : "$missingDays"},
{$unwind : "$docs"},
{$group : {
_id:"$_id",
days:{$addToSet:{date:"$docs._id.date",value:"$docs.value"}},
missingDays:{$addToSet:{date:"$missingDays",value:{$literal:0}}}
} },
{$project : {_id:0, name:"$_id", date:{$setUnion:["$days","$missingDays"]}}},
{$unwind : "$date"},
{$sort : {date:1,name:1}}
] )
On your sample input with days defined as above it outputs:
{ "name" : "A", "date" : { "date" : ISODate("2014-01-01T00:00:00Z"), "value" : 1 } }
{ "name" : "A", "date" : { "date" : ISODate("2014-01-02T00:00:00Z"), "value" : 3 } }
{ "name" : "A", "date" : { "date" : ISODate("2014-01-03T00:00:00Z"), "value" : 0 } }
{ "name" : "A", "date" : { "date" : ISODate("2014-01-04T00:00:00Z"), "value" : 5 } }
{ "name" : "A", "date" : { "date" : ISODate("2014-01-05T00:00:00Z"), "value" : 4 } }
{ "name" : "A", "date" : { "date" : ISODate("2014-01-06T00:00:00Z"), "value" : 0 } }
{ "name" : "B", "date" : { "date" : ISODate("2014-01-01T00:00:00Z"), "value" : 7 } }
{ "name" : "B", "date" : { "date" : ISODate("2014-01-02T00:00:00Z"), "value" : 8 } }
{ "name" : "B", "date" : { "date" : ISODate("2014-01-03T00:00:00Z"), "value" : 8 } }
{ "name" : "B", "date" : { "date" : ISODate("2014-01-04T00:00:00Z"), "value" : 0 } }
{ "name" : "B", "date" : { "date" : ISODate("2014-01-05T00:00:00Z"), "value" : 0 } }
{ "name" : "B", "date" : { "date" : ISODate("2014-01-06T00:00:00Z"), "value" : 0 } }
The first group stage may not be necessary in your case - it's there in case there are multiple documents for the same name and date, in that case you want to add the values for them. The second $group and $project stage figure out the difference between the days present for each name and the array of days you want covered, creating missingDays which will be getting the value 0 in the next $group stage. That group stage creates for each name an array of dates that have data and array of missing dates that don't. It structures them the say way so that the following $project stage can create a union of them using the $setUnion operator. After that all that's left is to $unwind the array of dates and sort it whichever way you want.

What is reason that on mapreduce sometimes the mapper generates more documents than the original data in mongodb?

I am performing a state-wise population count and getting extra documents with the original output. To check the reason i found that mappers would generate intermediate data a lot of more than the original data in mongodb . How can i resolve this ? The total count of document in source collection is 29468.
Sample from the Dataset:
{ "city" : "SPLENDORA", "loc" : [ -95.199308, 30.232609 ], "pop" : 11287, "state" : "TX", "_id" : "77372" }
{ "city" : "SPRING", "loc" : [ -95.377329, 30.053241 ], "pop" : 33118, "state" : "TX", "_id" : "77373" }
{ "city" : "TOMBALL", "loc" : [ -95.62006, 30.073923 ], "pop" : 19801, "state" : "TX", "_id" : "77375" }
{ "city" : "WILLIS", "loc" : [ -95.497583, 30.432025 ], "pop" : 9988, "state" : "TX", "_id" : "77378" }
{ "city" : "KLEIN", "loc" : [ -95.528481, 30.023377 ], "pop" : 35275, "state" : "TX", "_id" : "77379" }
{ "city" : "CONROE", "loc" : [ -95.492392, 30.225725 ], "pop" : 1635, "state" : "TX", "_id" : "77384" }
map function:
var m=function(){ emit(this.city,this.pop);}
reduce function:
var r=function(c,p){ return p;}
MR output to a new collection :
{ "_id" : "81080", "value" : 172 }
{ "_id" : "81250", "value" : 467 }
{ "_id" : "82057", "value" : 60 }
{ "_id" : "95411", "value" : 133 }
{ "_id" : "95414", "value" : 226 }
{ "_id" : "95440", "value" : 2876 }
{ "_id" : "95455", "value" : 843 }
{ "_id" : "95467", "value" : 328 }
{ "_id" : "95489", "value" : 358 }
{ "_id" : "95495", "value" : 367 }
{ "_id" : "98791", "value" : 5345 }
{ "_id" : "PLEASANT GROVE", "value" : [ 8458, 15703, 80, 772,
{ "_id" : "POINTBLANK", "value" : 2911 }
{ "_id" : "PORTER", "value" : [ 13541, 19024, 985, 425, 2705 ]
{ "_id" : "SHEPHERD", "value" : [ 9604, 17397, 2078 ] }
{ "_id" : "SPLENDORA", "value" : 11287 }
{ "_id" : "SPRING", "value" : [ 33118, 8379, 21805, 8540 ] }
{ "_id" : "TOMBALL", "value" : 19801 }
{ "_id" : "WILLIS", "value" : [ 9988, 2769, 2574 ] }
{ "_id" : "KLEIN", "value" : 35275 }
Your output isn't as expected because your reduce function is incorrect. The prototype for a reduce function is function(key,values) {...}, where values is an array associated with the key.
Your reduce function is returning the values array rather than reducing it.
To sum up the values for a given key, your reduce() function should look like:
var r=function(key, values) {
return Array.sum(values);
}
If you want to calculate population by state, your map() function is also incorrect: you should be emitting the state & population instead of city & population:
var m=function() {
emit(this.state,this.pop);
}
Putting that together, your output should end up looking like:
{
"_id" : "AK",
"value" : 550043
},
{
"_id" : "AL",
"value" : 4040587
},
{
"_id" : "AR",
"value" : 2350725
}
...
The MongoDB manual has further details on writing and testing your reduce function:
Requirements for the reduce function
Troubleshooting the reduce function

Multiple group operations using Mongo aggregation framework

Given a set of questions that have linked survey and category id:
> db.questions.find().toArray();
[
{
"_id" : ObjectId("4fda05bc322b1c95b531ac25"),
"id" : 1,
"name" : "Question 1",
"category_id" : 1,
"survey_id" : 1,
"score" : 5
},
{
"_id" : ObjectId("4fda05cb322b1c95b531ac26"),
"id" : 2,
"name" : "Question 2",
"category_id" : 1,
"survey_id" : 1,
"score" : 3
},
{
"_id" : ObjectId("4fda05d9322b1c95b531ac27"),
"id" : 3,
"name" : "Question 3",
"category_id" : 2,
"survey_id" : 1,
"score" : 4
},
{
"_id" : ObjectId("4fda4287322b1c95b531ac28"),
"id" : 4,
"name" : "Question 4",
"category_id" : 2,
"survey_id" : 1,
"score" : 7
}
]
I can find the category average with:
db.questions.aggregate(
{ $group : {
_id : "$category_id",
avg_score : { $avg : "$score" }
}
}
);
{
"result" : [
{
"_id" : 1,
"avg_score" : 4
},
{
"_id" : 2,
"avg_score" : 5.5
}
],
"ok" : 1
}
How can I get the average of category averages (note this is different than simply averaging all questions)? I would assume I would do multiple group operations but this fails:
> db.questions.aggregate(
... { $group : {
... _id : "$category_id",
... avg_score : { $avg : "$score" },
... }},
... { $group : {
... _id : "$survey_id",
... avg_score : { $avg : "$score" },
... }}
... );
{
"errmsg" : "exception: the _id field for a group must not be undefined",
"code" : 15956,
"ok" : 0
}
>
It's important to understand that the operations in the argument to aggregate() form a pipeline. This meant that the input to any element of the pipeline is the stream of documents produced by the previous element in the pipeline.
In your example, your first query creates a pipeline of documents that look like this:
{
"_id" : 2,
"avg_score" : 5.5
},
{
"_id" : 1,
"avg_score" : 4
}
This means that the second element of the pipline is seeing a series of documents where the only keys are "_id" and "avg_score". The keys "category_id" and "score" no longer exist in this document stream.
If you want to further aggregate on this stream, you'll have to aggregate using the keys that are seen at this stage in the pipeline. Since you want to average the averages, you need to put in a single constant value for the _id field, so that all of the input documents get grouped into a single result.
The following code produces the correct result:
db.questions.aggregate(
{ $group : {
_id : "$category_id",
avg_score : { $avg : "$score" },
}
},
{ $group : {
_id : "all",
avg_score : { $avg : "$avg_score" },
}
}
);
When run, it produces the following output:
{
"result" : [
{
"_id" : "all",
"avg_score" : 4.75
}
],
"ok" : 1
}