I've been trying to use MapReduce in MongoDB to do what I think is a simple procedure. I don't know if this is the right approach, of if I should even be using MapReduce. I googled what keywords I thought of and tried to hit the docs where I thought I would have the most success - but nothing. Maybe I'm thinking too hard about this?
I have two collections: details and gpas
details is made up of a whole bunch of documents (3+ million). The studentid element can be repeated two times, one for each year, like the following:
{ "_id" : ObjectId("4d49b7yah5b6d8372v640100"), "classes" : [1,17,19,21], "studentid" : "12345a", "year" : 1}
{ "_id" : ObjectId("4d76b7oij7s2d8372v640100"), "classes" : [2,12,19,22], "studentid" : "98765a", "year" : 1}
{ "_id" : ObjectId("4d49b7oij7s2d8372v640100"), "classes" : [32,91,101,217], "studentid" : "12345a", "year" : 2}
{ "_id" : ObjectId("4d76b7rty7s2d8372v640100"), "classes" : [1,11,18,22], "studentid" : "24680a", "year" : 1}
{ "_id" : ObjectId("4d49b7oij7s2d8856v640100"), "classes" : [32,99,110,215], "studentid" : "98765a", "year" : 2}
...
gpas has elements with the same studentid's from details. Only one entry per studentid, like this:
{ "_id" : ObjectId("4d49b7yah5b6d8372v640111"), "studentid" : "12345a", "overall" : 97, "subscore": 1}
{ "_id" : ObjectId("4f76b7oij7s2d8372v640213"), "studentid" : "98765a", "overall" : 85, "subscore": 5}
{ "_id" : ObjectId("4j49b7oij7s2d8372v640871"), "studentid" : "24680a", "overall" : 76, "subscore": 2}
...
In the end I want to have a collection with one row for each student in this format:
{ "_id" : ObjectId("4d49b7yah5b6d8372v640111"), "studentid" : "12345a", "classes_1": [1,17,19,21], "classes_2": [32,91,101,217], "overall" : 97, "subscore": 1}
{ "_id" : ObjectId("4f76b7oij7s2d8372v640213"), "studentid" : "98765a", "classes_1": [2,12,19,22], "classes_2": [32,99,110,215], "overall" : 85, "subscore": 5}
{ "_id" : ObjectId("4j49b7oij7s2d8372v640871"), "studentid" : "24680a", "classes_1": [1,11,18,22], "classes_2": [], "overall" : 76, "subscore": 2}
...
The way I was going to do this was by running MapReduce like this:
var mapDetails = function() {
emit(this.studentid, {studentid: this.studentid, classes: this.classes, year: this.year, overall: 0, subscore: 0});
};
var mapGpas = function() {
emit(this.studentid, {studentid: this.studentid, classes: [], year: 0, overall: this.overall, subscore: this.subscore});
};
var reduce = function(key, values) {
var outs = { studentid: "0", classes_1: [], classes_2: [], overall: 0, subscore: 0};
values.forEach(function(value) {
if (value.year == 0) {
outs.overall = value.overall;
outs.subscore = value.subscore;
}
else {
if (value.year == 1) {
outs.classes_1 = value.classes;
}
if (value.year == 2) {
outs.classes_2 = value.classes;
}
outs.studentid = value.studentid;
}
});
return outs;
};
res = db.details.mapReduce(mapDetails, reduce, {out: {reduce: 'joined'}})
res = db.gpas.mapReduce(mapGpas, reduce, {out: {reduce: 'joined'}})
But when I run it, this is my resulting collection:
{ "_id" : "12345a", "value" : { "studentid" : "12345a", "classes_1" : [ ], "classes_2" : [ ], "overall" : 97, "subscore" : 1 } }
{ "_id" : "98765a", "value" : { "studentid" : "98765a", "classes_1" : [ ], "classes_2" : [ ], "overall" : 85, "subscore" : 5 } }
{ "_id" : "24680a", "value" : { "studentid" : "24680a", "classes_1" : [ ], "classes_2" : [ ], "overall" : 76, "subscore" : 2 } }
I'm missing the classes arrays.
Also, as an aside, how do I access the elements in resulting MapReduce value element? Does MapReduce always output to value or whatever else you name it?
This is similar to a question that was asked on the MongoDB-users Google Groups.
https://groups.google.com/group/mongodb-user/browse_thread/thread/60a8b683e2626ada?pli=1
The answer references an on-line tutorial which looks similar to your example:
http://tebros.com/2011/07/using-mongodb-mapreduce-to-join-2-collections/
For more information on MapReduce in MongoDB, please see the documentation:
http://www.mongodb.org/display/DOCS/MapReduce
Additionally, there is a useful step-by-step walkthrough of how a MapReduce operation works in the "Extras" Section of the MongoDB Cookbook article titled, "Finding Max And Min Values with Versioned Documents":
http://cookbook.mongodb.org/patterns/finding_max_and_min/
Forgive me if you have already read some of the referenced documents. I have included them for the benefit of other users who may be reading this post and new to using MapReduce in MongoDB
It is important that the outputs from the 'emit' statements in the Map functions match the outputs of the Reduce function. If there is only one document output by the Map function, the Reduce function might not be run at all, and then your output collection will have mismatched documents.
I have slightly modified your map statements to emit documents in the format of your desired output, with two separate "classes" arrays.
I have also reworked your reduce statement to add new classes to the classes_1 and classes_2 arrays, only if they do not already exist.
var mapDetails = function(){
var output = {studentid: this.studentid, classes_1: [], classes_2: [], year: this.year, overall: 0, subscore: 0}
if (this.year == 1) {
output.classes_1 = this.classes;
}
if (this.year == 2) {
output.classes_2 = this.classes;
}
emit(this.studentid, output);
};
var mapGpas = function() {
emit(this.studentid, {studentid: this.studentid, classes_1: [], classes_2: [], year: 0, overall: this.overall, subscore: this.subscore});
};
var r = function(key, values) {
var outs = { studentid: "0", classes_1: [], classes_2: [], overall: 0, subscore: 0};
values.forEach(function(v){
outs.studentid = v.studentid;
v.classes_1.forEach(function(class){if(outs.classes_1.indexOf(class)==-1){outs.classes_1.push(class)}})
v.classes_2.forEach(function(class){if(outs.classes_2.indexOf(class)==-1){outs.classes_2.push(class)}})
if (v.year == 0) {
outs.overall = v.overall;
outs.subscore = v.subscore;
}
});
return outs;
};
res = db.details.mapReduce(mapDetails, r, {out: {reduce: 'joined'}})
res = db.gpas.mapReduce(mapGpas, r, {out: {reduce: 'joined'}})
Running the two MapReduce operations results in the following collection, which matches your desired format:
> db.joined.find()
{ "_id" : "12345a", "value" : { "studentid" : "12345a", "classes_1" : [ 1, 17, 19, 21 ], "classes_2" : [ 32, 91, 101, 217 ], "overall" : 97, "subscore" : 1 } }
{ "_id" : "24680a", "value" : { "studentid" : "24680a", "classes_1" : [ 1, 11, 18, 22 ], "classes_2" : [ ], "overall" : 76, "subscore" : 2 } }
{ "_id" : "98765a", "value" : { "studentid" : "98765a", "classes_1" : [ 2, 12, 19, 22 ], "classes_2" : [ 32, 99, 110, 215 ], "overall" : 85, "subscore" : 5 } }
>
MapReduce always outputs documents in the form of {_id:"id", value:"value"}
There is more information available on working with sub-documents in the document titled, "Dot Notation (Reaching into Objects)":
http://www.mongodb.org/display/DOCS/Dot+Notation+%28Reaching+into+Objects%29
If you would like the output of MapReduce to appear in a different format, you will have to do that programmatically in your application.
Hopefully this will improve your understanding of MapReduce, and get you one step closer to producing your desired output collection. Good Luck!
You cannot use m/r for this since that is designed to only apply on one collection. Reading from more than one collection will break sharding compatibility and is therefore not allowed. You can do what you want with either the new aggregation framework (2.1+) or do this inside your application.
Related
I have a collection of 6-7 million event records. I have another collection of ~100,000 hourly weather records spanning the same timeframe as the event records. I am using an $aggregate pipeline with $lookup to merge in relevant weather data for each event in the event collection.
THE PROBLEM: I have been running this on the full EVENT dataset for more than 8 HOURS, with no result. I have a deadline and I'm wondering if I will get a result...ever.
PLEASE HELP
Here is a sample event record:
{
"_id" : ObjectId("5dedae8111cd89b173b00910"),
"EventType" : "P",
"Jurisdiction" : "ABCD",
"Year" : 2006,
"JulianDay" : 91,
"CallReceipt" : ISODate("2006-04-01T00:00:37Z"),
"EventClosed" : ISODate("2006-04-01T00:05:25Z"),
"FinalType" : "EFGHI",
"EventWindowStart" : ISODate("2006-04-01T00:00:00Z"),
"EventWindowEnd" : ISODate("2006-04-01T01:00:00Z")
}
Here is a weather record:
{
"_id" : ObjectId("5dc3cd909fc78c0c78a336da"),
"DATE" : ISODate("2012-01-01T00:02:00Z"),
"REPORT_TYPE" : "FM-16",
"SOURCE" : 7,
"HourlyAltimeterSetting" : "30.06",
"HourlyDewPointTemperature" : "36",
"HourlyDryBulbTemperature" : "37",
"HourlyPresentWeatherType" : "BR:1 ||",
"HourlyRelativeHumidity" : 93,
"HourlySkyConditions" : "SCT:04 7 BKN:07 15 OVC:08 33",
"HourlyStationPressure" : "29.46",
"HourlyVisibility" : "5.00",
"HourlyWetBulbTemperature" : 37,
"HourlyWindDirection" : "260",
"HourlyWindSpeed" : 5,
"REM" : "MET10101/01/12 00:02:02 SPECI KROC 010502Z 26004KT 5SM BR SCT00
7 BKN015 OVC033 03/02 A3006 RMK AO2 RTX (MP)",
"REPORT_MODE" : "hourly"
}
Here is my code, typed directly into the mongo shell:
db.EVENTS.aggregate([
{
$lookup:
{
from: "WEATHER",
let : { start : "$EventWindowStart", end: "$EventWindowEnd"},
pipeline : [
{ $match :
{ $expr:
{ $and:
[
{$gte: ["$DATE", "$$start"]},
{$lte: ["$DATE", "$$end"]}
]
}
}
},
{$project: {
_id : 0,
HourlyDryBulbTemperature : 1,
HourlyPrecipitation : 1,
HourlyVisibility : 1,
WindSpeed: 1
}
}
],
as: "HourlyWeatherData"
}
},
{$out: "MERGED" }
])
On a small test subset I get the desired output. So the code works, as far as I can tell...
Sample output:
{
"_id" : ObjectId("5dedae8111cd89b173b00910"),
"EventType" : "P",
"Jurisdiction" : "ABCD",
"Year" : 2006,
"JulianDay" : 91,
"CallReceipt" : ISODate("2006-04-01T00:00:37Z"),
"EventClosed" : ISODate("2006-04-01T00:05:25Z"),
"FinalType" : "EFGHI",
"EventWindowStart" : ISODate("2006-04-01T00:00:00Z"),
"EventWindowEnd" : ISODate("2006-04-01T01:00:00Z"),
"HourlyWeatherData" : [
{
"HourlyDryBulbTemperature" : "59",
"HourlyPrecipitation" : "0.00",
"HourlyVisibility" : "10.00"
},
{
"HourlyDryBulbTemperature" : "59",
"HourlyVisibility" : "9.94"
}
]
}
PS: I do have ascending indexes on the event window fields in EVENTS, and an ascending and descending index on the DATE in WEATHER.
I have 1000 user records in collecton, in which 459 document has gender male and remaining as female
//document structure
> db.user_details.find().pretty()
{
"_id" : ObjectId("557e610d626754910f0974a4"),
"id" : 0,
"name" : "Leanne Flinn",
"email" : "leanne.flinn#unilogic.com",
"work" : "Unilogic",
"dob" : "Fri Jun 11 1965 20:50:58 GMT+0530 (IST)",
"age" : 5,
"gender" : "female",
"salary" : 35696,
"hobbies" : "Acrobatics,Meditation,Music"
}
{
"_id" : ObjectId("557e610d626754910f0974a5"),
"id" : 1,
"name" : "Edward Young",
"email" : "edward.young#solexis.com",
"work" : "Solexis",
"dob" : "Wed Feb 12 1941 16:45:53 GMT+0530 (IST)",
"age" : 1,
"gender" : "female",
"salary" : 72291,
"hobbies" : "Acrobatics,Meditation,Music"
}
{
"_id" : ObjectId("557e610d626754910f0974a6"),
"id" : 2,
"name" : "Haydee Milligan",
"email" : "haydee.milligan#dalserve.com",
"work" : "Dalserve",
"dob" : "Tue Sep 13 1994 13:45:04 GMT+0530 (IST)",
"age" : 17,
"gender" : "male",
"salary" : 20026,
"hobbies" : "Papier-Mache"
}
{
"_id" : ObjectId("557e610d626754910f0974a7"),
"id" : 3,
"name" : "Lyle Keesee",
"email" : "lyle.keesee#terrasys.com",
"work" : "Terrasys",
"dob" : "Tue Apr 25 1922 13:39:46 GMT+0530 (IST)",
"age" : 79,
"gender" : "female",
"salary" : 48032,
"hobbies" : "Acrobatics,Meditation,Music"
}
{
"_id" : ObjectId("557e610d626754910f0974a8"),
"id" : 4,
"name" : "Shea Mercer",
"email" : "shea.mercer#pancast.com",
"work" : "Pancast",
"dob" : "Mon Apr 08 1935 06:10:30 GMT+0530 (IST)",
"age" : 51,
"gender" : "male",
"salary" : 31511,
"hobbies" : "Acrobatics,Photography,Papier-Mache"
}
Number of users in each gender
> db.user_details.find({gender:'male'}).count()
459
>
> db.user_details.find({gender:'female'}).count()
541
> db.user_details.find({name:{$ne:null}}).count()
1000
> db.user_details.find({age:{$ne:null}}).count()
1000
Map reduce code
mapper = function(){
emit(this.gender, {name:this.name,age:this.age})
}
reducer = function(gender, users){
var res = 0;
users.forEach(function(user){
res = res + 1
})
return res;
}
db.user_details.mapReduce(mapper, reducer, {out: {inline:1}})
Why map reduce result has only 112 documents? It should contain 459 and 541 for male and female respectively, isn't it?
// Map reduce result
{
"results" : [
{
"_id" : "female",
"value" : 56
},
{
"_id" : "male",
"value" : 46
}
],
"timeMillis" : 45,
"counts" : {
"input" : 1000,
"emit" : 1000,
"reduce" : 20,
"output" : 2
},
"ok" : 1
}
Note : I know this is not a proper way to use map reduce, Actually i faced some more creepy problem in map reduce. Once i get solution to this question i could solve that
Your problem here is that you have missed one of the core concepts of how mapReduce works. The relevant documentation that explains this is found here:
MongoDB can invoke the reduce function more than once for the same key. In this case, the previous output from the reduce function for that key will become one of the input values to the next reduce function invocation for that key.
And then also a bit later:
the type of the return object must be identical to the type of the value emitted by the map function
What those two statements mean is you need to use the exact same signature issued from both the mapper and the reducer functions as the reduce process will indeed get called "multiple times".
This is how mapReduce deals with large data, but not necessarily processing all of the same values for a given "key" at once, but doing it in incremental "chunks":
There fore if all you want in the output is a "number" then all you "emit" is just a "number" as well:
db.collection.mapReduce(
function() {
emit(this.gender, this.age);
},
function(key,values) {
return Array.sum( values )
},
{ "out": { "inline": 1 } }
)
Or just "count" per type:
db.collection.mapReduce(
function() {
emit(this.gender, 1);
},
function(key,values) {
return Array.sum( values )
},
{ "out": { "inline": 1 } }
)
The point is "you need to put out the same as what you put in", as it will likely "go back in again". So whatever data you want to collect, the output structure for both mapper and reducer must be the same.
This is probably wrong.
users.forEach(function(user){
res = res + 1
})
Try this,
function(gender, users){
return Array.sum( users)
}
There is a mistake in the reduce function.
MONGODB reduce function can be called multiple times for the same KEY, so in your reduce code its getting overridden.
Also in map function you are emmitting the document of structure { user, age}, but in reduce function you are returning the count.
reduce = function(gender, doc) {
reducedVal = { user: 0, age: 0 };
for (var idx = 0; idx < doc.length; idx++) {
reducedVal.user += 1 ;
reducedVal.age += 1;
}
return reducedVal;
};
please check the below link as well:
http://thejackalofjavascript.com/mapreduce-in-mongodb/
This is a proper way to use map reduce(), for display gender-wise count of users
db.yourCollectionName.mapReduce(
function(){
emit(this.gender,1);
},
function(k,v){
return Array.sum(v);
},
{out:"genderCount"}
);
db.genderCount.find();
I am mongodb newbie! I am trying to process some tweeter data. my goal is to group users on each time interval (for simplicity, daily interval) and count his unique hashtags on that day. My idea to build new DB which is only contains user, date and hashtags. Here is data format:
> db.sampledDB.findOne()
{
"_id" : NumberLong("2334234"),
"replyid" : NumberLong(-1),
"userid" : NumberLong(21313),
"replyuserid" : NumberLong(-1),
"createdAt" : ISODate("2013-07-02T22:35:06Z"),
"tweettext" : "RT #BBCBreaking: Plane carrying Bolivia President Morales is diverted to Austria on suspicion US fugitive #Snowden is on board - Bolivian m…",
"screenName" : "x83",
"name" : "david x",
"retweetCount" : NumberLong(0),
"retweet_id" : NumberLong("12313223"),
"retweet_userid" : NumberLong(123123123),
"source" : "Twitter for Windows Phone",
"hashtags" : [
{
"start" : 106,
"end" : 114,
"text" : "Snowden"
}
],
"mentions" : [
{
"start" : 3,
"end" : 15,
"id" : NumberLong(876678),
"screenName" : "BBCBreaking",
"name" : "BBC Breaking News"
}
],
"media" : [ ]
}
I use mapReduce like this:
MAP:
map = function(){
//format date to year/month/day
var format = this.createdAt.getFullYear() + '/' + (this.createdAt.getMonth()+1) + '/' + this.createdAt.getDate();
var key = {userid:this.userid, date:format};
emit(key,{hashtags:this.hashtags}); }
REDUCE:
reduce = function(key,values){
var result = {a:[]};
for (var idx=0;idx<values.length;idx++){
result.a.push(values[idx].hashtag);
}
return result};
it results to:
{
"_id" : {
"userid" : NumberLong(7686787),
"date" : "2013/7/5"
},
"value" : {
"hashtag" : [
{
"start" : 24,
"end" : 44,
"text" : "SıkSöylenenYalanlar"
},
{
"start" : 45,
"end" : 60,
"text" : "ZimmermanTrial"
},
{
"start" : 61,
"end" : 84,
"text" : "ZaynMalikYouArePerfect"
},
{
"start" : 85,
"end" : 99,
"text" : "TrayvonMartin"
},
{
"start" : 100,
"end" : 110,
"text" : "Wimbledon"
},
{
"start" : 111,
"end" : 118,
"text" : "Футбол"
},
{
"start" : 119,
"end" : 127,
"text" : "Snowden"
},
{
"start" : 128,
"end" : 138,
"text" : "TFFistifa"
}
]
}
},
{
"_id" : {
"userid" : NumberLong(45666),
"date" : "2013/7/5"
},
"value" : {
"hashtag" : [
{
"start" : 24,
"end" : 44,
"text" : "SıkSöylenenYalanlar"
},
{
"start" : 45,
"end" : 60,
"text" : "ZimmermanTrial"
},
{
"start" : 61,
"end" : 84,
"text" : "ZaynMalikYouArePerfect"
},
{
"start" : 85,
"end" : 99,
"text" : "TrayvonMartin"
},
{
"start" : 100,
"end" : 110,
"text" : "Wimbledon"
},
{
"start" : 111,
"end" : 118,
"text" : "Футбол"
},
{
"start" : 119,
"end" : 127,
"text" : "Snowden"
},
{
"start" : 128,
"end" : 138,
"text" : "TFFistifa"
}
]
}
},
But I just want to keep the text element of hashtags. I tried to change the reducer to values[idx].hashtag.text or values[idx].hashtag["text"] which did not help.
UPDATE:
I suspect my problem is similar to MapReduce problem, but I dont know to fix mine
You might also consider using the aggregation framework which can produce the results shown below. The pipeline would look similar to this:
{$project: {
userid: "$userid",
"hashtags": "$hashtags.text",
date: {
year: { $year: "$createdAt" },
month: { $month: "$createdAt"},
day: {$dayOfMonth: "$createdAt"} }}},
{$unwind: "$hashtags" },
{ $group: { _id : {
date: "$date",
userid: "$userid"},
hashtags: { $addToSet:"$hashtags" }
}} )
Might produce a result like:
[
{
"_id" : {
"date" : {
"year" : 2013,
"month" : 8,
"day" : 4
},
"userid" : NumberLong(362337301)
},
"hashtags" : [
"tagger",
"stackoverflow",
"twitter"
]
}, /* more */
A brief explanation of the aggregation framework pipeline:
Using $project, grab only the fields that will matter through the rest of the pipeline. Before doing this, if there was a specific date or range that would have been desired, using $match would have been a great step to filter some of the results efficiently). Note that the createdAt field has been split into the respective pieces so that the time of day will later be ignored when grouping. After the projection has occurred, the new field will be called date in the example. Here, the hash tags have been simplified to be only the text property, and the name reused as "hashtags".
Next, as "hashtags" is an array at this point (would look like: ['tagger', 'stackoverflow', 'twitter'] for example, the pipeline creates a new document for each element in the "hashtag" array.
Finally, the grouping pipeline operator uses the combination of userid and date as a grouper, and adds all unique hash tags to a field called "hashtags".
As an alternative to splitting the date, you can also just treat the createdAt field as a string, and remove the time by using this in the pipeline:
date: {$substr: ["$createdAt",0, 10] }
It would produce something like:
2013-07-02
Edit
As you've pointed out, there is currently a 16MB limit in the document that is output from an Aggregation. While this is scheduled to be changed in the 2.6 version of MongoDB, you may be able to get a MapReduce as well that work. It's a bit messier given a MapReduce wasn't necessarily intended for this type of work, so the results may not be necessarily what you want.
map = function() {
var format = this.createdAt.getFullYear() + '/'
+ (this.createdAt.getMonth()+1) + '/' + this.createdAt.getDate();
var key = {userid:this.userid, date:format};
var hashtags = this.hashtags || [];
for(var i=0, l=hashtags.length; i < l; i++) {
emit(key, hashtags[i].text);
}
};
reduce = function(key, values){
values = values || [];
var tag;
var tags = {};
for(var i=0, l=values.length; i<l ; i++) {
tag = values[i] || "";
if (tag.length > 0) {
tags[tag] = "";
}
};
values = [];
for(var t in tags) {
values.push(t);
}
return values.join(',');
};
Instead of emitting the array, it emits each hash tag in the map. The reduce eliminates duplicates using a simple associative array and then returns a joined string with all of the hash tags. MongoDB does not support returning an array of results via the reduce function (the idea is that a reduce should be providing one result, not an array of results).
Results:
{
"_id" : {
"userid" : NumberLong(262317302),
"date" : "2013/7/2"
},
"value" : "Wisconsin,Space,Cheese"
}
If you don't need to do this work frequently, you could also just write a shell script in the MongoDB console that extracts the hash tags into a new collection. Then, just run it when you need to.
here is how I managed to produce the same result as the answer above. just for presenting another solution.
map = function(){
var day = this.createdAt.getFullYear() + '/' + (this.createdAt.getMonth()+1) + '/' + this.createdAt.getDate();
var key = {userid:this.userid, date:day};
var values = {hashtags:[]};
for (var idx=0;idx<this.hashtags.length;idx++){
values.hashtags.push(this.hashtags[idx].text);
}
emit(key,values);
};
reduce = function(key,values){
hashtag_list = {hashtags: []} ;
for(var i in values) {
hashtag_list.hashtags= values[i].hashtags.concat(hashtag_list.hashtags);
}
return hashtag_list;
}
Try:
values[idx].text
hashtag is not a property of the object, but text is.
In my database, I have millions of documents. Each of them has a time stamp. Some have the same time stamp. I want to get some points (a few hundreds or potentially more like thousands) to draw a graph. I don't want all the points. I want every n points I pick 1 point. I know there's aggregation framework and I tried that. The problem with that is since my data is huge. When I do aggregation work, The result exceeds document maximum size, 16MB, easily. There's also a function called skip in mongodb but it only skips first n documents. Are there good ways to achieve what I want? Or is there way to make aggregation result bigger? Thanks in advance!
I'm not sure how you can do this with either A/F or M/R - just skipping so that you have (f.e.) each 10th point is not something M/R allows you to do—unless you select each point based on a random value with a 10% change... which is probably not what you want. But that does work:
db.so.output.drop();
db.so.find().count();
map = function() {
// rand does 0-1, so < 0.1 means 10%
if (Math.random() < 0.1) {
emit(this._id, this);
}
}
reduce = function(key, values) {
return values;
}
db.so.mapReduce( map, reduce, { out: 'output' } );
db.output.find();
Which outputs something line:
{
"result" : "output",
"timeMillis" : 4,
"counts" : {
"input" : 23,
"emit" : 3,
"reduce" : 0,
"output" : 3
},
"ok" : 1,
}
> db.output.find();
{ "_id" : ObjectId("51ffc4bc16473d7b84172d85"), "value" : { "_id" : ObjectId("51ffc4bc16473d7b84172d85"), "date" : ISODate("2013-08-05T15:24:45Z") } }
{ "_id" : ObjectId("51ffc75316473d7b84172d8e"), "value" : { "_id" : ObjectId("51ffc75316473d7b84172d8e") } }
{ "_id" : ObjectId("51ffc75316473d7b84172d8f"), "value" : { "_id" : ObjectId("51ffc75316473d7b84172d8f") } }
or:
> db.so.mapReduce( map, reduce, { out: 'output' } );
{
"result" : "output",
"timeMillis" : 19,
"counts" : {
"input" : 23,
"emit" : 2,
"reduce" : 0,
"output" : 2
},
"ok" : 1,
}
> db.output.find();
{ "_id" : ObjectId("51ffc4bc16473d7b84172d83"), "value" : { "_id" : ObjectId("51ffc4bc16473d7b84172d83"), "date" : ISODate("2013-08-05T15:24:25Z") } }
{ "_id" : ObjectId("51ffc4bc16473d7b84172d86"), "value" : { "_id" : ObjectId("51ffc4bc16473d7b84172d86"), "date" : ISODate("2013-08-05T15:25:15Z") } }
Depending on a random factor.
This is a follow-up from this question, where I tried to solve this problem with the aggregation framework. Unfortunately, I have to wait before being able to update this particular mongodb installation to a version that includes the aggregation framework, so have had to use MapReduce for this fairly simple pivot operation.
I have input data in the format below, with multiple daily dumps:
"_id" : "daily_dump_2013-05-23",
"authors_who_sold_books" : [
{
"id" : "Charles Dickens",
"original_stock" : 253,
"customers" : [
{
"time_bought" : 1368627290,
"customer_id" : 9715923
}
]
},
{
"id" : "JRR Tolkien",
"original_stock" : 24,
"customers" : [
{
"date_bought" : 1368540890,
"customer_id" : 9872345
},
{
"date_bought" : 1368537290,
"customer_id" : 9163893
}
]
}
]
}
I'm after output in the following format, that aggregates across all instances of each (unique) author across all daily dumps:
{
"_id" : "Charles Dickens",
"original_stock" : 253,
"customers" : [
{
"date_bought" : 1368627290,
"customer_id" : 9715923
},
{
"date_bought" : 1368622358,
"customer_id" : 9876234
},
etc...
]
}
I have written this map function...
function map() {
for (var i in this.authors_who_sold_books)
{
author = this.authors_who_sold_books[i];
emit(author.id, {customers: author.customers, original_stock: author.original_stock, num_sold: 1});
}
}
...and this reduce function.
function reduce(key, values) {
sum = 0
for (i in values)
{
sum += values[i].customers.length
}
return {num_sold : sum};
}
However, this gives me the following output:
{
"_id" : "Charles Dickens",
"value" : {
"customers" : [
{
"date_bought" : 1368627290,
"customer_id" : 9715923
},
{
"date_bought" : 1368622358,
"customer_id" : 9876234
},
],
"original_stock" : 253,
"num_sold" : 1
}
}
{ "_id" : "JRR Tolkien", "value" : { "num_sold" : 3 } }
{
"_id" : "JK Rowling",
"value" : {
"customers" : [
{
"date_bought" : 1368627290,
"customer_id" : 9715923
},
{
"date_bought" : 1368622358,
"customer_id" : 9876234
},
],
"original_stock" : 183,
"num_sold" : 1
}
}
{ "_id" : "John Grisham", "value" : { "num_sold" : 2 } }
The even indexed documents have the customers and original_stock listed, but an incorrect sum of num_sold.
The odd indexed documents only have the num_sold listed, but it is the correct number.
Could anyone tell me what it is I'm missing, please?
Your problem is due to the fact that the format of the output of the reduce function should be identical to the format of the map function (see requirements for the reduce function for an explanation).
You need to change the code to something like the following to fix the problem, :
function map() {
for (var i in this.authors_who_sold_books)
{
author = this.authors_who_sold_books[i];
emit(author.id, {customers: author.customers, original_stock: author.original_stock, num_sold: author.customers.length});
}
}
function reduce(key, values) {
var result = {customers:[] , num_sold:0, original_stock: (values.length ? values[0].original_stock : 0)};
for (i in values)
{
result.num_sold += values[i].num_sold;
result.customers = result.customers.concat(values[i].customers);
}
return result;
}
I hope that helps.
Note : the change num_sold: author.customers.length in the map function. I think that's what you want