MapReduce trouble with counting - mongodb

I've got a problem, I have data in mongodb which looks like this:
{"miejscowosci_str":"OneCity", "wojewodztwo":"FirstRegionName", "ZIP-Code" : "...", ...}
{"miejscowosci_str":"TwoCity", "wojewodztwo":"FirstRegionName", "ZIP-Code" : "...", ...}
{"miejscowosci_str":"ThreeCity", "wojewodztwo":"SecondRegionName", "ZIP-Code" : "...", ...}
{"miejscowosci_str":"FourCity", "wojewodztwo":"SecondRegionName", "ZIP-Code" : "...", ...}
and so on
What I want is to list all regions (wojewodztwo) and to count average number of zip codes per region, I know how to count all zip codes in region:
var map = function() {
emit(this.wojewodztwo,1);
};
var reduce = function(key, val) {
var count = 0;
for(i in val) {
count += val[i];
}
return count;
};
db.kodypocztowe.mapReduce(
map,
reduce,
{ out : "result" }
);
But I don't know how to count number of cities (miejscowosci_str) so I could divide number of ZIP-Codes in region through number of cities in the same region.
One city can have multiple number of zip-codes.
Have you got any ideas?

I'm making a couple of assumptions here :
cities can have multiple zip codes
zip codes are unique
you are not trying to get the answer to M101P week 5 questions !
Rather than just counting the cities in one go, why not build up a list of city/zip objects in the map phase and then reduce this to a list of zips and unique cities in the map phase. Then you can use the finalize phase to calculate the averages.
Note : if the data set is large you might want to consider using the aggregation framework instead, this is shown after the map/reduce example
db.kodypocztowe.drop();
db.result.drop();
db.kodypocztowe.insert([
{"miejscowosci_str":"OneCity", "wojewodztwo":"FirstRegionName", "ZIP-Code" : "1"},
{"miejscowosci_str":"TwoCity", "wojewodztwo":"FirstRegionName", "ZIP-Code" : "2"},
{"miejscowosci_str":"ThreeCity", "wojewodztwo":"SecondRegionName", "ZIP-Code" : "3"},
{"miejscowosci_str":"FourCity", "wojewodztwo":"SecondRegionName", "ZIP-Code" : "4"},
{"miejscowosci_str":"FourCity", "wojewodztwo":"SecondRegionName", "ZIP-Code" : "5"},
]);
// map the data to { region : [{citiy : name , zip : code }] }
// Note : a city can be in multiple zips but zips are assumed to be unique
var map = function() {
emit(this.wojewodztwo, {city:this.miejscowosci_str, zip:this['ZIP-Code']});
};
//
// convert the data to :
//
// {region : {cities: [], zips : []}}
//
// note : always add zips
// note : only add cities if they are not already there
//
var reduce = function(key, val) {
var res = {zips:[], cities:[]}
for(i in val) {
var city = val[i].city;
res.zips.push(val[i].zip);
if(res.cities.indexOf(city) == -1) {
res.cities.push(city);
}
}
return res;
};
//
// finalize the data to get the average number of zips / region
var finalize = function(key, res) {
res.average = res.zips.length / res.cities.length;
delete res.cities;
delete res.zips;
return res;
}
print("==============");
print(" map/reduce")
print("==============");
db.kodypocztowe.mapReduce(
map,
reduce,
{ out : "result" , finalize:finalize}
);
db.result.find().pretty()
print("==============");
print(" aggregation")
print("==============");
db.kodypocztowe.aggregate( [
// get the number of zips / [region,city]
{ "$group" :
{
_id : {"region" : "$wojewodztwo", city : "$miejscowosci_str"},
zips:{$sum:1}
}
},
// get the number of cities per region and sum the number of zips
{ "$group" :
{
_id : "$_id.region" ,
cities:{$sum:1},
zips:{$sum:"$zips"},
}
},
// project the data into the same format that map/reduce generated
{ "$project" :
{
"value.average":{$divide: ["$zips","$cities"]}
}
}
]);
I hope that helps.

Related

How to detect the re-reduce stage in MongoDB map/reduce?

I use the following map/reduce setup to collect some data into array:
map: function() { emit(this.key, [this.item]); },
reduce: function(key, values) {
var items = [];
values.forEach( function(value) {items.concat(value.item);} );
return items;
},
out: {reduce: "result_collection"}
I want to improve the code and detect if the resulting collection has been changed during the re-reduce stage (when mongo invokes reduce with the current content of the "result_collection").
In other words, how to know that any documents have been emitted by the Map contain "item" that does not exist in the "result_collection" yet (under the same key, of course)?
This information can help at some further processing stages e.g. query "result_collection" to get the documents that have been updated during the map/reduce stage.
If you must do this, use a finalize function to adjust the value after all reduction is finished. You'll have to add more logic to the reduce function to handle the modified output.
I'll show you an example with the simple map-reduce defined by the following map and reduce functions:
var map = function() { emit(this.k, this.v) }
var reduce = function(key, values) { return Array.sum(values) }
On documents that look like { "k" : 0, "v" : 1 }, the map-reduce defined by the above functions produces result documents that look like { "_id" : 0, "value" : 17 }. Define a finalize function to modify the final document:
var finalize = function (key, reducedValue) { return { "m" : true, "v" : reducedValue } }
Now modify reduce to handle an element of values that might be an object of the above form:
var reduce2 = function(key, values) {
var sum = 0;
for (var i = 0; i < values.length; i++) {
if (typeof values[i] == "object") { sum += values[i].v }
else { sum += values[i] }
}
return sum
}
Output looks like
{ "_id" : 0, "value" : { "m" : true, "v" : 14 } }
{ "_id" : 1, "value" : { "m" : true, "v" : 34 } }
{ "_id" : 2, "value" : { "m" : true, "v" : 8 } }
so you can tell what's been modified by value.m. Your further processing can set v.m to false so you'll see what hasn't been processed yet after each map-reduce.

MongoDB MapReduce, different results with the "same approach", what I'm missing?

I know I'm missing something with MapReduce in MongoDB. I'm trying to build a tag-frequency collection and I'm getting different results, even if it seems that map and reduce functions are the "same".
Example document (forget values 100, 45... I'm not using them):
{
...
tags: [['Rock', 100], ['Indie Pop', 45], ...]
}
Emitting a scalar value 1:
var map = function () {
if (this.tags) {
this.tags.forEach(function (tag) {
emit(tag[0], 1); // Emit just 1
});
}
};
var reduce = function (key, vals) { // Vals should be [1, ...]
return vals.length; // Count the length of the array
};
db.tracks.mapReduce(map, reduce, { out: 'mapreduce_out' });
db.mapreduce_out.find().sort({ value: -1 }).limit(3);
Output is:
{ "_id" : "rubyrigby1", "value" : 9 }
{ "_id" : "Dom", "value" : 7 }
{ "_id" : "Feel Better", "value" : 7 }
Emitting an object { count: 1 }:
var map = function () {
if (this.tags) {
this.tags.forEach(function (tag) {
emit(tag[0], { count: 1 }); // Emit an object
});
}
};
var reduce = function (key, vals) { // vals should be [{ count: 1 }, ...]
var count = 0;
vals.forEach(function (val) {
count += val.count; // Accumul
});
return { count: count };
};
db.tracks.mapReduce(map, reduce, { out: 'mapreduce_out' });
db.mapreduce_out.find().sort({ 'value.count': -1 }).limit(3);
Result is different and appears to be "right":
{ "_id" : "rock", "value" : { "count" : 9472 } }
{ "_id" : "pop", "value" : { "count" : 7103 } }
{ "_id" : "electronic", "value" : { "count" : 5727 } }
What's wrong with the first approach?
Consider a collection of a thousand documents all with the tag 'tagname':
for (var i = 0; i < 1000; i++) {
db.collection.insert({tags: [['tagname']]});
}
If I write a proper mapReduce I should get the output {"_id": "tagname", "count": 1000}. But if I use your map and reduce functions I'll get a count of 101 instead of 1000.
The reason is, MongoDB calls your reduce function repeatedly with intermediate results, in order to avoid keeping too large a batch of results in memory. You can actually see this by putting a print statement in your reduce:
var reduce = function (key, vals) {
print(vals);
return vals.length; // Count the length of the array
};
The print output appears in the server log. The reduce function is called with the first 100 1's, and it returns 100. So far so good. Then MongoDB calls it again with the first reduce's output plus the next 100 1's:
reduce([100, 1, 1, ..., 1]) // 100 plus 100 more 1's
So now it returns 101, because that's the length of the array. But clearly it should return 200, the sum of the array. So to get a correct result, change your reduce function:
reduce = function (key, vals) {
var sum = 0;
vals.forEach(function(val) { sum += val; });
return sum;
}

Mongo: count the number of word occurrences in a set of documents

I have a set of documents in Mongo. Say:
[
{ summary:"This is good" },
{ summary:"This is bad" },
{ summary:"Something that is neither good nor bad" }
]
I'd like to count the number of occurrences of each word (case insensitive), then sort in descending order. The result should be something like:
[
"is": 3,
"bad": 2,
"good": 2,
"this": 2,
"neither": 1,
"nor": 1,
"something": 1,
"that": 1
]
Any idea how to do this? Aggregation framework would be preferred, as I understand it to some degree already :)
MapReduce might be a good fit that can process the documents on the server without doing manipulation on the client (as there isn't a feature to split a string on the DB server (open issue).
Start with the map function. In the example below (which likely needs to be more robust), each document is passed to the map function (as this). The code looks for the summary field and if it's there, lowercases it, splits on a space, and then emits a 1 for each word found.
var map = function() {
var summary = this.summary;
if (summary) {
// quick lowercase to normalize per your requirements
summary = summary.toLowerCase().split(" ");
for (var i = summary.length - 1; i >= 0; i--) {
// might want to remove punctuation, etc. here
if (summary[i]) { // make sure there's something
emit(summary[i], 1); // store a 1 for each word
}
}
}
};
Then, in the reduce function, it sums all of the results found by the map function and returns a discrete total for each word that was emitted above.
var reduce = function( key, values ) {
var count = 0;
values.forEach(function(v) {
count +=v;
});
return count;
}
Finally, execute the mapReduce:
> db.so.mapReduce(map, reduce, {out: "word_count"})
The results with your sample data:
> db.word_count.find().sort({value:-1})
{ "_id" : "is", "value" : 3 }
{ "_id" : "bad", "value" : 2 }
{ "_id" : "good", "value" : 2 }
{ "_id" : "this", "value" : 2 }
{ "_id" : "neither", "value" : 1 }
{ "_id" : "or", "value" : 1 }
{ "_id" : "something", "value" : 1 }
{ "_id" : "that", "value" : 1 }
A basic MapReduce example
var m = function() {
var words = this.summary.split(" ");
if (words) {
for(var i=0; i<words.length; i++) {
emit(words[i].toLowerCase(), 1);
}
}
}
var r = function(k, v) {
return v.length;
};
db.collection.mapReduce(
m, r, { out: { merge: "words_count" } }
)
This will insert word counts into a collection name words_count which you can sort (and index)
Note that it doesn't use stemming, omit punctuation, handles stop words etc.
Also note you can optimize the map function by accumulating repeating word(s) occurrences and emitting the count, not just 1
You can use #split.
Try Below query
db.summary.aggregate([
{ $project : { summary : { $split: ["$summary", " "] } } },
{ $unwind : "$summary" },
{ $group : { _id: "$summary" , total : { "$sum" : 1 } } },
{ $sort : { total : -1 } }
]);
Old question but since 4.2 this can be done with $regexFindAll now.
db.summaries.aggregate([
{$project: {
occurences: {
$regexFindAll: {
input: '$summary',
regex: /\b\w+\b/, // match words
}
}
}},
{$unwind: '$occurences'},
{$group: {
_id: '$occurences.match', // group by each word
totalOccurences: {
$sum: 1 // add up total occurences
}
}},
{$sort: {
totalOccurences: -1
}}
]);
This will output docs in the following format:
{
_id: "matchedwordstring",
totalOccurences: number
}

Mongodb MapReduce for grouping up to n per category using Mongoid

I have a weird problem with MongoDB (2.0.2) map reduce.
So, the story goes like this:
I have Ad model (look for model source extract below) and I need to group up to n ads per category in order to have a nice ordered listing I can later use to do more interesting things.
# encoding: utf-8
class Ad
include Mongoid::Document
cache
include Mongoid::Timestamps
field :title
field :slug, :unique => true
def self.aggregate_latest_active_per_category
map = "function () {
emit( this.category, { id: this._id });
}"
reduce = "function ( key, value ) {
return { ads:v };
}"
self.collection.map_reduce(map, reduce, { :out => "categories"} )
end
All fun and games up until now.
What I expect is to get a result in a form which resembles (mongo shell for db.categories.findOne() ):
{
"_id" : "category_name",
"value" : {
"ads" : [
{
"id" : ObjectId("4f2970e9e815f825a30014ab")
},
{
"id" : ObjectId("4f2970e9e815f825a30014b0")
},
{
"id" : ObjectId("4f2970e9e815f825a30014b6")
},
{
"id" : ObjectId("4f2970e9e815f825a30014b8")
},
{
"id" : ObjectId("4f2970e9e815f825a30014bd")
},
{
"id" : ObjectId("4f2970e9e815f825a30014c1")
},
{
"id" : ObjectId("4f2970e9e815f825a30014ca")
},
// ... and it goes on and on
]
}
}
Actually, it would be even better if I could get value to contain only array but MongoDB complains about not supporting that yet, but, with later use of finalize function, that is not a big problem I want to ask about.
Now, back to problem. What actually happens when I do map reduce is that it spits out something like :
{
"_id" : "category_name",
"value" : {
"ads" : [
{
"ads" : [
{
"ads" : [
{
"ads" : [
{
"ads" : [
{
"id" : ObjectId("4f2970d8e815f825a3000011")
},
{
"id" : ObjectId("4f2970d8e815f825a3000017")
},
{
"id" : ObjectId("4f2970d8e815f825a3000019")
},
{
"id" : ObjectId("4f2970d8e815f825a3000022")
},
// ... on and on and on
... and while I could probably work out a way to use this it just doesn't look like something I should get.
So, my questions (finally) are:
Am I doing something wrong and what is it?
I there something wrong with MongoDB map reduce (I mean besides all the usual things when compared to hadoop)?
Yes, you're doing it wrong. Inputs and outputs of map and reduce should be uniform. Because they are meant to be executed in parallel, and reduce might be run over partially reduced results. Try these functions:
var map = function() {
emit(this.category, {ads: [this._id]});
};
var reduce = function(key, values) {
var result = {ads: []};
values.forEach(function(v) {
v.ads.forEach(function(a) {
result.ads.push(a)
});
});
return result;
}
This should produce documents like:
{_id: category, value: {ads: [ObjectId("4f2970d8e815f825a3000011"),
ObjectId("4f2970d8e815f825a3000019"),
...]}}

mongodb map reduce: "first/lowest" value?

I have documents like this:
{
"_id" : "someid",
"name" : "somename",
"action" : "do something",
"date" : ISODate("2011-08-19T09:00:00Z")
}
I want to map reduce them into something like this:
{
"_id" : "someid",
"value" : {
"count" : 100,
"name" : "somename",
"action" : "do something",
"date" : ISODate("2011-08-19T09:00:00Z")
"firstEncounteredDate" : ISODate("2011-07-01T08:00:00Z")
}
}
I want to group the map reduced documents by "name", "action", and "date". But every document should has this "firstEncounteredDate" containing the earliest "date" (that is actually grouped by "name" and "action").
If I group by name, action and date, firstEncounteredDate would always be date, that's why I'd like to know if there's any way to get "the earliest date" (grouped by "name", and "action" from the entire document) while doing map-reduce.
How can I do this in map reduce?
Edit: more detail on firstEncounteredDate (courtesy to #beny23)
Seems like a two-pass map-reduce would fit the bill, somewhat akin to this example: http://cookbook.mongodb.org/patterns/unique_items_map_reduce/
In pass #1, group the original "name"x"action"x"date" documents by just "name" and "action", collecting the various "date" values into a "dates" array during reduce. Use a 'finalize' function to find the minimum of the collected dates.
Untested code:
// phase i map function :
function () {
emit( { "name": this.name, "action": this.action } ,
{ "count": 1, "dates": [ this.date ] } );
}
// phase i reduce function :
function( key, values ) {
var result = { count: 0, dates: [ ] };
values.forEach( function( value ) {
result.count += value.count;
result.dates = result.dates.concat( value.dates );
}
return result;
}
// phase i finalize function :
function( key, reduced_value ) {
var earliest = new Date( Math.min.apply( Math, reduced_value.dates ) );
reduced_value.firstEncounteredDate = earliest ;
return reduced_value;
}
In pass #2, use the documents generated in pass #1 as input. For each "name"x"action" document, emit a new "name"x"action"x"date" document for each collected date, along with the now determined minimum date common to that "name"x"action" pair. Group by "name"x"action"x"date", summing up the count for each individual date during reduce.
Equally untested code:
// phase ii map function :
function() {
this.dates.forEach( function( d ) {
emit( { "name": this.name, "action": this.action, "date" : d } ,
{ "count": 1, "firstEncounteredDate" : this.firstEncounteredDate } );
}
}
// phase ii reduce function :
function( key, values ) {
// note: value[i].firstEncounteredDate should all be identical, so ...
var result = { "count": 0,
"firstEncounteredDate": values[0].firstEncounteredDate };
values.forEach( function( value ) {
result.count += value.count;
}
return result;
}
Pass #2 does not do a lot of heavy lifting, obviously -- it's mostly copying each document N times, one for each unique date. We could easily build a map of unique dates to their incidence counts during the reduce step of pass #1. (In fact, if we don't do this, there's no real point in having a "count" field in the values from pass #1.) But doing the second pass is a fairly effortless way of generating a full target collection containing the desired documents.