MapReduce, MongoDB and node-mongodb-native - mongodb

I'm using the node-mongodb-native library to run a MapReduce on MongoDB (from node.js).
Here's my code:
var map = function() {
emit(this._id, {'count': this.count});
};
var reduce = function(key, values) {
return {'testing':1};
};
collection.mapReduce(
map,
reduce,
{
query:{ '_id': /s.*/g },
sort: {'count': -1},
limit: 10,
jsMode: true,
verbose: false,
out: { inline: 1 }
},
function(err, results) {
logger.log(results);
}
);
Two questions:
1) Basically, my reduce function is ignored. No matter what I put in it, the output remains just the result of my map function (no 'testing', in this case). Any ideas?
2) I get an error unless an index is defined on the field used for the sort (in this case - the count field). I understand this is to be expected. It seems inefficient as surely the right index would be (_id, count) and not (count), as in theory the _id should be used first (for the query), and only then the sorting should be applied to the applicable results. Am I missing something here? Is MongoDB inefficient? Is this a bug?
Thanks! :)

The reason why the reduce function is never called is due to you emitting a single value for each key so there is no reason for the reduce function to actually execute. Here is an example of how you trigger the reduce function
collection.insert([{group: 1, price:41}, {group: 1, price:22}, {group: 2, price:12}], {w:1}, function(err, r) {
// String functions
var map = function() {
emit(this.group, this.price);
};
var reduce = function(key, values) {
return Array.sum(values);
};
collection.mapReduce(
map,
reduce,
{
query:{},
// sort: {'count': -1},
// limit: 10,
// jsMode: true,
// verbose: false,
out: { inline: 1 }
},
function(err, results) {
console.log("----------- 0")
console.dir(err)
console.dir(results)
// logger.log(results);
}
);
Notice that we are emitting by the "group" key meaning there is n >= 0 entries grouped by the "group" key. Since you are emitting _id each key is unique and thus the reduce function is not needed.
http://docs.mongodb.org/manual/reference/command/mapReduce/#requirements-for-the-reduce-function

Related

Scope work strangely in mapReduce of MongoDB for the purpose of producing cumulative frequency

I have a collection called user, and I want to get cumulative frequency of number of users by date based on the _id field. The desired result should be something like that:
{
{_id: 2013-12-02, value: 10}, //upto 2013-12-02 there are 10 users
{_id: 2014-01-05, value: 20}, //upto 2014-01-05 there are totally 20 users
….
}
I try to get the above using the following mapReduce call:
db.user.mapReduce(
function(){var date = this._id.getTimestamp();
emit(new Date(date.getFullYear()+"-"+date.getMonth()+"-"+date.getDate()), 1)},
function(key, values) {cum = cum + Array.sum(values); return cum},
{out: "newUserAnalysis",
sort: {_id: 1},
scope: {cum: 0}})
But it seems that the cum variable reset to zero after the first return statement encountered in the reduce function. Why? Is there any other method to get what I want?
Many thanks.
cum should not be reset as it's a global variable in map, reduce and finalize functions during the whole mapReduce processing.
But reduce function has 3 requirements to be observed to assure processing correctly, particularly for bulky data handling since reduce function will be called repeatedly even on the same key. Normally the length of values in map function would not exceed 100. In a word, your design can't assure cum is called on the right sequence as you expect, which will produce incorrect statistics.
Following code for your reference:
// map and reduce per day then save to a collection.
db.user.mapReduce(function() {
var date = this._id.getTimestamp();
emit(new Date(date.getFullYear() + "-" + (date.getMonth() + 1) + "-"
+ date.getDate()), 1);
}, function(key, values) {
return Array.sum(values);
}, {
out : "newUserAnalysis",
sort : {
_id : 1
}
});
// Do accumulation one by one.
var cursor = db.newUserAnalysis.find().sort({_id:1});
var newValue = 0, first = true;
while (cursor.hasNext()) {
var doc = cursor.next();
newValue += doc.value;
if (first) {
first = false;
} else {
db.newUserAnalysis.update({_id:doc._id}, {$set:{value:newValue}});
}
}

findAndModify query not executing in callback to aggregation

I have an aggregation query on a students collection that is returning two sets of results
for each student like this
{ _id: 1543,
name: 'Bill Jackson',
scores: { type: 'homework', score: 38.86823689842918 } }
{ _id: 1543,
name: 'Bill Jackson',
scores: { type: 'homework', score: 15.861613903793295 } }
That's working fine. Now in the callback I want to remove one of the scores for each student. I use ugly nested conditionals below to isolate which of the two records I want to remove, and, once that's achieved I create a find and Modify query to remove the doc but there's no evidence of it getting run. Neither the error or success callback to the findAndModify are getting run, however I am able to log that I'm inside the area where the findAndModify is getting called.
Is it possible to query the db in the callback to an aggregation? If not, how should I perform an operation that persists in the db?
//aggregation query ommitted
, function(err, result) { //callbackstarts here with result of aggregation query that returns two records for each student
for (var i=0; i<result.length; i++) {
var id = result[i]['_id'];
if (id === result[i]['_id']){
if (foo && foo === result[i]['_id']){
//if we're in here, we know we need to remove score associated with this result[i]['_id']
//create findAndModify to remove the record
var query = { '_id' : result[i]['_id']}
var sort = []
var operation = { '$pull' : { 'scores.score' : result[i]['scores']['score'] } };
var options = []
console.log('this code is getting called but findAndModify not')
db.collection('students').findAndModify(query, sort, operation, options,function(err, doc) {
if(err) throw err;
if (!doc) {
console.log("record not found");
}
else {
console.log("changed doc" + doc);
}
});
}else {
var foo = result[i]['_id'] //part of logic to isolate which of two records to remove
}

mongodb get list of fields in all collections by using map reduce job

I have list of collection names in mongo database.I need a script to get all fileds names in each collections by passing collection name as argument by using map reduce job in mongo database.
This is what I have so far:
mr = db.runCommand({
"mapreduce" : "collectionname",
"map" : function() { for (var key in this) { emit(key, null); } },
"reduce" : function(key, stuff) { return null; },
"out": "collectioname" + "_keys"
})
or in one line for executing it in the mongo shell:
mr = db.runCommand({
"mapreduce" : "collectionname",
"map" : function() { for (var key in this) { emit(key, null); } },
"reduce" : function(key, stuff) { return null; },
"out": "collectioname" + "_keys"
})
This command was used to get list of fields in the collection. But this one is working only on primary one. I need to loop it (get all fields in each collection in the database). Thanks a lot.
The for loop you are looking for is:
var allCollections = db.getCollectionNames();
for (var i = 0; i < allCollections.length; ++i) {
var collectioname = allCollections[i];
// now do your map reduce for one collection here
// there are the merge or reduce options for the output collection
// in case you want to gather everything in one collection instead of:
// collectioname + '_keys'
}
save this in a script.js
Then run it:
mongo myDb script.js
or write it in one line and execute it the mongo shell:
var allCollections = db.getCollectionNames(); for (var i = 0; i < allCollections.length; ++i) { var collectioname = allCollections[i]; if (collectioname === 'system.indexes') continue; db.runCommand({ "mapreduce" : collectioname, "map" : function() { for (var key in this) { emit(key, null); } }, "reduce" : function(key, stuff) { return null; }, "out": collectioname + "_keys" }) }
But, please go thoroughly over the mapReduce page and run ALL the examples there. This should make it clear how to use the emit function to have proper results at the end. You currently emit a null for a certain key. There and the return of the reduce are the places where you should use some data and aggregate the values you want (for example the collection name, which would be a constant being passed through).

Group By (Aggregate Map Reduce Functions) in MongoDB using Scala (Casbah/Rogue)

Here's a specific query I'm having trouble with. I'm using Lift-mongo-
records so that i can use Rogue. I'm happy to use Rogue specific
syntax , or whatever works.
While there are good examples for using javascript strings via java noted below, I'd like to know what the best practices might be.
Imagine here that there is a table like
comments {
_id
topic
title
text
created
}
The desired output is a list of topics and their count, for example
cats (24)
dogs (12)
mice (5)
So a user can see an list, ordered by count, of a distinct/group by
Here's some psuedo SQL:
SELECT [DISTINCT] topic, count(topic) as topic_count
FROM comments
GROUP BY topic
ORDER BY topic_count DESC
LIMIT 10
OFFSET 10
One approach is using some DBObject DSL like
val cursor = coll.group( MongoDBObject(
"key" -> MongoDBObject( "topic" -> true ) ,
//
"initial" -> MongoDBObject( "count" -> 0 ) ,
"reduce" -> "function( obj , prev) { prev.count += obj.c; }"
"out" -> "topic_list_result"
))
[...].sort( MongoDBObject( "created" ->
-1 )).skip( offset ).limit( limit );
Variations of the above do not compile.
I could just ask "what am I doing wrong" but I thought I could make my
confusion more acute:
can I chain the results directly or do I need "out"?
what kind of output can I expect - I mean, do I iterate over a
cursor, or the "out" param
is "cond" required?
should I be using count() or distinct()
some examples contain a "map" param...
A recent post I found which covers the java driver implies I should
use strings instead of a DSL :
http://blog.evilmonkeylabs.com/2011/02/28/MongoDB-1_8-MR-Java/
Would this be the preferred method in either casbah or Rogue?
Update: 9/23
This fails in Scala/Casbah (compiles but produces error {MapReduceError 'None'} )
val map = "function (){ emit({ this.topic }, { count: 1 }); }"
val reduce = "function(key, values) { var count = 0; values.forEach(function(v) { count += v['count']; }); return {count: count}; }"
val out = coll.mapReduce( map , reduce , MapReduceInlineOutput )
ConfiggyObject.log.debug( out.toString() )
I settled on the above after seeing
https://github.com/mongodb/casbah/blob/master/casbah-core/src/test/scala/MapReduceSpec.scala
Guesses:
I am misunderstanding the toString method and what the out.object is?
missing finalize?
missing output specification?
https://jira.mongodb.org/browse/SCALA-43 ?
This works as desired from command line:
map = function (){
emit({ this.topic }, { count: 1 });
}
reduce = function(key, values) { var count = 0; values.forEach(function(v) { count += v['count']; }); return {count: count}; };
db.tweets.mapReduce( map, reduce, { out: "results" } ); //
db.results.ensureIndex( {count : 1});
db.results.find().sort( {count : 1});
Update
The issue has not been filed as a bug at Mongo.
https://jira.mongodb.org/browse/SCALA-55
The following worked for me:
val coll = MongoConnection()("comments")
val reduce = """function(obj,prev) { prev.csum += 1; }"""
val res = coll.group( MongoDBObject("topic"->true),
MongoDBObject(), MongoDBObject( "csum" -> 0 ), reduce)
res was an ArrayBuffer full of coll.T which can be handled in the usual ways.
Appears to be a bug - somewhere.
For now, I have a less-than-ideal workaround working now, using eval() (slower, less safe) ...
db.eval( "map = function (){ emit( { topic: this.topic } , { count: 1 }); } ; ");
db.eval( "reduce = function(key, values) { var count = 0; values.forEach(function(v) { count += v['count']; }); return {count: count}; }; ");
db.eval( " db.tweets.mapReduce( map, reduce, { out: \"tweetresults\" } ); ");
db.eval( " db.tweetresults.ensureIndex( {count : 1}); ");
Then I query the output table normally via casbah.

MongoDB map reduce producing different result to db.collection.find()

I have a map reduce like this:
map:
function() {
emit(this.username, {sent:this.sent, received:this.received});
}
reduce:
function(key, values) {
var result = {sent: 0, received: 0, entries:0};
values.forEach(function (value) {
result.sent += value.sent;
result.received += value.received;
result.entries += 1;
});
return result;
}
I've been monitoring the amount of entries processed in the result map, as you can see. I've found I get much lower numbers of accessed records than I should.
For my particular data set, the output is like so:
[{u'_id': u'1743', u'value': {u'received': 1406545.0, u'sent': 26251138.0, u'entries': 316.0}}]
As I'm running the map reduce with a query option, specifying a username and a date range.
If I perform the same query using db.collection.find() as follows, the count is different:
> db.entire_database.find({username: '1743', time : { $lte: ISODate('2011-08-12 12:40:00'), $gte: ISODate('2011-08-12 08:40:00') }}).count()
1915
The full map reduce query is this:
db.entire_database.mapReduce(m, r, {out: 'myoutput', query: { username: '1743', time : { $lte: ISODate('2011-08-12 12:40:00'), $gte: ISODate('2011-08-12 08:40:00') } } })
So basically, I'm unsure why the count is so radically different? Why is the find() giving me 1915, but the map reduce is 316?
Your map function needs to emit an object with the same form as the reduce function (ie. it should have an entries field set to 1). You can read more about this here.
Basically, the values that are passed to the reduce function are not necessarily the raw outputs emitted from map. Rather than being called once, the reduce function is called many times on 'groups' of values produced by map, the results of which are then combined again by being passed into a further call of the reduce function. This is what makes MapReduce horizontally scalable, because any group of emitted values can be farmed out to any server in any order before being combined later.
So I would restructure your functions slightly like this:
map:
function() {
emit(this.username, {sent:this.sent, received:this.received, entries : 1});
}
reduce:
function(key, values) {
var result = {sent: 0, received: 0, entries:0};
values.forEach(function (value) {
result.sent += value.sent;
result.received += value.received;
result.entries += value.entries;
});
return result;
}