mongodb mapreduce can save emit into collection? - mongodb

I have a function map created, is this
var m = function() {
hashtags = {}
for(var i in this.entities.hashtags) {
hashtags[this.entities.hashtags[i].text] = 1
};
var valor = {numtweets: 1 ,
dic_hastag: hashtags};
print(" value: " + tojson(valor));
emit(this.place.country_code, valor)
};
I start from a collection called tweets, and the output of my map function should have a variable numtweets: 1 and a variable hastags with the entire list of tweet hastats with a 1.
example
Nuwtweets: 1, hastags: "hast1": 1, "hast2": 2, "hast3": 1
1.- I can have the result saved in a collection, to prove that it works well instead of print
2.- If not if I have to do mapreduce, what should be the function reduces, why not do anything, and so when executing this, the output of the map function
Db.runCommand ({
MapReduce: "tweets",
Map: m,
Reduce: r,
Query: {"place.country_code": "AD"},
Out: {replace: "resultat5fi"}
});
Any suggestions, help, anything will be welcome.

I suggest you to see this.
This the way to debug map functions easily with mongodb.
first, create your map function as you did
Then define an emit function that will print (or insert into a collection)
Example of emit function that insert into a toto collection.
var emit = function(key, value) {
db.toto.insert({key: key, value: value});
}
Invoke a find function and apply your map function on each record
Example :
var myCursor = db.tweets.find( {} );
while (myCursor.hasNext()) {
var doc = myCursor.next();
map.apply(doc);
print();
}
Then look at yout toto collection
This way :
db.toto.find()

Related

mongodb mapReduce returns NaN because of minus character

My aim is to create a map function like this
map = function () { var key = {key:this.name-1}; emit(key, {count:1}); }
I pass the parameter "name-1" in a java code. But mongo takes it as "name minus 1". So key becomes and integer and so during map reduce I get NaN problem.
The map actually looks like this
map = function () { var key = {key:this.name - 1}; emit(key, {count:1}); }
how to overcome this problem?
Phew finally this works.
map = function () { var key = {key:this["name-1"]}; emit(key, {count:1}); }

how to calculate count and unique count over two fields in mongo reduce function

I have a link tracking table that has (amongst other fields) track_redirect and track_userid. I would like to output both the total count for a given link, and also the unique count - counting duplicates by the user id. So we can differentiate if someone has clicked the same link 5 times.
I've tried emitting this.track_userid in both the key and values parts but can't get to grips with how to correctly access them in the reduce function.
So if I roll back to when it actually worked, I have the very simple code below - just like it would be in a 'my first mapreduce function' example
map
function() {
if(this.track_redirect) {
emit(this.track_redirect,1);
}
}
reduce
function(k, vals) {
var sum = 0;
for (var i in vals) {
sum += vals[i];
}
return sum;
}
I'd like to know the correct way to emit the additional userid information and access it in the mapreduce please. or am i thinking about it in the wrong way?
in case it's not clear, I don't want to calculate the total clicks a userid has made, but to count the unique clicks of each url + userid - not counting any duplicate clicks a userid made on each link
can someone point me in the right direction please? thanks!
You can actually pass arbitrary object on the second parameter of the emit call. That means you can take advantage of this and store the userid in it. For example, your map function can look like this:
var mapFunc = function() {
if (this.track_redirect) {
var tempDoc = {};
tempDoc[this.track_userid] = 1;
emit(this.track_redirect, {
users_clicked: tempDoc,
total_clicks: 1
});
}
};
And your reduce function might look like this:
var reduceFunc = function(key, values) {
var summary = {
users_clicked: {},
total_clicks: 0
};
values.forEach(function (doc) {
summary.total_clicks += doc.total_clicks;
// Merge the properties of 2 objects together
// (and these are actually the userids)
Object.extend(summary.users_clicked, doc.users_clicked);
});
return summary;
};
The users_clicked property of the summary object basically stores the id of every user as a property (since you can't have duplicate properties, you can guarantee that it will store unique users). Also note that you have to be careful of the fact that some of the values passed to the reduce function can be result of a previous reduce and the sample code above takes that into account. You can find more about the said behavior in the docs here.
In order to get the unique count, you can pass in the finalizer function that gets called when the reduce phase is completed:
var finalFunc = function(key, value) {
// Counts the keys of an object. Taken from:
// http://stackoverflow.com/questions/18912/how-to-find-keys-of-a-hash
var countKeys = function(obj) {
var count = 0;
for(var i in obj) {
if (obj.hasOwnProperty(i))
{
count++;
}
}
return count;
};
return {
redirect: key,
total_clicks: value.total_clicks,
unique_clicks: countKeys(value.users_clicked)
};
};
Finally, you can execute the map reduce job like this (modify the out attribute to fit your needs):
db.users.mapReduce(mapFunc, reduceFunc, { finalize: finalFunc, out: { inline: 1 }});

How to access this._id in map function in MongoDB MapReduce?

I'm doing a MapReduce in Mongo to generate a reverse index of tokens for some documents. I am having trouble accessing document's _id in the map function.
Example document:
{
"_id" : ObjectId("4ea42a2c6fe22bf01f000d2d"),
"attributes" : {
"name" : "JCDR 50W38C",
"upi-tokens" : [
"50w38c",
"jcdr"
]
},
"sku" : "143669259486830515"
}
(The field ttributes['upi-tokens'] is a list of text tokens I want to create reverse index for.)
Map function (source of the problem):
m = function () {
this.attributes['upi-tokens'].forEach(
function (token) { emit(token, {ids: [ this._id ]} ); }
); }
Reduce function:
r = function (key, values) {
var results = new Array;
for (v in values) {
results = results.concat(v.ids);
}
return {ids:results};
}
MapReduce call:
db.offers.mapReduce(m, r, { out: "outcollection" } )
PROBLEM Resulting collection has null values everywhere where I'd expect an id instead of actual ObjectID strings.
Possible reason:
I was expecting the following 2 functions to be equivalent, but they aren't.
m1 = function (d) { print(d['_id']); }
m2 = function () { print(this['_id']); }
Now I run:
db.offers.find().forEach(m1)
db.offers.find().forEach(m2)
The difference is that m2 prints undefined for each document while m1 prints the ids as desired. I have no clue why.
Questions:
How do I get the _id of the current object in the map function for use in MapReduce? this._id or this['_id'] doesn't work.
Why exactly aren't m1 and m2 equivalent?
Got it to work... I made quite simple JS mistakes:
inner forEach() in the map function seems to overwrite 'this' object; this is no longer the main document (which has an _id) but the iterated object inside the loop)...
...or it was simply because in JS the for..in loop only returns the keys, not values, i.e.
for (v in values) {
now requires
values[v]
to access the actual array value. Duh...
The way I circumvented mistake #1 is by using for..in loop instead of ...forEach() loop in the map function:
m = function () {
for (t in this.attributes['upi-tokens']) {
var token = this.attributes['upi-tokens'][t];
emit (token, { ids: [ this._id ] });
}
}
That way "this" refers to what it needs to.
Could also do:
that = this;
this.attributes['upi-tokens'].forEach( function (d) {
...
that._id...
...
}
probably would work just fine.
Hope this helps someone.

Group By (Aggregate Map Reduce Functions) in MongoDB using Scala (Casbah/Rogue)

Here's a specific query I'm having trouble with. I'm using Lift-mongo-
records so that i can use Rogue. I'm happy to use Rogue specific
syntax , or whatever works.
While there are good examples for using javascript strings via java noted below, I'd like to know what the best practices might be.
Imagine here that there is a table like
comments {
_id
topic
title
text
created
}
The desired output is a list of topics and their count, for example
cats (24)
dogs (12)
mice (5)
So a user can see an list, ordered by count, of a distinct/group by
Here's some psuedo SQL:
SELECT [DISTINCT] topic, count(topic) as topic_count
FROM comments
GROUP BY topic
ORDER BY topic_count DESC
LIMIT 10
OFFSET 10
One approach is using some DBObject DSL like
val cursor = coll.group( MongoDBObject(
"key" -> MongoDBObject( "topic" -> true ) ,
//
"initial" -> MongoDBObject( "count" -> 0 ) ,
"reduce" -> "function( obj , prev) { prev.count += obj.c; }"
"out" -> "topic_list_result"
))
[...].sort( MongoDBObject( "created" ->
-1 )).skip( offset ).limit( limit );
Variations of the above do not compile.
I could just ask "what am I doing wrong" but I thought I could make my
confusion more acute:
can I chain the results directly or do I need "out"?
what kind of output can I expect - I mean, do I iterate over a
cursor, or the "out" param
is "cond" required?
should I be using count() or distinct()
some examples contain a "map" param...
A recent post I found which covers the java driver implies I should
use strings instead of a DSL :
http://blog.evilmonkeylabs.com/2011/02/28/MongoDB-1_8-MR-Java/
Would this be the preferred method in either casbah or Rogue?
Update: 9/23
This fails in Scala/Casbah (compiles but produces error {MapReduceError 'None'} )
val map = "function (){ emit({ this.topic }, { count: 1 }); }"
val reduce = "function(key, values) { var count = 0; values.forEach(function(v) { count += v['count']; }); return {count: count}; }"
val out = coll.mapReduce( map , reduce , MapReduceInlineOutput )
ConfiggyObject.log.debug( out.toString() )
I settled on the above after seeing
https://github.com/mongodb/casbah/blob/master/casbah-core/src/test/scala/MapReduceSpec.scala
Guesses:
I am misunderstanding the toString method and what the out.object is?
missing finalize?
missing output specification?
https://jira.mongodb.org/browse/SCALA-43 ?
This works as desired from command line:
map = function (){
emit({ this.topic }, { count: 1 });
}
reduce = function(key, values) { var count = 0; values.forEach(function(v) { count += v['count']; }); return {count: count}; };
db.tweets.mapReduce( map, reduce, { out: "results" } ); //
db.results.ensureIndex( {count : 1});
db.results.find().sort( {count : 1});
Update
The issue has not been filed as a bug at Mongo.
https://jira.mongodb.org/browse/SCALA-55
The following worked for me:
val coll = MongoConnection()("comments")
val reduce = """function(obj,prev) { prev.csum += 1; }"""
val res = coll.group( MongoDBObject("topic"->true),
MongoDBObject(), MongoDBObject( "csum" -> 0 ), reduce)
res was an ArrayBuffer full of coll.T which can be handled in the usual ways.
Appears to be a bug - somewhere.
For now, I have a less-than-ideal workaround working now, using eval() (slower, less safe) ...
db.eval( "map = function (){ emit( { topic: this.topic } , { count: 1 }); } ; ");
db.eval( "reduce = function(key, values) { var count = 0; values.forEach(function(v) { count += v['count']; }); return {count: count}; }; ");
db.eval( " db.tweets.mapReduce( map, reduce, { out: \"tweetresults\" } ); ");
db.eval( " db.tweetresults.ensureIndex( {count : 1}); ");
Then I query the output table normally via casbah.

MongoDB map reduce producing different result to db.collection.find()

I have a map reduce like this:
map:
function() {
emit(this.username, {sent:this.sent, received:this.received});
}
reduce:
function(key, values) {
var result = {sent: 0, received: 0, entries:0};
values.forEach(function (value) {
result.sent += value.sent;
result.received += value.received;
result.entries += 1;
});
return result;
}
I've been monitoring the amount of entries processed in the result map, as you can see. I've found I get much lower numbers of accessed records than I should.
For my particular data set, the output is like so:
[{u'_id': u'1743', u'value': {u'received': 1406545.0, u'sent': 26251138.0, u'entries': 316.0}}]
As I'm running the map reduce with a query option, specifying a username and a date range.
If I perform the same query using db.collection.find() as follows, the count is different:
> db.entire_database.find({username: '1743', time : { $lte: ISODate('2011-08-12 12:40:00'), $gte: ISODate('2011-08-12 08:40:00') }}).count()
1915
The full map reduce query is this:
db.entire_database.mapReduce(m, r, {out: 'myoutput', query: { username: '1743', time : { $lte: ISODate('2011-08-12 12:40:00'), $gte: ISODate('2011-08-12 08:40:00') } } })
So basically, I'm unsure why the count is so radically different? Why is the find() giving me 1915, but the map reduce is 316?
Your map function needs to emit an object with the same form as the reduce function (ie. it should have an entries field set to 1). You can read more about this here.
Basically, the values that are passed to the reduce function are not necessarily the raw outputs emitted from map. Rather than being called once, the reduce function is called many times on 'groups' of values produced by map, the results of which are then combined again by being passed into a further call of the reduce function. This is what makes MapReduce horizontally scalable, because any group of emitted values can be farmed out to any server in any order before being combined later.
So I would restructure your functions slightly like this:
map:
function() {
emit(this.username, {sent:this.sent, received:this.received, entries : 1});
}
reduce:
function(key, values) {
var result = {sent: 0, received: 0, entries:0};
values.forEach(function (value) {
result.sent += value.sent;
result.received += value.received;
result.entries += value.entries;
});
return result;
}