mapReduce gives different results in upgraded 2.2.0 - mongodb

I've been using mongodb 2.0.4 in development, and on deploying to production I failed to realize they were running 2.2.0. The mapReduce function that I was using is no longer behaving as it was under 2.0.4, and I can't figure out why.
mongodb 2.0.4 (note: M,F,T,I,C,H,R,D should total 144, and does in this example):
{
"_id" : "",
"value" : {
"tag" : "",
"networth" : 43558505,
"land" : 201837,
"alive" : 144,
"M" : 86,
"F" : 6,
"T" : 5,
"I" : 10,
"C" : 17,
"H" : 4,
"R" : 12,
"D" : 4,
"gdi" : 15
}
}
mongo 2.2.0(m+f+t+i+c+h+r+d totals up to 108, when it should total 144)
{
"_id" : "",
"value" : {
"tag" : "",
"networth" : 43558505,
"land" : 201837,
"alive" : 144,
"M" : 67,
"F" : 5,
"T" : 3,
"I" : 6,
"C" : 13,
"H" : 3,
"R" : 9,
"D" : 2,
"gdi" : 15
}
}
Here is the map/reduce functions I'm using:
// Map function
var map = function() {
var key = this.tag;
var value =
{
tag: this.tag,
networth: this.networth,
land: this.land,
alive: this.alive,
gdi: this.gdi,
gov: this.gov
};
emit(key, value);
};
Reduce function
var reduce = function(k,vals) {
reducedVals = { tag: k, networth: 0, land: 0, alive: 0, M: 0, F: 0, T: 0, I: 0, C: 0, H: 0, R: 0, D: 0, gdi: 0 };
for (var i = 0; i < vals.length; i++){
reducedVals.networth += vals[i].networth;
reducedVals.land += vals[i].land;
reducedVals.alive += vals[i].alive;
reducedVals.gdi += vals[i].gdi;
if (vals[i].gov == "M") reducedVals.M = reducedVals.M + 1;
if (vals[i].gov == "F") reducedVals.F = reducedVals.F + 1;
if (vals[i].gov == "T") reducedVals.T = reducedVals.T + 1;
if (vals[i].gov == "I") reducedVals.I = reducedVals.I + 1;
if (vals[i].gov == "C") reducedVals.C = reducedVals.C + 1;
if (vals[i].gov == "H") reducedVals.H = reducedVals.H + 1;
if (vals[i].gov == "R") reducedVals.R = reducedVals.R + 1;
if (vals[i].gov == "D") reducedVals.D = reducedVals.D + 1;
}
return reducedVals;
};
Perform the map reduce
collection.mapReduce(map, reduce, {out: {replace : 'alliances'}, query: {"alive":1}}, function(err, collection) {
// Mapreduce returns the temporary collection with the results
db.close();
});
So brief rundown.. the collection has a bunch of scores, in this format:
"alive" : 1,
"countryNumber" : 47,
"deleted" : 0,
"gdi" : 0,
"gov" : "C",
"land" : 20111,
"name" : "AtheistCelebratingXmas",
"networth" : 9793082,
"protection" : 0,
"rank" : 1,
"resetid" : 407,
"serverid" : 9,
"tag" : "Evolve",
"vacation" : 0
I'm essentially grouping by tag, summing up the networth, land and alive columns. Then checking the gov column for its value and summing up the totals of D,R,etc. Is there a particular reason I'm missing for why this won't work correctly in 2.2 vs 2.0.4? Either way, will the new aggregate commands do this more easily? I gave it a brief look over, and can get the group by tag, with sums for networth and alive columns to work--but wouldn't know where to begin with the gov column.

The shape of the object you emit from your map function must be the same as the object returned from your reduce function. This is because the results of a reduce call can get fed back into reduce when MongoDB decides to parallelize your map-reduce.
So you need to change your map to assemble the values to emit like this so they have the same structure as what's returned from reduce:
var value = {
"tag" : this.tag,
"networth" : this.networth,
"land" : this.land,
"alive" : this.alive,
"gdi" : this.gdi
};
value[this.gov] = 1;
and then update your reduce function accordingly.
BTW, given enough docs this would have failed in 2.0.4 as well. It's just that 2.2 uses different thresholds for when to parallelize.

I accepted JohnnyHK answer as he answered why my code was not working from one version to another; however, I felt I should post what I modified in the code to resolve the issue.
Map Function:
var map = function() {
var key = this.tag;
var fields = {
tag: this.tag,
networth: this.networth,
land: this.land,
alive: this.alive,
gdi: this.gdi,
M: 0,
F: 0,
T: 0,
I: 0,
C: 0,
H: 0,
R: 0,
D: 0
};
if (this.gov == "M") fields["M"] = 1
else fields["M"] = 0
if (this.gov == "F") fields["F"] = 1
else fields["F"] = 0
if (this.gov == "T") fields["T"] = 1
else fields["T"] = 0
if (this.gov == "I") fields["I"] = 1
else fields["I"] = 0
if (this.gov == "C") fields["C"] = 1
else fields["C"] = 0
if (this.gov == "H") fields["H"] = 1
else fields["H"] = 0
if (this.gov == "R") fields["R"] = 1
else fields["R"] = 0
if (this.gov == "D") fields["D"] = 1
else fields["D"] = 0
emit(key, fields);
};
Reduce Function:
var reduce = function(k,vals) {
reducedVals = { tag: k, networth: 0, land: 0, alive: 0, M: 0, F: 0, T: 0, I: 0, C: 0, H: 0, R: 0, D: 0, gdi: 0};
for (var i = 0; i < vals.length; i++){
reducedVals.networth += vals[i].networth;
reducedVals.land += vals[i].land;
reducedVals.alive += vals[i].alive;
reducedVals.gdi += vals[i].gdi;
reducedVals.M += vals[i].M;
reducedVals.F += vals[i].F;
reducedVals.T += vals[i].T;
reducedVals.I += vals[i].I;
reducedVals.C += vals[i].C;
reducedVals.H += vals[i].H;
reducedVals.R += vals[i].R;
reducedVals.D += vals[i].D;
}
return reducedVals;
};

Related

mongoDB mapreduce taking so long time for running 3m document

I have one collection with 3 million documents. Each document has 40 fields. Fields are like the follow .
{
"b_date" : "2016-04-05",
"d_date" : "2016-06-25",
"pos" : "MISC",
"origin" : "DXB",
"destination" : "HGA",
"pax" : 1,
"pax_1" : 2
},
{
"b_date" : "2016-04-05",
"d_date" : "2016-06-25",
"pos" : "MISC",
"origin" : "DXB",
"destination" : "HGA",
"pax" : 4,
"pax_1" : 5
},
{
"b_date" : "2016-04-05",
"d_date" : "2016-06-26",
"pos" : "MISC",
"origin" : "DXB",
"destination" : "HGA",
"pax" : 3,
"pax_1" : 3
}
Now I want to get the sum of pax and pax_1 by grouping b_date,d_date,pos,origin,destination fields.
cumulative pax is grouping of pos,origin,destination fields but cumulative pax and pax_1 should increase based on ascending order of b_date and d_date.
Expected result is.
{
"_id.dep_date" : "2016-04-05",
"_id.sale_date" : "2016-06-25",
"_id.pos" : "MISC",
"_id.origin" : "DXB",
"_id.destination" : "HGA",
"value.pax" : 5,
"value.cumulative_pax":5,
"value.pax_1" : 7,
"value.cumulative_pax_1":7,
},
{
"_id.dep_date" : "2016-04-05",
"_id.sale_date" : "2016-06-26",
"_id.pos" : "MISC",
"_id.origin" : "DXB",
"_id.destination" : "HGA",
"value.pax" : 3,
"value.cumulative_pax":8,
"value.pax_1" : 3,
"value.cumulative_pax_1":10,
}
my mapReduce code
db.collection.mapReduce(
function() {
emit(
{
"pos" : this.pos,
"origin" : this.origin,
"destination" : this.destination,
'dep_date': this.d_date,
'sale_date': this.b_date,
},
{
'pax':this.pax,
'pax_1':this.pax_1,
}
);
}
,
function(key,values) {
paxt = 0;
paxt_1 = 0;
for (var i in values){
paxt += values[i].pax;
paxt_1 += values[i].pax_1;
}
return {'pax':paxt,
'pax_1':paxt_1,
};
}
,
{
'scope':{
'pos':'',
'origin':'',
'destination':'',
'dep_date': '',
'sale_date': '',
'result':{}
}
,
'finalize':function(key,value) {
if (pos != key.pos ||
origin != key.origin ||
destination != key.destination ||
){
result['pax'] = 0;
result['pax_1'] = 0;
result['cumulative_pax'] = 0;
result['cumulative_pax_1'] = 0;
}
result['pax'] += value.pax;
result['cumulative_pax'] = value.pax;
result['pax_1'] += value.pax_1;
result['cumulative_pax_1'] = value.pax_1;
pos = key.pos;
origin = key.origin;
destination = key.destination;
dep_date = key.dep_date;
sale_date = key.sale_date;
return result;
}
,
'out':'some_collection'
}
)
This map reduce returning expected value but it took so much of time like 3 hours. Is that because of 'b_date' and 'd_date' is being string formatted date? or how to do optimization.
Aggregation is returning result within 3 minutes but I couldn't get cumulative pax by using aggregation.
Map Reduce code,
db.collection.mapReduce(
function() {
emit(
{
"pos" : this.pos,
"origin" : this.origin,
"destination" : this.destination,
'dep_date': this.d_date,
'sale_date': this.b_date,
},
{
'pax':this.pax,
'pax_1':this.pax_1,
}
);
}
,
function(key,values) {
paxt = 0;
paxt_1 = 0;
for (var i in values){
paxt += values[i].pax;
paxt_1 += values[i].pax_1;
}
return {'pax':paxt,
'pax_1':paxt_1,
};
}
,
{
'scope':{
'pos':'',
'origin':'',
'destination':'',
'dep_date': '',
'sale_date': '',
'result':{}
}
,
'finalize':function(key,value) {
if (pos != key.pos ||
origin != key.origin ||
destination != key.destination ||
){
result['pax'] = 0;
result['pax_1'] = 0;
result['cumulative_pax'] = 0;
result['cumulative_pax_1'] = 0;
}
result['pax'] += value.pax;
result['cumulative_pax'] = value.pax;
result['pax_1'] += value.pax_1;
result['cumulative_pax_1'] = value.pax_1;
pos = key.pos;
origin = key.origin;
destination = key.destination;
dep_date = key.dep_date;
sale_date = key.sale_date;
return result;
}
,
'out':'some_collection'
}
)

MongoDB : query collection based on Date

I have a collection in MongoDB in this format
db.logins.find().pretty()
{
"cust_id" : "samueal",
"created_at" : "2011-03-09 10:31:02.765"
}
{
"cust_id" : "sade",
"created_at" : "2011-03-09 10:33:11.157"
}
{
"cust_id" : "sade",
"created_at" : "2011-03-10 10:33:35.595"
}
{
"cust_id" : "samueal",
"created_at" : "2011-03-10 10:39:06.388"
}
This is my mapReduce function
m = function() { emit(this.cust_id, 1); }
r = function (k, vals) { var sum = 0; for (var i in vals) { sum += vals[i]; } return sum; }
q = function() {
var currentDate = new Date();
currentDate.setDate(currentDate.getDate()-32);
var month = (currentDate.getMonth() < 10 ? "0"+ (currentDate.getMonth()+1) : (currentDate.getMonth()+1));
var date = currentDate.getFullYear() + "-" + month ;
var patt = new RegExp(date);
var query = {"created_at":patt};
return query;
}
res = db.logins.mapReduce(m, r, { query : q(), out : "userLoginCountMonthly" });
With this i am geting the output as
{ "_id" : "sade", "value" : 2 }
{ "_id" : "samueal", "value" : 2 }
but i need genearte a Report output in this format
For Example
Name Date Logins
sade 2011-03-09 1
sade 2011-03-10 2
samueal 2011-03-09 1
samueal 2011-03-10 1
Could anybody please help me how to achive , its
Edited Part
Currently I am getting the output as
{ "_id" : "dsstest 2011-03-09", "value" : 4 }
{ "_id" : "dsstest 2011-03-10", "value" : 14 }
Is it possible that i can get in this format
{ "_id" : "dsstest" , "date" : "2011-03-09", "value" : 4 }
{ "_id" : "dsstest" , "date" : "2011-03-10", "value" : 14 }
Your mapping function is insufficient as it doesn't produce key that has the date in it.
I also don't quite understand why in the sample of what you want sade gets two logins. From what you are saying you want, you should need:
var m = function() {
var aux = this.created_at.indexOf(' ');
aux = this.created_at.substring(0,aux);
// this 'if' block will filter out the entries you don't want
// to be included in the result.
if (aux < "2011-11-11") {
return;
}
emit({cust:this.cust_id,day:aux},1);
}
var r = function(k, values) {
var l = values.length;
var r = 0;
var i;
for (i=0; i<l; i++) {
r+=values[i];
}
return r;
}
And to run:
> db.logins.mapReduce(m, r, { query : q(), out : "userLoginCountMonthly" });
And finally, to produce the report:
> db.userLoginCountMonthly.find().forEach(function(e){print(e._id.cust +' ' + e._id.day+' '+e.value);})
This would list the amount of logins for each user, for each day (within your search scope).

SQL to MapReduce: how sql avg function could be done in MongoDB?

How could write the sql avg function in MapReduce MongoDB?
I've tried the following, sum the values and divide for a count. But the problem is where i do it, in reduce function or finalize function?
For example:
i have the following document
{
"_id" : ObjectId("511b7d1b3daee1b1446ecdfe"),
"l_linenumber" : 1,
"l_quantity" : 17,
"l_extendedprice" : 21168.23,
"l_discount" : 0.04,
"l_tax" : 0.02,
"l_returnflag" : "N",
"l_linestatus" : "O",
"l_shipdate" : ISODate("1996-03-13T03:00:00Z"),
"l_commitdate" : ISODate("1996-02-12T03:00:00Z"),
"l_receiptdate" : ISODate("1996-03-22T03:00:00Z"),
"l_shipinstruct" : "DELIVER IN PERSON",
"l_shipmode" : "TRUCK",
"l_comment" : "blithely regular ideas caj",
}
And the SQL query is:
select
l_returnflag,
l_linestatus,
sum(l_quantity) as sum_qty,
sum(l_extendedprice) as sum_base_price,
sum(l_extendedprice*(1-l_discount)) as sum_disc_price,
sum(l_extendedprice*(1-l_discount)*(1+l_tax)) as sum_charge,
avg(l_quantity) as avg_qty,
avg(l_extendedprice) as avg_price,
avg(l_discount) as avg_disc,
count(*) as count_order
from
lineitem
where
l_shipdate <= DATE_SUB('1998-12-01',INTERVAL 90 DAY)
group by
l_returnflag,
l_linestatus
order by
l_returnflag,
l_linestatus;
I done this mapreduce function:
db.runCommand({
mapreduce: "lineitem",
map: function Map() {
var dataInicial = new Date("Dec 1, 1998");
var dataFinal = new Date();
dataFinal.setDate(dataInicial.getDate()-90);
if( this.l_shipdate<=dataFinal) {
emit(
{returnflag: this.l_returnflag, linestatus: this.l_linestatus},
{
sum_qty: this.l_quantity,
sum_base_price: this.l_extendedprice,
sum_disc_price: this.l_extendedprice*(1-this.l_discount),
sum_charge: this.l_extendedprice*(1-this.l_discount)*(1+this.l_tax),
avg_qty: this.l_quantity,
avg_price: this.l_extendedprice,
avg_disc: this.l_discount,
count_order: 1
}
);
}
},
reduce: function(key, values) {
var ret = {sum_qty: 0, sum_base_price: 0, sum_disc_price: 0, sum_charge: 0,
avg_qty: 0, avg_price: 0, avg_disc: 0, count_order: 0};
for (var i = 0; i < values.length; i++) {
ret.sum_qty += values[i].sum_qty;
ret.sum_base_price += values[i].sum_base_price;
ret.sum_disc_price += values[i].sum_disc_price;
ret.sum_charge += values[i].sum_charge;
ret.avg_qty += values[i].avg_qty;
ret.avg_price += values[i].avg_price;
ret.avg_disc += values[i].avg_disc;
ret.count_order += values[i].count_order;
}
return ret;
},
finalize: function(key, value) {
value.avg_qty = value.avg_qty/value.count_order;
value.avg_price = value.avg_qty/value.count_order;
value.avg_disc = value.avg_qty/value.count_order;
return value;
},
out: 'query001'
});
The answer for avg_qty, avg_price, avg_disc are incorrect. Whats is going on? Or the sum and divide by count would be done inside reduce function?
Here is how you do it with MapReduce:
m = function (){
emit( {returnflag: this.l_returnflag, linestatus: this.l_linestatus} ,
{ sum_base_price: this.l_extendedprice, count : 1 } );
};
r = function (name, values){
var res = {sum_base_price : 0, count : 0};
values.forEach (function (v) {
res.sum_base_price += v[i].sum_base_price;
res.count += v[i].count;
}
return res;
};
f = function(key, res){
res.avg = res.sum_base_price / res.count;
return res;
};
Then, call mapReduce:
db.lineitem.mapReduce( m, r,
{ finalize : f,
out : {inline : 1},
query: {l_shipdate:{$lt:dataFinal}}
}
);
So you are not filtering in the map function, you are doing it in a query before calling map (that's more efficient).
In aggregation framework it would be:
db.lineitem.aggregate( [
{$match: {l_shipdate:{$lt:dataFinal}},
{$group: { _id: {returnflag: "$l_returnflag", linestatus: "$l_linestatus"},
sum_base_price: {$sum:"$l_extended_price"},
avg_base_price: {$avg:"$l_extended_price"},
count: {$sum: 1}
}
}
])
Add your other fields as needed...

Unable to set query filter in a mongodb mapReduce command

I am trying to filter a mapReduce command with a query. This query seems to not beeing used by the mapReduce command. When I use the runCommand with the same parameters the query filter is used. I tryed with a mongodb 2.2.1 and a 2.0.1.
The query of my mapReduce function is not used.
m = function () {
if (this.duration > 0) {
emit("dur", this.duration);
}
}
r = function (key, values) {
var index = 0;
var sum = 0;
for (var i = 0; i < values.length; i++) {
sum += values[i];
index++;
}
return sum / index;
}
This command doesn't work :
res = db.movies.mapReduce(m,r, {out: { inline : 1}},{query:{kinds:'Action'}});
{
"results" : [
{
"_id" : "dur",
"value" : 5148.227224559308
}
],
"timeMillis" : 1849,
"counts" : {
"input" : 105472,
"emit" : 69602,
"reduce" : 106,
"output" : 1
},
"ok" : 1,
}
This command work :
res = db.runCommand({mapReduce : "movies", map : m, reduce : r, query : {kinds:'Action'}, out : {inline:1} })
{
"results" : [
{
"_id" : "dur",
"value" : 6134.118191572414
}
],
"timeMillis" : 238,
"counts" : {
"input" : 3577,
"emit" : 2910,
"reduce" : 4,
"output" : 1
},
"ok" : 1
}
With runCommand the query is used. Any ideas ?
You need to combine the out and query options into a single object:
res = db.movies.mapReduce(m,r, {out: { inline : 1}, query: {kinds: 'Action'} });

MongoDB group by Functionalities

In MySQL
select a,b,count(1) as cnt from list group by a, b having cnt > 2;
I have to execute the group by function using having condition in mongodb.
But i am getting following error. Please share your input.
In MongoDB
> res = db.list.group({key:{a:true,b:true},
... reduce: function(obj,prev) {prev.count++;},
... initial: {count:0}}).limit(10);
Sat Jan 7 16:36:30 uncaught exception: group command failed: {
"errmsg" : "exception: group() can't handle more than 20000 unique keys",
"code" : 10043,
"ok" : 0
Once it will be executed, we need to run the following file on next.
for (i in res) {if (res[i].count>2) printjson(res[i])};
Regards,
Kumaran
MongoDB group by is very limited in most cases, for instance
- the result set must be lesser than 10000 keys.
- it will not work in sharded environments
So its better to use map reduce. so the query would be like this
map = function() { emit({a:true,b:true},{count:1}); }
reduce = function(k, values) {
var result = {count: 0};
values.forEach(function(value) {
result.count += value.count;
});
return result;
}
and then
db.list.mapReduce(map,reduce,{out: { inline : 1}})
Its a untested version. let me know if it works
EDIT:
The earlier map function was faulty. Thats why you are not getting the results. it should have been
map = function () {
emit({a:this.a, b:this.b}, {count:1});
}
Test data:
> db.multi_group.insert({a:1,b:2})
> db.multi_group.insert({a:2,b:2})
> db.multi_group.insert({a:3,b:2})
> db.multi_group.insert({a:1,b:2})
> db.multi_group.insert({a:3,b:2})
> db.multi_group.insert({a:7,b:2})
> db.multi_group.mapReduce(map,reduce,{out: { inline : 1}})
{
"results" : [
{
"_id" : {
"a" : 1,
"b" : 2
},
"value" : {
"count" : 2
}
},
{
"_id" : {
"a" : 2,
"b" : 2
},
"value" : {
"count" : 1
}
},
{
"_id" : {
"a" : 3,
"b" : 2
},
"value" : {
"count" : 2
}
},
{
"_id" : {
"a" : 7,
"b" : 2
},
"value" : {
"count" : 1
}
}
],
"timeMillis" : 1,
"counts" : {
"input" : 6,
"emit" : 6,
"reduce" : 2,
"output" : 4
},
"ok" : 1,
}
EDIT2:
Complete solution including applying having count >= 2
map = function () {
emit({a:this.a, b:this.b}, {count:1,_id:this._id});
}
reduce = function(k, values) {
var result = {count: 0,_id:[]};
values.forEach(function(value) {
result.count += value.count;
result._id.push(value._id);
});
return result;
}
>db.multi_group.mapReduce(map,reduce,{out: { replace : "multi_result"}})
> db.multi_result.find({'value.count' : {$gte : 2}})
{ "_id" : { "a" : 1, "b" : 2 }, "value" : { "_id" : [ ObjectId("4f0adf2884025491024f994c"), ObjectId("4f0adf3284025491024f994f") ], "count" : 2 } }
{ "_id" : { "a" : 3, "b" : 2 }, "value" : { "_id" : [ ObjectId("4f0adf3084025491024f994e"), ObjectId("4f0adf3584025491024f9950") ], "count" : 2 } }
You should use MapReduce instead. Group has its limitations.
In future you'll be able to use the Aggregation Framework. But for now, use map/reduce.
Depends on the number of your groups, you might find a simpler and faster solution than group or MapReduce by using distinct:
var res = [];
for( var cur_a = db.list.distinct('a'); cur_a.hasNext(); ) {
var a = cur_a.next();
for( var cur_b = db.list.distinct('b'); cur_b.hasNext(); ) {
var b = cur_b.next();
var cnt = db.list.count({'a':a,'b':b})
if (cnt > 2)
res.push({ 'a': a, 'b' : b 'cnt': cnt}
}
}
It will be faster if you have indexes on a and b
db.list.ensureIndex({'a':1,'b':1})