How could write the sql avg function in MapReduce MongoDB?
I've tried the following, sum the values and divide for a count. But the problem is where i do it, in reduce function or finalize function?
For example:
i have the following document
{
"_id" : ObjectId("511b7d1b3daee1b1446ecdfe"),
"l_linenumber" : 1,
"l_quantity" : 17,
"l_extendedprice" : 21168.23,
"l_discount" : 0.04,
"l_tax" : 0.02,
"l_returnflag" : "N",
"l_linestatus" : "O",
"l_shipdate" : ISODate("1996-03-13T03:00:00Z"),
"l_commitdate" : ISODate("1996-02-12T03:00:00Z"),
"l_receiptdate" : ISODate("1996-03-22T03:00:00Z"),
"l_shipinstruct" : "DELIVER IN PERSON",
"l_shipmode" : "TRUCK",
"l_comment" : "blithely regular ideas caj",
}
And the SQL query is:
select
l_returnflag,
l_linestatus,
sum(l_quantity) as sum_qty,
sum(l_extendedprice) as sum_base_price,
sum(l_extendedprice*(1-l_discount)) as sum_disc_price,
sum(l_extendedprice*(1-l_discount)*(1+l_tax)) as sum_charge,
avg(l_quantity) as avg_qty,
avg(l_extendedprice) as avg_price,
avg(l_discount) as avg_disc,
count(*) as count_order
from
lineitem
where
l_shipdate <= DATE_SUB('1998-12-01',INTERVAL 90 DAY)
group by
l_returnflag,
l_linestatus
order by
l_returnflag,
l_linestatus;
I done this mapreduce function:
db.runCommand({
mapreduce: "lineitem",
map: function Map() {
var dataInicial = new Date("Dec 1, 1998");
var dataFinal = new Date();
dataFinal.setDate(dataInicial.getDate()-90);
if( this.l_shipdate<=dataFinal) {
emit(
{returnflag: this.l_returnflag, linestatus: this.l_linestatus},
{
sum_qty: this.l_quantity,
sum_base_price: this.l_extendedprice,
sum_disc_price: this.l_extendedprice*(1-this.l_discount),
sum_charge: this.l_extendedprice*(1-this.l_discount)*(1+this.l_tax),
avg_qty: this.l_quantity,
avg_price: this.l_extendedprice,
avg_disc: this.l_discount,
count_order: 1
}
);
}
},
reduce: function(key, values) {
var ret = {sum_qty: 0, sum_base_price: 0, sum_disc_price: 0, sum_charge: 0,
avg_qty: 0, avg_price: 0, avg_disc: 0, count_order: 0};
for (var i = 0; i < values.length; i++) {
ret.sum_qty += values[i].sum_qty;
ret.sum_base_price += values[i].sum_base_price;
ret.sum_disc_price += values[i].sum_disc_price;
ret.sum_charge += values[i].sum_charge;
ret.avg_qty += values[i].avg_qty;
ret.avg_price += values[i].avg_price;
ret.avg_disc += values[i].avg_disc;
ret.count_order += values[i].count_order;
}
return ret;
},
finalize: function(key, value) {
value.avg_qty = value.avg_qty/value.count_order;
value.avg_price = value.avg_qty/value.count_order;
value.avg_disc = value.avg_qty/value.count_order;
return value;
},
out: 'query001'
});
The answer for avg_qty, avg_price, avg_disc are incorrect. Whats is going on? Or the sum and divide by count would be done inside reduce function?
Here is how you do it with MapReduce:
m = function (){
emit( {returnflag: this.l_returnflag, linestatus: this.l_linestatus} ,
{ sum_base_price: this.l_extendedprice, count : 1 } );
};
r = function (name, values){
var res = {sum_base_price : 0, count : 0};
values.forEach (function (v) {
res.sum_base_price += v[i].sum_base_price;
res.count += v[i].count;
}
return res;
};
f = function(key, res){
res.avg = res.sum_base_price / res.count;
return res;
};
Then, call mapReduce:
db.lineitem.mapReduce( m, r,
{ finalize : f,
out : {inline : 1},
query: {l_shipdate:{$lt:dataFinal}}
}
);
So you are not filtering in the map function, you are doing it in a query before calling map (that's more efficient).
In aggregation framework it would be:
db.lineitem.aggregate( [
{$match: {l_shipdate:{$lt:dataFinal}},
{$group: { _id: {returnflag: "$l_returnflag", linestatus: "$l_linestatus"},
sum_base_price: {$sum:"$l_extended_price"},
avg_base_price: {$avg:"$l_extended_price"},
count: {$sum: 1}
}
}
])
Add your other fields as needed...
Related
I have databases where I have:
{
"_id" : ObjectId("5e750506cb670d8afe8c94da"),
"height" : "166.27",
"weight" : "68.62",
"nationality" : "Afghanistan"
(more fields)
},
{
"_id" : ObjectId("5e750506cb670d8afe8c99d5"),
"height" : "162.44",
"weight" : "71.8",
"nationality" : "Afghanistan"
(more fields)
}
...
I created firstly some aggregation:
db.getCollection('people').aggregate([{
$project:
{
"nationality": "$nationality",
"weight": "$weight",
"height": "$height",
"bmi": {$divide: [{"$toDouble": "$weight"}, {$pow: [{$divide: [{"$toDouble": "$height"}, 100]},2]}]}
}
},
{
$group:
{
_id: "$nationality",
"weight": {$sum:{"$toDouble": "$weight"}},
"height": {$sum:{"$toDouble": "$height"}},
"avgbmi": {$avg: "$bmi"},
"sumbmi": {$sum: "$bmi"},
"minbmi": {$min: "$bmi"},
"maxbmi": {$max: "$bmi"}
}
}, {
$sort: {_id: 1}
}])
and the result was:
{
"_id" : "Afghanistan",
"weight" : 2729.14,
"height" : 6587.53,
"avgbmi" : 24.8445707030224,
"sumbmi" : 968.938257417875,
"minbmi" : 17.0990239681871,
"maxbmi" : 31.6807242778146
}
...
Currently I am trying to rewrite it to map-reduce. But here I have problem with findind a min and max. So far I have:
var mapFunction2 = function() {
var key = this.nationality
var value = {
count: 1,
weight: this.weight,
height: this.height,
sumbmi: parseFloat(this.weight)/(parseFloat(this.height)/100 * parseFloat(this.height)/100)
};
emit(key, value);
};
var reduceFunction2 = function(key, values) {
reducedVal = { count: 0, weight: 0 , height: 0, sumbmi: 0, minbmi: 0, maxbmi: 0};
for (var idx = 0; idx < values.length; idx++) {
reducedVal.count += values[idx].count;
var weight = parseFloat(values[idx].weight);
var height = parseFloat(values[idx].height);
reducedVal.weight += weight;
reducedVal.height += height;
reducedVal.sumbmi += values[idx].sumbmi;
}
return reducedVal;
};
var finalizeFunction2 = function (key, reducedVal) {
reducedVal.avgbmi = parseFloat(reducedVal.sumbmi)/reducedVal.count;
return reducedVal;
};
db.getCollection('people').mapReduce( mapFunction2, reduceFunction2, {finalize:finalizeFunction2, out:{inline:1}}).results
I have problem here because my min and max are strange...? (min was around 20 but max was 940 and the array in reduce have lenght == 3)
Can you help me? Thank you in advance.
the document in my mongo collection like this:
{
"_id" : ObjectId("568f7e67676b4ddf133999e8"),
"auth_dic" : {
"2406" : [
"44735"
],
"6410" : [
"223423"
]
...
...
},
"user_id" : "fdasd23423"
}
this is one user, there are many id like '2406' '6410' in 'auth_dic'
i want to user map_reduce method to statistic how many users have the '2406', how many users have the '6410', and other types.
my mapper is:
mapper = Code('''
function(){
this.auth_dic.forEach(function(app_id){
emit(app_id, 1);
});
}
''')
my reducer is:
reducer = Code('''
function(key, values){
var total = 0;
for (var i = 0; i < values.length; i++) {
total += values[i];
}
return total;
}
''')
but the mapper can not work, beacause the function 'forEach' can not user to iterate a dic, what can i do to solve this problem?
Run the following MapReduce operation:
mr = db.runCommand({
"mapreduce" : "collectionName",
"map" : function() {
var map = this.auth_dic;
for (var key in map) {
if (map.hasOwnProperty(key)) { emit(key, 1); }
}
},
"reduce" : function(key, val) { return Array.sum(val); },
"out": "collectionName" + "_keys"
})
Then query the resulting collection so as to find all the counts for the dynamic keys:
db[mr.result].find()
I need to do the weighted average.
Did the coding as below
db.runCommand(
{ mapreduce : "<collecton>" ,
map: function ()
{
emit ({nkey: this.nkey}, {price: this.tags["31"], qty: this.tags["32"]});
},
reduce: function(key, vals)
{
var ret = {wavg:0};
var mul1 = 0.0;
var sum1 = 0.0;
for (var i=0;i<vals.length;i++)
{ mul1 += vals[i].price * vals[i].qty;
sum1 += vals[i].qty;
}
ret.wavg = mul1/sum1;
return ret;
},
out: 'res2', verbose: true
}
);
> db.res2.find()
{ "_id" : { "nkey" : "key1" }, "value" : { "wavg" : 311.7647058823529 } }
{ "_id" : { "nkey" : "ke2" }, "value" : { "wavg" : 585.7142857142857 } }
{ "_id" : { "nkey" : "key3" }, "value" : { "price" : 1000, "qty" : 110 } }
{ "_id" : { "nkey" : "key4" }, "value" : { "wavg" : 825 } }
If you notice, in the final reducer output(third row), it dint actually go thru the reduce functionality. The key occur only once, hence one result will be emitted. But I still want the reduce function to be acting on that to get the weighted average. I can't just go ahead with price and qty wherever I have only one occurence of the key, where I need weighted average for that key also.
Is there any way to achieve this ?
This is essentially how mapReduce works in that the reducer is never called when you only have one result. But you can always alter such results with a finalize stage:
db.runCommand({
"mapreduce" : "<collecton>" ,
"map": function () {
emit (
{ "nkey": this.nkey},
{ "price": this.tags["31"], qty: this.tags["32"]}
);
},
"reduce": function(key, vals) {
var ret = { "wavg": 0 };
var mul1 = 0.0;
var sum1 = 0.0;
for ( var i=0; i<vals.length; i++ ) {
mul1 += vals[i].price * vals[i].qty;
sum1 += vals[i].qty;
}
ret.wavg = mul1/sum1;
return ret;
},
"finalize": function(key,value) {
if (value.hasOwnProperty("price") {
value.wavg = value.price;
delete value["price"];
}
return value;
},
"out": 'res2',
"verbose": true
});
Or otherwise alternately just sum your keys in the reduce stage and do all the division in the finalize stage if that suits you thinking better. But then you would need to do your "multiplication" part in the "mapper" for that to work.
I know I'm missing something with MapReduce in MongoDB. I'm trying to build a tag-frequency collection and I'm getting different results, even if it seems that map and reduce functions are the "same".
Example document (forget values 100, 45... I'm not using them):
{
...
tags: [['Rock', 100], ['Indie Pop', 45], ...]
}
Emitting a scalar value 1:
var map = function () {
if (this.tags) {
this.tags.forEach(function (tag) {
emit(tag[0], 1); // Emit just 1
});
}
};
var reduce = function (key, vals) { // Vals should be [1, ...]
return vals.length; // Count the length of the array
};
db.tracks.mapReduce(map, reduce, { out: 'mapreduce_out' });
db.mapreduce_out.find().sort({ value: -1 }).limit(3);
Output is:
{ "_id" : "rubyrigby1", "value" : 9 }
{ "_id" : "Dom", "value" : 7 }
{ "_id" : "Feel Better", "value" : 7 }
Emitting an object { count: 1 }:
var map = function () {
if (this.tags) {
this.tags.forEach(function (tag) {
emit(tag[0], { count: 1 }); // Emit an object
});
}
};
var reduce = function (key, vals) { // vals should be [{ count: 1 }, ...]
var count = 0;
vals.forEach(function (val) {
count += val.count; // Accumul
});
return { count: count };
};
db.tracks.mapReduce(map, reduce, { out: 'mapreduce_out' });
db.mapreduce_out.find().sort({ 'value.count': -1 }).limit(3);
Result is different and appears to be "right":
{ "_id" : "rock", "value" : { "count" : 9472 } }
{ "_id" : "pop", "value" : { "count" : 7103 } }
{ "_id" : "electronic", "value" : { "count" : 5727 } }
What's wrong with the first approach?
Consider a collection of a thousand documents all with the tag 'tagname':
for (var i = 0; i < 1000; i++) {
db.collection.insert({tags: [['tagname']]});
}
If I write a proper mapReduce I should get the output {"_id": "tagname", "count": 1000}. But if I use your map and reduce functions I'll get a count of 101 instead of 1000.
The reason is, MongoDB calls your reduce function repeatedly with intermediate results, in order to avoid keeping too large a batch of results in memory. You can actually see this by putting a print statement in your reduce:
var reduce = function (key, vals) {
print(vals);
return vals.length; // Count the length of the array
};
The print output appears in the server log. The reduce function is called with the first 100 1's, and it returns 100. So far so good. Then MongoDB calls it again with the first reduce's output plus the next 100 1's:
reduce([100, 1, 1, ..., 1]) // 100 plus 100 more 1's
So now it returns 101, because that's the length of the array. But clearly it should return 200, the sum of the array. So to get a correct result, change your reduce function:
reduce = function (key, vals) {
var sum = 0;
vals.forEach(function(val) { sum += val; });
return sum;
}
I have a Collection with many columns: col1,col2, WebsiteCode, CreatedDate, col3....,coln.
I want to group by WebsiteCode in a range of CreatedDate.
So I do:
map :
function Map() {
var key={WebsiteCode:this.WebsiteCode};
val={Count: 1};
emit(key,val);
}
reduce :
function Reduce(key, values) {
var res = {Total:0};
values.forEach(function(value) {
res.Total += value.Count;
});
return res;
}
And query range DateTime:
{ "CreatedDate" : { "$gte" : dateFrom , "$lte" : dateTo } }
Finally, I run this mapreduce command.
The result returns not what I expected with many rows having Total = NaN
Ex: {_id:{WebsiteCode:"websitecode1"}}, {value:{Total:NaN}}
But when I run count command:
db.collect.find({ "WebsiteCode" : "websitecode1", "CreatedDate" : { "$gte" : dateFrom), "$lte" : dateTo } }).count();
Result return: 927
Could you explain to me what I did wrong?
Your reduce function must return a value that has the same shape as the emitted values. So you can't use Count in the map and Total in the reduce.
Try this instead:
function Reduce(key, values) {
var res = {Count:0};
values.forEach(function(value) {
res.Count += value.Count;
});
return res;
}