Rewrite aggregation to map reduce method in mongo - mongodb

I have databases where I have:
{
"_id" : ObjectId("5e750506cb670d8afe8c94da"),
"height" : "166.27",
"weight" : "68.62",
"nationality" : "Afghanistan"
(more fields)
},
{
"_id" : ObjectId("5e750506cb670d8afe8c99d5"),
"height" : "162.44",
"weight" : "71.8",
"nationality" : "Afghanistan"
(more fields)
}
...
I created firstly some aggregation:
db.getCollection('people').aggregate([{
$project:
{
"nationality": "$nationality",
"weight": "$weight",
"height": "$height",
"bmi": {$divide: [{"$toDouble": "$weight"}, {$pow: [{$divide: [{"$toDouble": "$height"}, 100]},2]}]}
}
},
{
$group:
{
_id: "$nationality",
"weight": {$sum:{"$toDouble": "$weight"}},
"height": {$sum:{"$toDouble": "$height"}},
"avgbmi": {$avg: "$bmi"},
"sumbmi": {$sum: "$bmi"},
"minbmi": {$min: "$bmi"},
"maxbmi": {$max: "$bmi"}
}
}, {
$sort: {_id: 1}
}])
and the result was:
{
"_id" : "Afghanistan",
"weight" : 2729.14,
"height" : 6587.53,
"avgbmi" : 24.8445707030224,
"sumbmi" : 968.938257417875,
"minbmi" : 17.0990239681871,
"maxbmi" : 31.6807242778146
}
...
Currently I am trying to rewrite it to map-reduce. But here I have problem with findind a min and max. So far I have:
var mapFunction2 = function() {
var key = this.nationality
var value = {
count: 1,
weight: this.weight,
height: this.height,
sumbmi: parseFloat(this.weight)/(parseFloat(this.height)/100 * parseFloat(this.height)/100)
};
emit(key, value);
};
var reduceFunction2 = function(key, values) {
reducedVal = { count: 0, weight: 0 , height: 0, sumbmi: 0, minbmi: 0, maxbmi: 0};
for (var idx = 0; idx < values.length; idx++) {
reducedVal.count += values[idx].count;
var weight = parseFloat(values[idx].weight);
var height = parseFloat(values[idx].height);
reducedVal.weight += weight;
reducedVal.height += height;
reducedVal.sumbmi += values[idx].sumbmi;
}
return reducedVal;
};
var finalizeFunction2 = function (key, reducedVal) {
reducedVal.avgbmi = parseFloat(reducedVal.sumbmi)/reducedVal.count;
return reducedVal;
};
db.getCollection('people').mapReduce( mapFunction2, reduceFunction2, {finalize:finalizeFunction2, out:{inline:1}}).results
I have problem here because my min and max are strange...? (min was around 20 but max was 940 and the array in reduce have lenght == 3)
Can you help me? Thank you in advance.

Related

Is it possible to retrieve a 'time span' from a MongoDB query, using the timestamp within an ObjectId?

We have a basic enquiry management tool that we're using to track some website enquiries in our administration suite, and we're using the ObjectId of each document in our enquiries collection to sort the enquiries by the date they were added.
{
"_id" : ObjectId("53a007db144ff47be1000003"),
"comments" : "This is a test enquiry. Please ignore. We'll delete it shortly.",
"customer" : {
"name" : "Test Enquiry",
"email" : "test#test.com",
"telephone" : "07890123456",
"mobile" : "07890123456",
"quote" : false,
"valuation" : false
},
"site" : [],
"test" : true,
"updates" : [
{
"_id" : ObjectId("53a007db144ff47be1000001"),
"status" : "New",
"status_id" : ObjectId("537de7c3a5e6e668ffc2335c"),
"status_index" : 100,
"substatus" : "New Web Enquiry",
"substatus_id" : ObjectId("5396bb9fa5e6e668ffc23388"),
"notes" : "New enquiry received from website.",
},
{
"_id" : ObjectId("53a80c977d299cfe91bacf81"),
"status" : "New",
"status_id" : ObjectId("537de7c3a5e6e668ffc2335c"),
"status_index" : 100,
"substatus" : "Attempted Contact",
"substatus_id" : ObjectId("53a80e06a5e6e668ffc2339e"),
"notes" : "In this test, we pretend that we've not managed to get hold of the customer on the first attempt.",
},
{
"_id" : ObjectId("53a80e539b966b8da5c40c36"),
"status" : "Approved",
"status_id" : ObjectId("52e77a49d85e95f00ebf6c72"),
"status_index" : 200,
"substatus" : "Enquiry Confirmed",
"substatus_id" : ObjectId("53901f1ba5e6e668ffc23372"),
"notes" : "In this test, we pretend that we've got hold of the customer after failing to contact them on the first attempt.",
}
]
}
Within each enquiry is an updates array of objects which also have an ObjectId as their main identity field. We're using an $unwind and $group aggregation to pull the first and latest updates, as well as the count of updates, making sure we only take enquiries where there have been more than one update (as one is automatically inserted when the enquiry is made):
db.enquiries.aggregate([
{
$match: {
"test": true
}
},
{
$unwind: "$updates"
},
{
$group: {
"_id": "$_id",
"latest_update_id": {
$last: "$updates._id"
},
"first_update_id": {
$first: "$updates._id"
},
"update_count": {
$sum: 1
}
}
},
{
$match: {
"update_count": {
$gt: 1
}
}
}
])
This results in the following output:
{
"result" : [
{
"_id" : ObjectId("53a295ad122ea80200000005"),
"latest_update_id" : ObjectId("53a80bdc7d299cfe91bacf7e"),
"first_update_id" : ObjectId("53a295ad122ea80200000003"),
"update_count" : 2
},
{
"_id" : ObjectId("53a007db144ff47be1000003"),
"latest_update_id" : ObjectId("53a80e539b966b8da5c40c36"),
"first_update_id" : ObjectId("53a007db144ff47be1000001"),
"update_count" : 3
}
],
"ok" : 1
}
This is then passed through to our code (node.js, in this case) where we perform a few operations on it and then present some information on our dashboard.
Ideally, I'd like to add another $group pipeline aggregation to the query which would subtract the timestamp of first_update_id from the timestamp of latest_update_id to give us a timespan, which we could then use $avg on.
Can anyone tell me if this is possible? (Thank you!)
As Neil already pointed out, you can't get to the timestamp from the ObjectId in the aggregation framework.
You said that speed is not important, so using MapReduce you can get what you want:
var map = function() {
if (this.updates.length > 1) {
var first = this.updates[0];
var last = this.updates[this.updates.length - 1];
var diff = last._id.getTimestamp() - first._id.getTimestamp();
var val = {
latest_update_id : last._id,
first_update_id : first._id,
update_count : this.updates.length,
diff: diff
}
emit(this._id, val);
}
};
var reduce = function() { };
db.runCommand(
{
mapReduce: "enquiries",
map: map,
reduce: reduce,
out: "mrresults",
query: { test : true}
}
);
This are the results:
{
"_id" : ObjectId("53a007db144ff47be1000003"),
"value" : {
"latest_update_id" : ObjectId("53a80e539b966b8da5c40c36"),
"first_update_id" : ObjectId("53a007db144ff47be1000001"),
"update_count" : 3,
"diff" : 525944000
}
}
Edit:
If you want to get the average diff for all documents you can do it like this:
var map = function() {
if (this.updates.length > 1) {
var first = this.updates[0];
var last = this.updates[this.updates.length - 1];
var diff = last._id.getTimestamp() - first._id.getTimestamp();
emit("1", {diff : diff});
}
};
var reduce = function(key, values) {
var reducedVal = { count: 0, sum: 0 };
for (var idx = 0; idx < values.length; idx++) {
reducedVal.count += 1;
reducedVal.sum += values[idx].diff;
}
return reducedVal;
};
var finalize = function (key, reducedVal) {
reducedVal.avg = reducedVal.sum/reducedVal.count;
return reducedVal;
};
db.runCommand(
{
mapReduce: "y",
map: map,
reduce: reduce,
finalize : finalize,
out: "mrtest",
query: { test : true}
}
);
And the example output:
> db.mrtest.find().pretty()
{
"_id" : "1",
"value" : {
"count" : 2,
"sum" : 1051888000,
"avg" : 525944000
}
}

mongodb: reduce function not action when only one result is there

I need to do the weighted average.
Did the coding as below
db.runCommand(
{ mapreduce : "<collecton>" ,
map: function ()
{
emit ({nkey: this.nkey}, {price: this.tags["31"], qty: this.tags["32"]});
},
reduce: function(key, vals)
{
var ret = {wavg:0};
var mul1 = 0.0;
var sum1 = 0.0;
for (var i=0;i<vals.length;i++)
{ mul1 += vals[i].price * vals[i].qty;
sum1 += vals[i].qty;
}
ret.wavg = mul1/sum1;
return ret;
},
out: 'res2', verbose: true
}
);
> db.res2.find()
{ "_id" : { "nkey" : "key1" }, "value" : { "wavg" : 311.7647058823529 } }
{ "_id" : { "nkey" : "ke2" }, "value" : { "wavg" : 585.7142857142857 } }
{ "_id" : { "nkey" : "key3" }, "value" : { "price" : 1000, "qty" : 110 } }
{ "_id" : { "nkey" : "key4" }, "value" : { "wavg" : 825 } }
If you notice, in the final reducer output(third row), it dint actually go thru the reduce functionality. The key occur only once, hence one result will be emitted. But I still want the reduce function to be acting on that to get the weighted average. I can't just go ahead with price and qty wherever I have only one occurence of the key, where I need weighted average for that key also.
Is there any way to achieve this ?
This is essentially how mapReduce works in that the reducer is never called when you only have one result. But you can always alter such results with a finalize stage:
db.runCommand({
"mapreduce" : "<collecton>" ,
"map": function () {
emit (
{ "nkey": this.nkey},
{ "price": this.tags["31"], qty: this.tags["32"]}
);
},
"reduce": function(key, vals) {
var ret = { "wavg": 0 };
var mul1 = 0.0;
var sum1 = 0.0;
for ( var i=0; i<vals.length; i++ ) {
mul1 += vals[i].price * vals[i].qty;
sum1 += vals[i].qty;
}
ret.wavg = mul1/sum1;
return ret;
},
"finalize": function(key,value) {
if (value.hasOwnProperty("price") {
value.wavg = value.price;
delete value["price"];
}
return value;
},
"out": 'res2',
"verbose": true
});
Or otherwise alternately just sum your keys in the reduce stage and do all the division in the finalize stage if that suits you thinking better. But then you would need to do your "multiplication" part in the "mapper" for that to work.

MongoDB count number of new documents per minute based on _id

I want to create a statistic on how many new documents are stored each minute.
Since the _id field with standard ObjectID contains already the timestamp of the document creation I think it should be possible to somehow use it.
On Stackoverflow i found the following map reduce code to get it done when there is a dedicated field for the creation data
Map-Reduce count number of documents in each minute MongoDB
map = function() {
var created_at_minute = new Date(this.created_at.getFullYear(),
this.created_at.getMonth(),
this.created_at.getDate(),
this.created_at.getHours(),
this.created_at.getMinutes());
emit(created_at_minute, {count: 1});
}
reduce = function(key, values) {
var total = 0;
for(var i = 0; i < values.length; i++) { total += values[i].count; }
return {count: total};
}
According to the Mongo DB Documentation (http://docs.mongodb.org/manual/reference/object-id/) it should be possible to get the timestamp from the _id by calling ObjectId("507f191e810c19729de860ea").getTimestamp().
Right now I have no idea if it is possible at all to use this getTimestamp() inside of the map function.
Has anybody an idea how to do it or is there a better way ?
I need it to be implementable in python or php
You can do this with M/R indeed. getTimestamp() works in M/R as it runs in JavaScript on the server, it doesn't matter whether your client language is PHP or Python:
map = function() {
var datetime = this._id.getTimestamp();
var created_at_minute = new Date(datetime.getFullYear(),
datetime.getMonth(),
datetime.getDate(),
datetime.getHours(),
datetime.getMinutes());
emit(created_at_minute, {count: 1});
}
reduce = function(key, values) {
var total = 0;
for(var i = 0; i < values.length; i++) { total += values[i].count; }
return {count: total};
}
db.so.mapReduce( map, reduce, { out: 'inline' } );
db.inline.find();
Which outputs something like:
{ "_id" : ISODate("2013-08-05T15:24:00Z"), "value" : { "count" : 9 } }
{ "_id" : ISODate("2013-08-05T15:26:00Z"), "value" : { "count" : 2 } }
However, I would suggest you don't use M/R but instead turn to the aggregation framework as it's much faster because can use indexes and run concurrently. Right now, the A/F does not have an operator to get the timestamp out of an ObjectID field yet though so you will have to store the time at the moment of insertion as well. F.e. with documents like this:
db.so.drop();
db.so.insert( { date: new ISODate( "2013-08-05T15:24:15" ) } );
db.so.insert( { date: new ISODate( "2013-08-05T15:24:19" ) } );
db.so.insert( { date: new ISODate( "2013-08-05T15:24:25" ) } );
db.so.insert( { date: new ISODate( "2013-08-05T15:24:32" ) } );
db.so.insert( { date: new ISODate( "2013-08-05T15:24:45" ) } );
db.so.insert( { date: new ISODate( "2013-08-05T15:25:15" ) } );
db.so.insert( { date: new ISODate( "2013-08-05T15:25:15" ) } );
db.so.aggregate( [
{ $group: {
_id: {
y: { '$year': '$date' },
m: { '$month': '$date' },
d: { '$dayOfMonth': '$date' },
h: { '$hour': '$date' },
i: { '$minute': '$date' },
},
count: { $sum : 1 }
} }
] );
Which outputs:
{
"result" : [
{
"_id" : {
"y" : 2013,
"m" : 8,
"d" : 5,
"h" : 15,
"i" : 25
},
"count" : 2
},
{
"_id" : {
"y" : 2013,
"m" : 8,
"d" : 5,
"h" : 15,
"i" : 24
},
"count" : 5
}
],
"ok" : 1
}

SQL to MapReduce: how sql avg function could be done in MongoDB?

How could write the sql avg function in MapReduce MongoDB?
I've tried the following, sum the values and divide for a count. But the problem is where i do it, in reduce function or finalize function?
For example:
i have the following document
{
"_id" : ObjectId("511b7d1b3daee1b1446ecdfe"),
"l_linenumber" : 1,
"l_quantity" : 17,
"l_extendedprice" : 21168.23,
"l_discount" : 0.04,
"l_tax" : 0.02,
"l_returnflag" : "N",
"l_linestatus" : "O",
"l_shipdate" : ISODate("1996-03-13T03:00:00Z"),
"l_commitdate" : ISODate("1996-02-12T03:00:00Z"),
"l_receiptdate" : ISODate("1996-03-22T03:00:00Z"),
"l_shipinstruct" : "DELIVER IN PERSON",
"l_shipmode" : "TRUCK",
"l_comment" : "blithely regular ideas caj",
}
And the SQL query is:
select
l_returnflag,
l_linestatus,
sum(l_quantity) as sum_qty,
sum(l_extendedprice) as sum_base_price,
sum(l_extendedprice*(1-l_discount)) as sum_disc_price,
sum(l_extendedprice*(1-l_discount)*(1+l_tax)) as sum_charge,
avg(l_quantity) as avg_qty,
avg(l_extendedprice) as avg_price,
avg(l_discount) as avg_disc,
count(*) as count_order
from
lineitem
where
l_shipdate <= DATE_SUB('1998-12-01',INTERVAL 90 DAY)
group by
l_returnflag,
l_linestatus
order by
l_returnflag,
l_linestatus;
I done this mapreduce function:
db.runCommand({
mapreduce: "lineitem",
map: function Map() {
var dataInicial = new Date("Dec 1, 1998");
var dataFinal = new Date();
dataFinal.setDate(dataInicial.getDate()-90);
if( this.l_shipdate<=dataFinal) {
emit(
{returnflag: this.l_returnflag, linestatus: this.l_linestatus},
{
sum_qty: this.l_quantity,
sum_base_price: this.l_extendedprice,
sum_disc_price: this.l_extendedprice*(1-this.l_discount),
sum_charge: this.l_extendedprice*(1-this.l_discount)*(1+this.l_tax),
avg_qty: this.l_quantity,
avg_price: this.l_extendedprice,
avg_disc: this.l_discount,
count_order: 1
}
);
}
},
reduce: function(key, values) {
var ret = {sum_qty: 0, sum_base_price: 0, sum_disc_price: 0, sum_charge: 0,
avg_qty: 0, avg_price: 0, avg_disc: 0, count_order: 0};
for (var i = 0; i < values.length; i++) {
ret.sum_qty += values[i].sum_qty;
ret.sum_base_price += values[i].sum_base_price;
ret.sum_disc_price += values[i].sum_disc_price;
ret.sum_charge += values[i].sum_charge;
ret.avg_qty += values[i].avg_qty;
ret.avg_price += values[i].avg_price;
ret.avg_disc += values[i].avg_disc;
ret.count_order += values[i].count_order;
}
return ret;
},
finalize: function(key, value) {
value.avg_qty = value.avg_qty/value.count_order;
value.avg_price = value.avg_qty/value.count_order;
value.avg_disc = value.avg_qty/value.count_order;
return value;
},
out: 'query001'
});
The answer for avg_qty, avg_price, avg_disc are incorrect. Whats is going on? Or the sum and divide by count would be done inside reduce function?
Here is how you do it with MapReduce:
m = function (){
emit( {returnflag: this.l_returnflag, linestatus: this.l_linestatus} ,
{ sum_base_price: this.l_extendedprice, count : 1 } );
};
r = function (name, values){
var res = {sum_base_price : 0, count : 0};
values.forEach (function (v) {
res.sum_base_price += v[i].sum_base_price;
res.count += v[i].count;
}
return res;
};
f = function(key, res){
res.avg = res.sum_base_price / res.count;
return res;
};
Then, call mapReduce:
db.lineitem.mapReduce( m, r,
{ finalize : f,
out : {inline : 1},
query: {l_shipdate:{$lt:dataFinal}}
}
);
So you are not filtering in the map function, you are doing it in a query before calling map (that's more efficient).
In aggregation framework it would be:
db.lineitem.aggregate( [
{$match: {l_shipdate:{$lt:dataFinal}},
{$group: { _id: {returnflag: "$l_returnflag", linestatus: "$l_linestatus"},
sum_base_price: {$sum:"$l_extended_price"},
avg_base_price: {$avg:"$l_extended_price"},
count: {$sum: 1}
}
}
])
Add your other fields as needed...

MongoDb aggregation or mapreduce for invoicing statistics?

I'm new to MongoDb and have a job for (I suppose) MapReduce or Aggregation.
I have an "invoices" collection with documents in this format:
{
date: 'some unix timestamp',
total: 12345,
paid: true
}
I need to display a table with months (jan-dec) as columns, a row for each year and the sum of total in the month (divided in paid and unpaid) in the cell. Like this:
| Jan | Feb | ...
2013 | 1,222 / 200 | 175 / 2,122 | ...
...
Can you help me get the mongo command right?
Maybe I'm better off writing some JS code to execute in mongo?
I've now found a solution using MapReduce. Here it is in use from PHP:
$map = new MongoCode('
function() {
var d = new Date(this.date*1000);
emit({y: d.getFullYear(), m: d.getMonth()}, {
total: this.total,
notPaid: this.paid ? 0 : this.total,
count: 1
});
};
');
$reduce = new MongoCode('
function(month, values) {
result = { total: 0, notPaid: 0, count: 0 };
for (var i = 0; i < values.length; i++) {
result.total += values[i].total;
result.notPaid += values[i].notPaid;
result.count += values[i].count;
}
return result;
};
');
$result = $db->command(array(
'mapreduce' => 'invoices',
'map' => $map,
'reduce' => $reduce,
'out' => 'temp'
));
echo $result['timeMillis'];
Now the results are in the "temp" collection, one document per month. Could it be optimized or enhanced?
You can do this with aggregation framework like this:
db.invoices.aggregate( [
{
"$project" : {
"yr" : {
"$year" : "$date"
},
"mo" : {
"$month" : "$date"
},
"total" : 1,
"unpaid" : {
"$cond" : [
"$paid",
0,
"$total"
]
}
}
},
{
"$group" : {
"_id" : {
"y" : "$yr",
"m" : "$mo"
},
"total" : {
"$sum" : "$total"
},
"unpaid" : {
"$sum" : "$unpaid"
}
}
}
] )
You can use another $project at the end to pretty-up the output, and a $sort to order it, but that's the basic functioning core of it.