SQL to MapReduce: counting unique key in a many to one relationship? - mongodb

Initially, i have a relationship where an order has many lineitems and many lineitems has only one order, as usual.
Using mongoDB, I did this document to represent it:
{
"_id" : ObjectId("511b7d1b3daee1b1446ecdfe"),
"l_order" : {
"_id" : ObjectId("511b7d133daee1b1446eb54d"),
"o_orderkey" : NumberLong(1),
"o_totalprice" : 173665.47,
"o_orderdate" : ISODate("1996-01-02T03:00:00Z"),
"o_orderpriority" : "5-LOW",
"o_shippriority" : 0,
},
"l_linenumber" : 1,
"l_shipdate" : ISODate("1996-03-13T03:00:00Z"),
"l_commitdate" : ISODate("1996-02-12T03:00:00Z"),
"l_receiptdate" : ISODate("1996-03-22T03:00:00Z"),
}
My intention is translate this sql query:
select
o_orderpriority,
count(*) as order_count
from
orders
where
o_orderdate >= date '1993-07-01'
and o_orderdate < date '1993-07-01' + interval '3' month
and exists (
select
*
from
lineitem
where
l_orderkey = o_orderkey
and l_commitdate < l_receiptdate
)
group by
o_orderpriority
order by
o_orderpriority;
For this a use two mapreduce functions:
First
db.runCommand({
mapreduce: "lineitem",
query: {
"l_order.o_orderdate": {'$gte': new Date("July 01, 1993"), '$lt': new Date("Oct 01, 1993")}
},
map: function Map() {
if(this.l_commitdate < this.l_receiptdate){
emit( this.l_order.o_orderkey, this.l_order.o_orderpriority );
}
},
out: 'query004a'
});
Second
db.runCommand({
mapreduce: "query004a",
map: function Map() {
/*Remenbering, the value here will be this.l_order.o_orderpriority from the previous mapreduce function*/
emit( this.value, 1 );
},
reduce: function(key, values) {
return Array.sum(values);
},
out: 'query004b'
});
In first i segregated the document pieces there was in date range and respect the comparison, grouping them for order key to avoid duplicate. In second i grouped the o_orderpriority and sum.
Well for my surprise the answer was bigger than i was expecting. But why and where this occurs?

in your first map function you should use 'oderpriority' as a key and 'orderkey' as value - this will reduce the set to the key you want in the second mapReduce. (You need to specify a reduce function, otherwise mapReduce will return an error).
So, this could look like this:
OrderDateMin = new Date("1996-01-01");
OrderDateMax = new Date("1996-04-01");
// first where on oderdate
query = {
"l_order.o_orderdate": {$gte: OrderDateMin, $lt: OrderDateMax}
}
map1 = function() {
//second "where" on commitdate < receiptdate
if ( this.l_commitdate < this.l_receiptdate ) {
// emit orderpriority as key, "1" as counter
emit( this.l_order.o_orderpriority, this.l_order.o_orderkey );
}
};
reduce1 = function(key, values) {
return 1;
}
db.runCommand({
mapReduce: "xx",
query: query,
map: map1,
reduce: reduce1,
out: 'query004a',
})
map2 = function() {
//_id is ordepriority
emit( this._id, 1 );
};
reduce2 = function(key, values) {
// count entries per orderpriority
count = 0;
values.forEach( function(value) { count += value; } );
return count;
}
db.runCommand({
mapReduce: "query004a",
map: map2,
reduce: reduce2,
out: 'query004b',
})
Now, the same can be achieved with one aggregate command, which is faster (implemented in C, not in JavaScript):
db.xx.aggregate([
// first "where", this will use an index, if defined
{ $match: {
"l_order.o_orderdate": { $gte: OrderDateMin, $lt: OrderDateMax }
}},
// reduce to needed fields, create a field for decision of second "where"
{ $project: {
"key": "$l_order.o_orderkey",
"pri": "$l_order.o_orderpriority",
okay: { $cond: [ {$lt: ["l_commitdate", "l_receiptdate"]}, 1, 0 ] }
}},
// select second where condition matched
{ $match: { "okay": 1 } },
// group by priority and key
{ $group: { _id: { "pri": "$pri", "key": "$key" } } },
// group by priority - count entries
{ $group: { _id: "$_id.pri", "count": { $sum: 1 } } },
])
which would return something like:
{ "result" : [ { "_id" : "5-LOW", "count" : 1 } ], "ok" : 1 }
Last, but not least: a suggestion regarding design:
It would be simpler if your structure was the other way round: an "orders" collection with the order-items embedded as an array of items. This would avoid having duplicate order data throughout the collection.
Further info:
http://docs.mongodb.org/manual/reference/command/mapReduce/#mapReduce
http://docs.mongodb.org/manual/reference/aggregation
Does this help?
Cheers
Ronald

Related

How to calculate simple moving average using mongodb mapreduce?

I have time series data as the following format in a mongodb collection:
{
"Name" : "AKBNK",
"Date" : ISODate("2009-01-02T00:00:00Z"),
"Close" : 3.256746559,
}
I want to calculate simple moving average using mongodb mapreduce. I tried it as the following to perform window sliding, but it works slowly when the period is big.
var mapper = function() {
var i = 0, j = counter;
if (j < period) {
j = period;
i = period - counter;
}
for (; i < period && j <= limit; i++, j++) {
emit (j, this.Close);
}
counter++;
}
var reducer = function(key, values) {
return Array.sum(values);
}
var finalizer = function(key, reducedValue) {
return reducedValue / period;
}
var period = 730;
db.data.mapReduce(mapper, reducer, {finalize: finalizer, out: "smaOut", query: {Name: "AKBNK"}, sort: {Date: -1}, limit: period * 2 - 1, scope: {counter: 1, period: period, limit: period * 2 - 1}});
Any advice how can I do this faster? How can I map the data?
You could try using the below aggregation pipeline which seems to produce the correct results at a quick glance but at a much higher speed:
db.data.aggregate({
$match: {
"Name": "AKBNK" // this stage will use and index if you have one on the "Name" field
}
}, {
$sort: { "Date": -1 }, // this stage will also use and index if you have one on "Date"
}, {
$group: {
"_id": null, // create one single document
"allCloseValues": { $push: "$Close" } // that shall contain an array with all "Close" values
}
}, {
$addFields: {
"copyOfAllCloseValues": "$allCloseValues" // duplicate the array
}
}, {
$unwind: {
"path": "$copyOfAllCloseValues", // flatten the created single document
"includeArrayIndex": "_id" // use the "_id" field to hold the array index
}
}, {
$project: {
avg: {
$avg: { // calculate the average of some part of the array "Close"
$slice: [ "$allCloseValues", "$_id", 730 ] // which shall start at index "_id" and take up to 730 values
}
}
}
}, {
$out: "smaOut" // write the resulting documents out to the "smaOut" collection
});

Write MapReduce function to count number of posts created by various users in MongoDB

I had created a collection "posts" with title, description, by, comments in MongoDB as:
db.posts.insert({
title:'MongoDB',
description:'MongoDB is a NoSQL DB',
by:'Tom',
comments:[
{user:'ram',
message:'We use MongoDB'
}
]
}
)
Similarly, I added other two entries.
Now, I want to write MapReduce function to count number of posts created by various users in MongoDB. I used:
db.posts.mapReduce(
function() { emit(this.user_id,1); },
function(key, values) {return Array.sum(values)}, {
out:"post_total"
}
).find()
This Output:
{"id": null , "value": 3}
But, what I want to display is something like:
{ "_id" : "tom_id", "value" : 2 }
{ "_id" : "mark_id", "value" : 1 }
or
{ "by" : "tom", "value" : 2 }
{ "by" : "mark", "value" : 1 }
Finally, I solve it. I got some idea from MapReduce function in MongoDB - Grouping document by ID
db.posts.mapReduce(
function() { emit(this.by,1); },
function(key, values) {return Array.sum(values)}, {
out:"post_total"
}
).find()
What I doing wrong was, emit(this.user_id,1). I was passing the wrong key.
Re-write your mapReduce method to:
db.posts.mapReduce(
function() {
emit(this.user_id, 1);
},
function(key, values) {
return Array.sum(values);
},
{ "out": { "inline": 1 } }
)
or use the runCommand command to run a mapReduce operation with an output collection option:
mr = db.runCommand({
"mapreduce": "posts",
"map" : function() {
for (var key in this) { emit(this.user_id, 1); }
},
"reduce" : function(key, values) { return Array.sum(values); },
"out": "post_total"
})
To get the results, run find() on the resulting collection:
db[mr.result].find()
Equivalent results yet much more efficient operation using the aggregation framework:
db.posts.aggregate([
{
"$group": {
"_id": "$user_id",
"count": { "$sum": 1 }
}
}
])

MongoDB: Operator $in and results order [duplicate]

When using MongoDB's $in clause, does the order of the returned documents always correspond to the order of the array argument?
As noted, the order of the arguments in the array of an $in clause does not reflect the order of how the documents are retrieved. That of course will be the natural order or by the selected index order as shown.
If you need to preserve this order, then you basically have two options.
So let's say that you were matching on the values of _id in your documents with an array that is going to be passed in to the $in as [ 4, 2, 8 ].
Approach using Aggregate
var list = [ 4, 2, 8 ];
db.collection.aggregate([
// Match the selected documents by "_id"
{ "$match": {
"_id": { "$in": [ 4, 2, 8 ] },
},
// Project a "weight" to each document
{ "$project": {
"weight": { "$cond": [
{ "$eq": [ "$_id", 4 ] },
1,
{ "$cond": [
{ "$eq": [ "$_id", 2 ] },
2,
3
]}
]}
}},
// Sort the results
{ "$sort": { "weight": 1 } }
])
So that would be the expanded form. What basically happens here is that just as the array of values is passed to $in you also construct a "nested" $cond statement to test the values and assign an appropriate weight. As that "weight" value reflects the order of the elements in the array, you can then pass that value to a sort stage in order to get your results in the required order.
Of course you actually "build" the pipeline statement in code, much like this:
var list = [ 4, 2, 8 ];
var stack = [];
for (var i = list.length - 1; i > 0; i--) {
var rec = {
"$cond": [
{ "$eq": [ "$_id", list[i-1] ] },
i
]
};
if ( stack.length == 0 ) {
rec["$cond"].push( i+1 );
} else {
var lval = stack.pop();
rec["$cond"].push( lval );
}
stack.push( rec );
}
var pipeline = [
{ "$match": { "_id": { "$in": list } }},
{ "$project": { "weight": stack[0] }},
{ "$sort": { "weight": 1 } }
];
db.collection.aggregate( pipeline );
Approach using mapReduce
Of course if that all seems to hefty for your sensibilities then you can do the same thing using mapReduce, which looks simpler but will likely run somewhat slower.
var list = [ 4, 2, 8 ];
db.collection.mapReduce(
function () {
var order = inputs.indexOf(this._id);
emit( order, { doc: this } );
},
function() {},
{
"out": { "inline": 1 },
"query": { "_id": { "$in": list } },
"scope": { "inputs": list } ,
"finalize": function (key, value) {
return value.doc;
}
}
)
And that basically relies on the emitted "key" values being in the "index order" of how they occur in the input array.
So those essentially are your ways of maintaining the order of a an input list to an $in condition where you already have that list in a determined order.
Another way using the Aggregation query only applicable for MongoDB verion >= 3.4 -
The credit goes to this nice blog post.
Example documents to be fetched in this order -
var order = [ "David", "Charlie", "Tess" ];
The query -
var query = [
{$match: {name: {$in: order}}},
{$addFields: {"__order": {$indexOfArray: [order, "$name" ]}}},
{$sort: {"__order": 1}}
];
var result = db.users.aggregate(query);
Another quote from the post explaining these aggregation operators used -
The "$addFields" stage is new in 3.4 and it allows you to "$project" new fields to existing documents without knowing all the other existing fields. The new "$indexOfArray" expression returns position of particular element in a given array.
Basically the addFields operator appends a new order field to every document when it finds it and this order field represents the original order of our array we provided. Then we simply sort the documents based on this field.
If you don't want to use aggregate, another solution is to use find and then sort the doc results client-side using array#sort:
If the $in values are primitive types like numbers you can use an approach like:
var ids = [4, 2, 8, 1, 9, 3, 5, 6];
MyModel.find({ _id: { $in: ids } }).exec(function(err, docs) {
docs.sort(function(a, b) {
// Sort docs by the order of their _id values in ids.
return ids.indexOf(a._id) - ids.indexOf(b._id);
});
});
If the $in values are non-primitive types like ObjectIds, another approach is required as indexOf compares by reference in that case.
If you're using Node.js 4.x+, you can use Array#findIndex and ObjectID#equals to handle this by changing the sort function to:
docs.sort((a, b) => ids.findIndex(id => a._id.equals(id)) -
ids.findIndex(id => b._id.equals(id)));
Or with any Node.js version, with underscore/lodash's findIndex:
docs.sort(function (a, b) {
return _.findIndex(ids, function (id) { return a._id.equals(id); }) -
_.findIndex(ids, function (id) { return b._id.equals(id); });
});
An easy way to order the result after mongo returns the array is to make an object with id as keys and then map over the given _id's to return an array that is correctly ordered.
async function batchUsers(Users, keys) {
const unorderedUsers = await Users.find({_id: {$in: keys}}).toArray()
let obj = {}
unorderedUsers.forEach(x => obj[x._id]=x)
const ordered = keys.map(key => obj[key])
return ordered
}
Similar to JonnyHK's solution, you can reorder the documents returned from find in your client (if your client is in JavaScript) with a combination of map and the Array.prototype.find function in EcmaScript 2015:
Collection.find({ _id: { $in: idArray } }).toArray(function(err, res) {
var orderedResults = idArray.map(function(id) {
return res.find(function(document) {
return document._id.equals(id);
});
});
});
A couple of notes:
The above code is using the Mongo Node driver and not Mongoose
The idArray is an array of ObjectId
I haven't tested the performance of this method vs the sort, but if you need to manipulate each returned item (which is pretty common) you can do it in the map callback to simplify your code.
I know this question is related to Mongoose JS framework, but the duplicated one is generic, so I hope posting a Python (PyMongo) solution is fine here.
things = list(db.things.find({'_id': {'$in': id_array}}))
things.sort(key=lambda thing: id_array.index(thing['_id']))
# things are now sorted according to id_array order
Always? Never. The order is always the same: undefined (probably the physical order in which documents are stored). Unless you sort it.
For any newcomers here is an short and elegant solution to preserve the order in such cases as of 2021 and using MongoDb 3.6 (tested):
const idList = ['123', '124', '125']
const out = await db
.collection('YourCollection')
.aggregate([
// Change uuid to your `id` field
{ $match: { uuid: { $in: idList } } },
{
$project: {
uuid: 1,
date: 1,
someOtherFieldToPreserve: 1,
// Addding this new field called index
index: {
// If we want index to start from 1, add an dummy value to the beggining of the idList array
$indexOfArray: [[0, ...idList], '$uuid'],
// Otherwise if 0,1,2 is fine just use this line
// $indexOfArray: [idList, '$uuid'],
},
},
},
// And finally sort the output by our index
{ $sort: { index: 1 } },
])
I know this is an old thread, but if you're just returning the value of the Id in the array, you may have to opt for this syntax. As I could not seem to get indexOf value to match with a mongo ObjectId format.
obj.map = function() {
for(var i = 0; i < inputs.length; i++){
if(this._id.equals(inputs[i])) {
var order = i;
}
}
emit(order, {doc: this});
};
How to convert mongo ObjectId .toString without including 'ObjectId()' wrapper -- just the Value?
You can guarantee order with $or clause.
So use $or: [ _ids.map(_id => ({_id}))] instead.
This is a code solution after the results are retrieved from Mongo. Using a map to store index and then swapping values.
catDetails := make([]CategoryDetail, 0)
err = sess.DB(mdb).C("category").
Find(bson.M{
"_id": bson.M{"$in": path},
"is_active": 1,
"name": bson.M{"$ne": ""},
"url.path": bson.M{"$exists": true, "$ne": ""},
}).
Select(
bson.M{
"is_active": 1,
"name": 1,
"url.path": 1,
}).All(&catDetails)
if err != nil{
return
}
categoryOrderMap := make(map[int]int)
for index, v := range catDetails {
categoryOrderMap[v.Id] = index
}
counter := 0
for i := 0; counter < len(categoryOrderMap); i++ {
if catId := int(path[i].(float64)); catId > 0 {
fmt.Println("cat", catId)
if swapIndex, exists := categoryOrderMap[catId]; exists {
if counter != swapIndex {
catDetails[swapIndex], catDetails[counter] = catDetails[counter], catDetails[swapIndex]
categoryOrderMap[catId] = counter
categoryOrderMap[catDetails[swapIndex].Id] = swapIndex
}
counter++
}
}
}

Does MongoDB's $in clause guarantee order

When using MongoDB's $in clause, does the order of the returned documents always correspond to the order of the array argument?
As noted, the order of the arguments in the array of an $in clause does not reflect the order of how the documents are retrieved. That of course will be the natural order or by the selected index order as shown.
If you need to preserve this order, then you basically have two options.
So let's say that you were matching on the values of _id in your documents with an array that is going to be passed in to the $in as [ 4, 2, 8 ].
Approach using Aggregate
var list = [ 4, 2, 8 ];
db.collection.aggregate([
// Match the selected documents by "_id"
{ "$match": {
"_id": { "$in": [ 4, 2, 8 ] },
},
// Project a "weight" to each document
{ "$project": {
"weight": { "$cond": [
{ "$eq": [ "$_id", 4 ] },
1,
{ "$cond": [
{ "$eq": [ "$_id", 2 ] },
2,
3
]}
]}
}},
// Sort the results
{ "$sort": { "weight": 1 } }
])
So that would be the expanded form. What basically happens here is that just as the array of values is passed to $in you also construct a "nested" $cond statement to test the values and assign an appropriate weight. As that "weight" value reflects the order of the elements in the array, you can then pass that value to a sort stage in order to get your results in the required order.
Of course you actually "build" the pipeline statement in code, much like this:
var list = [ 4, 2, 8 ];
var stack = [];
for (var i = list.length - 1; i > 0; i--) {
var rec = {
"$cond": [
{ "$eq": [ "$_id", list[i-1] ] },
i
]
};
if ( stack.length == 0 ) {
rec["$cond"].push( i+1 );
} else {
var lval = stack.pop();
rec["$cond"].push( lval );
}
stack.push( rec );
}
var pipeline = [
{ "$match": { "_id": { "$in": list } }},
{ "$project": { "weight": stack[0] }},
{ "$sort": { "weight": 1 } }
];
db.collection.aggregate( pipeline );
Approach using mapReduce
Of course if that all seems to hefty for your sensibilities then you can do the same thing using mapReduce, which looks simpler but will likely run somewhat slower.
var list = [ 4, 2, 8 ];
db.collection.mapReduce(
function () {
var order = inputs.indexOf(this._id);
emit( order, { doc: this } );
},
function() {},
{
"out": { "inline": 1 },
"query": { "_id": { "$in": list } },
"scope": { "inputs": list } ,
"finalize": function (key, value) {
return value.doc;
}
}
)
And that basically relies on the emitted "key" values being in the "index order" of how they occur in the input array.
So those essentially are your ways of maintaining the order of a an input list to an $in condition where you already have that list in a determined order.
Another way using the Aggregation query only applicable for MongoDB verion >= 3.4 -
The credit goes to this nice blog post.
Example documents to be fetched in this order -
var order = [ "David", "Charlie", "Tess" ];
The query -
var query = [
{$match: {name: {$in: order}}},
{$addFields: {"__order": {$indexOfArray: [order, "$name" ]}}},
{$sort: {"__order": 1}}
];
var result = db.users.aggregate(query);
Another quote from the post explaining these aggregation operators used -
The "$addFields" stage is new in 3.4 and it allows you to "$project" new fields to existing documents without knowing all the other existing fields. The new "$indexOfArray" expression returns position of particular element in a given array.
Basically the addFields operator appends a new order field to every document when it finds it and this order field represents the original order of our array we provided. Then we simply sort the documents based on this field.
If you don't want to use aggregate, another solution is to use find and then sort the doc results client-side using array#sort:
If the $in values are primitive types like numbers you can use an approach like:
var ids = [4, 2, 8, 1, 9, 3, 5, 6];
MyModel.find({ _id: { $in: ids } }).exec(function(err, docs) {
docs.sort(function(a, b) {
// Sort docs by the order of their _id values in ids.
return ids.indexOf(a._id) - ids.indexOf(b._id);
});
});
If the $in values are non-primitive types like ObjectIds, another approach is required as indexOf compares by reference in that case.
If you're using Node.js 4.x+, you can use Array#findIndex and ObjectID#equals to handle this by changing the sort function to:
docs.sort((a, b) => ids.findIndex(id => a._id.equals(id)) -
ids.findIndex(id => b._id.equals(id)));
Or with any Node.js version, with underscore/lodash's findIndex:
docs.sort(function (a, b) {
return _.findIndex(ids, function (id) { return a._id.equals(id); }) -
_.findIndex(ids, function (id) { return b._id.equals(id); });
});
An easy way to order the result after mongo returns the array is to make an object with id as keys and then map over the given _id's to return an array that is correctly ordered.
async function batchUsers(Users, keys) {
const unorderedUsers = await Users.find({_id: {$in: keys}}).toArray()
let obj = {}
unorderedUsers.forEach(x => obj[x._id]=x)
const ordered = keys.map(key => obj[key])
return ordered
}
Similar to JonnyHK's solution, you can reorder the documents returned from find in your client (if your client is in JavaScript) with a combination of map and the Array.prototype.find function in EcmaScript 2015:
Collection.find({ _id: { $in: idArray } }).toArray(function(err, res) {
var orderedResults = idArray.map(function(id) {
return res.find(function(document) {
return document._id.equals(id);
});
});
});
A couple of notes:
The above code is using the Mongo Node driver and not Mongoose
The idArray is an array of ObjectId
I haven't tested the performance of this method vs the sort, but if you need to manipulate each returned item (which is pretty common) you can do it in the map callback to simplify your code.
I know this question is related to Mongoose JS framework, but the duplicated one is generic, so I hope posting a Python (PyMongo) solution is fine here.
things = list(db.things.find({'_id': {'$in': id_array}}))
things.sort(key=lambda thing: id_array.index(thing['_id']))
# things are now sorted according to id_array order
Always? Never. The order is always the same: undefined (probably the physical order in which documents are stored). Unless you sort it.
For any newcomers here is an short and elegant solution to preserve the order in such cases as of 2021 and using MongoDb 3.6 (tested):
const idList = ['123', '124', '125']
const out = await db
.collection('YourCollection')
.aggregate([
// Change uuid to your `id` field
{ $match: { uuid: { $in: idList } } },
{
$project: {
uuid: 1,
date: 1,
someOtherFieldToPreserve: 1,
// Addding this new field called index
index: {
// If we want index to start from 1, add an dummy value to the beggining of the idList array
$indexOfArray: [[0, ...idList], '$uuid'],
// Otherwise if 0,1,2 is fine just use this line
// $indexOfArray: [idList, '$uuid'],
},
},
},
// And finally sort the output by our index
{ $sort: { index: 1 } },
])
I know this is an old thread, but if you're just returning the value of the Id in the array, you may have to opt for this syntax. As I could not seem to get indexOf value to match with a mongo ObjectId format.
obj.map = function() {
for(var i = 0; i < inputs.length; i++){
if(this._id.equals(inputs[i])) {
var order = i;
}
}
emit(order, {doc: this});
};
How to convert mongo ObjectId .toString without including 'ObjectId()' wrapper -- just the Value?
You can guarantee order with $or clause.
So use $or: [ _ids.map(_id => ({_id}))] instead.
This is a code solution after the results are retrieved from Mongo. Using a map to store index and then swapping values.
catDetails := make([]CategoryDetail, 0)
err = sess.DB(mdb).C("category").
Find(bson.M{
"_id": bson.M{"$in": path},
"is_active": 1,
"name": bson.M{"$ne": ""},
"url.path": bson.M{"$exists": true, "$ne": ""},
}).
Select(
bson.M{
"is_active": 1,
"name": 1,
"url.path": 1,
}).All(&catDetails)
if err != nil{
return
}
categoryOrderMap := make(map[int]int)
for index, v := range catDetails {
categoryOrderMap[v.Id] = index
}
counter := 0
for i := 0; counter < len(categoryOrderMap); i++ {
if catId := int(path[i].(float64)); catId > 0 {
fmt.Println("cat", catId)
if swapIndex, exists := categoryOrderMap[catId]; exists {
if counter != swapIndex {
catDetails[swapIndex], catDetails[counter] = catDetails[counter], catDetails[swapIndex]
categoryOrderMap[catId] = counter
categoryOrderMap[catDetails[swapIndex].Id] = swapIndex
}
counter++
}
}
}

Twitter data - Finding the most mentioned user in MongoDB

Lets say I have stream data from the Twitter API, and I have the data stored as documents in the MongoDB. What I'm trying to find is the count of screen_name under entities.user_mentions.
{
"_id" : ObjectId("50657d5844956d06fb5b36c7"),
"contributors" : null,
"text" : "",
"entities" : {
"urls" : [ ],
"hashtags" : [
{
"text" : "",
"indices" : [
26,
30
]
},
{
"text" : "",
"indices" : []
}
],
"user_mentions" : [
{
"name":"Twitter API",
"indices":[4,15],
"screen_name":"twitterapi",
"id":6253282, "id_str":"6253282"
}]
},
...
I have attempted to use map reduce:
map = function() {
if (!this.entities.user_mentions.screen_name) {
return;
}
for (index in this.entities.user_mentions.screen_name) {
emit(this.entities.user_mentions.screen_name[index], 1);
}
}
reduce = function(previous, current) {
var count = 0;
for (index in current) {
count += current[index];
}
return count;
}
result = db.runCommand({
"mapreduce" : "twitter_sample",
"map" : map,
"reduce" : reduce,
"out" : "user_mentions"
});
But its not quite working...
Since entities.user_mentions is an array, you want to emit a value for each screen_name in the map():
var map = function() {
this.entities.user_mentions.forEach(function(mention) {
emit(mention.screen_name, { count: 1 });
})
};
Then count the values by unique screen_name in the reduce():
var reduce = function(key, values) {
// NB: reduce() uses same format as results emitted by map()
var result = { count: 0 };
values.forEach(function(value) {
result.count += value.count;
});
return result;
};
Note: to debug your map/reduce JavaScript functions, you can use print() and printjson() commands. The output will appear in your mongod log.
EDIT: For comparison, here is an example using the new Aggregation Framework in MongoDB 2.2:
db.twitter_sample.aggregate(
// Project to limit the document fields included
{ $project: {
_id: 0,
"entities.user_mentions" : 1
}},
// Split user_mentions array into a stream of documents
{ $unwind: "$entities.user_mentions" },
// Group and count the unique mentions by screen_name
{ $group : {
_id: "$entities.user_mentions.screen_name",
count: { $sum : 1 }
}},
// Optional: sort by count, descending
{ $sort : {
"count" : -1
}}
)
The original Map/Reduce approach is best suited for a large data set, as is implied with Twitter data. For a comparison of Map/Reduce vs Aggregation Framework limitations see the related discussion on the StackOverflow question MongoDB group(), $group and MapReduce.