Twitter data - Finding the most mentioned user in MongoDB - mongodb

Lets say I have stream data from the Twitter API, and I have the data stored as documents in the MongoDB. What I'm trying to find is the count of screen_name under entities.user_mentions.
{
"_id" : ObjectId("50657d5844956d06fb5b36c7"),
"contributors" : null,
"text" : "",
"entities" : {
"urls" : [ ],
"hashtags" : [
{
"text" : "",
"indices" : [
26,
30
]
},
{
"text" : "",
"indices" : []
}
],
"user_mentions" : [
{
"name":"Twitter API",
"indices":[4,15],
"screen_name":"twitterapi",
"id":6253282, "id_str":"6253282"
}]
},
...
I have attempted to use map reduce:
map = function() {
if (!this.entities.user_mentions.screen_name) {
return;
}
for (index in this.entities.user_mentions.screen_name) {
emit(this.entities.user_mentions.screen_name[index], 1);
}
}
reduce = function(previous, current) {
var count = 0;
for (index in current) {
count += current[index];
}
return count;
}
result = db.runCommand({
"mapreduce" : "twitter_sample",
"map" : map,
"reduce" : reduce,
"out" : "user_mentions"
});
But its not quite working...

Since entities.user_mentions is an array, you want to emit a value for each screen_name in the map():
var map = function() {
this.entities.user_mentions.forEach(function(mention) {
emit(mention.screen_name, { count: 1 });
})
};
Then count the values by unique screen_name in the reduce():
var reduce = function(key, values) {
// NB: reduce() uses same format as results emitted by map()
var result = { count: 0 };
values.forEach(function(value) {
result.count += value.count;
});
return result;
};
Note: to debug your map/reduce JavaScript functions, you can use print() and printjson() commands. The output will appear in your mongod log.
EDIT: For comparison, here is an example using the new Aggregation Framework in MongoDB 2.2:
db.twitter_sample.aggregate(
// Project to limit the document fields included
{ $project: {
_id: 0,
"entities.user_mentions" : 1
}},
// Split user_mentions array into a stream of documents
{ $unwind: "$entities.user_mentions" },
// Group and count the unique mentions by screen_name
{ $group : {
_id: "$entities.user_mentions.screen_name",
count: { $sum : 1 }
}},
// Optional: sort by count, descending
{ $sort : {
"count" : -1
}}
)
The original Map/Reduce approach is best suited for a large data set, as is implied with Twitter data. For a comparison of Map/Reduce vs Aggregation Framework limitations see the related discussion on the StackOverflow question MongoDB group(), $group and MapReduce.

Related

mongodb $map to get value based on the object type

Consider I have a value like this
var objList= { BMW: { type: 1, code: 'BW' },
BENZ: { type: 2, code: 'BN' },
RR: { type: 3, code: 'RR' }, };
And in my collection, it was stored liked 1, 2 or 3( which is a type from the above constant).. How can I create a query to map and return BN if type query supplied to filter was 2.
var objList is from my constants file
I have a long aggregate list, hope that will not be useful to decide something here and I guess my questing is not related to any issue or application specific.
// Aggregate
var aggregateList = [
{ $match : matches },
{ $unwind : unwind },
{ $group : group1 },
{ $unwind : unwind },
{ $sort : sort },
{ $skip : skip },
{ $limit : limit },
];
var db_collections = Car.aggregate(aggregateList)

Write MapReduce function to count number of posts created by various users in MongoDB

I had created a collection "posts" with title, description, by, comments in MongoDB as:
db.posts.insert({
title:'MongoDB',
description:'MongoDB is a NoSQL DB',
by:'Tom',
comments:[
{user:'ram',
message:'We use MongoDB'
}
]
}
)
Similarly, I added other two entries.
Now, I want to write MapReduce function to count number of posts created by various users in MongoDB. I used:
db.posts.mapReduce(
function() { emit(this.user_id,1); },
function(key, values) {return Array.sum(values)}, {
out:"post_total"
}
).find()
This Output:
{"id": null , "value": 3}
But, what I want to display is something like:
{ "_id" : "tom_id", "value" : 2 }
{ "_id" : "mark_id", "value" : 1 }
or
{ "by" : "tom", "value" : 2 }
{ "by" : "mark", "value" : 1 }
Finally, I solve it. I got some idea from MapReduce function in MongoDB - Grouping document by ID
db.posts.mapReduce(
function() { emit(this.by,1); },
function(key, values) {return Array.sum(values)}, {
out:"post_total"
}
).find()
What I doing wrong was, emit(this.user_id,1). I was passing the wrong key.
Re-write your mapReduce method to:
db.posts.mapReduce(
function() {
emit(this.user_id, 1);
},
function(key, values) {
return Array.sum(values);
},
{ "out": { "inline": 1 } }
)
or use the runCommand command to run a mapReduce operation with an output collection option:
mr = db.runCommand({
"mapreduce": "posts",
"map" : function() {
for (var key in this) { emit(this.user_id, 1); }
},
"reduce" : function(key, values) { return Array.sum(values); },
"out": "post_total"
})
To get the results, run find() on the resulting collection:
db[mr.result].find()
Equivalent results yet much more efficient operation using the aggregation framework:
db.posts.aggregate([
{
"$group": {
"_id": "$user_id",
"count": { "$sum": 1 }
}
}
])

Mongodb: find documents with array field that contains more than one SAME specified value

There is three documents in collection test:
// document 1
{
"id": 1,
"score": [3,2,5,4,5]
}
// document 2
{
"id": 2,
"score": [5,5]
}
// document 3
{
"id": 3,
"score": [5,3,3]
}
I want to fetch documents that score field contains [5,5].
query:
db.test.find( {"score": {"$all": [5,5]}} )
will return document 1, 2 and 3, but I only want to fetch document 1 and 2.
How can I do this?
After reading your problem I personally think mongodb not supported yet this kind of query. If any one knows about how to find this using mongo query they defiantly post answers here.
But I think this will possible using mongo forEach method, so below code will match your criteria
db.collectionName.find().forEach(function(myDoc) {
var scoreCounts = {};
var arr = myDoc.score;
for (var i = 0; i < arr.length; i++) {
var num = arr[i];
scoreCounts[num] = scoreCounts[num] ? scoreCounts[num] + 1 : 1;
}
if (scoreCounts[5] >= 2) { //scoreCounts[5] this find occurrence of 5
printjsononeline(myDoc);
}
});
Changed in version 2.6.
The $all is equivalent to an $and operation of the specified values; i.e. the following statement:
{ tags: { $all: [ "ssl" , "security" ] } }
is equivalent to:
{ $and: [ { tags: "ssl" }, { tags: "security" } ] }
I think you need to pass in a nested array -
So try
db.test.find( {"score": {"$all": [[5,5]]}} )
Source
Changed in version 2.6.
When passed an array of a nested array (e.g. [ [ "A" ] ] ), $all can now match documents where the field contains the nested array as an element (e.g. field: [ [ "A" ], ... ]), or the field equals the nested array (e.g. field: [ "A" ]).
http://docs.mongodb.org/manual/reference/operator/query/all/
You can do it with an aggregation. The first step can use an index on { "score" : 1 } but the rest is hard work.
db.test.aggregate([
{ "$match" : { "score" : 5 } },
{ "$unwind" : "$score" },
{ "$match" : { "score" : 5 } },
{ "$group" : { "_id" : "$_id", "sz" : { "$sum" : 1 } } }, // use $first here to include other fields in the results
{ "$match" : { "sz" : { "$gte" : 2 } } }
])

Update all documents in a collection with random numbers

I am trying to update all documents in a collection with random numbers.
Each document should have a different number.
My current code
db.myDoc.update(
{ rand : { $exists : false } },
{ $set : { rand: Math.random() } },
{ multi : true }
)
populates ALL documents with the SAME random value.
How to fix?
You can make use of the cursor.forEach() cursor method in the mongo shell to achieve this:
db.myDoc.find({rand: {$exists : false }}).forEach(function(mydoc) {
db.myDoc.update({_id: mydoc._id}, {$set: {rand: Math.random()}})
})
Starting in Mongo 4.4, the $function aggregation operator allows applying a custom javascript function to implement behaviour not supported by the MongoDB Query Language.
For instance, in order to update documents with a random value:
// { "x" : 1 }
// { "x" : 2 }
db.collection.updateMany(
{ rand: { $exists: false } },
[{ $set:
{ rand:
{ $function: {
body: function() { return Math.random(); },
args: [],
lang: "js"
}}
}
}]
)
// { "x" : 1, "rand" : 0.7012578283384967 }
// { "x" : 2, "rand" : 0.21041874709692365 }
$function takes 3 parameters:
body, which is the function to apply.
args, which contains the fields from the record that the function can take as parameter. In our case we don't need any reference to the document itself in order to compute a random value, thus the empty array.
lang, which is the language in which the body function is written. Only js is currently available.
Note that this is now way more efficient than a find/foreach option since everything is done server side in one pass.
Starting in Mongo 4.4.2, the $rand aggregation operator provides a random float between 0 and 1 each time it is called:
// { "x" : 1 }
// { "x" : 2 }
db.collection.updateMany(
{ rand: { $exists: false } },
[{ $set: { rand: { $rand: {} } } }]
)
// { "x" : 1, "rand" : 0.22252333583221115 }
// { "x" : 2, "rand" : 0.9811303782541574 }

SQL to MapReduce: counting unique key in a many to one relationship?

Initially, i have a relationship where an order has many lineitems and many lineitems has only one order, as usual.
Using mongoDB, I did this document to represent it:
{
"_id" : ObjectId("511b7d1b3daee1b1446ecdfe"),
"l_order" : {
"_id" : ObjectId("511b7d133daee1b1446eb54d"),
"o_orderkey" : NumberLong(1),
"o_totalprice" : 173665.47,
"o_orderdate" : ISODate("1996-01-02T03:00:00Z"),
"o_orderpriority" : "5-LOW",
"o_shippriority" : 0,
},
"l_linenumber" : 1,
"l_shipdate" : ISODate("1996-03-13T03:00:00Z"),
"l_commitdate" : ISODate("1996-02-12T03:00:00Z"),
"l_receiptdate" : ISODate("1996-03-22T03:00:00Z"),
}
My intention is translate this sql query:
select
o_orderpriority,
count(*) as order_count
from
orders
where
o_orderdate >= date '1993-07-01'
and o_orderdate < date '1993-07-01' + interval '3' month
and exists (
select
*
from
lineitem
where
l_orderkey = o_orderkey
and l_commitdate < l_receiptdate
)
group by
o_orderpriority
order by
o_orderpriority;
For this a use two mapreduce functions:
First
db.runCommand({
mapreduce: "lineitem",
query: {
"l_order.o_orderdate": {'$gte': new Date("July 01, 1993"), '$lt': new Date("Oct 01, 1993")}
},
map: function Map() {
if(this.l_commitdate < this.l_receiptdate){
emit( this.l_order.o_orderkey, this.l_order.o_orderpriority );
}
},
out: 'query004a'
});
Second
db.runCommand({
mapreduce: "query004a",
map: function Map() {
/*Remenbering, the value here will be this.l_order.o_orderpriority from the previous mapreduce function*/
emit( this.value, 1 );
},
reduce: function(key, values) {
return Array.sum(values);
},
out: 'query004b'
});
In first i segregated the document pieces there was in date range and respect the comparison, grouping them for order key to avoid duplicate. In second i grouped the o_orderpriority and sum.
Well for my surprise the answer was bigger than i was expecting. But why and where this occurs?
in your first map function you should use 'oderpriority' as a key and 'orderkey' as value - this will reduce the set to the key you want in the second mapReduce. (You need to specify a reduce function, otherwise mapReduce will return an error).
So, this could look like this:
OrderDateMin = new Date("1996-01-01");
OrderDateMax = new Date("1996-04-01");
// first where on oderdate
query = {
"l_order.o_orderdate": {$gte: OrderDateMin, $lt: OrderDateMax}
}
map1 = function() {
//second "where" on commitdate < receiptdate
if ( this.l_commitdate < this.l_receiptdate ) {
// emit orderpriority as key, "1" as counter
emit( this.l_order.o_orderpriority, this.l_order.o_orderkey );
}
};
reduce1 = function(key, values) {
return 1;
}
db.runCommand({
mapReduce: "xx",
query: query,
map: map1,
reduce: reduce1,
out: 'query004a',
})
map2 = function() {
//_id is ordepriority
emit( this._id, 1 );
};
reduce2 = function(key, values) {
// count entries per orderpriority
count = 0;
values.forEach( function(value) { count += value; } );
return count;
}
db.runCommand({
mapReduce: "query004a",
map: map2,
reduce: reduce2,
out: 'query004b',
})
Now, the same can be achieved with one aggregate command, which is faster (implemented in C, not in JavaScript):
db.xx.aggregate([
// first "where", this will use an index, if defined
{ $match: {
"l_order.o_orderdate": { $gte: OrderDateMin, $lt: OrderDateMax }
}},
// reduce to needed fields, create a field for decision of second "where"
{ $project: {
"key": "$l_order.o_orderkey",
"pri": "$l_order.o_orderpriority",
okay: { $cond: [ {$lt: ["l_commitdate", "l_receiptdate"]}, 1, 0 ] }
}},
// select second where condition matched
{ $match: { "okay": 1 } },
// group by priority and key
{ $group: { _id: { "pri": "$pri", "key": "$key" } } },
// group by priority - count entries
{ $group: { _id: "$_id.pri", "count": { $sum: 1 } } },
])
which would return something like:
{ "result" : [ { "_id" : "5-LOW", "count" : 1 } ], "ok" : 1 }
Last, but not least: a suggestion regarding design:
It would be simpler if your structure was the other way round: an "orders" collection with the order-items embedded as an array of items. This would avoid having duplicate order data throughout the collection.
Further info:
http://docs.mongodb.org/manual/reference/command/mapReduce/#mapReduce
http://docs.mongodb.org/manual/reference/aggregation
Does this help?
Cheers
Ronald