MongoDB group by Functionalities - mongodb

In MySQL
select a,b,count(1) as cnt from list group by a, b having cnt > 2;
I have to execute the group by function using having condition in mongodb.
But i am getting following error. Please share your input.
In MongoDB
> res = db.list.group({key:{a:true,b:true},
... reduce: function(obj,prev) {prev.count++;},
... initial: {count:0}}).limit(10);
Sat Jan 7 16:36:30 uncaught exception: group command failed: {
"errmsg" : "exception: group() can't handle more than 20000 unique keys",
"code" : 10043,
"ok" : 0
Once it will be executed, we need to run the following file on next.
for (i in res) {if (res[i].count>2) printjson(res[i])};
Regards,
Kumaran

MongoDB group by is very limited in most cases, for instance
- the result set must be lesser than 10000 keys.
- it will not work in sharded environments
So its better to use map reduce. so the query would be like this
map = function() { emit({a:true,b:true},{count:1}); }
reduce = function(k, values) {
var result = {count: 0};
values.forEach(function(value) {
result.count += value.count;
});
return result;
}
and then
db.list.mapReduce(map,reduce,{out: { inline : 1}})
Its a untested version. let me know if it works
EDIT:
The earlier map function was faulty. Thats why you are not getting the results. it should have been
map = function () {
emit({a:this.a, b:this.b}, {count:1});
}
Test data:
> db.multi_group.insert({a:1,b:2})
> db.multi_group.insert({a:2,b:2})
> db.multi_group.insert({a:3,b:2})
> db.multi_group.insert({a:1,b:2})
> db.multi_group.insert({a:3,b:2})
> db.multi_group.insert({a:7,b:2})
> db.multi_group.mapReduce(map,reduce,{out: { inline : 1}})
{
"results" : [
{
"_id" : {
"a" : 1,
"b" : 2
},
"value" : {
"count" : 2
}
},
{
"_id" : {
"a" : 2,
"b" : 2
},
"value" : {
"count" : 1
}
},
{
"_id" : {
"a" : 3,
"b" : 2
},
"value" : {
"count" : 2
}
},
{
"_id" : {
"a" : 7,
"b" : 2
},
"value" : {
"count" : 1
}
}
],
"timeMillis" : 1,
"counts" : {
"input" : 6,
"emit" : 6,
"reduce" : 2,
"output" : 4
},
"ok" : 1,
}
EDIT2:
Complete solution including applying having count >= 2
map = function () {
emit({a:this.a, b:this.b}, {count:1,_id:this._id});
}
reduce = function(k, values) {
var result = {count: 0,_id:[]};
values.forEach(function(value) {
result.count += value.count;
result._id.push(value._id);
});
return result;
}
>db.multi_group.mapReduce(map,reduce,{out: { replace : "multi_result"}})
> db.multi_result.find({'value.count' : {$gte : 2}})
{ "_id" : { "a" : 1, "b" : 2 }, "value" : { "_id" : [ ObjectId("4f0adf2884025491024f994c"), ObjectId("4f0adf3284025491024f994f") ], "count" : 2 } }
{ "_id" : { "a" : 3, "b" : 2 }, "value" : { "_id" : [ ObjectId("4f0adf3084025491024f994e"), ObjectId("4f0adf3584025491024f9950") ], "count" : 2 } }

You should use MapReduce instead. Group has its limitations.
In future you'll be able to use the Aggregation Framework. But for now, use map/reduce.

Depends on the number of your groups, you might find a simpler and faster solution than group or MapReduce by using distinct:
var res = [];
for( var cur_a = db.list.distinct('a'); cur_a.hasNext(); ) {
var a = cur_a.next();
for( var cur_b = db.list.distinct('b'); cur_b.hasNext(); ) {
var b = cur_b.next();
var cnt = db.list.count({'a':a,'b':b})
if (cnt > 2)
res.push({ 'a': a, 'b' : b 'cnt': cnt}
}
}
It will be faster if you have indexes on a and b
db.list.ensureIndex({'a':1,'b':1})

Related

Calculate average using mapreduce in MongoDb

I have a collection of 10 million records which resembles this.
{
"_id" : ObjectId("596dd10bbd1a6628ace1c14c"),
"X" : 13212,
"Z" : 173836,
"userID" : 9354785
}
User ID is unique. I have to calculate the average of X and sum of Z. I can calculate the sum of Z using the following mapReduce function
var mapFunction1 = function() {
emit(this.userID, this.Z);
};
var reduceFunction1 = function() {
return Array.sum(Z);
};
db.transaction.mapReduce(
mapfunction1,
reduceFunction1,
{out:"mapreduce"}
)
How do i calculate the average of X?
I tried Array.avg(Z) but it returns the same output as sum(Z).
It looks like the requirements can be expressed more simply using the Aggregation Pipeline with the $avg and $sum operators.
Input
> db.transactions.find()
{ "_id" : ObjectId("5970e59e26507421fa20bee9"), "X" : 13212, "Z" : 173836, "userID" : 9354785 }
{ "_id" : ObjectId("5970e5a426507421fa20beea"), "X" : 1234, "Z" : 5678, "userID" : 1 }
{ "_id" : ObjectId("5970e5a826507421fa20beeb"), "X" : 100, "Z" : 200, "userID" : 2 }
Aggregation Pipeline
> db.transactions.aggregate([
{
$group : {
_id: "aggregates",
avgX: {
$avg: "$X"
},
sumZ: {
$sum: "$Z"
}
}
}
])
Output
{ "_id" : "aggregates", "avgX" : 4848.666666666667, "sumZ" : 179714 }
You are not passing (key,value) pair parameter to reduceFunction1.
Try this:
var mapFunction1 = function() {
emit(this.userID, this.Z);
};
var reduceFunction1 = function(varKey,varZ) {
return Array.avg(varZ);
};
db.transaction.mapReduce(
mapfunction1,
reduceFunction1,
{out:"mapreduce"}
)

Is it possible to retrieve a 'time span' from a MongoDB query, using the timestamp within an ObjectId?

We have a basic enquiry management tool that we're using to track some website enquiries in our administration suite, and we're using the ObjectId of each document in our enquiries collection to sort the enquiries by the date they were added.
{
"_id" : ObjectId("53a007db144ff47be1000003"),
"comments" : "This is a test enquiry. Please ignore. We'll delete it shortly.",
"customer" : {
"name" : "Test Enquiry",
"email" : "test#test.com",
"telephone" : "07890123456",
"mobile" : "07890123456",
"quote" : false,
"valuation" : false
},
"site" : [],
"test" : true,
"updates" : [
{
"_id" : ObjectId("53a007db144ff47be1000001"),
"status" : "New",
"status_id" : ObjectId("537de7c3a5e6e668ffc2335c"),
"status_index" : 100,
"substatus" : "New Web Enquiry",
"substatus_id" : ObjectId("5396bb9fa5e6e668ffc23388"),
"notes" : "New enquiry received from website.",
},
{
"_id" : ObjectId("53a80c977d299cfe91bacf81"),
"status" : "New",
"status_id" : ObjectId("537de7c3a5e6e668ffc2335c"),
"status_index" : 100,
"substatus" : "Attempted Contact",
"substatus_id" : ObjectId("53a80e06a5e6e668ffc2339e"),
"notes" : "In this test, we pretend that we've not managed to get hold of the customer on the first attempt.",
},
{
"_id" : ObjectId("53a80e539b966b8da5c40c36"),
"status" : "Approved",
"status_id" : ObjectId("52e77a49d85e95f00ebf6c72"),
"status_index" : 200,
"substatus" : "Enquiry Confirmed",
"substatus_id" : ObjectId("53901f1ba5e6e668ffc23372"),
"notes" : "In this test, we pretend that we've got hold of the customer after failing to contact them on the first attempt.",
}
]
}
Within each enquiry is an updates array of objects which also have an ObjectId as their main identity field. We're using an $unwind and $group aggregation to pull the first and latest updates, as well as the count of updates, making sure we only take enquiries where there have been more than one update (as one is automatically inserted when the enquiry is made):
db.enquiries.aggregate([
{
$match: {
"test": true
}
},
{
$unwind: "$updates"
},
{
$group: {
"_id": "$_id",
"latest_update_id": {
$last: "$updates._id"
},
"first_update_id": {
$first: "$updates._id"
},
"update_count": {
$sum: 1
}
}
},
{
$match: {
"update_count": {
$gt: 1
}
}
}
])
This results in the following output:
{
"result" : [
{
"_id" : ObjectId("53a295ad122ea80200000005"),
"latest_update_id" : ObjectId("53a80bdc7d299cfe91bacf7e"),
"first_update_id" : ObjectId("53a295ad122ea80200000003"),
"update_count" : 2
},
{
"_id" : ObjectId("53a007db144ff47be1000003"),
"latest_update_id" : ObjectId("53a80e539b966b8da5c40c36"),
"first_update_id" : ObjectId("53a007db144ff47be1000001"),
"update_count" : 3
}
],
"ok" : 1
}
This is then passed through to our code (node.js, in this case) where we perform a few operations on it and then present some information on our dashboard.
Ideally, I'd like to add another $group pipeline aggregation to the query which would subtract the timestamp of first_update_id from the timestamp of latest_update_id to give us a timespan, which we could then use $avg on.
Can anyone tell me if this is possible? (Thank you!)
As Neil already pointed out, you can't get to the timestamp from the ObjectId in the aggregation framework.
You said that speed is not important, so using MapReduce you can get what you want:
var map = function() {
if (this.updates.length > 1) {
var first = this.updates[0];
var last = this.updates[this.updates.length - 1];
var diff = last._id.getTimestamp() - first._id.getTimestamp();
var val = {
latest_update_id : last._id,
first_update_id : first._id,
update_count : this.updates.length,
diff: diff
}
emit(this._id, val);
}
};
var reduce = function() { };
db.runCommand(
{
mapReduce: "enquiries",
map: map,
reduce: reduce,
out: "mrresults",
query: { test : true}
}
);
This are the results:
{
"_id" : ObjectId("53a007db144ff47be1000003"),
"value" : {
"latest_update_id" : ObjectId("53a80e539b966b8da5c40c36"),
"first_update_id" : ObjectId("53a007db144ff47be1000001"),
"update_count" : 3,
"diff" : 525944000
}
}
Edit:
If you want to get the average diff for all documents you can do it like this:
var map = function() {
if (this.updates.length > 1) {
var first = this.updates[0];
var last = this.updates[this.updates.length - 1];
var diff = last._id.getTimestamp() - first._id.getTimestamp();
emit("1", {diff : diff});
}
};
var reduce = function(key, values) {
var reducedVal = { count: 0, sum: 0 };
for (var idx = 0; idx < values.length; idx++) {
reducedVal.count += 1;
reducedVal.sum += values[idx].diff;
}
return reducedVal;
};
var finalize = function (key, reducedVal) {
reducedVal.avg = reducedVal.sum/reducedVal.count;
return reducedVal;
};
db.runCommand(
{
mapReduce: "y",
map: map,
reduce: reduce,
finalize : finalize,
out: "mrtest",
query: { test : true}
}
);
And the example output:
> db.mrtest.find().pretty()
{
"_id" : "1",
"value" : {
"count" : 2,
"sum" : 1051888000,
"avg" : 525944000
}
}

get undefined value in mongodb MapReduce

I tried to use twice MapReduce aggregation to get unique user number per month.
The first MR function work out a mr_buyer_payment collection, like this:
{ "_id" : { "u" : "01329f19-27b0-435b-9ca1-450984024a31", "tid" : ISODate("2013-09-01T00:00:00Z") }, "value" : { "payment" : 38, "count_pay" : 1 } }
{ "_id" : { "u" : "264dd104-b934-490b-988e-5822fd7970f6", "tid" : ISODate("2013-09-01T00:00:00Z") }, "value" : { "payment" : 4.99, "count_pay" : 1 } }
{ "_id" : { "u" : "27bb8f72-a13e-4676-862c-02f41fea1bc0", "tid" : ISODate("2013-09-01T00:00:00Z") }, "value" : { "payment" : 11.98, "count_pay" : 2 } }
The second MR function works well with small data set , but when query grows more than 100 records, it gets wrong result , some value is NaN.
The debug log shows some value in Reduce function like v.payment, v.count_user became undefine.
date:Sun Jun 30 2013 17:00:00 GMT-0700 (PDT) value:undefined / 162 / undefined
And the MR result info is wired:
{
"result" : "mr_buyer_all",
"timeMillis" : 29,
"counts" : {
"input" : 167,
"emit" : 167,
"reduce" : 6, // it should be 3, as same as "output" number
"output" : 3
},
"ok" : 1,
}
This is 2nd MR function:
db.mr_buyer_payment.mapReduce(
function(){
var key = this._id.tid;
var value = {
payment:this.value.payment,
count_pay:this.value.count_pay,
count_user:1
};
if (value.count_pay>0)
{
print("date:"+key+" u:"+this._id.u+"value:"+value.payment+" / "+value.count_pay+" / "+value.count_user);
emit(key,value);
}
},
function(key,values){
var result = {revenue:0,count_pay:0,user:0};
values.forEach(function(v){
if (!v.count_user)
{
print("date:"+key+" "+"value:"+v.payment+" / "+v.count_pay+" / "+v.count_user);
} else
{
result.revenue += v.payment;
result.count_pay += v.count_pay;
result.user += v.count_user;
}
});
return result;
},
{
out:{replace:"mr_buyer_all"}
}
)
The sub-document in Reduce function should use same format as one in Map function. So the solution is :
function(key,values){
// the following key must be as same as the object in map
var r = {payment:0,count_pay:0,count_user:0}
values.forEach(function(v){
r.payment += v.payment;
r.count_pay += v.count_pay;
r.count_user += v.count_user;
});
return r;
},

Unable to set query filter in a mongodb mapReduce command

I am trying to filter a mapReduce command with a query. This query seems to not beeing used by the mapReduce command. When I use the runCommand with the same parameters the query filter is used. I tryed with a mongodb 2.2.1 and a 2.0.1.
The query of my mapReduce function is not used.
m = function () {
if (this.duration > 0) {
emit("dur", this.duration);
}
}
r = function (key, values) {
var index = 0;
var sum = 0;
for (var i = 0; i < values.length; i++) {
sum += values[i];
index++;
}
return sum / index;
}
This command doesn't work :
res = db.movies.mapReduce(m,r, {out: { inline : 1}},{query:{kinds:'Action'}});
{
"results" : [
{
"_id" : "dur",
"value" : 5148.227224559308
}
],
"timeMillis" : 1849,
"counts" : {
"input" : 105472,
"emit" : 69602,
"reduce" : 106,
"output" : 1
},
"ok" : 1,
}
This command work :
res = db.runCommand({mapReduce : "movies", map : m, reduce : r, query : {kinds:'Action'}, out : {inline:1} })
{
"results" : [
{
"_id" : "dur",
"value" : 6134.118191572414
}
],
"timeMillis" : 238,
"counts" : {
"input" : 3577,
"emit" : 2910,
"reduce" : 4,
"output" : 1
},
"ok" : 1
}
With runCommand the query is used. Any ideas ?
You need to combine the out and query options into a single object:
res = db.movies.mapReduce(m,r, {out: { inline : 1}, query: {kinds: 'Action'} });

MongoDB find where key equals string from array

I am trying to find in a collection all of the documents that have the given key equal to one of the strings in an array.
Heres an example of the collection.
{
roomId = 'room1',
name = 'first'
},
{
roomId = 'room2',
name = 'second'
},
{
roomId = 'room3',
name = 'third'
}
And heres an example of the array to look through.
[ 'room2', 'room3' ]
What i thought would work is...
collection.find({ roomId : { $in : [ 'room2', 'room3' ]}}, function( e, r )
{
// r should return the second and third room
});
How can i achieve this?
One way this could be solve would be to do a for loop...
var roomIds = [ 'room2', 'room3' ];
for ( var i=0; i < roomIds.length; i++ )
{
collection.find({ id : roomIds[ i ]})
}
But this is not ideal....
What you posted should work - no looping required. The $in operator does the job:
> db.Room.insert({ "_id" : 1, name: 'first'});
> db.Room.insert({ "_id" : 2, name: 'second'});
> db.Room.insert({ "_id" : 3, name: 'third'});
> // test w/ int
> db.Room.find({ "_id" : { $in : [1, 2] }});
{ "_id" : 1, "name" : "first" }
{ "_id" : 2, "name" : "second" }
> // test w/ strings
> db.Room.find({ "name" : { $in : ['first', 'third'] }});
{ "_id" : 1, "name" : "first" }
{ "_id" : 3, "name" : "third" }
Isn't that what you expect?
Tested w/ MongoDB 2.1.1