MongoDB map reduce count giving more results than a query - mongodb

I have a collection users in Mongo and I execute this map reduce which I believe is the equivalent of a COUNT(*) GROUP BY origin:
> m = function() { for (i in this.membership) {
... emit( this.membership[i].platform_profile.origin, 1 );
... } }
function () {
for (i in this.membership) {
emit(this.membership[i].platform_profile.origin, 1);
}
}
> r = function( id, values ) { var result = 0;
... for ( var i = 0; i < values.length; i ++ ) { result += values[i]; }
... return result; }
function (id, values) {
var result = 0;
for (var i = 0; i < values.length; i++) {
result += values[i];
}
return result;
}
> db.users.mapReduce(m, r, {out : { inline: 1}});
{
"results" : [
{
"_id" : 0,
"value" : 15
},
{
"_id" : 1,
"value" : 449
},
...
}
But if I try to count how many documents have this field set to a specific value like 1, I get fewer results:
db.users.count({"membership.platform_profile.origin": 1});
424
What am I missing?

Are your count queries using a sparse index by any chance? My only guess there would be if some other query criteria resulted in documents absent from from index to be ignored from the count.
I recreated your schema with some fixture data and the results between map/reduce and simple count queries are in agreement:
db.users.drop();
var map = function() {
for (i in this.membership) {
emit(this.membership[i].platform_profile.origin, 1);
}
};
var reduce = function(id, values ) {
var result = 0;
for (var i = 0; i < values.length; i++) {
result += values[i];
}
return result;
}
var origins = {1: "a", 2: "b", 3: "c", 4: "d"};
for (var i = 0; i < 1000; ++i) {
var membership = [];
for (var o in origins) {
if (0 == i % o) {
membership.push({ platform_profile: { origin: origins[o] }});
}
}
db.users.save({ membership: membership });
}
db.users.mapReduce(map, reduce, {out: {inline: 1}}).results.forEach(function(result){
print(result["_id"] + ": " + result["value"]);
});
for (var o in origins) {
print(origins[o] + ": " + db.users.count({"membership.platform_profile.origin": origins[o]}));
}
Here's the output:
$ mongo --quiet mr_count.js
a: 1000
b: 500
c: 334
d: 250
a: 1000
b: 500
c: 334
d: 250

You can use the following map/reduce for the equivalent of COUNT(*) GROUP BY origin
Map/Reduce Functions :
map = function() {
if(!this.membership) return;
for (i in this.membership) {
if(!this.membership[i].platform_profile || !this.membership[i].platform_profile.origin) return;
emit(this.membership[i].platform_profile.origin, 1);
}
}
reduce = function(key, values) {
var count = 0;
for (v in values) {
count += values[v];
}
return count;
}
result = db.runCommand({
"mapreduce" : "users",
"map" : map,
"reduce" : reduce,
"out" : "users_count"
});

I had the same issue. I replaced x.length by Array.sum(x) in the reduce function (assuming you emit 1 in the map function) and it works. I agree x.length should work too, but I cannot explain why it does not.

Related

How do query among many collections then write result to corresponding collection

If I want to query users' age > 18,
and export result to corresponding collection,
How could I do it by rewriteing the following script?
The following is psuedo code
source_collections = ["user_1", "user_2", ..., "user_999"]
output_collections = ["result_1", "result_2", ..., "result_999"]
pipeline = [
{
"$match":{"age": > 18}
}
{ "$out" : output_collections }
]
cur = db[source_collections].runCommand('aggregate',
{pipeline: pipeline,allowDiskUse: true})
The script you're looking for is something like:
var prefix_source = 'user_';
var prefix_output = 'result_';
var source_collections = [];
var output_collections = [];
var numCollections = 999;
for (var i = 1; i <= numCollections; i++) {
source_collections.push(prefix_source + i);
output_collections.push(prefix_output + i);
}
var pipeline = [{'$match': {age: {'$gt': 18}}}, {'$out': ''}]; for (var currentCollection = 0; currentCollection < source_collections.length; currentCollection++) {
pipeline[pipeline.length - 1]['$out'] = output_collections[currentCollection];
var cur = db[source_collections[currentCollection]].runCommand('aggregate', {pipeline: pipeline,allowDiskUse: true});
}
And while you're at it, the var cur = ... line could be simplified to
db[source_collections[currentCollection]].aggregate(pipeline, {allowDiskUse: true});
Note: I've added a piece that generates your arrays for you, as I'm sure you're not looking to write them by hand :D

How to add or inc to an array in a update query?

If I have a doc which has an array which contains a items which represents counts for a day, perhaps like :-
{
data : [ {'20141102' : 2 },{'20141103' : 4 } ]
}
when I do an update, and I have a string '20141103' and then later a '20141104' I want to either inc the array entry or add a new array entry. Is this possible with an update?
Yes, it's feasible. I tried like this:
(run on mongo shell; both client and server are V2.6.4)
function tryAndFine(coll, key, value) {
var entry = {};
entry[key] = value;
var parent = 'data';
var prefix = parent + '.';
function incrementOnly() {
var criteria = {};
criteria[prefix + key] = {$exists : true};
var update = {};
update[prefix + "$." + key] = value;
var result = coll.update(criteria, {$inc : update});
// if increment fails, try to add a new one
if (result.nModified == 0) {
addNewElement();
}
}
function addNewElement() {
var criteria = {};
criteria[prefix + key] = {$exists : false};
var update = {};
update[parent] = entry;
var result = coll.update(criteria, {$push : update}, {upsert : true});
// if exists, try to increment the count
if (result.upserted == 0 && result.nModified == 0) {
incrementOnly();
}
}
// run entry
incrementOnly();
}
// test
var c = db.c;
c.drop();
tryAndFine(c, '20141103', 1);
tryAndFine(c, '20141103', 1);
tryAndFine(c, '20141104', 1);
tryAndFine(c, '20141105', 1);
tryAndFine(c, '20141104', 1);
// output
{
"_id" : ObjectId("54577e1a3502852bd4ad2395"),
"data" : [ {
"20141103" : 2
}, {
"20141104" : 2
}, {
"20141105" : 1
} ]
}

SQL to MapReduce: where clause with a select statement

How could i translate an sql query to mongodb map reduce when the where clause constains a select statement?
For example
select
sum(l_extendedprice) / 7.0 as avg_yearly
from
lineitem,
part
where
p_partkey = l_partkey
and p_brand = 'Brand#23'
and p_container = 'MED BOX'
and l_quantity < (
select
0.2 * avg(l_quantity)
from
lineitem
where
l_partkey = p_partkey
);
I've tried this mapreduce, but seemingly it did not work.
db.runCommand({
mapreduce: "lineitem",
query: {
"partsupp.ps_partkey.p_brand": "Brand#23",
"partsupp.ps_partkey.p_container": "MED BOX"
},
map: function() {
var data = {l_extendedprice: 0, l_quantity:0, total_l_quantity: 0 };
data.l_extendedprice = this.l_extendedprice;
data.l_quantity = this.l_quantity;
data.total_l_quantity = 1;
emit("avg_yearly", data);
},
reduce: function(key, values) {
var data = {l_extendedprice: 0, l_quantity:0, total_l_quantity: 0 };
var sum_l_quantity = 0;
/*sum the l_quantity and total_l_quantity*/
for (var i = 0; i < values.length; i++) {
sum_l_quantity += values[i].l_quantity;
data.total_l_quantity += values[i].total_l_quantity;
}
/*calculate the average l_quantity and multiply*/
var avg_l_quantity = 0.2 * (sum_l_quantity / data.total_l_quantity);
/*sum l_extendedprice and divide */
for (var i = 0; i < values.length; i++) {
if( values[i].l_quantity < avg_l_quantity ) {
data.l_extendedprice += values[i].l_extendedprice;
}
}
data.l_extendedprice = data.l_extendedprice / 7.0;
return data;
},
out: 'query017'
});
Has another way to do it? Is it possible to do it in the query clause of mapreduce? Or i only had mistaked in my code?
The schema
{
"_id" : ObjectId("511b7d1b3daee1b1446ecdfe"),
"l_quantity" : 17,
"l_extendedprice" : 21168.23,
"l_discount" : 0.04,
"l_shipdate" : ISODate("1996-03-13T03:00:00Z"),
"l_commitdate" : ISODate("1996-02-12T03:00:00Z"),
"l_receiptdate" : ISODate("1996-03-22T03:00:00Z"),
"l_shipmode" : "TRUCK",
"l_comment" : "blithely regular ideas caj",
"partsupp" : {
"ps_availqty" : 6157,
"ps_supplycost" : 719.17,
"ps_partkey" : {
"p_partkey" : NumberLong(155190),
"p_name" : "slate lavender tan lime lawn",
"p_mfgr" : "Manufacturer#4",
"p_brand" : "Brand#44",
"p_type" : "PROMO BRUSHED NICKEL",
"p_size" : 9,
"p_container" : "JUMBO JAR",
"p_retailprice" : 1245.19,
"p_comment" : "regular, final dol"
}
}
}

Count frequency of an year

I'm trying to count the frequency of an year with map/reduce, here is the code
map = %Q{
function(){
emit({}, {
year: this.birthdate.getFullYear(),
count: 1
})
}
}
reduce = %Q{
function(key, values){
var agg = { }
values.forEach( function(value){
if(agg[value.year]){
agg[value.year] += value.count
} else {
agg[key] = value.count
}
})
return agg
}
}
User.map_reduce(map, reduce).out(inline: true)
and it returns
=> [{"_id"=>{}, "value"=>{"2004"=>2.0, "2002"=>1.0, "2005"=>1.0}}]
But I have a lot of years in my database, and this is just tracking 3 of then. How can I do this?
Now its working... here is the code
map = %Q{
function(){
var today = new Date()
var age = today.getFullYear() - this.birthdate.getFullYear()
var m = today.getMonth() - this.birthdate.getMonth()
if (m < 0 || (m === 0 && today.getDate() < this.birthdate.getDate()))
age--
emit(age, { count: 1 })
}
}
reduce = %Q{
function(key, values){
var sum = 0
values.forEach( function(value){
sum += value.count
})
return { count: sum }
}
}
scoped.map_reduce(map, reduce).out(inline: true)

mongodb map emit keys from subdocument

I have a document which includes a subdocument:
{
"_id" : ObjectId("XXXXX"),
"SearchKey" : "1234",
"SearchTerms" : {
"STA" : ["1"],
"STB" : ["asdfasdf"],
"STC" : ["another"]
}
}
The SearchTerm elements are not fixed - sometimes we'll have STA without STC, for example.
I can do this:
var map = function() {
for (key in this.SearchTerms)
{
emit(key, 1);
}
}
but I can't do this:
var map = function() {
for (var i=0; i< this.SearchTerms.length; i++)
{
emit(this.SearchTerms[i], 1)
}
}
because the latter doesn't produce any results after the reduce. Why not?
As an aside - what I need to do is count the cross-product of the search terms over all documents, that is, find the incidence of (STA and STB) and (STA and STC) and (STB and STC) in the case above. If someone knows how to do that right away, that works even better.
As always, thanks for the help
The key that you emit should be a composite of both keys.
var map = function() {
if(!this.SearchTerms) return;
for(var i = 0 ; i < this.SearchTerms.length; i++){
var outerKey = this.SearchTerms[i];
for(var j = i + 1; j < this.SearchTerms.length; j++){
var innerKey = this.SearchTerms[j];
// assuming you don't care about counting (STA and STC) separately from (STC and STA),
// then order doesn't matter, lets make sure both occurences have the same key.
var compositeKey = (outerKey < innerKey) ? outerKey+"_"+innerKey : innerKey+"_"+outerKey;
emit(compositeKey, 1);
}
}
}
This is because this.SearchTerms is a dictionary/subdocument and not an array. this.SearchTerms[0] doesn't exist.
For the second question: something like this should work:
for (key1 in this.SearchTerms)
{
for (key2 in this.SearchTerms)
{
if (key1 < key2)
{
key = [key1, key2];
emit(key, 1);
}
}
}