MongoDB Map-Reduce combine document with dynamic schema - mongodb

I'm trying what I think should be a simple map reduce, but am having trouble because I can't find a reference of how to write the server side javascript.
Given two documents:
{
"_id" : ObjectId("530c8b58d95cd926144055d9"),
"atomic" : "p",
"doc" : {
"d1" : "t"
},
"array" : ["e"]
},
{
"_id" : ObjectId("530c8b71d95cd926144055da"),
"atomic" : "p",
"doc" : {
"d2" : "r"
},
"array" : ["f"]
}
I would like the result to be
{
"_id" : "p",
"value" : {
"doc" : {
"d1" : "t",
"d2" : "r"
},
"array" : ["e", "f"]
}
}
The map function is:
function () {
emit(
this.atomic,
{doc: this.doc, array: this.array}
);
}
The incorrect reduce function is:
function (key, values) {
var reduced = {doc:{}, array:[]};
values.forEach(function(val){
for(var i = 0; i < val.array.length; i++)
reduced.array.push(val.array[i]);
val.doc.forEach(function(kvp){reduced.doc.add(kvp.key, kvp.value);});
});
return reduced;
}
The part with the array is fine, it is trying to combine the documents that is messing up (i.e. not executing due to missing function). I've tried all permutations I can think off -- if I add the val.doc to an array then they all show up, it's just that I can't figure out how to merge it into a single document.
The fields in the doc will be dynamic so there is no way to reference it by name.
Any help would be appreciated.

Not sure the reduced.doc.add bit will work.
Maybe try:
function (key, values) {
var reduced = {doc:{}, array:[]};
values.forEach(function(val){
for(var i = 0; i < val.array.length; i++)
reduced.array.push(val.array[i]);
for (kvp in val.doc){
reduced.doc[kvp]=val.doc[kvp];
}
});
return reduced;
}

Related

How to query MongoDb documents using the indices of embedded arrays

I am trying to learn how to use mongo queries to reach deep into a data tree. Specifically, I'm trying to remove the object below {"object": 'to remove'}
{
"_id" : ObjectId("7840f22736341b09154f7ebf"),
"username" : "nmay",
"fname" : "Nate",
"lname" : "May",
"data" : [
{
"monthNum" : 1,
"year" : 2016,
"days" : [
{
"date" : "2016-01-01T06:00:00.000Z",
"type1" : [],
"type2" : []
},
{
"date" : "2016-01-02T06:00:00.000Z",
"type1" : [
{"object": 'to remove'}
],
"type2" : []
}
]
}
]
}
so far I know how to query for the user _id, but I'm not sure how to remove the desired object using the indices in each array. In this example I want to remove data[0].days[1].type1[0]
Here is the query that I have so far:
app.delete('/user/:id/data/:monthIndex/days/:dayIndex/type1/:type1Index', function (req, res, next) {
var monthIndex = parseInt(req.params.monthIndex); // these console the value properly
var dayIndex = parseInt(req.params.dayIndex); // -1 is applied to the parameter to translate to array position
var type1Index = parseInt(req.params.type1Index);
db.users.update(
{ _id: mongojs.ObjectId(req.params.id) },
{ $pull: data.monthIndex.days.dayIndex.type1.type1Index }
);
}
It gives me the error
ReferenceError: data is not defined
Can someone demonstrate how I can pass this query my index parameters to remove the desired object?
Unfortunately, there is no way to remove an array element by its numerical index with a single operation in MongoDB. In order to do this, you need to unset desired element(s) first, and remove the resulting null-valued fields afterwards.
Your code should look something like this:
db.users.update(
{ _id : mongojs.ObjectId(req.params.id) },
{ $unset : { 'data.0.days.1.type1.0' : 1 } }
);
db.users.update(
{ _id : mongojs.ObjectId(req.params.id) },
{ $pull : { 'data.0.days.1.type1' : null } }
);
Edit by #bob: to pass in the parameters you have to build the query string, which is ugly:
var unset = {};
unset['$unset'] = {};
unset.$unset['data.' + req.params.monthIndex + '.days.' + req.params.dayIndex + '.foods.' + req.params.foodIndex] = 1;
db.users.update( { _id : mongojs.ObjectId(req.params.id) }, unset );
var pull = {};
pull['$pull'] = {};
pull.$pull['data.' + req.params.monthIndex + '.days.' + req.params.dayIndex + '.foods'] = null;
db.users.update( { _id : mongojs.ObjectId(req.params.id) }, pull );

MongoDB query to return documents that only have keys amongst a predefined set

The MongoDB query language allows filtering documents based on the existence or absence of a given field with the $exists operator.
Is there a way, with the MongoDB syntax, and given a set K of allowed fields, to exclude documents that have fields not in K from the results, but:
not knowing in advance which extra fields (outside K) can be encountered
not using JavaScript, that is, the $where operator?
Example:
{
"Some field" : "foo"
}
{
"Some field" : "bar",
"Some other field" : "foobar"
}
With the set K = [ "Some field" ], only the first document is to be returned.
Note how this is not to be confused with a projection, which would return both documents but removing the extra field.
I'm not sure if MongoDB do support such kind of operations out of box but you can achieve so with help of mapReduce.
Assuming your sample data set;
// Variable for map
var map = function () {
var isAcceptable = true;
Object.keys(this).forEach(function (key) {
if (key != "_id" && white_list.indexOf(key) == -1) {
isAcceptable = false;
}
});
if (isAcceptable == true) {
emit(1, this);
}
};
// Variable for reduce
var reduce = function (key, values) {
return values;
};
db.collection.mapReduce(
map,
reduce,
{
scope: {"white_list": ["Some field"]},
out: {"inline": 1}
}
);
Will return:
{
"results" : [
{
"_id" : 1,
"value" : {
"_id" : ObjectId("57cd7503e55de957c62fb9c8"),
"Some field" : "foo"
}
}
],
"timeMillis" : 13,
"counts" : {
"input" : 2,
"emit" : 1,
"reduce" : 0,
"output" : 1
},
"ok" : 1
}
Desired result will be in results.values of returned document. However, keep in mind limitation of MongoDB mapReduce and maximum size of BSON document.
Given a set of known fields K, you can construct a query that takes the set as input and gives a query with the $exists operator along with the corresponding fields projection. Using an example, suppose you have the following documents in a test collection
db.test.insert({ "fieldX": "foo", "fieldY": "bar", "fieldZ": 1 })
db.test.insert({ "fieldX": "123", "fieldY": "bar", "fieldZ": 2 })
db.test.insert({ "fieldY": "abc", "fieldZ": 3 })
db.test.insert({ "fieldX": "xyz", "fieldZ": 4 })
db.test.insert({ "fieldZ": 5 })
Then you can construct a query Q and a projection P from an input set K as follows:
var K = [ "fieldX", "fieldZ" ];
var or = K.map(function(field) {
var obj = {};
obj[field] = { "$exists": true };
return obj;
});
var P = K.reduce(function(doc, field) {
doc[field] = 1;
return doc;
}, {} );
var Q = { "$or": or };
db.test.find(Q, P);
Sample Output:
/* 1 */
{
"_id" : ObjectId("57cd78322c241f5870c82b7d"),
"fieldX" : "foo",
"fieldZ" : 1
}
/* 2 */
{
"_id" : ObjectId("57cd78332c241f5870c82b7e"),
"fieldX" : "123",
"fieldZ" : 2
}
/* 3 */
{
"_id" : ObjectId("57cd78332c241f5870c82b7f"),
"fieldZ" : 3
}
/* 4 */
{
"_id" : ObjectId("57cd78332c241f5870c82b80"),
"fieldX" : "xyz",
"fieldZ" : 4
}
/* 5 */
{
"_id" : ObjectId("57cd78332c241f5870c82b81"),
"fieldZ" : 5
}

Is it possible to retrieve a 'time span' from a MongoDB query, using the timestamp within an ObjectId?

We have a basic enquiry management tool that we're using to track some website enquiries in our administration suite, and we're using the ObjectId of each document in our enquiries collection to sort the enquiries by the date they were added.
{
"_id" : ObjectId("53a007db144ff47be1000003"),
"comments" : "This is a test enquiry. Please ignore. We'll delete it shortly.",
"customer" : {
"name" : "Test Enquiry",
"email" : "test#test.com",
"telephone" : "07890123456",
"mobile" : "07890123456",
"quote" : false,
"valuation" : false
},
"site" : [],
"test" : true,
"updates" : [
{
"_id" : ObjectId("53a007db144ff47be1000001"),
"status" : "New",
"status_id" : ObjectId("537de7c3a5e6e668ffc2335c"),
"status_index" : 100,
"substatus" : "New Web Enquiry",
"substatus_id" : ObjectId("5396bb9fa5e6e668ffc23388"),
"notes" : "New enquiry received from website.",
},
{
"_id" : ObjectId("53a80c977d299cfe91bacf81"),
"status" : "New",
"status_id" : ObjectId("537de7c3a5e6e668ffc2335c"),
"status_index" : 100,
"substatus" : "Attempted Contact",
"substatus_id" : ObjectId("53a80e06a5e6e668ffc2339e"),
"notes" : "In this test, we pretend that we've not managed to get hold of the customer on the first attempt.",
},
{
"_id" : ObjectId("53a80e539b966b8da5c40c36"),
"status" : "Approved",
"status_id" : ObjectId("52e77a49d85e95f00ebf6c72"),
"status_index" : 200,
"substatus" : "Enquiry Confirmed",
"substatus_id" : ObjectId("53901f1ba5e6e668ffc23372"),
"notes" : "In this test, we pretend that we've got hold of the customer after failing to contact them on the first attempt.",
}
]
}
Within each enquiry is an updates array of objects which also have an ObjectId as their main identity field. We're using an $unwind and $group aggregation to pull the first and latest updates, as well as the count of updates, making sure we only take enquiries where there have been more than one update (as one is automatically inserted when the enquiry is made):
db.enquiries.aggregate([
{
$match: {
"test": true
}
},
{
$unwind: "$updates"
},
{
$group: {
"_id": "$_id",
"latest_update_id": {
$last: "$updates._id"
},
"first_update_id": {
$first: "$updates._id"
},
"update_count": {
$sum: 1
}
}
},
{
$match: {
"update_count": {
$gt: 1
}
}
}
])
This results in the following output:
{
"result" : [
{
"_id" : ObjectId("53a295ad122ea80200000005"),
"latest_update_id" : ObjectId("53a80bdc7d299cfe91bacf7e"),
"first_update_id" : ObjectId("53a295ad122ea80200000003"),
"update_count" : 2
},
{
"_id" : ObjectId("53a007db144ff47be1000003"),
"latest_update_id" : ObjectId("53a80e539b966b8da5c40c36"),
"first_update_id" : ObjectId("53a007db144ff47be1000001"),
"update_count" : 3
}
],
"ok" : 1
}
This is then passed through to our code (node.js, in this case) where we perform a few operations on it and then present some information on our dashboard.
Ideally, I'd like to add another $group pipeline aggregation to the query which would subtract the timestamp of first_update_id from the timestamp of latest_update_id to give us a timespan, which we could then use $avg on.
Can anyone tell me if this is possible? (Thank you!)
As Neil already pointed out, you can't get to the timestamp from the ObjectId in the aggregation framework.
You said that speed is not important, so using MapReduce you can get what you want:
var map = function() {
if (this.updates.length > 1) {
var first = this.updates[0];
var last = this.updates[this.updates.length - 1];
var diff = last._id.getTimestamp() - first._id.getTimestamp();
var val = {
latest_update_id : last._id,
first_update_id : first._id,
update_count : this.updates.length,
diff: diff
}
emit(this._id, val);
}
};
var reduce = function() { };
db.runCommand(
{
mapReduce: "enquiries",
map: map,
reduce: reduce,
out: "mrresults",
query: { test : true}
}
);
This are the results:
{
"_id" : ObjectId("53a007db144ff47be1000003"),
"value" : {
"latest_update_id" : ObjectId("53a80e539b966b8da5c40c36"),
"first_update_id" : ObjectId("53a007db144ff47be1000001"),
"update_count" : 3,
"diff" : 525944000
}
}
Edit:
If you want to get the average diff for all documents you can do it like this:
var map = function() {
if (this.updates.length > 1) {
var first = this.updates[0];
var last = this.updates[this.updates.length - 1];
var diff = last._id.getTimestamp() - first._id.getTimestamp();
emit("1", {diff : diff});
}
};
var reduce = function(key, values) {
var reducedVal = { count: 0, sum: 0 };
for (var idx = 0; idx < values.length; idx++) {
reducedVal.count += 1;
reducedVal.sum += values[idx].diff;
}
return reducedVal;
};
var finalize = function (key, reducedVal) {
reducedVal.avg = reducedVal.sum/reducedVal.count;
return reducedVal;
};
db.runCommand(
{
mapReduce: "y",
map: map,
reduce: reduce,
finalize : finalize,
out: "mrtest",
query: { test : true}
}
);
And the example output:
> db.mrtest.find().pretty()
{
"_id" : "1",
"value" : {
"count" : 2,
"sum" : 1051888000,
"avg" : 525944000
}
}

Mongodb Is it possible to aggregate an object?

I am trying to aggregate the total sum of packets in this document.
{
"_id" : ObjectId("51a6cd102769c63e65061bda"),
"capture" : "1369885967",
"packets" : {
"0" : "595",
"1" : "596",
"2" : "595",
"3" : "595",
...
}
}
The closest I can get is about
db.collection.aggregate({ $match: { capture : "1369885967" } }, {$group: { _id:null, sum: {$sum:"$packets"}}});
However it returns sum 0, which is obviously wrong.
{ "result" : [ { "_id" : null, "sum" : 0 } ], "ok" : 1 }
How do I get the sum of all the packets?
Since you have the values in an object instead of an array, you'll need to use mapReduce.
// Emit the values as integers
var mapFunction =
function() {
for (key in this.packets) {
emit(null, parseInt(this.packets[key]));
}
}
// Reduce to a simple sum
var reduceFunction =
function(key, values) {
return Array.sum(values);
}
> db.collection.mapReduce(mapFunction, reduceFunction, {out: {inline:1}})
{
"results" : [
{
"_id" : null,
"value" : 2381
}
],
"ok" : 1,
}
If at all possible, you should emit the values as an array of a numeric type instead since that gives you more options (ie aggregation) and (unless the data set is large) probably performance benefits.
If you don't know how many keys are in the packet subdocument and since you also seem to be storing counts as strings (why???) you will have to use mapReduce.
Something like:
m=function() {
for (f in "this.packets") {
emit(null, +this.packets[f]);
};
r=function(k, vals) {
int sum=0;
vals.forEach(function(v) { sum+=v; } );
return sum;
}
db.collection.mapreduce(m, r, {out:{inline:1}, query:{your query condition here}});

Get all objects for each tag

I'm new in mongodbs mapreduce and for sure I have not completely understood it for now. And I have a problem, which I try to solve for few days without success.
I have a collection of let's say posts with a tags field. Now I want to mapreduce a new collection of tags. Where every tag have an array of all posts ids that have this one particular tag assigned.
one of my attempts to do this (which doesn't do this right)
m = function() {
for (var i in this.tags) {
emit(this.tags[i], {"ids" : [this._id]});
};
}
r = function(key, emits) {
var total = {ids : []}
for (var i in emits) {
emits[i].ids.forEach(function(id) {
total.ids.push(id);
}
}
return total;
};
I know, that I have to pivot the date some how around, but I just cant get my head wrapped around it.
I think you're missing a ")" in your reduce function to close the emits[i].ids.forEach(). Is this what you're trying to do?
r = function (key, values) {
var total = {ids:[]};
for (var i in values) {
values[i].ids.forEach(
function (id){
total.ids.push(id);
}
);
}
return total;
}
input
{_id:2, tags: ["dog", "Jenna"]}
{_id:1, tags: ["cat", "Jenna"]}
result:
{"results" : [
{"_id" : "Jenna",
"value" : {"ids" : [2,1]}
},
{"_id" : "cat",
"value" : {"ids" : [1]}
},
{"_id" : "dog",
"value" : {"ids" : [2]}
}
],
"timeMillis" : 1,
"counts" : {
"input" : 2,
"emit" : 4,
"reduce" : 1,
"output" : 3
},
"ok" : 1,
}