I have 400,000 data in my Mongo DB collection.Every Document have count(number).I want read these documents and add all numbers to get total.I get Mongo DB Collection data using Node.js and mongoose then calculate total using for-loop.it take around two minute.i want to take it in one second.is there any way to speed this process?.I found mapreduce can speedup this.what is the most efficient way to speedup this process.
i take Mongodb model like this
exports.getDownloads = function(processPD,processDW,responseMDW) {
DailyDowloadsModel.find({},function(err,foundData){
var select;
if (err) {
log.error(clientIP +" - DB Connection downloads failed - error");
res.status(500).send();
}
else {
if(foundData.length == 0){
var responseObject = null;
if(select && select == 'count'){
responseObject = {count: 0};
}
}else {
var responseObject = foundData;
if (select && select == "count") {
responseObject = {count: foundData.length};
}
processPD(processDW,responseObject,responseMDW);
}
}
});
}
sample Document
{
"_id" : ObjectId("5719ef37264f87331a3d0c54"),
"refunds" : "0",
"downloads" : "6",
"country" : "CA",
"date" : "2013-09-06",
"product_id" : "20600001319328",
"__v" : 0
}
I want to calculate total downloads.
You have two options to calculate total.
OPTION 1: aggregation framework
Performing such actions using aggregation framework will be much faster as compared to sending all documents to client and doing math over there.
Note: your downloads field is string, it should be a number.
db.collection.aggregate([
{$group:{_id:null, total:{$sum:"$downloads"}}}
])
On my Machine (Macbook Pro), it's returning total in under half a second. I'm running test on 400000 documents.
OPTION 2: map reduce
Though it is highly recommended to update your document structure to accept downloads as number. However, if this is not an option for whatever reason, your best bet is map reduce functionality offered by MongoDB.
var map = function(){
emit(1, parseInt(this.downloads));
};
var reduce = function(key, values){
var reducedValue = Array.sum(values);
return reducedValue;
};
db.collection.mapReduce(map, reduce, {
out: { "inline" : 1}
});
map reduce is slower than aggregation framework as you can see but much faster than your original approach. It emit output as:
{
"results" : [
{
"_id" : NumberInt(1),
"value" : NumberInt(2400000)
}
],
"timeMillis" : NumberInt(4112),
"counts" : {
"input" : NumberInt(400000),
"emit" : NumberInt(400000),
"reduce" : NumberInt(4000),
"output" : NumberInt(1)
},
"ok" : NumberInt(1)
}
As you can notice, it took roughly 4 seconds to complete operation.
Use Mongo DB aggregation
db.DailyDowloadsModel.aggregate([{$group:{_id:null, totalDownloads:{$sum:"$downloads"}}}]);
But before that INDEX the download field with this command in Mongo DB.
db.DailyDowloadsModel.createIndex( { downloads: 1 });
Related
I have a MongoDB database storing float arrays. Assume a collection of documents in the following format:
{
"id" : 0,
"vals" : [ 0.8, 0.2, 0.5 ]
}
Having a query array, e.g., with values [ 0.1, 0.3, 0.4 ], I would like to compute for all elements in the collection a distance (e.g., sum of differences; for the given document and query it would be computed by abs(0.8 - 0.1) + abs(0.2 - 0.3) + abs(0.5 - 0.4) = 0.9).
I tried to use the aggregation function of MongoDB to achieve this, but I can't work out how to iterate over the array. (I am not using the built-in geo operations of MongoDB, as the arrays can be rather long)
I also need to sort the results and limit to the top 100, so calculation after reading the data is not desired.
Current Processing is mapReduce
If you need to execute this on the server and sort the top results and just keep the top 100, then you could use mapReduce for this like so:
db.test.mapReduce(
function() {
var input = [0.1,0.3,0.4];
var value = Array.sum(this.vals.map(function(el,idx) {
return Math.abs( el - input[idx] )
}));
emit(null,{ "output": [{ "_id": this._id, "value": value }]});
},
function(key,values) {
var output = [];
values.forEach(function(value) {
value.output.forEach(function(item) {
output.push(item);
});
});
output.sort(function(a,b) {
return a.value < b.value;
});
return { "output": output.slice(0,100) };
},
{ "out": { "inline": 1 } }
)
So the mapper function does the calculation and output's everything under the same key so all results are sent to the reducer. The end output is going to be contained in an array in a single output document, so it is both important that all results are emitted with the same key value and that the output of each emit is itself an array so mapReduce can work properly.
The sorting and reduction is done in the reducer itself, as each emitted document is inspected the elements are put into a single tempory array, sorted, and the top results are returned.
That is important, and just the reason why the emitter produces this as an array even if a single element at first. MapReduce works by processing results in "chunks", so even if all emitted documents have the same key, they are not all processed at once. Rather the reducer puts it's results back into the queue of emitted results to be reduced until there is only a single document left for that particular key.
I'm restricting the "slice" output here to 10 for brevity of listing, and including the stats to make a point, as the 100 reduce cycles called on this 10000 sample can be seen:
{
"results" : [
{
"_id" : null,
"value" : {
"output" : [
{
"_id" : ObjectId("56558d93138303848b496cd4"),
"value" : 2.2
},
{
"_id" : ObjectId("56558d96138303848b49906e"),
"value" : 2.2
},
{
"_id" : ObjectId("56558d93138303848b496d9a"),
"value" : 2.1
},
{
"_id" : ObjectId("56558d93138303848b496ef2"),
"value" : 2.1
},
{
"_id" : ObjectId("56558d94138303848b497861"),
"value" : 2.1
},
{
"_id" : ObjectId("56558d94138303848b497b58"),
"value" : 2.1
},
{
"_id" : ObjectId("56558d94138303848b497ba5"),
"value" : 2.1
},
{
"_id" : ObjectId("56558d94138303848b497c43"),
"value" : 2.1
},
{
"_id" : ObjectId("56558d95138303848b49842b"),
"value" : 2.1
},
{
"_id" : ObjectId("56558d96138303848b498db4"),
"value" : 2.1
}
]
}
}
],
"timeMillis" : 1758,
"counts" : {
"input" : 10000,
"emit" : 10000,
"reduce" : 100,
"output" : 1
},
"ok" : 1
}
So this is a single document output, in the specific mapReduce format, where the "value" contains an element which is an array of the sorted and limitted result.
Future Processing is Aggregate
As of writing, the current latest stable release of MongoDB is 3.0, and this lacks the functionality to make your operation possible. But the upcoming 3.2 release introduces new operators that make this possible:
db.test.aggregate([
{ "$unwind": { "path": "$vals", "includeArrayIndex": "index" }},
{ "$group": {
"_id": "$_id",
"result": {
"$sum": {
"$abs": {
"$subtract": [
"$vals",
{ "$arrayElemAt": [ { "$literal": [0.1,0.3,0.4] }, "$index" ] }
]
}
}
}
}},
{ "$sort": { "result": -1 } },
{ "$limit": 100 }
])
Also limitting to the same 10 results for brevity, you get output like this:
{ "_id" : ObjectId("56558d96138303848b49906e"), "result" : 2.2 }
{ "_id" : ObjectId("56558d93138303848b496cd4"), "result" : 2.2 }
{ "_id" : ObjectId("56558d96138303848b498e31"), "result" : 2.1 }
{ "_id" : ObjectId("56558d94138303848b497c43"), "result" : 2.1 }
{ "_id" : ObjectId("56558d94138303848b497861"), "result" : 2.1 }
{ "_id" : ObjectId("56558d96138303848b499037"), "result" : 2.1 }
{ "_id" : ObjectId("56558d96138303848b498db4"), "result" : 2.1 }
{ "_id" : ObjectId("56558d93138303848b496ef2"), "result" : 2.1 }
{ "_id" : ObjectId("56558d93138303848b496d9a"), "result" : 2.1 }
{ "_id" : ObjectId("56558d96138303848b499182"), "result" : 2.1 }
This is made possible largely due to $unwind being modified to project a field in results that contains the array index, and also due to $arrayElemAt which is a new operator that can extract an array element as a singular value from a provided index.
This allows the "look-up" of values by index position from your input array in order to apply the math to each element. The input array is facilitated by the existing $literal operator so $arrayElemAt does not complain and recongizes it as an array, ( seems to be a small bug at present, as other array functions don't have the problem with direct input ) and gets the appropriate matching index value by using the "index" field produced by $unwind for comparison.
The math is done by $subtract and of course another new operator in $abs to meet your functionality. Also since it was necessary to unwind the array in the first place, all of this is done inside a $group stage accumulating all array members per document and applying the addition of entries via the $sum accumulator.
Finally all result documents are processed with $sort and then the $limit is applied to just return the top results.
Summary
Even with the new functionallity about to be availble to the aggregation framework for MongoDB it is debatable which approach is actually more efficient for results. This is largely due to there still being a need to $unwind the array content, which effectively produces a copy of each document per array member in the pipeline to be processed, and that generally causes an overhead.
So whilst mapReduce is the only present way to do this until a new release, it may actually outperform the aggregation statement depending on the amount of data to be processed, and despite the fact that the aggregation framework works on native coded operators rather than translated JavaScript operations.
As with all things, testing is always recommended to see which case suits your purposes better and which gives the best performance for your expected processing.
Sample
Of course the expected result for the sample document provided in the question is 0.9 by the math applied. But just for my testing purposes, here is a short listing used to generate some sample data that I wanted to at least verify the mapReduce code was working as it should:
var bulk = db.test.initializeUnorderedBulkOp();
var x = 10000;
while ( x-- ) {
var vals = [0,0,0];
vals = vals.map(function(val) {
return Math.round((Math.random()*10),1)/10;
});
bulk.insert({ "vals": vals });
if ( x % 1000 == 0) {
bulk.execute();
bulk = db.test.initializeUnorderedBulkOp();
}
}
The arrays are totally random single decimal point values, so there is not a lot of distribution in the listed results I gave as sample output.
I have a data structure like this:
We have some centers. A center has some switches. A switch has some ports.
{
"_id" : ObjectId("561ad881755a021904c00fb5"),
"Name" : "center1",
"Switches" : [
{
"Ports" : [
{
"PortNumber" : 2,
"Status" : "Empty"
},
{
"PortNumber" : 5,
"Status" : "Used"
},
{
"PortNumber" : 7,
"Status" : "Used"
}
]
}
]
}
All I want is to write an Update query to change the Status of the port that it's PortNumber is 5 to "Empty".
I can update it when I know the array index of the port (here array index is 1) with this query:
db.colection.update(
// query
{
_id: ObjectId("561ad881755a021904c00fb5")
},
// update
{
$set : { "Switches.0.Ports.1.Status" : "Empty" }
}
);
But I don't know the array index of that Port.
Thanks for help.
You would normally do this using the positional operator $, as described in the answer to this question:
Update field in exact element array in MongoDB
Unfortunately, right now the positional operator only supports one array level deep of matching.
There is a JIRA ticket for the sort of behavior that you want: https://jira.mongodb.org/browse/SERVER-831
In case you can make Switches into an object instead, you could do something like this:
db.colection.update(
{
_id: ObjectId("561ad881755a021904c00fb5"),
"Switch.Ports.PortNumber": 5
},
{
$set: {
"Switch.Ports.$.Status": "Empty"
}
}
)
Since you don't know the array index of the Port, I would suggest you dynamically create the $set conditions on the fly i.e. something which would help you get the indexes for the objects and then modify accordingly, then consider using MapReduce.
Currently this seems to be not possible using the aggregation framework. There is an unresolved open JIRA issue linked to it. However, a workaround is possible with MapReduce. The basic idea with MapReduce is that it uses JavaScript as its query language but this tends to be fairly slower than the aggregation framework and should not be used for real-time data analysis.
In your MapReduce operation, you need to define a couple of steps i.e. the mapping step (which maps an operation into every document in the collection, and the operation can either do nothing or emit some object with keys and projected values) and reducing step (which takes the list of emitted values and reduces it to a single element).
For the map step, you ideally would want to get for every document in the collection, the index for each Switches and Ports array fields and another key that contains the $set keys.
Your reduce step would be a function (which does nothing) simply defined as var reduce = function() {};
The final step in your MapReduce operation will then create a separate collection Switches that contains the emitted Switches array object along with a field with the $set conditions. This collection can be updated periodically when you run the MapReduce operation on the original collection.
Altogether, this MapReduce method would look like:
var map = function(){
for(var i = 0; i < this.Switches.length; i++){
for(var j = 0; j < this.Switches[i].Ports.length; j++){
emit(
{
"_id": this._id,
"switch_index": i,
"port_index": j
},
{
"index": j,
"Switches": this.Switches[i],
"Port": this.Switches[i].Ports[j],
"update": {
"PortNumber": "Switches." + i.toString() + ".Ports." + j.toString() + ".PortNumber",
"Status": "Switches." + i.toString() + ".Ports." + j.toString() + ".Status"
}
}
);
}
}
};
var reduce = function(){};
db.centers.mapReduce(
map,
reduce,
{
"out": {
"replace": "switches"
}
}
);
Querying the output collection Switches from the MapReduce operation will typically give you the result:
db.switches.findOne()
Sample Output:
{
"_id" : {
"_id" : ObjectId("561ad881755a021904c00fb5"),
"switch_index" : 0,
"port_index" : 1
},
"value" : {
"index" : 1,
"Switches" : {
"Ports" : [
{
"PortNumber" : 2,
"Status" : "Empty"
},
{
"PortNumber" : 5,
"Status" : "Used"
},
{
"PortNumber" : 7,
"Status" : "Used"
}
]
},
"Port" : {
"PortNumber" : 5,
"Status" : "Used"
},
"update" : {
"PortNumber" : "Switches.0.Ports.1.PortNumber",
"Status" : "Switches.0.Ports.1.Status"
}
}
}
You can then use the cursor from the db.switches.find() method to iterate over and update your collection accordingly:
var newStatus = "Empty";
var cur = db.switches.find({ "value.Port.PortNumber": 5 });
// Iterate through results and update using the update query object set dynamically by using the array-index syntax.
while (cur.hasNext()) {
var doc = cur.next();
var update = { "$set": {} };
// set the update query object
update["$set"][doc.value.update.Status] = newStatus;
db.centers.update(
{
"_id": doc._id._id,
"Switches.Ports.PortNumber": 5
},
update
);
};
Problem
I have a document with a _id and a Collection of Answers I am trying to write a map-reduce function to sum the total score of answers for each id.
Document
/* 0 */
{
"_id" : ObjectId("527b6ba88d251d58a18f3f0a"),
"Answers" : [{
"Score" : 2
}, {
"Score" : 0
}, {
"Score" : 2
}, {
"Score" : 2
}]
}
Here is the Map-Reduce I though would be correct reading the documentation
Map
function() {
this.Answers.forEach(function(val)
{
emit(this._id, val.Score);
});
}
also tried this
function() {
for (var i = 0; i < this.Answers; i++)
{
emit(this._id, this.Answers[i].Score);
}
}
Reduce
function(key, values)
{
return Array.sum(values);
}
I am getting no information back with this, but it does appear to be processing it takes 2-5 seconds to return. I guess I am not understanding something about map-reduce.
Also I am using MongoVUE to access MongoDB.
EDIT
I just ran my map reduce through the console and got this output
{
"results" : [ ],
"timeMillis" : 2506,
"counts" : {
"input" : 1655,
"emit" : 0,
"reduce" : 0,
"output" : 0
},
"ok" : 1,
}
so it's my map function that's incorrect I guess as nothing was emitted.
EDIT 2
Updated document with output from mongovue
In JavaScript loops adding the length property allows you to iterate by the count of the items in the array, so you cna change your second attempt to:
function() {
for (var i = 0; i < this.Answers.length; i++)
{
emit(this._id, this.Answers[i].Score);
}
}
It should also be noted that your reduce can run multiple times per key, specifically it can repeat every 101 rows, technically this shouldn't matter since you are summing up the array values and the previous reduce value will be passed as an array element in the new reduce so it should work just fine; however, good to keep in mind.
I think the 'this' variable is not what you expect in the .forEach() function in your map method. Try this instead;
function() {
var row = this;
this.Answers.forEach(function(val)
{
emit(row._id, val.Score);
});
}
I'm attempting to create my own todo list using Javascript, Python and MongoDB. I'm getting stuck on how to handle the task ordering.
My current idea is to have an order field in each task document and when the order changes on the client I would grab the task list from the db and reorder each task individually/sequentially. This seems awkward because large todo lists would mean large amount of queries. Is there a way to update a field in multiple documents sequentially?
I'm also looking for advice as to whether this is the best way to do this. I want to be able to maintain the todo list order but maybe I'm going about it the wrong way.
{
"_id" : ObjectId("50a658f2cace55034c68ce95"),
"order" : 1,
"title" : "task1",
"complete" : 0
}
{
"_id" : ObjectId("50a658fecace55034c68ce96"),
"order" : 2,
"title" : "task2",
"complete" : 1
}
{
"_id" : ObjectId("50a65907cace55034c68ce97"),
"order" : 3,
"title" : "task3",
"complete" : 1
}
{
"_id" : ObjectId("50a65911cace55034c68ce98"),
"order" : 4,
"title" : "task4",
"complete" : 0
}
{
"_id" : ObjectId("50a65919cace55034c68ce99"),
"order" : 5,
"title" : "task5",
"complete" : 0
}
Mongo is very very fast with queries, you should not be as concerned with performance as if you were using a full featured relational database. If you want to be prudent, just create a todo list of 1k items and try it out, it should be pretty instant.
for (var i = 0; i < orderedListOfIds.length; i++)
{
db.collection.update({ '_id': orderedListOfIds[i] }, { $set: { order:i } })
}
then
db.collection.find( { } ).sort( { order: 1 } )
Yes, mongo allows for updating multiple documents. Just use a modifier operation and multi=True. For example, this increments order by one for all documents with order greater than five:
todos.update({'order':{'$gt':5}}, {'$inc':{'order':1}}, multi=True)
As to the best way, usually it's better to use a "natural" ordering (by name, date, priority etc) rather than create a fake field just for that.
I'm doing something similar. I added a field ind to my list items. Here's how I move a list item to a new location:
moveItem: function (sourceIndex, targetIndex) {
var id = Items.findOne({ind:sourceIndex})._id;
var movinUp = targetIndex > sourceIndex;
shift = movinUp ? -1 : 1;
lowerIndex = Math.min(sourceIndex, targetIndex);
lowerIndex += movinUp ? 1 : 0;
upperIndex = Math.max(sourceIndex, targetIndex);
upperIndex -= movinUp ? 0 : 1;
console.log("Shifting items from "+lowerIndex+" to "+upperIndex+" by "+shift+".");
Items.update({ind: {$gte: lowerIndex,$lte: upperIndex}}, {$inc: {ind:shift}},{multi:true});
Items.update(id, {$set: {ind:targetIndex}});
}
if you're using native promises (es6) in mongoose mongoose.Promise = global.Promise you can do the following to batch:
function batchUpdate(res, req, next){
let ids = req.body.ids
let items = []
for(let i = 0; i < ids.length; i++)
items.push(db.collection.findOneAndUpdate({ _id:ids[i] }, { $set: { order:i } }))
Promise.all(items)
.then(() => res.status(200).send())
.catch(next)
}
If I have a collection with thousands of elements, is there a way I can easily find which elements are taking up the most space (in terms of MB)?
There's no built-in query for this, you have to iterate the collection, gather size for each document, and sort afterwards. Here's how it'd work:
var cursor = db.coll.find();
var doc_size = {};
cursor.forEach(function (x) {
var size = Object.bsonsize(x);
doc_size[x._id] = size;
});
At this point you'll have a hashmap with document ids as keys and their sizes as values.
Note that with this approach you will be fetching the entire collection over the wire. An alternative is to use MapReduce and do this server-side (inside mongo):
> function mapper() {emit(this._id, Object.bsonsize(this));}
> function reducer(obj, size_in_b) { return { id : obj, size : size_in_b}; }
>
> var results = db.coll.mapReduce(mapper, reducer, {out : {inline : 1 }}).results
> results.sort(function(r1, r2) { return r2.value - r1.value; })
inline:1 tells mongo not to create a temporary collection for results, everything will be kept in RAM.
And a sample output from one of my collections:
[
{
"_id" : ObjectId("4ce9339942a812be22560634"),
"value" : 1156115
},
{
"_id" : ObjectId("4ce9340442a812be24560634"),
"value" : 913413
},
{
"_id" : ObjectId("4ce9340642a812be26560634"),
"value" : 866833
},
{
"_id" : ObjectId("4ce9340842a812be28560634"),
"value" : 483614
},
...
{
"_id" : ObjectId("4ce9340742a812be27560634"),
"value" : 61268
}
]
>
Figured this out! I did this in two steps using Object.bsonsize():
db.myCollection.find().forEach(function(myObject) {
db.objectSizes.save({object_id: object._id, size: Object.bsonsize(chain)});
});
db.objectSizes.find().sort({size: -1}).limit(5).pretty();