I'm performing an incremental map reduce on a 2.6 mongod instance and everything worked fine and dandy until recently.
db.runCommand({ mapreduce: "timespanaggregations",
query: {
"start" : {
$gt: previousRun,
}
},
map : function Map() {
delete this.start;
delete this.end;
delete this._id;
var key = this.user,
value = this;
delete value.user;
emit(key, value);
},
reduce : function Reduce(user, aggregationData) {
var result = {};
aggregationData.forEach(function(timespan){
Object.keys(timespan).forEach(function(field){
if (!result[field] || (field.indexOf('Last', field.length - 5) != -1)) {
result[field] = timespan[field];
} else if ((field.indexOf('Count', field.length - 5) != -1) || (field.indexOf('Sum', field.length - 5) != -1)) {
result[field] += timespan[field];
} else if (field.indexOf('Min', field.length - 5) != -1) {
result[field] = (result[field] > timespan[field]) ? timespan[field] : result[field];
} else if (field.indexOf('Max', field.length - 5) != -1) {
result[field] = (result[field] < timespan[field]) ? timespan[field] : result[field];
}
});
});
return result;
},
sort : { "user" : 1, "start" : 1 },
out : { reduce: "lifetime_agg" },
jsMode: true
});
I'm pretty sure that I'm not breaking any requirements and nothing is nowhere near the limits. But, if I don't use the query to make chunks small enough, the command does nothing at all. It simply responds with:
counts: {
input: 0,
emit: 0,
reduce: 0,
output: number of records already in lifetime_agg
}
I would have expected some sort of an error message. The whole thing still works, if I force it to run with a smaller query result.
jsMode is set to true because I was tweaking with it and the number of records is nowhere near 500,000. It behaves the same way with jsMode set to false.
Related
I'm trying to implement a "range query" in MongoDB using Mongoose, ordered by a 'criteria' and then by '_id'.
And I would like to return to the client a string containing both cursors.
I was trying to implement something like the code below, with the commented block 2. However, I'm getting an error. Not even the log messages are being printed.
In my test, the query is empty, because the collection is empty.
I suspected that I was not getting the cursor, so I've tested with 'block 1' instead of block 2, and it worked.
But since I need the last cursor, I guess what I really need to use is the .toArray method, right?
What am I doing wrong?
Feed.find({
"criteria": {$lt: cursorCriteria},
"_id": {$lt: cursorId}
})
.sort({
criteria: -1,
_id: -1
})
.limit( 50 )
// block 1: just to test if I'm getting the cursor
.then( items => {
items.forEach( function(item) {
console.log('an item');
})
})
/* block 2: if I try this block instead of block 1, I get an error
.toArray( items => {
if (items.length > 0) {
console.log('not empty);
} else {
console.log('empty');
}
var nextCursor = '${item.criteria}_${item._id}';
res.status(200).json({item, nextCursor});
})
*/
Mongoose doesn't a have toArray() method
This worked fine:
.then( items => {
if (items.length > 0) {
console.log('not empty');
} else {
console.log('empty');
}
var nextCursor;
if (items.length > 0) {
nextCursor = '' + items[items.length-1].criteria + "_" + items[items.length-1]._id;
} else {
nextCursor = '';
}
res.status(200).json({items, nextCursor});
})
Here is the dataset
// Data 1
{ name : 111,
factors : [
{name:"f1", value:"dog", unit : "kg"},
{name:"f2", value:"0"}
]
},// data2
{ name : 112,
factors :
[
{name:"f1", value:"cat", unit : "g"},
{name:"f2", value:"13"}
]
}
// 100,000 more data ...
I would like to convert the value of factor f2 to be number.
db.getCollection('cases').find({
factors : {
$elemMatch : {
name : "f2",
value : {$type : 2}
}
}
}).forEach(function(doc, i){
doc.factors.forEach(function(factor){
if(factor.name == "f2"){
factor.value = !isNaN(factor.value) ? parseInt(factor.value) : factor.value;
}
});
db.cases.save(factor);
});
However, it can only update about 75~77 data for each execution. I am not sure why and I guess the problem is that the save() is async, so we can not initiate too many save() at the same time.
What should I do?
The concept here is to loop through your collection with a cursor and for each document within the cursor, gather data about the index position of the factors array elements.
You will then use this data later on in the loop as the update operation parameters to correctly identify the desired field to update.
Supposing your collection is not that humongous, the intuition above can be implemented using the forEach() method of the cursor as you have done in your attempts to do the iteration and getting the index data for all the arrays involved.
The following demonstrates this approach for small datasets:
db.cases.find({"factors.value": { "$exists": true, "$type": 2 }}).forEach(function(doc){
var factors = doc.factors,
updateOperatorDocument = {};
for (var idx = 0; idx < factors.length; idx++){
var val;
if(factors[idx].name == "f2"){
val = !isNaN(factors[idx].value) ? parseInt(factors[idx].value) : factors[idx].value;
updateOperatorDocument["factors."+ idx +".value"] = val;
}
};
db.cases.updateOne(
{ "_id": doc._id },
{ "$set": updateOperatorDocument }
);
});
Now for improved performance especially when dealing with large collections, take advantage of using a Bulk() API for updating the collection in bulk.
This is quite effecient as opposed to the above operations because with the bulp API you will be sending the operations to the server in batches (for example, say a batch size of 1000) which gives you much better
performance since you won't be sending every request to the server but just once in every 1000 requests, thus making your updates more efficient and quicker.
The following examples demonstrate using the Bulk() API available in MongoDB versions >= 2.6 and < 3.2.
var bulkUpdateOps = db.cases.initializeUnOrderedBulkOp(),
counter = 0;
db.cases.find({"factors.value": { "$exists": true, "$type": 2 }}).forEach(function(doc){
var factors = doc.factors,
updateOperatorDocument = {};
for (var idx = 0; idx < factors.length; idx++){
var val;
if(factors[idx].name == "f2"){
val = !isNaN(factors[idx].value) ? parseInt(factors[idx].value) : factors[idx].value;
updateOperatorDocument["factors."+ idx +".value"] = val;
}
};
bulkUpdateOps.find({ "_id": doc._id }).update({ "$set": updateOperatorDocument })
counter++; // increment counter for batch limit
if (counter % 1000 == 0) {
// execute the bulk update operation in batches of 1000
bulkUpdateOps.execute();
// Re-initialize the bulk update operations object
bulkUpdateOps = db.cases.initializeUnOrderedBulkOp();
}
})
// Clean up remaining operation in the queue
if (counter % 1000 != 0) { bulkUpdateOps.execute(); }
The next example applies to the new MongoDB version 3.2 which has since deprecated the Bulk() API and provided a newer set of apis using bulkWrite().
It uses the same cursors as above but creates the arrays with the bulk operations using the same forEach() cursor method to push each bulk write document to the array. Because write commands can accept no more than 1000 operations, you will need to group your operations to have at most 1000 operations and re-intialise the array when loop hit the 1000 iteration:
var cursor = db.cases.find({"factors.value": { "$exists": true, "$type": 2 }}),
bulkUpdateOps = [];
cursor.forEach(function(doc){
var factors = doc.factors,
updateOperatorDocument = {};
for (var idx = 0; idx < factors.length; idx++){
var val;
if(factors[idx].name == "f2"){
val = !isNaN(factors[idx].value) ? parseInt(factors[idx].value) : factors[idx].value;
updateOperatorDocument["factors."+ idx +".value"] = val;
}
};
bulkUpdateOps.push({
"updateOne": {
"filter": { "_id": doc._id },
"update": { "$set": updateOperatorDocument }
}
});
if (bulkUpdateOps.length == 1000) {
db.cases.bulkWrite(bulkUpdateOps);
bulkUpdateOps = [];
}
});
if (bulkUpdateOps.length > 0) { db.cases.bulkWrite(bulkUpdateOps); }
Write Result for Sample data
{
"acknowledged" : true,
"deletedCount" : 0,
"insertedCount" : 0,
"matchedCount" : 2,
"upsertedCount" : 0,
"insertedIds" : {},
"upsertedIds" : {}
}
got a job comparison sql and nosql data for my report for the college course. I'm doing a inner join testing, and mongo did something equivalent to the map reduce but am having problems.
Find all records are correct, fetch rows from a specific id is also correct, but I can not search for text, or other attribute.
My collections:
y_um_milhao{id_y_um_milhao, col_descricao}
x_um_milhao{id_x_um_milhao, col_decimal, fk_y_um_milhao}
My map/reduce:
var mapX_um_milhao = function(){
var output = {id_x: this.x_um_milhao,olDecimal:this.col_decimal, id_x:this.id_x_um_milhao}
emit (this.id_x_um_milhao, output);
};
var mapY_um_milhao = function(){
var output = {y_id: this.y_um_milhao, colDescricao:this.col_descricao, id_y:this.id_y_um_milhao}
emit(this.id_y_um_milhao, output);
};
var reduce_um_milhao = function(key, values){
var outs = {colDescricao:null, id_y:null, colDecimal:null, id_x:null};
values.forEach(function(v){
if(outs.colDescricao == null){
outs.colDescricao = v.colDescricao
}
if(outs.id_y == null){
outs.id_y = v.id_y
}
if(outs.colDecimal == null){
outs.colDecimal = v.colDecimal
}
if(outs.id_x == null){
outs.id_x = v.id_x
}
});
return outs;
};
result = db.x_um_milhao.mapReduce(mapX_um_milhao, reduce_um_milhao, {out: {reduce: 'x_y'}});
result = db.y_um_milhao.mapReduce(mapY_um_milhao, reduce_um_milhao, {out: {reduce: 'x_y'}});
Seeking all records:
db.x_y.find()
Seeking registration by id:
db.x_y.find({_id:1)
Result:
{ "_id" : 1, "value" : { "colDescricao" : "Teste TCC1", "id_y" : 1, "colDecimal" : 13.38, "id_x" : 1 } }
Now I can not search for "colDescricao" how could he do?
PS: Sorry for the English, not speak and used the google translator
You can search for "colDescricao" like this:
db.x_y.find({value.colDescricao : "Teste TCC1"})
I'm currently working on a project where I'm using keyword queries against a MongoDB. If I search for things that exists in the database everything works ok, but if I search for things that don't exist, or I have a typo in my query the appilcation basically crashes.
The query is as simple as this:
var query = Query.And(Query.Matches("text", searchText)
Where searchText is what's being written into the searchbox in the UI.
To check the size of the cursor I've tried implementing this:
if ( cursor.Size() == 0)
{
MessageBox.Show("Your search did not return a match. Please search for
something else.");
return database;
}
But the system takes 10-15 minutes to evaluate that the size is 0, compared to the 0.5 seconds if the size is 1 or more.
So do anyone have any suggestions? Either a better way of checking the size of the cursor or some kind of function that makes the method time out and tell the user that no match was found?
Thanks in advance.
Update:
As requested added the explain for something that should and something that shouldn't exist
db.docs.find( {text: "a"}).explain
function (verbose) {
/* verbose=true --> include allPlans, oldPlan fields */
var n = this.clone();
n._ensureSpecial();
n._query.$explain = true;
n._limit = Math.abs(n._limit) * -1;
var e = n.next();
function cleanup(obj){
if (typeof(obj) != 'object'){
return;
}
delete obj.allPlans;
delete obj.oldPlan;
if (typeof(obj.length) == 'number'){
for (var i=0; i < obj.length; i++){
cleanup(obj[i]);
}
}
if (obj.shards){
for (var key in obj.shards){
cleanup(obj.shards[key]);
}
}
if (obj.clauses){
cleanup(obj.clauses);
}
}
if (!verbose)
cleanup(e);
return e;
}
db.docs.find( {text: "fgrgfk"}).explain
function (verbose) {
/* verbose=true --> include allPlans, oldPlan fields */
var n = this.clone();
n._ensureSpecial();
n._query.$explain = true;
n._limit = Math.abs(n._limit) * -1;
var e = n.next();
function cleanup(obj){
if (typeof(obj) != 'object'){
return;
}
delete obj.allPlans;
delete obj.oldPlan;
if (typeof(obj.length) == 'number'){
for (var i=0; i < obj.length; i++){
cleanup(obj[i]);
}
}
if (obj.shards){
for (var key in obj.shards){
cleanup(obj.shards[key]);
}
}
if (obj.clauses){
cleanup(obj.clauses);
}
}
if (!verbose)
cleanup(e);
return e;
}
Update 2: Overview of indexes:
db.docs.getIndexes()
{
"v" : 1,
"key" : {
"_id" : 1
},
"ns" : "tweet_database.docs",
"name" : "_id_"
}
I have a MongoDB collection containing history data with id and timestamp.
I want to delete data from the collection older than a specific
timestamp. But for every id at least one
document (the newest) must stay in the collection.
Suppose I have the following documents in my collection ...
{"id" : "11", "timestamp" : ISODate("2011-09-09T10:27:34.785Z")} //1
{"id" : "11", "timestamp" : ISODate("2011-09-08T10:27:34.785Z")} //2
{"id" : "22", "timestamp" : ISODate("2011-09-05T10:27:34.785Z")} //3
{"id" : "22", "timestamp" : ISODate("2011-09-01T10:27:34.785Z")} //4
... and I want to delete documents having a timestamp older than
2011-09-07 then
1 and 2 should not be deleted because they are newer.
4 should be deleted because it is older, but 3 should not be deleted
(although it is older) because
at least one document per id should stay in the collection.
Does anyone know how I can do this with casbah and/or on the mongo
console?
Regards,
Christian
I can think of a couple of ways. First, try this:
var cutoff = new ISODate("2011-09-07T00:00:00.000Z");
db.testdata.find().forEach(function(data) {
if (data.timestamp.valueOf() < cutoff.valueOf()) {
// A candidate for deletion
if (db.testdata.find({"id": data.id, "timestamp": { $gt: data.timestamp }}).count() > 0) {
db.testdata.remove({"_id" : data._id});
}
}
});
This does the job you want. Or you can use a MapReduce job to do it as well. Load this into a text file:
var map = function() {
emit(this.id, {
ref: this._id,
timestamp: this.timestamp
});
};
var reduce = function(key, values) {
var cutoff = new ISODate("2011-09-07T00:00:00.000Z");
var newest = null;
var ref = null;
var i;
for (i = 0; i < values.length; ++i) {
if (values[i].timestamp.valueOf() < cutoff.valueOf()) {
// falls into the delete range
if (ref == null) {
ref = values[i].ref;
newest = values[i].timestamp;
} else if (values[i].timestamp.valueOf() > newest.valueOf()) {
// This one is newer than the one we are currently saving.
// delete ref
db.testdata.remove({_id : ref});
ref = values[i].ref;
newest = values[i].timestamp;
} else {
// This one is older
// delete values[i].ref
db.testdata.remove({_id : values[i].ref});
}
} else if (ref == null) {
ref = values[i].ref;
newest = values[i].timestamp;
}
}
return { ref: ref, timestamp: newest };
};
Load the above file into the shell: load("file.js");
Then run it: db.testdata.mapReduce(map, reduce, {out: "results"});
Then remove the mapReduce output: db.results.drop();