Insert field with array size in mongo - mongodb

I have a documents in mongodb, containing some array. Now I need to have a field containing a quantity of items of this array. So I need to update documents adding this field.
Simply I thought this will work:
db.myDocument.update({
"itemsTotal": {
$exists: false
},
"items": {
$exists: true
}
}, {
$set: {
itemsTotal: {
$size: "$items"
}
}
}, {
multi: true
})
But it completes with "not okForStorage".
Also I tried to make an aggregation, but it throws exception:
"errmsg" : "exception: invalid operator '$size'",
"code" : 15999,
"ok" : 0
What is a best solution and what I do wrong? I'm starting to think about writing java tool for calculation totals and updating documents with it.

You can use the .aggregate() method to $project your documents and return the $size of the items array. After that you will need to loop through your aggregation result using the .forEach loop and $set the itemTotal field for your document using "Bulk" operation for maximum efficiency.
var bulkOp = db.myDocument.initializeUnorderedBulkOp();
var count = 0;
db.myDocument.aggregate([
{ "$match": {
"itemsTotal": { "$exists": false } ,
"items": { "$exists": true }
}},
{ "$project": { "itemsTotal": { "$size": "$items" } } }
]).forEach(function(doc) {
bulkOp.find({ "_id": doc._id }).updateOne({
"$set": { "itemsTotal": doc.itemsTotal }
});
count++;
if (count % 200 === 0) {
// Execute per 200 operations and re-init
bulkOp.execute();
bulkOp = db.myDocument.initializeUnorderedBulkOp();
}
})
// Clean up queues
if (count > 0) {
bulkOp.execute();
}

You could initialise a Bulk() operations builder to update the document in a loop as follows:
var bulk = db.collection.initializeOrderedBulkOp(),
count = 0;
db.collection.find("itemsTotal": { "$exists": false },
"items": {
$exists: true
}
).forEach(function(doc) {
var items_size = doc.items.length;
bulk.find({ "_id": doc._id }).updateOne({
"$set": { "itemsTotal": items_size }
});
count++;
if (count % 100 == 0) {
bulk.execute();
bulk = db.collection.initializeUnorderedBulkOp();
}
});
if (count % 100 != 0) { bulk.execute(); }

This is much easier starting with MongoDB v3.4, which introduced the $addFields aggregation pipeline operator. We'll also use the $out operator to output the result of the aggregation to the same collection (replacing the existing collection is atomic).
db.myDocuments.aggregate( [
{
$addFields: {
itemsTotal: { $size: "$items" } ,
},
},
{
$out: "myDocuments"
}
] )
WARNING: this solution requires that all documents to have the items field. If some documents don't have it, aggregate will fail with
"The argument to $size must be an array, but was of type: missing"
You might think you could add a $match to the aggregation to filter only documents containing items, but that means all documents not containing items will not be output back to the myDocuments collection, so you'll lose those permanently.

Related

Robomongo : Exceeded memory limit for $group

I`m using a script to remove duplicates on mongo, it worked in a collection with 10 items that I used as a test but when I used for the real collection with 6 million documents, I get an error.
This is the script which I ran in Robomongo (now known as Robo 3T):
var bulk = db.getCollection('RAW_COLLECTION').initializeOrderedBulkOp();
var count = 0;
db.getCollection('RAW_COLLECTION').aggregate([
// Group on unique value storing _id values to array and count
{ "$group": {
"_id": { RegisterNumber: "$RegisterNumber", Region: "$Region" },
"ids": { "$push": "$_id" },
"count": { "$sum": 1 }
}},
// Only return things that matched more than once. i.e a duplicate
{ "$match": { "count": { "$gt": 1 } } }
]).forEach(function(doc) {
var keep = doc.ids.shift(); // takes the first _id from the array
bulk.find({ "_id": { "$in": doc.ids }}).remove(); // remove all remaining _id matches
count++;
if ( count % 500 == 0 ) { // only actually write per 500 operations
bulk.execute();
bulk = db.getCollection('RAW_COLLECTION').initializeOrderedBulkOp(); // re-init after execute
}
});
// Clear any queued operations
if ( count % 500 != 0 )
bulk.execute();
This is the error message:
Error: command failed: {
"errmsg" : "exception: Exceeded memory limit for $group, but didn't allow external sort. Pass allowDiskUse:true to opt in.",
"code" : 16945,
"ok" : 0
} : aggregate failed :
_getErrorWithCode#src/mongo/shell/utils.js:23:13
doassert#src/mongo/shell/assert.js:13:14
assert.commandWorked#src/mongo/shell/assert.js:266:5
DBCollection.prototype.aggregate#src/mongo/shell/collection.js:1215:5
#(shell):1:1
So I need to set allowDiskUse:true to work? Where do I do that in the script and is there any problem doing this?
{ allowDiskUse: true }
Should be placed right after the aggregation pipeline.
In your code this should go like this:
db.getCollection('RAW_COLLECTION').aggregate([
// Group on unique value storing _id values to array and count
{ "$group": {
"_id": { RegisterNumber: "$RegisterNumber", Region: "$Region" },
"ids": { "$push": "$_id" },
"count": { "$sum": 1 }
}},
// Only return things that matched more than once. i.e a duplicate
{ "$match": { "count": { "$gt": 1 } } }
], { allowDiskUse: true } )
Note: Using { allowDiskUse: true } may introduce issues related to performance as aggregation pipeline will access data from temporary files on disk. Also depends on disk performance and the size of your working set. Test performance for your use case
It is always better to use match before group when you have large data.
If you are using match before group, you won't get into this problem.
db.getCollection('sample').aggregate([
{$match:{State:'TAMIL NADU'}},
{$group:{
_id:{DiseCode:"$code", State:"$State"},
totalCount:{$sum:1}
}},
{
$project:{
Code:"$_id.code",
totalCount:"$totalCount",
_id:0
}
}
])
If you really overcome this issue without match, then solution is { allowDiskUse: true }
Here is a simple undocumented trick that can help in a lot of case to avoid disk usage.
You can use a intermediate $project stage to reduce the size of the records passed in the $sort stage.
In this exemple it will drive to :
var bulk = db.getCollection('RAW_COLLECTION').initializeOrderedBulkOp();
var count = 0;
db.getCollection('RAW_COLLECTION').aggregate([
// here is the important stage
{ "$project": { "_id": 1, "RegisterNumber": 1, "Region": 1 } }, // this will reduce the records size
{ "$group": {
"_id": { RegisterNumber: "$RegisterNumber", Region: "$Region" },
"ids": { "$push": "$_id" },
"count": { "$sum": 1 }
}},
{ "$match": { "count": { "$gt": 1 } } }
]).forEach(function(doc) {
var keep = doc.ids.shift(); // takes the first _id from the array
bulk.find({ "_id": { "$in": doc.ids }}).remove(); // remove all remaining _id matches
count++;
if ( count % 500 == 0 ) { // only actually write per 500 operations
bulk.execute();
bulk = db.getCollection('RAW_COLLECTION').initializeOrderedBulkOp(); // re-init after execute
}
});
see the first $project stage that is here only to avoid the disk usage.
This is especially useful for collection will large records with most of the data unused in the aggregate
From MongoDB Docs
The $group stage has a limit of 100 megabytes of RAM. By default, if
the stage exceeds this limit, $group will produce an error. However,
to allow for the handling of large datasets, set the allowDiskUse
option to true to enable $group operations to write to temporary
files. See db.collection.aggregate() method and the aggregate command
for details.

MongoDB: update all document on one field

{
"_id" : 1,
"users" : 2329255
},
{
"_id" :2,
"users" : 2638831
}
how to update all documents users field divided by 100.
result will be
{
"_id" : 1,
"users" : 23292.55
},
{
"_id" : 2,
"users" : 26388.31
}
db.coll.update({}, {$set: {'users': {'$divide': ['$users', 100]}}})
----its not working
Try below query:
db.coll.find().snapshot().forEach(
function (e) {
e.users = e.users/100;
// save the updated document
db.coll.save(e);
}
)
Above query will change/update the data in DB. If you want to fetch records with devided value then use $ project:
db.coll.aggregate(
[
{ $project: { users: { $divide: [ "$users", 100 ] } } }
]
)
this will not update the data but will return you desired value.
Use as per your requirement.
The $divide operator is only valid for the aggregate() function, not the update() function. What you want to do is use the aggregate() method to create a computed field, iterate the results from
the aggregate() cursor to create bulk update operations that you can send to the server in one request, rather that sending each update request with each item in the result.
The following example demonstrates this:
var bulkUpdateOps = [];
db.coll.aggregate([
{ "$match": { "users": { "$exists": true } } }
{
"$project": {
"computed_field": {
"$divide": ["$users", 100]
}
}
}
]).forEach(function(doc){
bulkUpdateOps.push({
"updateOne": {
"filter": { "_id": doc._id },
"update": { "$set": { "users": doc.computed_field } }
}
});
if (bulkUpdateOps.length === 500) {
db.coll.bulkWrite(bulkUpdateOps);
bulkUpdateOps = [];
}
});
if (bulkUpdateOps.length > 0) db.coll.bulkWrite(bulkUpdateOps);
Or for MongoDB 2.6.x and 3.0.x releases, use this version of Bulk operations:
var bulk = db.coll.initializeUnorderedBulkOp(),
counter = 0;
db.coll.aggregate([
{ "$match": { "users": { "$exists": true } } }
{
"$project": {
"computed_field": {
"$divide": ["$users", 100]
}
}
}
]).forEach(function(doc) {
bulk.find({ "_id": doc._id })
.updateOne({ "$set": { "users": doc.computed_field } });
if (counter % 500 === 0) {
bulk.execute();
bulk = db.coll.initializeUnorderedBulkOp();
}
});
if (counter % 500 !== 0 ) bulk.execute();
The Bulk operations API in both cases will help reduce the IO load on the server by sending the requests only once in every 500 documents in the collection to process.

How to sum up all the employee salaries and update it to one attribute in the array embedded doc in mongodb

I want to sum up all the EMP_SALARY = (9000)2000+3000+4000 and I'm trying to update the value 9000 to total_employee_salary attribute.How can I do it in mongo shell.Can anyone please help me out regarding this ...
{
"_id" : ObjectId("571898dbc000041fe0b921eb"),
"ORGANIZATION" : "abc",
"TOTAL_EMPLOYEES" : 10,
"TOTAL_EMPLOYEES_SALARY" : 0,
"employees" : [
{
"EMP_NAME" : "vijay",
"EMP_SALARY" : 2000,
},
{
"EMP_NAME" : "vishnu",
"EMP_SALARY" : 3000,
},
{
"EMP_NAME" : "vishal",
"EMP_SALARY" : 4000,
}
]
}
If you are doing this in bulk for your collection, then the best way to do this is iterate with .bulkWrite() to write back:
var ops = [];
db.collection.find().forEach(function(doc) {
ops.push({
"updateOne": {
"filter": { "_id": doc._id },
"update": {
"$set": {
"TOTAL_EMPLOYEE_SALARY": Array.sum(doc.employees.map(function(emp) {
return emp.EMP_SALARY
}))
}
}
}
});
if ( ops.length == 1000 ) {
db.collection.bulkWrite(ops);
ops = [];
}
})
if ( ops.length > 0 ) {
db.collection.bulkWrite(ops);
}
For "super safe" code though, you probably should be using $inc on iteration of each array element instead:
var ops = [];
db.collection.find().forEach(function(doc) {
doc.employees.forEach(function(emp) {
ops.push({
"updateOne": {
"filter": { "_id": doc._id },
"update": {
"$inc": {
"TOTAL_EMPLOYEE_SALARY": emp.EMP_SALARY
}
}
}
});
if ( ops.length == 1000 ) {
db.collection.bulkWrite(ops);
ops = [];
}
})
});
if ( ops.length > 0 ) {
db.collection.bulkWrite(ops);
}
In earlier shell releases you do it using the "bulk" operations builder directly:
var bulk = db.collection.initializeOrderedBulkOp(),
count = 0;
db.collection.find().forEach(function(doc) {
bulk.find({ "_id": doc._id }).updateOne({
"$set": {
"TOTAL_EMPLOYEE_SALARY": Array.sum(doc.employees.map(function(emp) {
return emp.EMP_SALARY
}))
}
});
count++;
if ( count % 1000 == 0 ) {
bulk.execute();
bulk = db.collection.initializeOrderedBulkOp();
}
})
if ( count % 1000 != 0 ) {
bulk.execute();
}
But what you really should be doing in all instances is updating the mongodb-shell package on your system, regardless of the server version used. A modern shell should really be updated just as with a modern API version with your programming language of choice.
You need to iterate documents anyway in order to update each one, so you might as well just sum the content from the array by reading each document.
Just for trivia sake, in modern MongoDB releases the $sum operator works both as an accumulator as well as now adding items in an array. So now you can do this:
db.collection.aggregate([
{ "$project": {
"TOTAL_EMPLOYEE_SALARY": {
"$sum": "$employees.EMP_SALARY"
}
}}
])
And that will give the total of the array in each document.
In earlier versions than MongoDB 3.2 though, you need to $unwind the array and $group instead:
db.collection.aggregate([
{ "$unwind": "$employees" },
{ "$group": {
"_id": "$_id",
"TOTAL_EMPLOYEE_SALARY": { "$sum": "$employees.EMP_SALARY" }
}}
])
db.collection.aggregate([
{
$group: {
_id: null,
"sum": {
$sum: "$employees.EMP_SALARY"
}
}
}
])

Remove all Duplicates except the most recent document

I would like to clear all duplicated of a specific field in a collection. leaving only the earliest entry of the duplicates.
Here is my aggregate query which works great for finding the duplicates:
db.History.aggregate([
{ $group: {
_id: { name: "$sessionId" },
uniqueIds: { $addToSet: "$_id" },
count: { $sum: 1 }
} },
{ $match: {
count: { $gte: 2 }
} },
{ $sort : { count : -1} }
],{ allowDiskUse:true,
cursor:{}});
Only problem is that i need to execute a remove query as well and keep for each of the duplicates the youngest entry (determined by the field 'timeCreated':
"timeCreated" : ISODate("2016-03-07T10:48:43.251+02:00")
How exactly do i do that?
Personally I would take advantage of the fact that the ObjectId values themselves are "monotonic" or therefore "ever increasing in value" which means that the "youngest" or "most recent" would come at the end of a naturally sorted list.
So rather than force the aggregation pipeline to do the sorting, the most logical and efficient thing to do is simply sort the list of unique _id values returned per document as you process each response.
So basically working with the listing that you must have found:
Remove Duplicates from MongoDB
And is actually my answer ( and your the second person to reference this week, and yet no votes received for useful! Hmm! ), where it's just a simple .sort() applied within the cursor iteration for the returned array:
Using the _id Value
var bulk = db.History.initializeOrderedBulkOp(),
count = 0;
// List "all" fields that make a document "unique" in the `_id`
// I am only listing some for example purposes to follow
db.History.aggregate([
{ "$group": {
"_id": "$sessionId",
"ids": { "$push": "$_id" }, // _id values are already unique, so $addToSet adds nothing
"count": { "$sum": 1 }
}},
{ "$match": { "count": { "$gt": 1 } } }
],{ "allowDiskUse": true}).forEach(function(doc) {
doc.ids.sort().reverse(); // <-- this is the only real change
doc.ids.shift(); // remove first match, which is now youngest
bulk.find({ "_id": { "$in": doc.ids } }).remove(); // removes all $in list
count++;
// Execute 1 in 1000 and re-init
if ( count % 1000 == 0 ) {
bulk.execute();
bulk = db.History.initializeOrderedBulkOp();
}
});
if ( count % 1000 != 0 )
bulk.execute();
Using a specific field
If you "really" are set on adding another date value on which to determine which is youngest then just add to the array in $push first, then apply the client side sort function. Again just a really simple change:
var bulk = db.History.initializeOrderedBulkOp(),
count = 0;
// List "all" fields that make a document "unique" in the `_id`
// I am only listing some for example purposes to follow
db.History.aggregate([
{ "$group": {
"_id": "$sessionId",
"ids": { "$push": {
"_id": "$_id",
"created": "$timeCreated"
}},
"count": { "$sum": 1 }
}},
{ "$match": { "count": { "$gt": 1 } } }
],{ "allowDiskUse": true}).forEach(function(doc) {
doc.ids = doc.ids.sort(function(a,b) { // sort dates and just return _id
return a.created.valueOf() < a.created.valueOf()
}).map(function(el) { return el._id });
doc.ids.shift(); // remove first match, which is now youngest
bulk.find({ "_id": { "$in": doc.ids } }).remove(); // removes all $in list
count++;
// Execute 1 in 1000 and re-init
if ( count % 1000 == 0 ) {
bulk.execute();
bulk = db.History.initializeOrderedBulkOp();
}
});
if ( count % 1000 != 0 )
bulk.execute();
So it's a really simple process with no "real" alteration to the original process used to identify the duplicates and then remove all but one of them.
Always the best approach here to just let the server do the job of finding the duplicates, then client side when iterating the cursor you can then work out from the returned array which document is going to be kept and which ones you are going to remove.

Update MongoDB collection with $toLower or $toUpper

I would like to convert the 'state' field for all 'Organization' to all UPPER case. So
'Ky' becomes 'KY'
'tX' becomes 'TX'
'ca' becomes 'CA'
why this doesn't work
db.organizations.update(state:{ $exists : true }},{$set:{state:{ $toUpper : state }}}, false, true)
The $toLower and $toUpper operators you reference are for use with the aggregation framework only, and by itself does not alter documents in a collection as the .update() statement does. Also it is not presently possible to reference the value of an existing field within an update statement to produce a new value.
What you need to do is "loop" the collection and make your changes:
db.organizations.find({ "state": { "$exists": true } }).forEach(function(doc) {
db.organizations.update(
{ "_id": doc._id },
{ "$set": { "state": doc.state.toUpperCase() } }
);
});
With MongoDB 2.6 or greater you can make this a bit better with the bulk operations API:
var bulk = db.organizations.initializeOrderedBulkOp();
var count = 0;
db.organizations.find({ "state": { "$exists": true } }).forEach(function(doc) {
bulk.find({ "_id": doc._id }).updateOne({
"$set": { "state": doc.state.toUpperCase() } }
);
count++;
if ( count % 500 == 0 ) {
bulk.execute();
bulk = db.organizations.initializeOrderedBulkOp();
count = 0;
}
});
if ( count > 0 )
bulk.execute();
While still basically looping the results, the writes are only sent to the database once every 500 documents or whatever you choose to set staying under the 16MB BSON limit for the operation.
You have to put toUpperCase() like this:
"$set": { "state": doc.state.toUpperCase() } }