Efficient MongoDB query to split a field into an array - mongodb

This code splits the nicknames field in the cities collection into an array, but it's way to slow:
db.cities
.find()
.snapshot()
.forEach(function(el) {
el.nicknames = el.nicknames.split('->')
db.cities.save(el)
})
This code also splits the nicknames field in the cities collection into an array and it's much faster, but it temporarily causes the database size to double which crashes my database.
db.cities.aggregate(
[
{ "$addFields": {
"nicknames": { "$split": [ "$nicknames", "->" ] }
}},
{ "$out": "cities" }
]
)
This seems like a trivial database task. There has to be a better way... right?

Yes, take advantage of the bulkWrite method for efficient bulk updates. You can split up the update operation into batches for large collections.
Using the cursor from the aggregate operation (minus the last $out pipeline), you can compose the bulk update operations as:
let bulkUpdateOps = [];
const cursor = db.cities.aggregate([
{ "$project": { "nicknames": { "$split": [ "$nicknames", "->" ] } } }
]);
cursor.forEach(doc => {
const { _id, nicknames } = doc;
bulkUpdateOps.push({
"updateOne": {
"filter": { _id },
"update": { "$set": { nicknames } },
"upsert": true
}
});
if (bulkUpdateOps.length === 1000) {
db.cities.bulkWrite(bulkUpdateOps);
bulkUpdateOps = [];
}
});
if (bulkUpdateOps.length > 0) {
db.cities.bulkWrite(bulkUpdateOps);
}

Related

Upsert issue when updating multiple documents using an array of IDs with $in

This query is doing the job fine :
db.collection.update(
{ "_id": oneIdProvided },
{ $inc: { "field": 5 } },{ upsert: true }
)
Now I would like to do the same operation multiple time with different IDs, I thought the good way was to use $in and therefore I tried :
db.collection.update(
{ "_id": { $in: oneArrayOfIds} },
{ $inc: { "field": 5 } },{ upsert: true }
)
Problem is : if one of the provided ID in the array is not existing in the collection, a new document is created (which is what I want) but will be attributed an automatic ID, not using the ID I provided and was looking for.
One solution I see could be to do first an insert query with my array of ID (those already existing would not be modified) and then doing my update query with upsert: false
Do you see a way of doing that in only one query ?
We can do this by performing multiple write operations using the bulkWrite() method.
function* range(start, end, step) {
for (let val=start; val<end; val+=step)
yield val
}
let oneArrayOfIds; // For example [1, 2, 3, 4]
let bulkOp = oneArrayOfIds.map( id => {
return {
"updateOne": {
"filter": { "_id": id },
"update": { "$set": { "field": 5 } },
"upsert": true
}
};
});
const limit = 1000;
const len = bulkOp.length;
let chunks = [];
if (len > 1000) {
for (let index of range(0, len, limit)) {
db.collection.bulkWrite(bulkOp.slice(index, index+limit));
}
} else {
db.collection.bulkWrite(bulkOp);
}

Find and change all date type fields in mongodb collection

I have a collection with multiple date type fields. I know I can change them based on their key, but is there a way to find all fields that have date as a type and change all of them in one script?
UPDATE
Many thanks to chridam for helping me out. Based upon his code I came up with this solution. (Note: I have mongo 3.2.9, and some code snippets from chridam's answer just wouldn't run. It might be valid but it didn't work for me.)
map = function() {
for (var key in this) {
if (key != null && this[key] != null && this[key] instanceof Date){
emit(key, null);
}
}
}
collectionName = "testcollection_copy";
mr = db.runCommand({
"mapreduce": collectionName,
"map": map,
"reduce": function() {},
"out": "map_reduce_test" // out is required
})
dateFields = db[mr.result].distinct("_id")
printjson(dateFields)
//updating documents
db[collectionName].find().forEach(function (document){
for(var i=0;i<dateFields.length;i++){
document[dateFields[i]] = new NumberLong(document[dateFields[i]].getTime());
}
db[collectionName].save(document);
});
Since projection didn't work, I used the above code for updating the documents.
My only question is why to use bulkWrite?
(Also, getTime() seemed better than substracting dates.)
An operation like this would involve two tasks; one to get a list of fields with the date type via MapReduce and the next to update the collection via aggregation or Bulk write operations.
NB: The following methodology assumes all the date fields are at the root level of the document and not embedded nor subdocuments.
MapReduce
The first thing you need is to run the following mapReduce operation. This will help you determine if each property with every document in the collection is of date type and returns a distinct list of the date fields:
// define helper function to determine if a key is of Date type
isDate = function(dt) {
return dt && dt instanceof Date && !isNaN(dt.valueOf());
}
// map function
map = function() {
for (var key in this) {
if (isDate(value[key])
emit(key, null);
}
}
// variable with collection name
collectionName = "yourCollectionName";
mr = db.runCommand({
"mapreduce": collectionName,
"map": map,
"reduce": function() {}
})
dateFields = db[mr.result].distinct("_id")
printjson(dateFields)
//output: [ "validFrom", "validTo", "registerDate"" ]
Option 1: Update collection via aggregation framework
You can use the aggregation framework to update your collection, in particular the $addFields operator available in MongoDB version 3.4 and newer. If your MongoDB server version does not support this, you can update your collection with the other workaround (as described in the next option).
The timestamp is calculated by using the $subtract arithmetic aggregation operator with the date field as minuend and the date since epoch new Date("1970-01-01") as subtrahend.
The resulting documents of the aggregation pipeline are then written to the same collection via the $out operator thus updating the collection with the new fields.
In essence, you'd want to end up running the following aggregation pipeline which converts the date fields to timestamps using the above algorithm:
pipeline = [
{
"$addFields": {
"validFrom": { "$subtract": [ "$validFrom", new Date("1970-01-01") ] },
"validTo": { "$subtract": [ "$validTo", new Date("1970-01-01") ] },
"registerDate": { "$subtract": [ "$registerDate", new Date("1970-01-01") ] }
}
},
{ "$out": collectionName }
]
db[collectionName].aggregate(pipeline)
You can dynamically create the above pipeline array given the list of the date fields as follows:
var addFields = { "$addFields": { } },
output = { "$out": collectionName };
dateFields.forEach(function(key){
var subtr = ["$"+key, new Date("1970-01-01")];
addFields["$addFields"][key] = { "$subtract": subtr };
});
db[collectionName].aggregate([addFields, output])
Option 2: Update collection via Bulk
Since this option is a workaround when $addFields operator from above is not supported, you can use the $project pipeline to create the new timestamp fields with the same $subtract implementation but instead of writing the results to the same collection, you can iterate the cursor from the aggregate results using forEach() method and with each document, update the collection using the bulkWrite() method.
The following example shows this approach:
ops = []
pipeline = [
{
"$project": {
"validFrom": { "$subtract": [ "$validFrom", new Date("1970-01-01") ] },
"validTo": { "$subtract": [ "$validTo", new Date("1970-01-01") ] },
"registerDate": { "$subtract": [ "$registerDate", new Date("1970-01-01") ] }
}
}
]
db[collectionName].aggregate(pipeline).forEach(function(doc) {
ops.push({
"updateOne": {
"filter": { "_id": doc._id },
"update": {
"$set": {
"validFrom": doc.validFrom,
"validTo": doc.validTo,
"registerDate": doc.registerDate
}
}
}
});
if (ops.length === 500 ) {
db[collectionName].bulkWrite(ops);
ops = [];
}
})
if (ops.length > 0)
db[collectionName].bulkWrite(ops);
Using the same method as Option 1 above to create the pipeline and the bulk method objects dynamically:
var ops = [],
project = { "$project": { } },
dateFields.forEach(function(key){
var subtr = ["$"+key, new Date("1970-01-01")];
project["$project"][key] = { "$subtract": subtr };
});
setDocFields = function(doc, keysList) {
setObj = { "$set": { } };
return keysList.reduce(function(obj, key) {
obj["$set"][key] = doc[key];
return obj;
}, setObj )
}
db[collectionName].aggregate([project]).forEach(function(doc) {
ops.push({
"updateOne": {
"filter": { "_id": doc._id },
"update": setDocFields(doc, dateFields)
}
});
if (ops.length === 500 ) {
db[collectionName].bulkWrite(ops);
ops = [];
}
})
if (ops.length > 0)
db[collectionName].bulkWrite(ops);

MongoDB: update all document on one field

{
"_id" : 1,
"users" : 2329255
},
{
"_id" :2,
"users" : 2638831
}
how to update all documents users field divided by 100.
result will be
{
"_id" : 1,
"users" : 23292.55
},
{
"_id" : 2,
"users" : 26388.31
}
db.coll.update({}, {$set: {'users': {'$divide': ['$users', 100]}}})
----its not working
Try below query:
db.coll.find().snapshot().forEach(
function (e) {
e.users = e.users/100;
// save the updated document
db.coll.save(e);
}
)
Above query will change/update the data in DB. If you want to fetch records with devided value then use $ project:
db.coll.aggregate(
[
{ $project: { users: { $divide: [ "$users", 100 ] } } }
]
)
this will not update the data but will return you desired value.
Use as per your requirement.
The $divide operator is only valid for the aggregate() function, not the update() function. What you want to do is use the aggregate() method to create a computed field, iterate the results from
the aggregate() cursor to create bulk update operations that you can send to the server in one request, rather that sending each update request with each item in the result.
The following example demonstrates this:
var bulkUpdateOps = [];
db.coll.aggregate([
{ "$match": { "users": { "$exists": true } } }
{
"$project": {
"computed_field": {
"$divide": ["$users", 100]
}
}
}
]).forEach(function(doc){
bulkUpdateOps.push({
"updateOne": {
"filter": { "_id": doc._id },
"update": { "$set": { "users": doc.computed_field } }
}
});
if (bulkUpdateOps.length === 500) {
db.coll.bulkWrite(bulkUpdateOps);
bulkUpdateOps = [];
}
});
if (bulkUpdateOps.length > 0) db.coll.bulkWrite(bulkUpdateOps);
Or for MongoDB 2.6.x and 3.0.x releases, use this version of Bulk operations:
var bulk = db.coll.initializeUnorderedBulkOp(),
counter = 0;
db.coll.aggregate([
{ "$match": { "users": { "$exists": true } } }
{
"$project": {
"computed_field": {
"$divide": ["$users", 100]
}
}
}
]).forEach(function(doc) {
bulk.find({ "_id": doc._id })
.updateOne({ "$set": { "users": doc.computed_field } });
if (counter % 500 === 0) {
bulk.execute();
bulk = db.coll.initializeUnorderedBulkOp();
}
});
if (counter % 500 !== 0 ) bulk.execute();
The Bulk operations API in both cases will help reduce the IO load on the server by sending the requests only once in every 500 documents in the collection to process.

Transform Multiple Array Elements with update

I have a collection of documents like
doc:{
"_id":6,
item1:"something"
item2:[
{
subitem1:value1,
subitem2:value2
},
{
subitem1:value3,
subitem2:value4
}
]
}
And i want to insert a field with the data of the two other and then delete them to have this
doc:{
"_id":6,
item1:"something"
item2:[
{
subitem:{field:value1,field2:value2}
},
{
subitem:{field:value3,field2:value4}
}
]
}
I have to update all the document of the collection with 1 script.
I have tried several things like $set, $push but nothing works (with no error when executed)
My last script is
db.docs.find({}).update.forEach(
function(doc){
doc.item2.forEach(
function(item){
{ $set : {item.subitem = {field:item.subitem1,field2:item.subitem2}}}
}
)
db.docs.save(doc);
}
,false,true)
This doesn't generate error but do nothing.
And i didn't even found how to delete a field.
Please help me !
You should be looping with .bulkWrite() to commit the updates. The main thing to note here is what you are actually iterating, which is the collection items as well as the members of the target array to transform.
And either blow array the entire array with it's replacement:
var ops = [];
db.docs.find({
"item2": {
"$elemMatch": {
"subitem1": { "$exists": true },
"subitem2": { "$exists": true }
}
}
}).forEach(function(doc) {
doc.item2 = doc.item2.map(function(el) {
return { "subitem": { "field1": el.subitem1, "field2": el.subitem2 } };
});
ops.push({
"updateOne": {
"filter": { "_id": doc._id },
"update": { "$set": { "item2": doc.item2 } }
}
});
// Test outside array looping
if ( ops.length == 1000 ) {
db.docs.bulkWrite(ops);
ops = []
}
});
if ( ops.length > 0 )
db.docs.bulkWrite(ops);
Or preferably use positional matches as updates:
var ops = [];
db.docs.find({
"item2": {
"$elemMatch": {
"subitem1": { "$exists": true },
"subitem2": { "$exists": true }
}
}
}).forEach(function(doc) {
doc.item2.forEach(function(item) {
var updoc = { "subitem": { "field1": item.subitem1, "field2": item.subitem2 } };
ops.push({
"updateOne": {
"filter": {
"_id": doc._id,
"item2": {
"$elemMatch": {
"subitem1": item.subitem1,
"subitem2": item.subitem2
}
}
},
"update": { "$set": { "item2.$": updoc } }
}
});
// Test inside the array looping
if ( ops.length == 1000 ) {
db.docs.bulkWrite(ops);
ops = []
}
});
});
if ( ops.length > 0 )
db.docs.bulkWrite(ops);
The reason why the latter case is better is the writes are actually atomic for each element so in high volume environments you would not get conflicting writes from other processes.
That's the speedy and safe way to transform your current array content. The first way will run a bit faster but I really would not recommend it on a live system. The second will still be very quick, but since it's updating one array element at a time in operations then there is a bit more to do.
In both cases the actual "wire communication" with the server happens only one in one thousand operations, so this removes the overhead of sending the request and waiting for the response of every single update.

Insert field with array size in mongo

I have a documents in mongodb, containing some array. Now I need to have a field containing a quantity of items of this array. So I need to update documents adding this field.
Simply I thought this will work:
db.myDocument.update({
"itemsTotal": {
$exists: false
},
"items": {
$exists: true
}
}, {
$set: {
itemsTotal: {
$size: "$items"
}
}
}, {
multi: true
})
But it completes with "not okForStorage".
Also I tried to make an aggregation, but it throws exception:
"errmsg" : "exception: invalid operator '$size'",
"code" : 15999,
"ok" : 0
What is a best solution and what I do wrong? I'm starting to think about writing java tool for calculation totals and updating documents with it.
You can use the .aggregate() method to $project your documents and return the $size of the items array. After that you will need to loop through your aggregation result using the .forEach loop and $set the itemTotal field for your document using "Bulk" operation for maximum efficiency.
var bulkOp = db.myDocument.initializeUnorderedBulkOp();
var count = 0;
db.myDocument.aggregate([
{ "$match": {
"itemsTotal": { "$exists": false } ,
"items": { "$exists": true }
}},
{ "$project": { "itemsTotal": { "$size": "$items" } } }
]).forEach(function(doc) {
bulkOp.find({ "_id": doc._id }).updateOne({
"$set": { "itemsTotal": doc.itemsTotal }
});
count++;
if (count % 200 === 0) {
// Execute per 200 operations and re-init
bulkOp.execute();
bulkOp = db.myDocument.initializeUnorderedBulkOp();
}
})
// Clean up queues
if (count > 0) {
bulkOp.execute();
}
You could initialise a Bulk() operations builder to update the document in a loop as follows:
var bulk = db.collection.initializeOrderedBulkOp(),
count = 0;
db.collection.find("itemsTotal": { "$exists": false },
"items": {
$exists: true
}
).forEach(function(doc) {
var items_size = doc.items.length;
bulk.find({ "_id": doc._id }).updateOne({
"$set": { "itemsTotal": items_size }
});
count++;
if (count % 100 == 0) {
bulk.execute();
bulk = db.collection.initializeUnorderedBulkOp();
}
});
if (count % 100 != 0) { bulk.execute(); }
This is much easier starting with MongoDB v3.4, which introduced the $addFields aggregation pipeline operator. We'll also use the $out operator to output the result of the aggregation to the same collection (replacing the existing collection is atomic).
db.myDocuments.aggregate( [
{
$addFields: {
itemsTotal: { $size: "$items" } ,
},
},
{
$out: "myDocuments"
}
] )
WARNING: this solution requires that all documents to have the items field. If some documents don't have it, aggregate will fail with
"The argument to $size must be an array, but was of type: missing"
You might think you could add a $match to the aggregation to filter only documents containing items, but that means all documents not containing items will not be output back to the myDocuments collection, so you'll lose those permanently.