How do you consistently migrate a large MongoDB collection? - mongodb

I was trying to migrate a large MongoDB of ~600k documents, like so:
for await (const doc of db.collection('collection').find({
legacyProp: { $exists: true },
})) {
// additional data fetching from separate collections here
const newPropValue = await fetchNewPropValue(doc._id)
await db.collection('collection').findOneAndUpdate({ _id: doc._id }, [{ $set: { newProp: newPropValue } }, { $unset: ['legacyProp'] }])
}
}
When the migration script finished, data was still being updated for about 30 minutes or so. I've concluded this by computing document count of documents containing legacyProp property:
db.collection.countDocuments({ legacyProp: { $exists: true } })
which was decreasing on subsequent calls. After a while, the updates stopped and the final document count of documents containing legacy prop was around 300k, so the update failed silently resulting in a data loss. I'm curious what exactly happened, and most importantly, how do you update large MongoDB collections without any data loss? Keep in mind, there is additional data fetching involved before every update operation.

My first attempt would be to build function of fetchNewPropValue() in an aggregation pipeline.
Have a look at Aggregation Pipeline Operators
If this is not possible then you can try to put all newPropValue's into array and use it like this. 600k properties should fit easily into your RAM.
const newPropValues = await fetchNewPropValue() // getting all new properties as array [{_id: ..., val: ...}, {_id: ..., val: ...}, ...]
db.getCollection('collection').updateMany(
{ legacyProp: { $exists: true } },
[
{
$set: {
newProp: {
$first: {
$filter: { input: newPropValues, cond: { $eq: ["$_id", "$$this._id"] } }
}
}
}
},
{ $set: { legacyProp: "$$REMOVE", newProp: "$$newProp.val" } }
]
)
Or you can try bulkWrite:
let bulkOperations = []
db.getCollection('collection').find({ legacyProp: { $exists: true } }).forEach(doc => {
const newPropValue = await fetchNewPropValue(doc._id);
bulkOperations.push({
updateOne: {
filter: { _id: doc._id },
update: {
$set: { newProp: newPropValue },
$unset: { legacyProp: "" }
}
}
});
if (bulkOperations.length > 10000) {
db.getCollection('collection').bulkWrite(bulkOperations, { ordered: false });
bulkOperations = [];
}
})
if (bulkOperations.length > 0)
db.getCollection('collection').bulkWrite(bulkOperations, { ordered: false })

Related

MongoDB: How to speed up my data reorganisation query/operation?

I'm trying to analyse some data and I thought my queries would be faster ultimately by storing a relationship between my collections instead. So I wrote something to do the data normalisation, which is as follows:
var count = 0;
db.Interest.find({'PersonID':{$exists: false}, 'Data.DateOfBirth': {$ne: null}})
.toArray()
.forEach(function (x) {
if (null != x.Data.DateOfBirth) {
var peep = { 'Name': x.Data.Name, 'BirthMonth' :x.Data.DateOfBirth.Month, 'BirthYear' :x.Data.DateOfBirth.Year};
var person = db.People.findOne(peep);
if (null == person) {
peep._id = db.People.insertOne(peep).insertedId;
//print(peep._id);
}
db.Interest.updateOne({ '_id': x._id }, {$set: { 'PersonID':peep._id }})
++count;
if ((count % 1000) == 0) {
print(count + ' updated');
}
}
})
This script is just passed to mongo.exe.
Basically, I attempt to find an existing person, if they don't exist create them. In either case, link the originating record with the individual person.
However this is very slow! There's about 10 million documents and at the current rate it will take about 5 days to complete.
Can I speed this up simply? I know I can multithread it to cut it down, but have I missed something?
In order to insert new persons into People collection, use this one:
db.Interest.aggregate([
{
$project: {
Name: "$Data.Name",
BirthMonth: "$Data.DateOfBirth.Month",
BirthYear: "$Data.DateOfBirth.Year",
_id: 0
}
},
{
$merge: {
into: "People",
// requires an unique index on {Name: 1, BirthMonth: 1, BirthYear: 1}
on: ["Name", "BirthMonth", "BirthYear"]
}
}
])
For updating PersonID in Interest collection use this pipeline:
db.Interest.aggregate([
{
$lookup: {
from: "People",
let: {
name: "$Data.Name",
month: "$Data.DateOfBirth.Month",
year: "$Data.DateOfBirth.Year"
},
pipeline: [
{
$match: {
$expr: {
$and: [
{ $eq: ["$Name", "$$name"] },
{ $eq: ["$BirthMonth", "$$month"] },
{ $eq: ["$BirthYear", "$$year"] }
]
}
}
},
{ $project: { _id: 1 } }
],
as: "interests"
}
},
{
$set: {
PersonID: { $first: "$interests._id" },
interests: "$$REMOVE"
}
},
{ $merge: { into: "Interest" } }
])
Mongo Playground

Delete document that has size greater than a specific value

I have a collection which contains a multiple documents whose size has increased from 16MBs or is about to reach 16MBs.
I want query that finds documents which have size greater than 10MBs and delete all of them.
I am using following to find the size of document.
Object.bsonsize(db.test.findOne({type:"auto"}))
Is there a way to embed this query inside db.test.deleteMany() query?
This following query deletes the documents with size greater than the specified size (the size is specified in bytes). This query is valid with MongoDB v4.4 or higher.
db.collection.deleteMany( {
$expr: { $gt: [ { $bsonSize: "$$ROOT" }, SIZE_LIMIT ] },
type: "auto"
} )
The following script runs for MongoDB v4.2 or earlier:
const SIZE_LIMIT = 75 // substitute your value here in bytes
let idsToDelete = [ ]
let crsr = db.collection.find()
while(crsr.hasNext()) {
let doc= crsr.next()
if (Object.bsonsize(doc) > SIZE_LIMIT) {
idsToDelete.push(doc._id)
}
}
db.collection.deleteMany( { _id: { $in: idsToDelete } } )
I think you have to do it like this:
db.test.aggregate([
{ $match: { type: "auto" } },
{ $project: { bsonSize: { $bsonSize: "$$ROOT" } } },
{ $match: { bsonSize: { $gt: 16e6 } } },
]).forEach(function (doc) {
db.test.deleteOne({ _id: doc._id });
})
Or if you prefer deleteMany:
var ids = db.test.aggregate([
{ $match: { type: "auto" } },
{ $project: { bsonSize: { $bsonSize: "$$ROOT" } } },
{ $match: { bsonSize: { $lt: 16e6 } } }
]).toArray().map(x => x._id);
db.test.deleteMany({ _id: { $in: ids } });

Mongodb update duplicate array value

I am getting in an array of data and then want to insert that into my mongodb. I want to overwrite any duplicate values with the new one (on Im uploading) and if no duplicates just add it onto the current array.
I currently have:
db.cases.updateOne(
{ companyID: 218 },
{
$addToSet: {
cases: [AN ARRAY OF CASES]
},
$currentDate: { lastModified: true }
})
The collection has multiple companies and each has an array of cases - see image below :
The other thing that doesn't seem to work is that the currentDate doesn't seem to change whenever I update the cases, not sure if thats the way I have written the query?
Thank you.
You can use $addToSet (https://docs.mongodb.com/manual/reference/operator/update/addToSet/) with $[] (https://docs.mongodb.com/manual/reference/operator/update/positional-filtered/) to accomplish this.
I scripted an example in Node.js to show you what I mean:
const { MongoClient } = require('mongodb');
async function main() {
/**
* Connection URI. Update <username>, <password>, and <your-cluster-url> to reflect your cluster.
*/
const uri = "mongodb+srv://<username>:<password>#<your-cluster-url>?retryWrites=true&w=majority";
/**
* The Mongo Client you will use to interact with your database
*/
const client = new MongoClient(uri, { useUnifiedTopology: true });
try {
// Connect to the MongoDB cluster
await client.connect();
// Make the appropriate DB calls
await updateArray(client, 'id1')
} finally {
// Close the connection to the MongoDB cluster
await client.close();
}
}
main().catch(console.error);
async function updateArray(client, id) {
const docId = "299";
const mynewdoc = {
id: docId,
priority: 5,
casenumber: 40,
new: "field"
}
const result = await client.db("NameOfYourDb").collection("NameOfYourCollection").updateOne(
{ _id: id },
{
$set: {
"cases.$[element]": mynewdoc
}
},
{
arrayFilters: [{ "element.id": docId }]
}
)
if (result) {
console.log(result);
} else {
console.log(`Not found '${id}'`);
}
const result2 = await client.db("NameOfYourDb").collection("NameOfYourCollection").updateOne(
{ _id: id },
{
$addToSet: {
"cases": mynewdoc
}
}
)
if (result2) {
console.log(result2);
} else {
console.log(`Not found '${id}'`);
}
}
See https://www.mongodb.com/blog/post/quick-start-nodejs-mongodb--how-to-get-connected-to-your-database for an explanation of how the Node.js code is structured.
So I have figured a hack around but it is not exactly what I wanted - maybe someone can use this and expand upon it to make it work better:
db.cases.aggregate(
{ $match: { 'companyID': 218 }},
{ $unwind: '$cases' },
{ $group: { "_id": "$cases.casenumber", cases: {$push:'$cases'}, "count": { $sum:1 }}},
{ $match: { "count": { "$gt": 1 }}}
{ $project: { "cases": { $slice: [ "$cases", -1, { $subtract: [ { $size: "$cases" }, 1 ]}] }}},
{ $project: { "cases": { $arrayElemAt: ['$cases', 0] }}},
{ $group: { _id: 1, cases: {$push:'$cases' }}},
{ $out: 'cases' }
)
The only problem is that using out overwrites the cases document so I need to figure out a way to write it to the cases array inside of the company id (218)
Hopefully this can help someone though.
Thanks.

Mongodb $expr query is very slow

I have a Mongo 4.2.0 instance here on my development environment with a simple collection of only 300 entries.
I've build some basic queue handling juggling with some date fields.
To get an document that should be updated I have the following $expr-query, which runs very slow imho.
db.collection("myupdates").findOneAndUpdate({
$expr: {
$and: [
{ $gt: ["$shouldUpdate", "$updatedAt"] },
{ $gt: ["$shouldUpdate", "$isUpdatingAt"] },
{ $gt: ["$shouldUpdate", "$updateErroredAt"] },
]
},
}, {
$set: {
isUpdatingAt: new Date(),
},
});
This query takes around ~120ms after warmup on my standard year 2019 laptop. Where my other simple queries only take ~3ms.
Although it doesn't really matter to set indexes with 300 documents, I've tried of course to set them all. Single to compound indexes. This does not do the trick.
It's also not the findOneAndUpdate, with countDocuments I achieve the same slow speed.
Is this the normal speed of an $expr or aggregation syntax? What did I wrong? Is there a better way to achieve this? Do I have to use Redis for this use case?
Possible solution
As #Neil Lunn pointed out in the answers, calculated conditions do not utilize an index and should be the last resort.
So I just got rid of the calculated condition by splitting the query into 2 queries. The first query is getting an actual value I can match with.
These 2 queries boil down to ~10ms total, which is much better then 120ms.
const shouldUpdateDateResult = await mongo.db.collection("myupdates").findOne({
shouldUpdate: { $exists: true }
}, {
shouldUpdate: 1,
});
const shouldUpdateDate = shouldUpdateDateResult && shouldUpdateDateResult.shouldUpdate;
const result = await mongo.db.collection("myupdates").findOneAndUpdate({
$and: [
{ shouldUpdate: shouldUpdateDate },
{ $or: [
{ updatedAt: { $eq: null } },
{ updatedAt: { $exists: false } },
{ updatedAt: { $lte: shouldUpdateDate } }
] },
{ $or: [
{ isUpdatingAt: { $eq: null } },
{ isUpdatingAt: { $exists: false } },
{ isUpdatingAt: { $lte: shouldUpdateDate } }
] },
{ $or: [
{ updateErroredAt: { $eq: null } },
{ updateErroredAt: { $exists: false } },
{ updateErroredAt: { $lte: shouldUpdateDate } }
] },
],
}, {
$set: {
isUpdatingAt: new Date(),
},
});
The whole idea behind this is a processing queue usable by multiple workers.

Update Multiple Sub Doc By array of sub doc _id's in mongodb

I am trying to update multiple sub documents by given array of sub documents id's. I tried multiple approaches but it's not working.
In my scenario i need to update multiple sub documents by given array of id's. Here is my query as below:
Approach 1. (No elements were updating)
var updated = await ModelName.update(
{
'subDocArray._id' : { $in: req.body.elementId }
},
{
$set: {
'subDocArray.$[elem].abc': req.body.abcValue,
'subDocArray.$[elem].xyz': req.body.xyzValue
},
},{ "arrayFilters": [{ "elem._id": { $in: req.body.elementId } }], "multi": true, "upsert": true }
).lean().exec();
Approach 2: (Only First occurred element is updating)
var updated = await ModelName.update(
{
'subDocArray._id' : { $in: req.body.elementId }
},
{
$set: {
'subDocArray.$.abc': req.body.abcValue,
'subDocArray.$.xyz': req.body.xyzValue
},
},{ multi: true}
).exec();
Here req.body.elementId is array of sub doc id's.
Approach 1 was almost right. I was passing array of elementId's are which are in string format so i converted them in ObjectId form and then it works.
var arrOfObjectId = [];
req.body.elementId.forEach(elem => {
arrOfObjectId.push(Types.ObjectId(elem))
});
To find the difference between both of the array i printed both in console which were showing like below:
console.log(req.body.elementId)
Result: ['xxxxxxxxxxxxxxxxxxxxxxxx','yyyyyyyyyyyyyyyyyyyyyyyy'] //WRONG
console.log(arrOfObjectId)
Result: [ObjectId('xxxxxxxxxxxxxxxxxxxxxxxx'),ObjectId('yyyyyyyyyyyyyyyyyyyyyyyy')] //RIGHT
var updated = await ModelName.update(
{
'subDocArray._id' : { $in: arrOfObjectId }
},
{
$set: {
'subDocArray.$[elem].abc': req.body.abcValue,
'subDocArray.$[elem].xyz': req.body.xyzValue
},
},{ "arrayFilters": [{ "elem._id": { $in: arrOfObjectId } }], "multi": true, "upsert": true }
).lean().exec();