I am using a mongoskin in my nodeJs applicatipon to insert data in mongo db. I have a requirement to insert array of documents in database and send back the Ids of inserted records to the client. I am able to insert data however unable to locate the Ids of inserted records in the result Object. Need help to locate the insertedIds in the result. Im using the below code to bulk insert.
db.collection('myCollection', function (err, collection) {
var bulk = collection.initializeUnorderedBulkOp();
for (var i = 0; i < dataArray.length; i++) {
bulk.insert(dataArray[i]);
}
bulk.execute(function (err, result) {
//TODO: return the Ids of inserted records to the client
//Client will use these Ids to perform subsequent calls to the nodejs service
});
});
My result is a BatchWriteResult Object type.
Would suggest using the other bulk API method upsert() which will afford you to get in your BatchWriteResult() object the _id values of the inserted documents by calling its getUpsertedIds() method. The result object is in the same format as given in the documentation for BulkWriteResult.
The update operation with the Bulk.find.upsert() option will perform an insert when there are no matching documents for the Bulk.find() condition. If the update document does not specify an _id field, MongoDB adds the _id field and thus you can retrieve the id's of the inserted document
within your BatchWriteResult().
Also, the way you are queing up your bulk insert operations is not usually recommened since this basically builds up in memory; you'd want to have a bit of more control with managing the queues and memory resources other than relying on the driver's default way of limiting the batches of 1000 at a time, as well as the complete batch being under 16MB. The way you can do this is to use the forEach() loop of your data array with a counter that will help limit the batches to 1000 at a time.
The following shows the above approach
function getInsertedIds(result){
var ids = result.getUpsertedIds();
console.log(ids); // an array of upserted ids
return ids;
}
db.collection('myCollection',function(err,collection) {
var bulk = collection.initializeUnorderedBulkOp(),
insertedIds = [],
counter = 0;
dataArray.forEach(function (data){
bulk.find(data).upsert().updateOne(data);
counter++;
if (counter % 1000 == 0) {
bulk.execute(function(err, result) {
insertedIds = getInsertedIds(result);
bulk = collection.initializeUnorderedBulkOp(); // reset after execute
});
}
});
// Clean up the remaining operations in the queue which were
// cut off in the loop - counter not a round divisor of 1000
if (counter % 1000 != 0 ) {
bulk.execute(function(err, result) {
insertedIds = insertedIds.concat(getInsertedIds(result));
console.log(insertedIds);
});
}
});
Related
I have a number in the usersnumber collection (counter document), and I want to take that number and put it in the users collection in the number field (as you can see in the photo). Is there any way I can get the data from usersnumber and update the users > collection > document > number?
I expect having the number 10 from the usersnumber collection in the collection users > document > number: 0
a transaction is a set of read and write operations on one or more
documents.
Using the Cloud Firestore client libraries, you can group multiple
operations into a single transaction. Transactions are useful when you
want to update a field's value based on its current value, or the
value of some other field.
https://firebase.google.com/docs/firestore/manage-data/transactions#transactions
import { runTransaction } from "firebase/firestore";
try {
await runTransaction(db, async (transaction) => {
const sfDoc = await transaction.get(sfDocRef);
if (!sfDoc.exists()) {
throw "Document does not exist!";
}
const newPopulation = sfDoc.data().population + 1;
transaction.update(sfDocRef, { population: newPopulation });
});
console.log("Transaction successfully committed!");
} catch (e) {
console.log("Transaction failed: ", e);
}
I'm creating a pre-trigger for a Cosmos DB container. The pre-trigger is supposed to fetch all data related to the triggering document id. The incoming_document.items is always returning 100 when there are more than 100 documents expected (which seems to be limited by the query). I tried to set the pageSize property to -1 in the FeedOptions parameters and to use continuation, but it is still giving me 100. How can I fix this to give the total count?
Here is a simplified version of the code (without the continuation, I used a similar code to here):
function trgAddStats() {
var context = getContext();
var request = context.getRequest();
var incoming_document = request.getBody();
var container = context.getCollection();
var incoming_document.items = 1;
var filterQuery = {
"query": `SELECT t.customer, t.amount FROM Transactions_ds t WHERE t.customer = #customer`,
"parameters": [{
"name": "#customer",
"value": incoming_document.customer
}
]
};
var isAccepted = container.queryDocuments(container.getSelfLink(), filterQuery, {},
function (err, items, responseOptions) {
if (err) throw new Error("Error" + err.message);
incoming_document.items += items.length;
request.setBody(incoming_document);
}
);
if (!isAccepted) throw "Unable to update transaction, abort";
}
For getting more than 100 documents in Cosmos DB we can make use of x-ms-max-item-count.
The maximum number of values that can be returned by the query execution is done by the x-ms-max-item-count header.
The default value of the query results is 100 and it can be configured from 1–1000 using this header.
For more details regarding Pagination of query results in Microsoft Documentation.
You can Customize the number for Items per page in Query Explorer too like here.
I have cases where I build bulkWrite operations where some documents have the same update object, is there any performance benefit to merging the filters and send one updateMany with those filters instead of multiple updateOnes in the same bulkWrite?
It's obviously better to use updateMany over multiple updateOnes when using the normal methods, but with bulkWrite, since it's a single command, are there any significant gains of preferring one over the other?
Example:
I have 200k documents that I need to update, I have 10 total unique status field for all 200K documents, so my options are:
Solutions:
A) Send one single bulkWrite with 10 updateMany operations, and each one of those operations will affect 20K documents.
B) Send one single bulkWrite with 200K updateOne each operations holding its filter and status.
As #AlexBlex noted, I have to look out for accidentally updating more than one document with the same filter, in my case I use _id as my filter, so accidentally updating other documents is not a concern in my case, but is definitely something to look out for when considering the updateMany option.
Thanks #AlexBlex.
Short answer:
Using updateMany is at least twice faster, but might accidentally update more documents than you intended, keep reading to learn how to avoid this and gain the performance benefits.
Long answer:
We ran the following experiment to know the answer for that, the following are the steps:
Create a bankaccounts mongodb collection, each document contains only one field (balance).
Insert 1 million documents into the bankaccounts collection.
Randomize the order in memory of all 1 million documents to avoid any possible optimizations from the database using ids that are inserted in the same sequence, simulating a real-world scenario.
Build write operations for bulkWrite from the documents with a random number between 0 and 100.
Execute the bulkWrite.
Log the time the bulkWrite took.
Now, the experiment lies in the 4th step.
In one variation of the experiment we build an array consisting of 1 million updateOne operations, each updateOne has filter for a single document, and its respective `update object.
In the second variation, we build 100 updateMany operations, each including filter for 10K documents ids, and their respective update.
Results:
updateMany with multiple documents ids is 243% faster than multiple updateOnes, this can not be used everywhere though, please read "The risk" section to learn when it should be used.
Details:
We ran the script 5 times for each variation, the detailed results are as follows:
With updateOne: 51.28 seconds on average.
With updateMany: 21.04 seconds on average.
The risk:
As many people have already pointed out, updateMany is not a direct substitute to updateOne, since it can incorrectly update multiple documents when our intention was to really update only one document.
This approach is only valid when you're using a field that is unique such as _id or any other field that is unique, if the filter is depending on fields that are not unique, multiple documents will be updated and the results will not be equivalent.
65831219.js
// 65831219.js
'use strict';
const mongoose = require('mongoose');
const { Schema } = mongoose;
const DOCUMENTS_COUNT = 1_000_000;
const UPDATE_MANY_OPERATIONS_COUNT = 100;
const MINIMUM_BALANCE = 0;
const MAXIMUM_BALANCE = 100;
const SAMPLES_COUNT = 10;
const bankAccountSchema = new Schema({
balance: { type: Number }
});
const BankAccount = mongoose.model('BankAccount', bankAccountSchema);
mainRunner().catch(console.error);
async function mainRunner () {
for (let i = 0; i < SAMPLES_COUNT; i++) {
await runOneCycle(buildUpdateManyWriteOperations).catch(console.error);
await runOneCycle(buildUpdateOneWriteOperations).catch(console.error);
console.log('-'.repeat(80));
}
process.exit(0);
}
/**
*
* #param {buildUpdateManyWriteOperations|buildUpdateOneWriteOperations} buildBulkWrite
*/
async function runOneCycle (buildBulkWrite) {
await mongoose.connect('mongodb://localhost:27017/test', {
useNewUrlParser: true,
useUnifiedTopology: true
});
await mongoose.connection.dropDatabase();
const { accounts } = await createAccounts({ accountsCount: DOCUMENTS_COUNT });
const { writeOperations } = buildBulkWrite({ accounts });
const writeStartedAt = Date.now();
await BankAccount.bulkWrite(writeOperations);
const writeEndedAt = Date.now();
console.log(`Write operations took ${(writeEndedAt - writeStartedAt) / 1000} seconds with \`${buildBulkWrite.name}\`.`);
}
async function createAccounts ({ accountsCount }) {
const rawAccounts = Array.from({ length: accountsCount }, () => ({ balance: getRandomInteger(MINIMUM_BALANCE, MAXIMUM_BALANCE) }));
const accounts = await BankAccount.insertMany(rawAccounts);
return { accounts };
}
function buildUpdateOneWriteOperations ({ accounts }) {
const writeOperations = shuffleArray(accounts).map((account) => ({
updateOne: {
filter: { _id: account._id },
update: { balance: getRandomInteger(MINIMUM_BALANCE, MAXIMUM_BALANCE) }
}
}));
return { writeOperations };
}
function buildUpdateManyWriteOperations ({ accounts }) {
shuffleArray(accounts);
const accountsChunks = chunkArray(accounts, accounts.length / UPDATE_MANY_OPERATIONS_COUNT);
const writeOperations = accountsChunks.map((accountsChunk) => ({
updateMany: {
filter: { _id: { $in: accountsChunk.map(account => account._id) } },
update: { balance: getRandomInteger(MINIMUM_BALANCE, MAXIMUM_BALANCE) }
}
}));
return { writeOperations };
}
function getRandomInteger (min = 0, max = 1) {
min = Math.ceil(min);
max = Math.floor(max);
return min + Math.floor(Math.random() * (max - min + 1));
}
function shuffleArray (array) {
let currentIndex = array.length;
let temporaryValue;
let randomIndex;
// While there remain elements to shuffle...
while (0 !== currentIndex) {
// Pick a remaining element...
randomIndex = Math.floor(Math.random() * currentIndex);
currentIndex -= 1;
// And swap it with the current element.
temporaryValue = array[currentIndex];
array[currentIndex] = array[randomIndex];
array[randomIndex] = temporaryValue;
}
return array;
}
function chunkArray (array, sizeOfTheChunkedArray) {
const chunked = [];
for (const element of array) {
const last = chunked[chunked.length - 1];
if (!last || last.length === sizeOfTheChunkedArray) {
chunked.push([element]);
} else {
last.push(element);
}
}
return chunked;
}
Output
$ node 65831219.js
Write operations took 20.803 seconds with `buildUpdateManyWriteOperations`.
Write operations took 50.84 seconds with `buildUpdateOneWriteOperations`.
----------------------------------------------------------------------------------------------------
Tests were run using MongoDB version 4.0.4.
At high level, if you have same update object, then you can do updateMany rather than bulkWrite
Reason:
bulkWrite is designed to send multiple different commands to the server as mentioned here
If you have same update object, updateMany is best suited.
Performance:
If you have 10k update commands in bulkWrite, it will be executed batch manner internally. It may impact on the execution time
Exact lines from the reference about batching:
Each group of operations can have at most 1000 operations. If a group exceeds this limit, MongoDB will divide the group into smaller groups of 1000 or less. For example, if the bulk operations list consists of 2000 insert operations, MongoDB creates 2 groups, each with 1000 operations.
Thanks #Alex
Sometimes I create a temporary collection to aggregate data from multiple collections into a single collection for reporting. I need to drop this temporary collection shortly after a report is created to avoid filling up disk space with temporary collections from many report requests.
Currently I execute this from the application
db.dropCollection('tempCollection564e1f5a4abea9100523ade5');
but the results are not consistent each time it runs. Sometimes the collection drops successfully, but other times the collection fails to drop with this error message:
MongoError: exception: cannot perform operation: a background operation is currently running for collection databaseName.tempCollectionName
code: 12587
What is a best practice for deleting temporary collections in MongoDB? I currently name the collection with a UUID to avoid name collisions, and the collection is only used once before I attempt to destroy the temporary collection.
Is there a way to check if operations are in progress for a collection, and then drop the collection when the operations complete?
note: I do not believe this is an issue with javascript async code in the application. I call the dropCollection() after the aggregation query completes.
I ended up creating this mongoose plugin, and it's been running great in production for over a year. I create a temporary collection, then use setTimeout() to drop the collection 1 minute later. 1 minute is sufficient to query the collection, so the collection is no longer in use.
This creates collections with unique names, such as z_tempCollection_595820e4ae61ecc89635f794, so there is never a name collision.
var mongoose = require('mongoose');
var _ = require('lodash');
var util1 = require(global.appRootPath + '/lib/util1_lib.js');
function tempCollection(persistantSchema){
persistantSchema.statics.resultIntoTempCollection = function (tempCollectionDataArray, options, callback) {
var timestampSeconds = Math.round(Date.now() / 1000);
var tmpCollectionName = 'z_tempCollection_' + (new mongoose.mongo.ObjectId().toString()) + '_' + timestampSeconds;
var registeredModelName = 'tempModel' + tmpCollectionName;
options = options || {};
options.strict = _.isUndefined(options.strict) ? false : options.strict;
options.schema = _.isUndefined(options.schema) ? {} : options.schema;
var tmpSchema = new mongoose.Schema(options.schema, {strict: options.strict, collection: tmpCollectionName});
tmpSchema.statics.removeTempCollection = function(tempModel){
var maxRemovalAttempts = 3;
delete mongoose.models[registeredModelName];
delete mongoose.modelSchemas[registeredModelName];
setTimeout(function(){
mongoose.connection.db.dropCollection(tmpCollectionName, function (err, result) {
if (err) {
util1.saveError(err, 'server', null);
}
});
}, 60 * 1000);
}
// tempModel variable ref is overwritten on each subsequent run of resultIntoTempCollection
var tempModel = mongoose.model(registeredModelName, tmpSchema);
var promises = [];
tempCollectionDataArray.forEach(function(doc){
promises.push(new tempModel(doc).save());
});
return Promise.all(promises).then(function(){
return tempModel;
});
}
}
module.exports = tempCollection;
There are a number of questions and answers about randomly ordering results or randomly getting a single record. The answers recommend adding a random field, creating an index on that field, and then doing a random draw. It looks like:
db.myindex.find().forEach(function(doc) {
db.myindex.update({_id: doc._id}, {$set: {rand: Math.random()}})
})
This works great, but it takes several hours (lots and lots of data). It looks like is limited by write locking which makes sense since the update is happening for each record. How do I do this in bulk? I tried:
var bulk = db.myindex.initializeUnorderedBulkOp();
bulk.find({}).update( { $set: { rand: Math.random() } } );
bulk.execute();
But it sets the rand field to the same value for every record! How do I fix this?
Edit: By the way, the reason that I need to do this is because I get a huge bson file from someone else and I need to import it frequently, so can't wait multiple hours to get it updated.
Introduce a loop with the bulk operations sent to the server once per 1000 documents, or as many modifications as you can fit under the 64MB BSON limit:
var bulk = db.myindex.initializeOrderedBulkOp();
var counter = 0;
db.myindex.find().forEach(function(doc) {
bulk.find({ "_id": doc._id }).updateOne({
"$set": { "rand": Math.random() }
});
counter++;
if (counter % 1000 == 0) {
bulk.execute();
bulk = db.myindex.initializeOrderedBulkOp();
}
});
if (counter % 1000 != 0){
bulk.execute();
}
If the collection is just static data, and you're getting a BSON file from someone else, it might be quicker to stream the BSON file through a filter to generate a new BSON file that you can then import using mongoimport.
Here is one that I wrote using nodeJS that can process a BSON file at around 1GB/min.
var bson = require('bson');
var BSON = new bson.BSONPure.BSON();
var BSONStream = require('bson-stream');
var fs = require('fs');
var sb = require('stream-buffers');
var rs = fs.createReadStream('tweets.bson');
var ws = fs.createWriteStream('tweets_random.bson',{flags:'a'});
var writeBuffer = new sb.WritableStreamBuffer({
initialSize: (1024*1024),
incrementAmount: (10*1024)
});
rs.pipe(new BSONStream()).on('data',function(obj) {
obj.rand = Math.random();
writeBuffer.write(BSON.serialize(obj));
if(writeBuffer.size()>(1024*1024)) {
var size = writeBuffer.size();
ws.write(writeBuffer.getContents(),function() {
console.log("Wrote",size,"bytes");
console.log("Buffer has:",writeBuffer.size(),"bytes left");
});
}
});
It might go faster if you modify the buffer size/increment parameters.
This is of course assuming that you have the luxury of reimporting your data.