MongoDB bulkWrite multiple updateOne vs updateMany - mongodb

I have cases where I build bulkWrite operations where some documents have the same update object, is there any performance benefit to merging the filters and send one updateMany with those filters instead of multiple updateOnes in the same bulkWrite?
It's obviously better to use updateMany over multiple updateOnes when using the normal methods, but with bulkWrite, since it's a single command, are there any significant gains of preferring one over the other?
Example:
I have 200k documents that I need to update, I have 10 total unique status field for all 200K documents, so my options are:
Solutions:
A) Send one single bulkWrite with 10 updateMany operations, and each one of those operations will affect 20K documents.
B) Send one single bulkWrite with 200K updateOne each operations holding its filter and status.
As #AlexBlex noted, I have to look out for accidentally updating more than one document with the same filter, in my case I use _id as my filter, so accidentally updating other documents is not a concern in my case, but is definitely something to look out for when considering the updateMany option.
Thanks #AlexBlex.

Short answer:
Using updateMany is at least twice faster, but might accidentally update more documents than you intended, keep reading to learn how to avoid this and gain the performance benefits.
Long answer:
We ran the following experiment to know the answer for that, the following are the steps:
Create a bankaccounts mongodb collection, each document contains only one field (balance).
Insert 1 million documents into the bankaccounts collection.
Randomize the order in memory of all 1 million documents to avoid any possible optimizations from the database using ids that are inserted in the same sequence, simulating a real-world scenario.
Build write operations for bulkWrite from the documents with a random number between 0 and 100.
Execute the bulkWrite.
Log the time the bulkWrite took.
Now, the experiment lies in the 4th step.
In one variation of the experiment we build an array consisting of 1 million updateOne operations, each updateOne has filter for a single document, and its respective `update object.
In the second variation, we build 100 updateMany operations, each including filter for 10K documents ids, and their respective update.
Results:
updateMany with multiple documents ids is 243% faster than multiple updateOnes, this can not be used everywhere though, please read "The risk" section to learn when it should be used.
Details:
We ran the script 5 times for each variation, the detailed results are as follows:
With updateOne: 51.28 seconds on average.
With updateMany: 21.04 seconds on average.
The risk:
As many people have already pointed out, updateMany is not a direct substitute to updateOne, since it can incorrectly update multiple documents when our intention was to really update only one document.
This approach is only valid when you're using a field that is unique such as _id or any other field that is unique, if the filter is depending on fields that are not unique, multiple documents will be updated and the results will not be equivalent.
65831219.js
// 65831219.js
'use strict';
const mongoose = require('mongoose');
const { Schema } = mongoose;
const DOCUMENTS_COUNT = 1_000_000;
const UPDATE_MANY_OPERATIONS_COUNT = 100;
const MINIMUM_BALANCE = 0;
const MAXIMUM_BALANCE = 100;
const SAMPLES_COUNT = 10;
const bankAccountSchema = new Schema({
balance: { type: Number }
});
const BankAccount = mongoose.model('BankAccount', bankAccountSchema);
mainRunner().catch(console.error);
async function mainRunner () {
for (let i = 0; i < SAMPLES_COUNT; i++) {
await runOneCycle(buildUpdateManyWriteOperations).catch(console.error);
await runOneCycle(buildUpdateOneWriteOperations).catch(console.error);
console.log('-'.repeat(80));
}
process.exit(0);
}
/**
*
* #param {buildUpdateManyWriteOperations|buildUpdateOneWriteOperations} buildBulkWrite
*/
async function runOneCycle (buildBulkWrite) {
await mongoose.connect('mongodb://localhost:27017/test', {
useNewUrlParser: true,
useUnifiedTopology: true
});
await mongoose.connection.dropDatabase();
const { accounts } = await createAccounts({ accountsCount: DOCUMENTS_COUNT });
const { writeOperations } = buildBulkWrite({ accounts });
const writeStartedAt = Date.now();
await BankAccount.bulkWrite(writeOperations);
const writeEndedAt = Date.now();
console.log(`Write operations took ${(writeEndedAt - writeStartedAt) / 1000} seconds with \`${buildBulkWrite.name}\`.`);
}
async function createAccounts ({ accountsCount }) {
const rawAccounts = Array.from({ length: accountsCount }, () => ({ balance: getRandomInteger(MINIMUM_BALANCE, MAXIMUM_BALANCE) }));
const accounts = await BankAccount.insertMany(rawAccounts);
return { accounts };
}
function buildUpdateOneWriteOperations ({ accounts }) {
const writeOperations = shuffleArray(accounts).map((account) => ({
updateOne: {
filter: { _id: account._id },
update: { balance: getRandomInteger(MINIMUM_BALANCE, MAXIMUM_BALANCE) }
}
}));
return { writeOperations };
}
function buildUpdateManyWriteOperations ({ accounts }) {
shuffleArray(accounts);
const accountsChunks = chunkArray(accounts, accounts.length / UPDATE_MANY_OPERATIONS_COUNT);
const writeOperations = accountsChunks.map((accountsChunk) => ({
updateMany: {
filter: { _id: { $in: accountsChunk.map(account => account._id) } },
update: { balance: getRandomInteger(MINIMUM_BALANCE, MAXIMUM_BALANCE) }
}
}));
return { writeOperations };
}
function getRandomInteger (min = 0, max = 1) {
min = Math.ceil(min);
max = Math.floor(max);
return min + Math.floor(Math.random() * (max - min + 1));
}
function shuffleArray (array) {
let currentIndex = array.length;
let temporaryValue;
let randomIndex;
// While there remain elements to shuffle...
while (0 !== currentIndex) {
// Pick a remaining element...
randomIndex = Math.floor(Math.random() * currentIndex);
currentIndex -= 1;
// And swap it with the current element.
temporaryValue = array[currentIndex];
array[currentIndex] = array[randomIndex];
array[randomIndex] = temporaryValue;
}
return array;
}
function chunkArray (array, sizeOfTheChunkedArray) {
const chunked = [];
for (const element of array) {
const last = chunked[chunked.length - 1];
if (!last || last.length === sizeOfTheChunkedArray) {
chunked.push([element]);
} else {
last.push(element);
}
}
return chunked;
}
Output
$ node 65831219.js
Write operations took 20.803 seconds with `buildUpdateManyWriteOperations`.
Write operations took 50.84 seconds with `buildUpdateOneWriteOperations`.
----------------------------------------------------------------------------------------------------
Tests were run using MongoDB version 4.0.4.

At high level, if you have same update object, then you can do updateMany rather than bulkWrite
Reason:
bulkWrite is designed to send multiple different commands to the server as mentioned here
If you have same update object, updateMany is best suited.
Performance:
If you have 10k update commands in bulkWrite, it will be executed batch manner internally. It may impact on the execution time
Exact lines from the reference about batching:
Each group of operations can have at most 1000 operations. If a group exceeds this limit, MongoDB will divide the group into smaller groups of 1000 or less. For example, if the bulk operations list consists of 2000 insert operations, MongoDB creates 2 groups, each with 1000 operations.
Thanks #Alex

Related

How to make a complex query to count nested objects that match with a query on firestore? [duplicate]

Is it possible to count how many items a collection has using the new Firebase database, Cloud Firestore?
If so, how do I do that?
2023 Update
Firestore now supports aggregation queries.
Node SDK
const collectionRef = db.collection('cities');
const snapshot = await collectionRef.count().get();
console.log(snapshot.data().count);
Web v9 SDK
const coll = collection(db, "cities");
const snapshot = await getCountFromServer(coll);
console.log('count: ', snapshot.data().count);
Notable Limitation - You cannot currently use count() queries with real-time listeners and offline queries. (See below for alternatives)
Pricing - Pricing depends on the number of matched index entries rather than the number of documents. One index entry contains multiple documents making this cheaper than counting documents individually.
Old Answer
As with many questions, the answer is - It depends.
You should be very careful when handling large amounts of data on the front end. On top of making your front end feel sluggish, Firestore also charges you $0.60 per million reads you make.
Small collection (less than 100 documents)
Use with care - Frontend user experience may take a hit
Handling this on the front end should be fine as long as you are not doing too much logic with this returned array.
db.collection('...').get().then(snap => {
size = snap.size // will return the collection size
});
Medium collection (100 to 1000 documents)
Use with care - Firestore read invocations may cost a lot
Handling this on the front end is not feasible as it has too much potential to slow down the users system. We should handle this logic server side and only return the size.
The drawback to this method is you are still invoking Firestore reads (equal to the size of your collection), which in the long run may end up costing you more than expected.
Cloud Function:
db.collection('...').get().then(snap => {
res.status(200).send({length: snap.size});
});
Front End:
yourHttpClient.post(yourCloudFunctionUrl).toPromise().then(snap => {
size = snap.length // will return the collection size
})
Large collection (1000+ documents)
Most scalable solution
FieldValue.increment()
As of April 2019 Firestore now allows incrementing counters, completely atomically, and without reading the data prior. This ensures we have correct counter values even when updating from multiple sources simultaneously (previously solved using transactions), while also reducing the number of database reads we perform.
By listening to any document deletes or creates we can add to or remove from a count field that is sitting in the database.
See the firestore docs - Distributed Counters
Or have a look at Data Aggregation by Jeff Delaney. His guides are truly fantastic for anyone using AngularFire but his lessons should carry over to other frameworks as well.
Cloud Function:
export const documentWriteListener = functions.firestore
.document('collection/{documentUid}')
.onWrite((change, context) => {
if (!change.before.exists) {
// New document Created : add one to count
db.doc(docRef).update({ numberOfDocs: FieldValue.increment(1) });
} else if (change.before.exists && change.after.exists) {
// Updating existing document : Do nothing
} else if (!change.after.exists) {
// Deleting document : subtract one from count
db.doc(docRef).update({ numberOfDocs: FieldValue.increment(-1) });
}
return;
});
Now on the frontend you can just query this numberOfDocs field to get the size of the collection.
Simplest way to do so is to read the size of a "querySnapshot".
db.collection("cities").get().then(function(querySnapshot) {
console.log(querySnapshot.size);
});
You can also read the length of the docs array inside "querySnapshot".
querySnapshot.docs.length;
Or if a "querySnapshot" is empty by reading the empty value, which will return a boolean value.
querySnapshot.empty;
As far as I know there is no build-in solution for this and it is only possible in the node sdk right now.
If you have a
db.collection('someCollection')
you can use
.select([fields])
to define which field you want to select. If you do an empty select() you will just get an array of document references.
example:
db.collection('someCollection').select().get().then(
(snapshot) => console.log(snapshot.docs.length)
);
This solution is only a optimization for the worst case of downloading all documents and does not scale on large collections!
Also have a look at this:
How to get a count of number of documents in a collection with Cloud Firestore
Aggregate count query just landed as a preview in Firestore.
Announced at the 2022 Firebase Summit: https://firebase.blog/posts/2022/10/whats-new-at-Firebase-Sumit-2022
Excerpt:
[Developer Preview] Count() function: With the new count function in
Firstore [sic], you can now get the count of the matching documents when you
run a query or read from a collection, without loading the actual
documents, which saves you a lot of time.
Code sample they showed at the summit:
During the Q&A, someone asked about pricing for aggregated queries, and the answer the Firebase team provided was that it'll cost 1 / 1000th of the price of a read (rounded up to the nearest read, see comments below for more details), but will count all records that are part of the aggregate.
Be careful counting number of documents for large collections. It is a little bit complex with firestore database if you want to have a precalculated counter for every collection.
Code like this doesn't work in this case:
export const customerCounterListener =
functions.firestore.document('customers/{customerId}')
.onWrite((change, context) => {
// on create
if (!change.before.exists && change.after.exists) {
return firestore
.collection('metadatas')
.doc('customers')
.get()
.then(docSnap =>
docSnap.ref.set({
count: docSnap.data().count + 1
}))
// on delete
} else if (change.before.exists && !change.after.exists) {
return firestore
.collection('metadatas')
.doc('customers')
.get()
.then(docSnap =>
docSnap.ref.set({
count: docSnap.data().count - 1
}))
}
return null;
});
The reason is because every cloud firestore trigger has to be idempotent, as firestore documentation say: https://firebase.google.com/docs/functions/firestore-events#limitations_and_guarantees
Solution
So, in order to prevent multiple executions of your code, you need to manage with events and transactions. This is my particular way to handle large collection counters:
const executeOnce = (change, context, task) => {
const eventRef = firestore.collection('events').doc(context.eventId);
return firestore.runTransaction(t =>
t
.get(eventRef)
.then(docSnap => (docSnap.exists ? null : task(t)))
.then(() => t.set(eventRef, { processed: true }))
);
};
const documentCounter = collectionName => (change, context) =>
executeOnce(change, context, t => {
// on create
if (!change.before.exists && change.after.exists) {
return t
.get(firestore.collection('metadatas')
.doc(collectionName))
.then(docSnap =>
t.set(docSnap.ref, {
count: ((docSnap.data() && docSnap.data().count) || 0) + 1
}));
// on delete
} else if (change.before.exists && !change.after.exists) {
return t
.get(firestore.collection('metadatas')
.doc(collectionName))
.then(docSnap =>
t.set(docSnap.ref, {
count: docSnap.data().count - 1
}));
}
return null;
});
Use cases here:
/**
* Count documents in articles collection.
*/
exports.articlesCounter = functions.firestore
.document('articles/{id}')
.onWrite(documentCounter('articles'));
/**
* Count documents in customers collection.
*/
exports.customersCounter = functions.firestore
.document('customers/{id}')
.onWrite(documentCounter('customers'));
As you can see, the key to prevent multiple execution is the property called eventId in the context object. If the function has been handled many times for the same event, the event id will be the same in all cases. Unfortunately, you must have "events" collection in your database.
In 2020 this is still not available in the Firebase SDK however it is available in Firebase Extensions (Beta) however it's pretty complex to setup and use...
A reasonable approach
Helpers... (create/delete seems redundant but is cheaper than onUpdate)
export const onCreateCounter = () => async (
change,
context
) => {
const collectionPath = change.ref.parent.path;
const statsDoc = db.doc("counters/" + collectionPath);
const countDoc = {};
countDoc["count"] = admin.firestore.FieldValue.increment(1);
await statsDoc.set(countDoc, { merge: true });
};
export const onDeleteCounter = () => async (
change,
context
) => {
const collectionPath = change.ref.parent.path;
const statsDoc = db.doc("counters/" + collectionPath);
const countDoc = {};
countDoc["count"] = admin.firestore.FieldValue.increment(-1);
await statsDoc.set(countDoc, { merge: true });
};
export interface CounterPath {
watch: string;
name: string;
}
Exported Firestore hooks
export const Counters: CounterPath[] = [
{
name: "count_buildings",
watch: "buildings/{id2}"
},
{
name: "count_buildings_subcollections",
watch: "buildings/{id2}/{id3}/{id4}"
}
];
Counters.forEach(item => {
exports[item.name + '_create'] = functions.firestore
.document(item.watch)
.onCreate(onCreateCounter());
exports[item.name + '_delete'] = functions.firestore
.document(item.watch)
.onDelete(onDeleteCounter());
});
In action
The building root collection and all sub collections will be tracked.
Here under the /counters/ root path
Now collection counts will update automatically and eventually! If you need a count, just use the collection path and prefix it with counters.
const collectionPath = 'buildings/138faicnjasjoa89/buildingContacts';
const collectionCount = await db
.doc('counters/' + collectionPath)
.get()
.then(snap => snap.get('count'));
Limitations
As this approach uses a single database and document, it is limited to the Firestore constraint of 1 Update per Second for each counter. It will be eventually consistent, but in cases where large amounts of documents are added/removed the counter will lag behind the actual collection count.
I agree with #Matthew, it will cost a lot if you perform such query.
[ADVICE FOR DEVELOPERS BEFORE STARTING THEIR PROJECTS]
Since we have foreseen this situation at the beginning, we can actually make a collection namely counters with a document to store all the counters in a field with type number.
For example:
For each CRUD operation on the collection, update the counter document:
When you create a new collection/subcollection: (+1 in the counter) [1 write operation]
When you delete a collection/subcollection: (-1 in the counter) [1 write operation]
When you update an existing collection/subcollection, do nothing on the counter document: (0)
When you read an existing collection/subcollection, do nothing on the counter document: (0)
Next time, when you want to get the number of collection, you just need to query/point to the document field. [1 read operation]
In addition, you can store the collections name in an array, but this will be tricky, the condition of array in firebase is shown as below:
// we send this
['a', 'b', 'c', 'd', 'e']
// Firebase stores this
{0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e'}
// since the keys are numeric and sequential,
// if we query the data, we get this
['a', 'b', 'c', 'd', 'e']
// however, if we then delete a, b, and d,
// they are no longer mostly sequential, so
// we do not get back an array
{2: 'c', 4: 'e'}
So, if you are not going to delete the collection , you can actually use array to store list of collections name instead of querying all the collection every time.
Hope it helps!
As of October 2022, Firestore has introduced a count() method on the client SDKs. Now you can count for a query without downloads.
For 1000 documents, it will charge you for 1 document read.
Web (v9)
Introduced in Firebase 9.11.0:
const collectionRef = collection(db, "cities");
const snapshot = await getCountFromServer(collectionRef);
console.log('count: ', snapshot.data().count);
Web V8
Not Available.
Node (Admin)
const collectionRef = db.collection('cities');
const snapshot = await collectionRef.count().get();
console.log(snapshot.data().count);
Android (Kotlin)
Introduced in firestore v24.4.0 (BoM 31.0.0):
val query = db.collection("cities")
val countQuery = query.count()
countQuery.get(AggregateSource.SERVER).addOnCompleteListener { task ->
if (task.isSuccessful) {
val snapshot = task.result
Log.d(TAG, "Count: ${snapshot.count}")
} else {
Log.d(TAG, "Count failed: ", task.getException())
}
}
Apple Platforms (Swift)
Introduced in Firestore v10.0.0:
do {
let query = db.collection("cities")
let countQuery = query.countAggregateQuery
let snapshot = try await countQuery.aggregation(source: AggregateSource.server)
print(snapshot.count)
} catch {
print(error)
}
Increment a counter using admin.firestore.FieldValue.increment:
exports.onInstanceCreate = functions.firestore.document('projects/{projectId}/instances/{instanceId}')
.onCreate((snap, context) =>
db.collection('projects').doc(context.params.projectId).update({
instanceCount: admin.firestore.FieldValue.increment(1),
})
);
exports.onInstanceDelete = functions.firestore.document('projects/{projectId}/instances/{instanceId}')
.onDelete((snap, context) =>
db.collection('projects').doc(context.params.projectId).update({
instanceCount: admin.firestore.FieldValue.increment(-1),
})
);
In this example we increment an instanceCount field in the project each time a document is added to the instances sub collection. If the field doesn't exist yet it will be created and incremented to 1.
The incrementation is transactional internally but you should use a distributed counter if you need to increment more frequently than every 1 second.
It's often preferable to implement onCreate and onDelete rather than onWrite as you will call onWrite for updates which means you are spending more money on unnecessary function invocations (if you update the docs in your collection).
No, there is no built-in support for aggregation queries right now. However there are a few things you could do.
The first is documented here. You can use transactions or cloud functions to maintain aggregate information:
This example shows how to use a function to keep track of the number of ratings in a subcollection, as well as the average rating.
exports.aggregateRatings = firestore
.document('restaurants/{restId}/ratings/{ratingId}')
.onWrite(event => {
// Get value of the newly added rating
var ratingVal = event.data.get('rating');
// Get a reference to the restaurant
var restRef = db.collection('restaurants').document(event.params.restId);
// Update aggregations in a transaction
return db.transaction(transaction => {
return transaction.get(restRef).then(restDoc => {
// Compute new number of ratings
var newNumRatings = restDoc.data('numRatings') + 1;
// Compute new average rating
var oldRatingTotal = restDoc.data('avgRating') * restDoc.data('numRatings');
var newAvgRating = (oldRatingTotal + ratingVal) / newNumRatings;
// Update restaurant info
return transaction.update(restRef, {
avgRating: newAvgRating,
numRatings: newNumRatings
});
});
});
});
The solution that jbb mentioned is also useful if you only want to count documents infrequently. Make sure to use the select() statement to avoid downloading all of each document (that's a lot of bandwidth when you only need a count). select() is only available in the server SDKs for now so that solution won't work in a mobile app.
UPDATE 11/20
I created an npm package for easy access to a counter function: https://code.build/p/9DicAmrnRoK4uk62Hw1bEV/firestore-counters
I created a universal function using all these ideas to handle all counter situations (except queries).
The only exception would be when doing so many writes a second, it
slows you down. An example would be likes on a trending post. It is
overkill on a blog post, for example, and will cost you more. I
suggest creating a separate function in that case using shards:
https://firebase.google.com/docs/firestore/solutions/counters
// trigger collections
exports.myFunction = functions.firestore
.document('{colId}/{docId}')
.onWrite(async (change: any, context: any) => {
return runCounter(change, context);
});
// trigger sub-collections
exports.mySubFunction = functions.firestore
.document('{colId}/{docId}/{subColId}/{subDocId}')
.onWrite(async (change: any, context: any) => {
return runCounter(change, context);
});
// add change the count
const runCounter = async function (change: any, context: any) {
const col = context.params.colId;
const eventsDoc = '_events';
const countersDoc = '_counters';
// ignore helper collections
if (col.startsWith('_')) {
return null;
}
// simplify event types
const createDoc = change.after.exists && !change.before.exists;
const updateDoc = change.before.exists && change.after.exists;
if (updateDoc) {
return null;
}
// check for sub collection
const isSubCol = context.params.subDocId;
const parentDoc = `${countersDoc}/${context.params.colId}`;
const countDoc = isSubCol
? `${parentDoc}/${context.params.docId}/${context.params.subColId}`
: `${parentDoc}`;
// collection references
const countRef = db.doc(countDoc);
const countSnap = await countRef.get();
// increment size if doc exists
if (countSnap.exists) {
// createDoc or deleteDoc
const n = createDoc ? 1 : -1;
const i = admin.firestore.FieldValue.increment(n);
// create event for accurate increment
const eventRef = db.doc(`${eventsDoc}/${context.eventId}`);
return db.runTransaction(async (t: any): Promise<any> => {
const eventSnap = await t.get(eventRef);
// do nothing if event exists
if (eventSnap.exists) {
return null;
}
// add event and update size
await t.update(countRef, { count: i });
return t.set(eventRef, {
completed: admin.firestore.FieldValue.serverTimestamp()
});
}).catch((e: any) => {
console.log(e);
});
// otherwise count all docs in the collection and add size
} else {
const colRef = db.collection(change.after.ref.parent.path);
return db.runTransaction(async (t: any): Promise<any> => {
// update size
const colSnap = await t.get(colRef);
return t.set(countRef, { count: colSnap.size });
}).catch((e: any) => {
console.log(e);
});;
}
}
This handles events, increments, and transactions. The beauty in this, is that if you are not sure about the accuracy of a document (probably while still in beta), you can delete the counter to have it automatically add them up on the next trigger. Yes, this costs, so don't delete it otherwise.
Same kind of thing to get the count:
const collectionPath = 'buildings/138faicnjasjoa89/buildingContacts';
const colSnap = await db.doc('_counters/' + collectionPath).get();
const count = colSnap.get('count');
Also, you may want to create a cron job (scheduled function) to remove old events to save money on database storage. You need at least a blaze plan, and there may be some more configuration. You could run it every sunday at 11pm, for example.
https://firebase.google.com/docs/functions/schedule-functions
This is untested, but should work with a few tweaks:
exports.scheduledFunctionCrontab = functions.pubsub.schedule('5 11 * * *')
.timeZone('America/New_York')
.onRun(async (context) => {
// get yesterday
const yesterday = new Date();
yesterday.setDate(yesterday.getDate() - 1);
const eventFilter = db.collection('_events').where('completed', '<=', yesterday);
const eventFilterSnap = await eventFilter.get();
eventFilterSnap.forEach(async (doc: any) => {
await doc.ref.delete();
});
return null;
});
And last, don't forget to protect the collections in firestore.rules:
match /_counters/{document} {
allow read;
allow write: if false;
}
match /_events/{document} {
allow read, write: if false;
}
Update: Queries
Adding to my other answer if you want to automate query counts as well, you can use this modified code in your cloud function:
if (col === 'posts') {
// counter reference - user doc ref
const userRef = after ? after.userDoc : before.userDoc;
// query reference
const postsQuery = db.collection('posts').where('userDoc', "==", userRef);
// add the count - postsCount on userDoc
await addCount(change, context, postsQuery, userRef, 'postsCount');
}
return delEvents();
Which will automatically update the postsCount in the userDocument. You could easily add other one to many counts this way. This just gives you ideas of how you can automate things. I also gave you another way to delete the events. You have to read each date to delete it, so it won't really save you to delete them later, just makes the function slower.
/**
* Adds a counter to a doc
* #param change - change ref
* #param context - context ref
* #param queryRef - the query ref to count
* #param countRef - the counter document ref
* #param countName - the name of the counter on the counter document
*/
const addCount = async function (change: any, context: any,
queryRef: any, countRef: any, countName: string) {
// events collection
const eventsDoc = '_events';
// simplify event type
const createDoc = change.after.exists && !change.before.exists;
// doc references
const countSnap = await countRef.get();
// increment size if field exists
if (countSnap.get(countName)) {
// createDoc or deleteDoc
const n = createDoc ? 1 : -1;
const i = admin.firestore.FieldValue.increment(n);
// create event for accurate increment
const eventRef = db.doc(`${eventsDoc}/${context.eventId}`);
return db.runTransaction(async (t: any): Promise<any> => {
const eventSnap = await t.get(eventRef);
// do nothing if event exists
if (eventSnap.exists) {
return null;
}
// add event and update size
await t.set(countRef, { [countName]: i }, { merge: true });
return t.set(eventRef, {
completed: admin.firestore.FieldValue.serverTimestamp()
});
}).catch((e: any) => {
console.log(e);
});
// otherwise count all docs in the collection and add size
} else {
return db.runTransaction(async (t: any): Promise<any> => {
// update size
const colSnap = await t.get(queryRef);
return t.set(countRef, { [countName]: colSnap.size }, { merge: true });
}).catch((e: any) => {
console.log(e);
});;
}
}
/**
* Deletes events over a day old
*/
const delEvents = async function () {
// get yesterday
const yesterday = new Date();
yesterday.setDate(yesterday.getDate() - 1);
const eventFilter = db.collection('_events').where('completed', '<=', yesterday);
const eventFilterSnap = await eventFilter.get();
eventFilterSnap.forEach(async (doc: any) => {
await doc.ref.delete();
});
return null;
}
I should also warn you that universal functions will run on every
onWrite call period. It may be cheaper to only run the function on
onCreate and on onDelete instances of your specific collections. Like
the noSQL database we are using, repeated code and data can save you
money.
There is no direct option available. You cant't do db.collection("CollectionName").count().
Below are the two ways by which you can find the count of number of documents within a collection.
1 :- Get all the documents in the collection and then get it's size.(Not the best Solution)
db.collection("CollectionName").get().subscribe(doc=>{
console.log(doc.size)
})
By using above code your document reads will be equal to the size of documents within a collection and that is the reason why one must avoid using above solution.
2:- Create a separate document with in your collection which will store the count of number of documents in the collection.(Best Solution)
db.collection("CollectionName").doc("counts")get().subscribe(doc=>{
console.log(doc.count)
})
Above we created a document with name counts to store all the count information.You can update the count document in the following way:-
Create a firestore triggers on the document counts
Increment the count property of counts document when a new document is created.
Decrement the count property of counts document when a document is deleted.
w.r.t price (Document Read = 1) and fast data retrieval the above solution is good.
A workaround is to:
write a counter in a firebase doc, which you increment within a transaction everytime you create a new entry
You store the count in a field of your new entry (i.e: position: 4).
Then you create an index on that field (position DESC).
You can do a skip+limit with a query.Where("position", "<" x).OrderBy("position", DESC)
Hope this helps!
I have try a lot with different approaches.
And finally, I improve one of the methods.
First you need to create a separate collection and save there all events.
Second you need to create a new lambda to be triggered by time. This lambda will Count events in event collection and clear event documents.
Code details in article.
https://medium.com/#ihor.malaniuk/how-to-count-documents-in-google-cloud-firestore-b0e65863aeca
one of the fast + money saver trick is that:-
make a doc and store a 'count' variable in firestore, when user add new doc in the collection, increase that variable, and when user delete a doc, decrease variable. e.g.
updateDoc(doc(db, "Count_collection", "Count_Doc"), {count: increment(1)});
note: use (-1) for decreasing, (1) for increasing count
How it save money and time:-
you(firebase) don't need to loop through the collection, nor browser needs to load whole collection to count number of docs.
all the counts are save in a doc of only one variable named "count" or whatever, so less than 1kb data is used, and it use only 1 reads in firebase firestore.
Solution using pagination with offset & limit:
public int collectionCount(String collection) {
Integer page = 0;
List<QueryDocumentSnapshot> snaps = new ArrayList<>();
findDocsByPage(collection, page, snaps);
return snaps.size();
}
public void findDocsByPage(String collection, Integer page,
List<QueryDocumentSnapshot> snaps) {
try {
Integer limit = 26000;
FieldPath[] selectedFields = new FieldPath[] { FieldPath.of("id") };
List<QueryDocumentSnapshot> snapshotPage;
snapshotPage = fireStore()
.collection(collection)
.select(selectedFields)
.offset(page * limit)
.limit(limit)
.get().get().getDocuments();
if (snapshotPage.size() > 0) {
snaps.addAll(snapshotPage);
page++;
findDocsByPage(collection, page, snaps);
}
} catch (InterruptedException | ExecutionException e) {
e.printStackTrace();
}
}
findDocsPage it's a recursive method to find all pages of collection
selectedFields for otimize query and get only id field instead full body of document
limit max size of each query page
page define inicial page for pagination
From the tests I did it worked well for collections with up to approximately 120k records!
Firestore is introducing a new Query.count() that fetches the count of a query without fetching the docs.
This would allow to simply query all collection items and get the count of that query.
Ref:
Firebase 10 iOS SDK
[JS SDK PR] (https://github.com/firebase/firebase-js-sdk/pull/6608)
There's a new build in function since version 9.11.0 called getCountFromServer(), which fetches the number of documents in the result set without actually downloading the documents.
https://firebase.google.com/docs/reference/js/firestore_#getcountfromserver
Took me a while to get this working based on some of the answers above, so I thought I'd share it for others to use. I hope it's useful.
'use strict';
const functions = require('firebase-functions');
const admin = require('firebase-admin');
admin.initializeApp();
const db = admin.firestore();
exports.countDocumentsChange = functions.firestore.document('library/{categoryId}/documents/{documentId}').onWrite((change, context) => {
const categoryId = context.params.categoryId;
const categoryRef = db.collection('library').doc(categoryId)
let FieldValue = require('firebase-admin').firestore.FieldValue;
if (!change.before.exists) {
// new document created : add one to count
categoryRef.update({numberOfDocs: FieldValue.increment(1)});
console.log("%s numberOfDocs incremented by 1", categoryId);
} else if (change.before.exists && change.after.exists) {
// updating existing document : Do nothing
} else if (!change.after.exists) {
// deleting document : subtract one from count
categoryRef.update({numberOfDocs: FieldValue.increment(-1)});
console.log("%s numberOfDocs decremented by 1", categoryId);
}
return 0;
});
This uses counting to create numeric unique ID. In my use, I will not be decrementing ever, even when the document that the ID is needed for is deleted.
Upon a collection creation that needs unique numeric value
Designate a collection appData with one document, set with .doc id only
Set uniqueNumericIDAmount to 0 in the firebase firestore console
Use doc.data().uniqueNumericIDAmount + 1 as the unique numeric id
Update appData collection uniqueNumericIDAmount with firebase.firestore.FieldValue.increment(1)
firebase
.firestore()
.collection("appData")
.doc("only")
.get()
.then(doc => {
var foo = doc.data();
foo.id = doc.id;
// your collection that needs a unique ID
firebase
.firestore()
.collection("uniqueNumericIDs")
.doc(user.uid)// user id in my case
.set({// I use this in login, so this document doesn't
// exist yet, otherwise use update instead of set
phone: this.state.phone,// whatever else you need
uniqueNumericID: foo.uniqueNumericIDAmount + 1
})
.then(() => {
// upon success of new ID, increment uniqueNumericIDAmount
firebase
.firestore()
.collection("appData")
.doc("only")
.update({
uniqueNumericIDAmount: firebase.firestore.FieldValue.increment(
1
)
})
.catch(err => {
console.log(err);
});
})
.catch(err => {
console.log(err);
});
});
var variable=0
variable=variable+querySnapshot.count
then if you are to use it on a String variable then
let stringVariable= String(variable)
Along with my npm package adv-firestore-functions above, you can also just use firestore rules to force a good counter:
Firestore Rules
function counter() {
let docPath = /databases/$(database)/documents/_counters/$(request.path[3]);
let afterCount = getAfter(docPath).data.count;
let beforeCount = get(docPath).data.count;
let addCount = afterCount == beforeCount + 1;
let subCount = afterCount == beforeCount - 1;
let newId = getAfter(docPath).data.docId == request.path[4];
let deleteDoc = request.method == 'delete';
let createDoc = request.method == 'create';
return (newId && subCount && deleteDoc) || (newId && addCount && createDoc);
}
function counterDoc() {
let doc = request.path[4];
let docId = request.resource.data.docId;
let afterCount = request.resource.data.count;
let beforeCount = resource.data.count;
let docPath = /databases/$(database)/documents/$(doc)/$(docId);
let createIdDoc = existsAfter(docPath) && !exists(docPath);
let deleteIdDoc = !existsAfter(docPath) && exists(docPath);
let addCount = afterCount == beforeCount + 1;
let subCount = afterCount == beforeCount - 1;
return (createIdDoc && addCount) || (deleteIdDoc && subCount);
}
and use them like so:
match /posts/{document} {
allow read;
allow update;
allow create: if counter();
allow delete: if counter();
}
match /_counters/{document} {
allow read;
allow write: if counterDoc();
}
Frontend
Replace your set and delete functions with these:
set
async setDocWithCounter(
ref: DocumentReference<DocumentData>,
data: {
[x: string]: any;
},
options: SetOptions): Promise<void> {
// counter collection
const counterCol = '_counters';
const col = ref.path.split('/').slice(0, -1).join('/');
const countRef = doc(this.afs, counterCol, col);
const countSnap = await getDoc(countRef);
const refSnap = await getDoc(ref);
// don't increase count if edit
if (refSnap.exists()) {
await setDoc(ref, data, options);
// increase count
} else {
const batch = writeBatch(this.afs);
batch.set(ref, data, options);
// if count exists
if (countSnap.exists()) {
batch.update(countRef, {
count: increment(1),
docId: ref.id
});
// create count
} else {
// will only run once, should not use
// for mature apps
const colRef = collection(this.afs, col);
const colSnap = await getDocs(colRef);
batch.set(countRef, {
count: colSnap.size + 1,
docId: ref.id
});
}
batch.commit();
}
}
delete
async delWithCounter(
ref: DocumentReference<DocumentData>
): Promise<void> {
// counter collection
const counterCol = '_counters';
const col = ref.path.split('/').slice(0, -1).join('/');
const countRef = doc(this.afs, counterCol, col);
const countSnap = await getDoc(countRef);
const batch = writeBatch(this.afs);
// if count exists
batch.delete(ref);
if (countSnap.exists()) {
batch.update(countRef, {
count: increment(-1),
docId: ref.id
});
}
/*
if ((countSnap.data() as any).count == 1) {
batch.delete(countRef);
}*/
batch.commit();
}
see here for more info...
J
This feature is now supported in FireStore, albeit in Beta.
Here are the official Firebase docs
With the new version of Firebase, you can now run aggregated queries!
Simply write
.count().get();
after your query.
As it stands, firebase only allows server-side count, like this
const collectionRef = db.collection('cities');
const snapshot = await collectionRef.count().get();
console.log(snapshot.data().count);
Please not this is for nodeJS
New feature available in Firebase/Firestore provides a count of documents in a collection:
See this thread to see how to achieve it, with an example.
How To Count Number of Documents in a Collection in Firebase Firestore With a WHERE query in react.js
According to this documentation Cloud Firestore supports the count() aggregation query and is available in preview.
The Flutter/Dart code was missing (at the time of writing this) so I played around with it and the following function seems to work:
Future<int> getCount(String path) async {
var collection = _fireStore.collection(path);
var countQuery = collection.count();
var snapShot = await countQuery.get(source: AggregateSource.server);
return snapShot.count;
}
firebaseFirestore.collection("...").addSnapshotListener(new EventListener<QuerySnapshot>() {
#Override
public void onEvent(QuerySnapshot documentSnapshots, FirebaseFirestoreException e) {
int Counter = documentSnapshots.size();
}
});
So my solution for this problem is a bit non-technical, not super precise, but good enough for me.
Those are my documents. As I have a lot of them (100k+) there are 'laws of big numbers' happening. I can assume that there is less-or-more equal number of items having id starting with 0, 1, 2, etc.
So what I do is I scroll my list till I get into id's starting with 1, or with 01, depending on how long you have to scroll
👆 here we are.
Now, having scrolled so far, I open the inspector and see how much did I scroll and divide it by height of single element
Had to scroll 82000px to get items with id starting with 1. Height of single element is 32px.
It means I have 2500 with id starting with 0, so now I multiply it by number of possible 'starting char'. In firebase it can be A-Z, a-z, 0-9 which means it's 24 + 24 + 10 = 58.
It means I have ~~2500*58 so it gives roughly 145000 items in my collection.
Summarizing: What is wrong with you firebase?

Mongodb/Meteor: combine documents with ids in many arrays(returned by another query)

I am using meteor to make a query( for a publication) to find all the Results generated by Workers which has finished their work:
Results has the following structure:
result example 1:
{
_id: "sldf234sdf"
result_a:0,
result_b:0
}
result example 2:
{
_id: "ghjwef23qql"
result_a:0,
result_b:0
}
Workers is defined as:
{
_id: "iweyr23s"
results:["sldf234sdf", "ghjwef23qql"], //here is a list of
tag:'running'
}
Here is what I am trying to do:
// 1), I want to find all the workers which is finished with their tag
const workers = Workers.find({tag:'done'});
// 2), I want to get the resultid arrays in all the workers, then combine it into a big array
const results_id_arrays = workers[0].results + workers[1].results + ...
const results = Results.find({_id:{$in: results_id_arrays }});
So, my question is, how to make a mongodb query to implement the second step?
Is this what you are looking for?
you can get results ids like this:
const results_id_arrays = Workers.aggregate({$project:{a:'$results'}},
{$unwind:'$a'},
{$unwind:'$a'},
{$group:{_id:'a',res:{$addToSet:'$a'}}}).map(function(e) {return e.res})
and then
const results = Results.find({_id:{$in: results_id_arrays }});
does this work?
You can use the mighty underscorejs for this. There is also a meteor package for it. Check documentation here.
//fetch results
const workers = Workers.find({tag:'done'}, {results: 1, _id: 0}).fetch();
//pluck only the ids, without field name
const plucked = _.pluck(workers, 'results');
//use plucked ids to find results
return Results.find({_id:{$in: plucked }});

Get inserted Ids for Bulk.Insert() -Mongoskin

I am using a mongoskin in my nodeJs applicatipon to insert data in mongo db. I have a requirement to insert array of documents in database and send back the Ids of inserted records to the client. I am able to insert data however unable to locate the Ids of inserted records in the result Object. Need help to locate the insertedIds in the result. Im using the below code to bulk insert.
db.collection('myCollection', function (err, collection) {
var bulk = collection.initializeUnorderedBulkOp();
for (var i = 0; i < dataArray.length; i++) {
bulk.insert(dataArray[i]);
}
bulk.execute(function (err, result) {
//TODO: return the Ids of inserted records to the client
//Client will use these Ids to perform subsequent calls to the nodejs service
});
});
My result is a BatchWriteResult Object type.
Would suggest using the other bulk API method upsert() which will afford you to get in your BatchWriteResult() object the _id values of the inserted documents by calling its getUpsertedIds() method. The result object is in the same format as given in the documentation for BulkWriteResult.
The update operation with the Bulk.find.upsert() option will perform an insert when there are no matching documents for the Bulk.find() condition. If the update document does not specify an _id field, MongoDB adds the _id field and thus you can retrieve the id's of the inserted document
within your BatchWriteResult().
Also, the way you are queing up your bulk insert operations is not usually recommened since this basically builds up in memory; you'd want to have a bit of more control with managing the queues and memory resources other than relying on the driver's default way of limiting the batches of 1000 at a time, as well as the complete batch being under 16MB. The way you can do this is to use the forEach() loop of your data array with a counter that will help limit the batches to 1000 at a time.
The following shows the above approach
function getInsertedIds(result){
var ids = result.getUpsertedIds();
console.log(ids); // an array of upserted ids
return ids;
}
db.collection('myCollection',function(err,collection) {
var bulk = collection.initializeUnorderedBulkOp(),
insertedIds = [],
counter = 0;
dataArray.forEach(function (data){
bulk.find(data).upsert().updateOne(data);
counter++;
if (counter % 1000 == 0) {
bulk.execute(function(err, result) {
insertedIds = getInsertedIds(result);
bulk = collection.initializeUnorderedBulkOp(); // reset after execute
});
}
});
// Clean up the remaining operations in the queue which were
// cut off in the loop - counter not a round divisor of 1000
if (counter % 1000 != 0 ) {
bulk.execute(function(err, result) {
insertedIds = insertedIds.concat(getInsertedIds(result));
console.log(insertedIds);
});
}
});

mapreduce between consecutive documents

Setup:
I got a large collection with the following entries
Name - String
Begin - time stamp
End - time stamp
Problem:
I want to get the gaps between documents, Using the map-reduce paradigm.
Approach:
I'm trying to set a new collection of pairs mid, after that I can compute differences from it using $unwind and Pair[1].Begin - Pair[0].End
function map(){
emit(0, this)
}
function reduce(){
var i = 0;
var pairs = [];
while ( i < values.length -1){
pairs.push([values[i], values[i+1]]);
i = i + 1;
}
return {"pairs":pairs};
}
db.collection.mapReduce(map, reduce, sort:{begin:1}, out:{replace:"mid"})
This works with limited number of document because of the 16MB document cap. I'm not sure if I need to get the collection into memory and doing it there, How else can I approach this problem?
The mapReduce function of MongoDB has a different way of handling what you propose than the method you are using to solve it. The key factor here is "keeping" the "previous" document in order to make the comparison to the next.
The actual mechanism that supports this is the "scope" functionality, which allows a sort of "global" variable approach to use in the overall code. As you will see, what you are asking when that is considered takes no "reduction" at all as there is no "grouping", just emission of document "pair" data:
db.collection.mapReduce(
function() {
if ( last == null ) {
last = this;
} else {
emit(
{
"start_id": last._id,
"end_id": this._id
},
this.Begin - last.End
);
last = this;
}
},
function() {}, // no reduction required
{
"out": { "inline": 1 },
"scope": { "last": null }
}
)
Out with a collection as the output as required to your size.
But this way by using a "global" to keep the last document then the code is both simple and efficient.

mongodb move documents from one collection to another collection

How can documents be moved from one collection to another collection in MongoDB?? For example: I have lot of documents in collection A and I want to move all 1 month older documents to collection B (these 1 month older documents should not be in collection A).
Using aggregation we can do copy. But what I am trying to do is moving of documents.
What method can be used to move documents?
The bulk operations #markus-w-mahlberg showed (and #mark-mullin refined) are efficient but unsafe as written. If the bulkInsert fails, the bulkRemove will still continue. To make sure you don't lose any records when moving, use this instead:
function insertBatch(collection, documents) {
var bulkInsert = collection.initializeUnorderedBulkOp();
var insertedIds = [];
var id;
documents.forEach(function(doc) {
id = doc._id;
// Insert without raising an error for duplicates
bulkInsert.find({_id: id}).upsert().replaceOne(doc);
insertedIds.push(id);
});
bulkInsert.execute();
return insertedIds;
}
function deleteBatch(collection, documents) {
var bulkRemove = collection.initializeUnorderedBulkOp();
documents.forEach(function(doc) {
bulkRemove.find({_id: doc._id}).removeOne();
});
bulkRemove.execute();
}
function moveDocuments(sourceCollection, targetCollection, filter, batchSize) {
print("Moving " + sourceCollection.find(filter).count() + " documents from " + sourceCollection + " to " + targetCollection);
var count;
while ((count = sourceCollection.find(filter).count()) > 0) {
print(count + " documents remaining");
sourceDocs = sourceCollection.find(filter).limit(batchSize);
idsOfCopiedDocs = insertBatch(targetCollection, sourceDocs);
targetDocs = targetCollection.find({_id: {$in: idsOfCopiedDocs}});
deleteBatch(sourceCollection, targetDocs);
}
print("Done!")
}
Update 2
Please do NOT upvote this answer any more. As written #jasongarber's answer is better in any aspect.
Update
This answer by #jasongarber is a safer approach and should be used instead of mine.
Provided I got you right and you want to move all documents older than 1 month, and you use mongoDB 2.6, there is no reason not to use bulk operations, which are the most efficient way of doing multiple operations I am aware of:
> var bulkInsert = db.target.initializeUnorderedBulkOp()
> var bulkRemove = db.source.initializeUnorderedBulkOp()
> var date = new Date()
> date.setMonth(date.getMonth() -1)
> db.source.find({"yourDateField":{$lt: date}}).forEach(
function(doc){
bulkInsert.insert(doc);
bulkRemove.find({_id:doc._id}).removeOne();
}
)
> bulkInsert.execute()
> bulkRemove.execute()
This should be pretty fast and it has the advantage that in case something goes wrong during the bulk insert, the original data still exists.
Edit
In order to prevent too much memory to be utilized, you can execute the bulk operation on every x docs processed:
> var bulkInsert = db.target.initializeUnorderedBulkOp()
> var bulkRemove = db.source.initializeUnorderedBulkOp()
> var x = 10000
> var counter = 0
> var date = new Date()
> date.setMonth(date.getMonth() -1)
> db.source.find({"yourDateField":{$lt: date}}).forEach(
function(doc){
bulkInsert.insert(doc);
bulkRemove.find({_id:doc._id}).removeOne();
counter ++
if( counter % x == 0){
bulkInsert.execute()
bulkRemove.execute()
bulkInsert = db.target.initializeUnorderedBulkOp()
bulkRemove = db.source.initializeUnorderedBulkOp()
}
}
)
> bulkInsert.execute()
> bulkRemove.execute()
Insert and remove:
var documentsToMove = db.collectionA.find({});
documentsToMove.forEach(function(doc) {
db.collectionB.insert(doc);
db.collectionA.remove(doc);
});
note: this method might be quite slow for large collections or collections holding large documents.
$out is use to create the new collection with data , so use $out
db.oldCollection.aggregate([{$out : "newCollection"}])
then use drop
db.oldCollection.drop()
you can use range query to get data from sourceCollection and keep the cursor data in variable and loop on it and insert to target collection:
var doc = db.sourceCollection.find({
"Timestamp":{
$gte:ISODate("2014-09-01T00:00:00Z"),
$lt:ISODate("2014-10-01T00:00:00Z")
}
});
doc.forEach(function(doc){
db.targetCollection.insert(doc);
})
Hope so it helps!!
First option (Using mongo dump)
1.Get a dump from collection
mongodump -d db -c source_collection
2.Restore from collection
mongorestore -d db -c target_collection dir=dump/db_name/source_collection.bson
Second Option
Running aggregate
db.getCollection('source_collection').aggregate([ { $match: {"emailAddress" : "apitester#mailinator.com"} }, { $out: "target_collection" } ])
Third Option (Slowest)
Running a through for loop
db.getCollection('source_collection').find().forEach(function(docs){ db.getCollection('target_collection').insert(docs); }) print("Rolleback Completed!");
May be from the performance point of view it's better to remove a lot of documents using one command(especially if you have indexes for query part) rather than deleting them one-by-one.
For example:
db.source.find({$gte: start, $lt: end}).forEach(function(doc){
db.target.insert(doc);
});
db.source.remove({$gte: start, $lt: end});
This is a restatement of #Markus W Mahlberg
Returning the favor - as a function
function moveDocuments(sourceCollection,targetCollection,filter) {
var bulkInsert = targetCollection.initializeUnorderedBulkOp();
var bulkRemove = sourceCollection.initializeUnorderedBulkOp();
sourceCollection.find(filter)
.forEach(function(doc) {
bulkInsert.insert(doc);
bulkRemove.find({_id:doc._id}).removeOne();
}
)
bulkInsert.execute();
bulkRemove.execute();
}
An example use
var x = {dsid:{$exists: true}};
moveDocuments(db.pictures,db.artifacts,x)
to move all documents that have top level element dsid from the pictures to the artifacts collection
Here's an update to #jasongarber's answer which uses the more recent mongo 'bulkWrite' operation (Read docs here), and also keeps the whole process asynchronous so you can run it as part of a wider script which depends on its' completion.
async function moveDocuments (sourceCollection, targetCollection, filter) {
const sourceDocs = await sourceCollection.find(filter)
console.log(`Moving ${await sourceDocs.count()} documents from ${sourceCollection.collectionName} to ${targetCollection.collectionName}`)
const idsOfCopiedDocs = await insertDocuments(targetCollection, sourceDocs)
const targetDocs = await targetCollection.find({_id: {$in: idsOfCopiedDocs}})
await deleteDocuments(sourceCollection, targetDocs)
console.log('Done!')
}
async function insertDocuments (collection, documents) {
const insertedIds = []
const bulkWrites = []
await documents.forEach(doc => {
const {_id} = doc
insertedIds.push(_id)
bulkWrites.push({
replaceOne: {
filter: {_id},
replacement: doc,
upsert: true,
},
})
})
if (bulkWrites.length) await collection.bulkWrite(bulkWrites, {ordered: false})
return insertedIds
}
async function deleteDocuments (collection, documents) {
const bulkWrites = []
await documents.forEach(({_id}) => {
bulkWrites.push({
deleteOne: {
filter: {_id},
},
})
})
if (bulkWrites.length) await collection.bulkWrite(bulkWrites, {ordered: false})
}
From MongoDB 3.0 up, you can use the copyTo command with the following syntax:
db.source_collection.copyTo("target_collection")
Then you can use the drop command to remove the old collection:
db.source_collection.drop()
I do like the response from #markus-w-mahlberg, however at times, I have seen the need to keep it a bit simpler for people. As such I have a couple of functions that are below. You could naturally wrap thing here with bulk operators as he did, but this code works with new and old Mongo systems equally.
function parseNS(ns){
//Expects we are forcing people to not violate the rules and not doing "foodb.foocollection.month.day.year" if they do they need to use an array.
if (ns instanceof Array){
database = ns[0];
collection = ns[1];
}
else{
tNS = ns.split(".");
if (tNS.length > 2){
print('ERROR: NS had more than 1 period in it, please pass as an [ "dbname","coll.name.with.dots"] !');
return false;
}
database = tNS[0];
collection = tNS[1];
}
return {database: database,collection: collection};
}
function insertFromCollection( sourceNS, destNS, query, batchSize, pauseMS){
//Parse and check namespaces
srcNS = parseNS(sourceNS);
destNS = parseNS(destNS);
if ( srcNS == false || destNS == false){return false;}
batchBucket = new Array();
totalToProcess = db.getDB(srcNS.database).getCollection(srcNS.collection).find(query,{_id:1}).count();
currentCount = 0;
print("Processed "+currentCount+"/"+totalToProcess+"...");
db.getDB(srcNS.database).getCollection(srcNS.collection).find(query).addOption(DBQuery.Option.noTimeout).forEach(function(doc){
batchBucket.push(doc);
if ( batchBucket.length > batchSize){
db.getDB(destNS.database).getCollection(destNS.collection)insert(batchBucket);
currentCount += batchBucket.length;
batchBucket = [];
sleep (pauseMS);
print("Processed "+currentCount+"/"+totalToProcess+"...");
}
}
print("Completed");
}
/** Example Usage:
insertFromCollection("foo.bar","foo2.bar",{"type":"archive"},1000,20);
You could obviously add a db.getSiblingDB(srcNS.database).getCollection(srcNS.collection).remove(query,true)
If you wanted to also remove the records after they are copied to the new location. The code can easily be built like that to make it restartable.
I had 2297 collection for 15 million of documents but some collection was empty.
Using only copyTo the script failed, but with this script optimization:
db.getCollectionNames().forEach(function(collname) {
var c = db.getCollection(collname).count();
if(c!==0){
db.getCollection(collname).copyTo('master-collection');
print('Copied collection ' + collname);
}
});
all works fine for me.
NB: copyTo is deprecated because it block the read/write operation: so I think is fine if you know that the database is not usable during this operation.
In my case for each didn't work. So I had to make some changes.
var kittySchema = new mongoose.Schema({
name: String
});
var Kitten = mongoose.model('Kitten', kittySchema);
var catSchema = new mongoose.Schema({
name: String
});
var Cat = mongoose.model('Cat', catSchema);
This is Model for both the collection
`function Recursion(){
Kitten.findOne().lean().exec(function(error, results){
if(!error){
var objectResponse = results;
var RequiredId = objectResponse._id;
delete objectResponse._id;
var swap = new Cat(objectResponse);
swap.save(function (err) {
if (err) {
return err;
}
else {
console.log("SUCCESSFULL");
Kitten.deleteOne({ _id: RequiredId }, function(err) {
if (!err) {
console.log('notification!');
}
else {
return err;
}
});
Recursion();
}
});
}
if (err) {
console.log("No object found");
// return err;
}
})
}`
I planned to arhieve 1000 records at a time using bulkinsert and bulkdelete methods of pymongo.
For both source and target
create mongodb objects to connect to the database.
instantiate the bulk objects. Note: I created a backup of bulk objects too. This will help me to rollback the insertion or removal when an error occurs.
example:
For source
// replace this with mongodb object creation logic
source_db_obj = db_help.create_db_obj(source_db, source_col)
source_bulk = source_db_obj.initialize_ordered_bulk_op()
source_bulk_bak = source_db_obj.initialize_ordered_bulk_op()
For target
// replace this with mogodb object creation logic
target_db_obj = db_help.create_db_obj(target_db, target_col)
target_bulk = target_db_obj.initialize_ordered_bulk_op()
target_bulk_bak = target_db_obj.initialize_ordered_bulk_op()
Obtain the source records that matches the filter criteria
source_find_results = source_db_obj.find(filter)
Loop through the source records
create target and source bulk operations
Append archived_at field with the current datetime to the target collection
//replace this with the logic to obtain the UTCtime.
doc['archived_at'] = db_help.getUTCTime()
target_bulk.insert(document)
source_bulk.remove(document)
for rollback in case of any errors or exceptions, create target_bulk_bak and source_bulk_bak operations.
target_bulk_bak.find({'_id':doc['_id']}).remove_one()
source_bulk_bak.insert(doc)
//remove the extra column
doc.pop('archieved_at', None)
When the record count to 1000, execute the target - bulk insertion and source - bulk removal. Note: this method takes target_bulk and source_bulk objects for execution.
execute_bulk_insert_remove(source_bulk, target_bulk)
When exception occurs, execute the target_bulk_bak removal and source_bulk_bak inesertions. This would rollback the changes. Since mongodb doesn't have rollback, I came up with this hack
execute_bulk_insert_remove(source_bulk_bak, target_bulk_bak)
Finally re-initialize the source and target bulk and bulk_bak objects. This is necessary because you can use them only once.
Complete code
def execute_bulk_insert_remove(source_bulk, target_bulk):
try:
target_bulk.execute()
source_bulk.execute()
except BulkWriteError as bwe:
raise Exception(
"could not archive document, reason: {}".format(bwe.details))
def archive_bulk_immediate(filter, source_db, source_col, target_db, target_col):
"""
filter: filter criteria for backup
source_db: source database name
source_col: source collection name
target_db: target database name
target_col: target collection name
"""
count = 0
bulk_count = 1000
source_db_obj = db_help.create_db_obj(source_db, source_col)
source_bulk = source_db_obj.initialize_ordered_bulk_op()
source_bulk_bak = source_db_obj.initialize_ordered_bulk_op()
target_db_obj = db_help.create_db_obj(target_db, target_col)
target_bulk = target_db_obj.initialize_ordered_bulk_op()
target_bulk_bak = target_db_obj.initialize_ordered_bulk_op()
source_find_results = source_db_obj.find(filter)
start = datetime.now()
for doc in source_find_results:
doc['archived_at'] = db_help.getUTCTime()
target_bulk.insert(doc)
source_bulk.find({'_id': doc['_id']}).remove_one()
target_bulk_bak.find({'_id': doc['_id']}).remove_one()
doc.pop('archieved_at', None)
source_bulk_bak.insert(doc)
count += 1
if count % 1000 == 0:
logger.info("count: {}".format(count))
try:
execute_bulk_insert_remove(source_bulk, target_bulk)
except BulkWriteError as bwe:
execute_bulk_insert_remove(source_bulk_bak, target_bulk_bak)
logger.info("Bulk Write Error: {}".format(bwe.details))
raise
source_bulk = source_db_obj.initialize_ordered_bulk_op()
source_bulk_bak = source_db_obj.initialize_ordered_bulk_op()
target_bulk = target_db_obj.initialize_ordered_bulk_op()
target_bulk_bak = target_db_obj.initialize_ordered_bulk_op()
end = datetime.now()
logger.info("archived {} documents to {} in ms.".format(
count, target_col, (end - start)))