MongoDB - loop through each database and run a command - mongodb

My MongoDB instance have several databases of same design. I want to loop through each database and apply unique indexes to several collections in each database.
I understand how to apply the new unique index to the collection, but how do I loop through each database and run the command on each?
db.collection.createIndex (
{ email : 1 },
{ unique : true, collation : { locale : "en", strength : 2 } }
)

The general idea is to loop through the list returned from the commands
db.adminCommand('listDatabases')
or
db.getMongo().getDBNames()
Within the loop, create the db object with getDB() method and loop through each db's collection with db.getCollectionNames(), get the collection object with db.getCollection() or db[collectionName] bracket syntax and create the index, something like the following:
mongo = db.getMongo(); // or mongo = new Mongo();
mongo.getDBNames().forEach(function(dbname){
db = mongo.getDB(dbname);
db.getCollectionNames().forEach(function(collectionName) {
collection = db.getCollection(collectionName); // or db[collectionMame]
indexes = collection.getIndexes();
print("Indexes for " + collectionName + ":");
printjson(indexes);
emailIdx = indexes.filter(function(idx){ return idx.key === { "email": 1 }; });
if (emailIdx.length < 1) {
collection.createIndex (
{ email : 1 },
{ unique : true, collation : { locale : "en", strength : 2 } }
)
}
});
})
A useful Mongo shell cheat sheet can be found here.

Related

Mongodb: Loop through all database and collection to find a field ( key ) in used?

I have set of databases in mongo and each has set of collections. I need to iterate through all the databases and collections to find whether key being used in any collections. With googling i have mange to iterate through databases and collections. But I'm unable to find if certain key is used in any collection. Following is my script execute in mongodb shell.
db = db.getSiblingDB("admin");
var dbs = db.runCommand({ "listDatabases": 1 }).databases;
dbs.forEach(function(database) {
db = db.getSiblingDB(database.name);
cols = db.getCollectionNames();
cols.forEach(function(col) {
db[col].find({}).forEach(function(field) {
//print(field);
if( field == "testKey"){
print(col + " has column")
}
});
});
});
How can I correct this script if the filed is equals to "testKey" then print that database and collection. Currently this does not go inside to if condition even-though i have "testKey" in some collections.
Note that find returns list of documents so actually you need another iteration on the keys of that document.
db = db.getSiblingDB("admin");
var dbs = db.runCommand({ "listDatabases": 1 }).databases;
dbs.forEach(function (database) {
db = db.getSiblingDB(database.name);
cols = db.getCollectionNames();
cols.forEach(col =>
db[col].find({}).forEach(doc =>
Object.keys(doc).forEach(key => {
print(key === 'testKey')
})
)
)
});

How to compare all documents in two collections with millions of doc and write the diff in a third collection in MongoDB

I have two collections (coll_1, coll_2) with a million documents each.
These two collections are actually created by running two versions of a code from the same data source, so both two collections will have the same number of documents but the document in both collections can have one more field or sub-document missing or have a different values, but both collection's documents will have the same primary_key_id which is indexed.
I have this javascript function saved on the db to get the diff
db.system.js.save({
_id: "diffJSON", value:
function(obj1, obj2) {
var result = {};
for (key in obj1) {
if (obj2[key] != obj1[key]) result[key] = obj2[key];
if (typeof obj2[key] == 'array' && typeof obj1[key] == 'array')
result[key] = arguments.callee(obj1[key], obj2[key]);
if (typeof obj2[key] == 'object' && typeof obj1[key] == 'object')
result[key] = arguments.callee(obj1[key], obj2[key]);
}
return result;
}
});
Which runs fine like this
diffJSON(testObj1, testObj2);
Question: How to run diffJSON on coll1 and coll2, and output diffJSON result into coll3 along with primary_key_id.
I am new to MongoDB, and I understand the JOINS doesn't work as similar to RDBMS, so I wonder if I have to copy the two comparing documents in a single collection and then run the diffJSON function.
Also, most of the time (say 90%) documents in two collections will be identical, I would need to know about only 10% of docs which have any diff.
Here is a simple example document:
(but real doc is around 15k in size, just so you know the scale)
var testObj1 = { test:"1",test1: "2", tt:["td","ax"], tr:["Positive"] ,tft:{test:["a"]}};
var testObj2 = { test:"1",test1: "2", tt:["td","ax"], tr:["Negative"] };
If you know a better way to diff the documents, please feel free to suggest.
you can use a simple shell script to achieve this. First create a file named script.js and paste this code in it :
// load previously saved diffJSON() function
db.loadServerScripts();
// get all the document from collection coll1
var cursor = db.coll1.find();
if (cursor != null && cursor.hasNext()) {
// iterate over the cursor
while (cursor.hasNext()){
var doc1 = cursor.next();
// get the doc with the same _id from coll2
var id = doc1._id;
var doc2 = db.coll2.findOne({_id: id});
// compute the diff
var diff = diffJSON(doc2, doc1);
// if there is a difference between the two objects
if ( Object.keys(diff).length > 0 ) {
diff._id = id;
// insert the diff in coll3 with the same _id
db.coll3.insert(diff);
}
}
}
In this script I assume that your primary_key is the _id field.
then execute it from you shell like this:
mongo --host hostName --port portNumber databaseName < script.js
where databaseName is the came of the database containing the collections coll1 and coll2.
for this samples documents (just added an _id field to your docs):
var testObj1 = { _id: 1, test:"1",test1: "2", tt:["td","ax"], tr:["Positive"] ,tft:{test:["a"]}};
var testObj2 = { _id: 1, test:"1",test1: "2", tt:["td","ax"], tr:["Negative"] };
the script will save the following doc in coll3 :
{ "_id" : 1, "tt" : { }, "tr" : { "0" : "Positive" } }
This solution builds upon the one proposed by felix (I don't have the necessary reputation to comment on his). I made a few small changes to his script that bring important performance improvements:
// load previously saved diffJSON() function
db.loadServerScripts();
// get all the document from collection coll1 and coll2
var cursor1 = db.coll1.find().sort({'_id': 1});
var cursor2 = db.coll2.find().sort({'_id': 1});
if (cursor1 != null && cursor1.hasNext() && cursor2 != null && cursor2.hasNext()) {
// iterate over the cursor
while (cursor1.hasNext() && cursor2.hasNext()){
var doc1 = cursor1.next();
var doc2 = cursor2.next();
var pk = doc1._id
// compute the diff
var diff = diffJSON(doc2, doc1);
// if there is a difference between the two objects
if ( Object.keys(diff).length > 0 ) {
diff._id = pk;
// insert the diff in coll3 with the same _id
db.coll3.insert(diff);
}
}
}
Two cursors are used for fetching all the entries in the database sorted by the primary key. This is a very important aspect and brings most of the performance improvement. By retrieving the documents sorted by primary key, we make sure we match them correctly by the primary key. This is based on the fact that the two collections hold the same data.
This way we avoid making a call to coll2 for each document in coll1. It might seem as something insignificant, but we're talking about 1 million calls which put a lot of stress on the database.
Another important assumption is that the primary key field is _id. If it's not the case, it is crucial to have an unique index on the primary key field. Otherwise, the script might mismatch documents with the same primary key.

Query or command to find a Document, given an ObjectID but NOT a collection

So I have a document that has references to foreign ObjectIDs that may point to other documents or collections.
For example this is the pseudo-structure of the document
{
_id: ObjectID(xxxxxxxx),
....
reference: ObjectID(yyyyyyyy)
}
I can't find anything that does not involve providing the collection and given that I don't know for sure on which collection to search, I am wondering if there is a way for me to find the document in the entire database and find the collection ObjectID(yyyyyyyy) belongs to.
The only possible way to do this is by listing every collection in the database and performing a db.collection.find() on each one.
E.g. in the Mongo shell I would do something like
var result = new Array();
var collections = db.getCollectionNames();
for (var i = 0; i < collections.length; i++) {
var found = db.getCollection(collections[i]).findOne({ "_id" : ObjectId("yyyyyyyy") });
if (found) {
result.push(found);
}
}
print(result);
You need to run your query on all collections in your database.
db.getCollectionNames().forEach(function(collection){
db[collection].find({ $or : [
{ _id : ObjectId("535372b537e6210c53005ee5") },
{ reference : ObjectId("535372b537e6210c53005ee5")}]
}).forEach(printjson);
});

Delete all _id field from subdocuments

I have been using Mongoose to insert a large amount of data into a mongodb database. I noticed that by default, Mongoose adds _id fields to all subdocuments, leaving me with documents which look like this (I've removed many fields for brevity - I've also shrunken each array to one entry, they generally have more)
{
"start_time" : ISODate("2013-04-05T02:30:28Z"),
"match_id" : 165816931,
"players" : [
{
"account_id" : 4294967295,
"_id" : ObjectId("51daffdaa78cee5c36e29fba"),
"additional_units" : [ ],
"ability_upgrades" : [
{
"ability" : 5155,
"time" : 141,
"level" : 1,
"_id" : ObjectId("51daffdaa78cee5c36e29fca")
},
]
},
],
"_id" : ObjectId("51daffdca78cee5c36e2a02e")
}
I have found how to prevent Mongoose adding these by default (http://mongoosejs.com/docs/guide.html, see option: id), however I now have 95 million records with these extraneous _id fields on all subdocuments. I am interested in finding the best way of deleting all of these fields (leaving the _id on the top level document). My initial thoughts are to use a bunch of for...in loops on each object but this seems very inefficient.
Given Derick's answer, I have created a function to do this:
var deleteIdFromSubdocs = function (obj, isRoot) {
for (var key in obj) {
if (isRoot == false && key == "_id") {
delete obj[key];
} else if (typeof obj[key] == "object") {
deleteIdFromSubdocs(obj[key], false);
}
}
return obj;
And run it against a test collection using:
db.testobjects.find().forEach(function (x){ y = deleteIdFromSubdocs(x, true); db.testobjects.save(y); } )
This appears to work for my test collection. I'd like to see if anyone has any opinions on how this could be done better/any risks involved before I run it against the 95 million document collection.
The players._id could be removed using an update operation, like he following:
db.collection.update({'players._id': {$exists : 1}}, { $unset : { 'players.$._id' : 1 } }, false, true)
However, it's not possible use positional operator in nested arrays. So, one solution is run a script directly on our database:
var cursor = db.collection.find({'players.ability_upgrades._id': {$exists : 1}});
cursor.forEach(function(doc) {
for (var i = 0; i < doc.players.length; i++) {
var player = doc.players[i];
delete player['_id'];
for (var j = 0; j < player.ability_upgrades.length; j++) {
delete player.ability_upgrades[j]['_id'];
}
}
db.collection.save(doc);
});
Save the script to a file and call mongo with the file as parameter:
> mongo remove_oid.js --shell
The only solution is to do this one by one, exactly with a for...in loop as you described.
Just another version, try this with AngularJS and MongoDB ;-)
function removeIds (obj, isRoot) {
for (var key in obj._doc) {
if (isRoot == false && key == "_id") {
delete obj._doc._id;
} else if ((Object.prototype.toString.call( obj[key] ) === '[object Array]' )) {
for (var i=0; i<obj[key].length; i++)
removeIds(obj[key][i], false);
}
}
return obj;
}
Usage:
var newObj = removeIds(oldObj, true);
delete newObj._id;

In MongoDB mapreduce, how can I flatten the values object?

I'm trying to use MongoDB to analyse Apache log files. I've created a receipts collection from the Apache access logs. Here's an abridged summary of what my models look like:
db.receipts.findOne()
{
"_id" : ObjectId("4e57908c7a044a30dc03a888"),
"path" : "/videos/1/show_invisibles.m4v",
"issued_at" : ISODate("2011-04-08T00:00:00Z"),
"status" : "200"
}
I've written a MapReduce function that groups all data by the issued_at date field. It summarizes the total number of requests, and provides a breakdown of the number of requests for each unique path. Here's an example of what the output looks like:
db.daily_hits_by_path.findOne()
{
"_id" : ISODate("2011-04-08T00:00:00Z"),
"value" : {
"count" : 6,
"paths" : {
"/videos/1/show_invisibles.m4v" : {
"count" : 2
},
"/videos/1/show_invisibles.ogv" : {
"count" : 3
},
"/videos/6/buffers_listed_and_hidden.ogv" : {
"count" : 1
}
}
}
}
How can I make the output look like this instead:
{
"_id" : ISODate("2011-04-08T00:00:00Z"),
"count" : 6,
"paths" : {
"/videos/1/show_invisibles.m4v" : {
"count" : 2
},
"/videos/1/show_invisibles.ogv" : {
"count" : 3
},
"/videos/6/buffers_listed_and_hidden.ogv" : {
"count" : 1
}
}
}
It's not currently possible, but I would suggest voting for this case: https://jira.mongodb.org/browse/SERVER-2517.
Taking the best from previous answers and comments:
db.items.find().hint({_id: 1}).forEach(function(item) {
db.items.update({_id: item._id}, item.value);
});
From http://docs.mongodb.org/manual/core/update/#replace-existing-document-with-new-document
"If the update argument contains only field and value pairs, the update() method replaces the existing document with the document in the update argument, except for the _id field."
So you need neither to $unset value, nor to list each field.
From https://docs.mongodb.com/manual/core/read-isolation-consistency-recency/#cursor-snapshot
"MongoDB cursors can return the same document more than once in some situations. ... use a unique index on this field or these fields so that the query will return each document no more than once. Query with hint() to explicitly force the query to use that index."
AFAIK, by design Mongo's map reduce will spit results out in "value tuples" and I haven't seen anything that will configure that "output format". Maybe the finalize() method can be used.
You could try running a post-process that will reshape the data using
results.find({}).forEach( function(result) {
results.update({_id: result._id}, {count: result.value.count, paths: result.value.paths})
});
Yep, that looks ugly. I know.
You can do Dan's code with a collection reference:
function clean(collection) {
collection.find().forEach( function(result) {
var value = result.value;
delete value._id;
collection.update({_id: result._id}, value);
collection.update({_id: result.id}, {$unset: {value: 1}} ) } )};
A similar approach to that of #ljonas but no need to hardcode document fields:
db.results.find().forEach( function(result) {
var value = result.value;
delete value._id;
db.results.update({_id: result._id}, value);
db.results.update({_id: result.id}, {$unset: {value: 1}} )
} );
All the proposed solutions are far from optimal. The fastest you can do so far is something like:
var flattenMRCollection=function(dbName,collectionName) {
var collection=db.getSiblingDB(dbName)[collectionName];
var i=0;
var bulk=collection.initializeUnorderedBulkOp();
collection.find({ value: { $exists: true } }).addOption(16).forEach(function(result) {
print((++i));
//collection.update({_id: result._id},result.value);
bulk.find({_id: result._id}).replaceOne(result.value);
if(i%1000==0)
{
print("Executing bulk...");
bulk.execute();
bulk=collection.initializeUnorderedBulkOp();
}
});
bulk.execute();
};
Then call it:
flattenMRCollection("MyDB","MyMRCollection")
This is WAY faster than doing sequential updates.
While experimenting with Vincent's answer, I found a couple of problems. Basically, if you perform updates within a foreach loop, this will move the document to the end of the collection and the cursor will reach that document again (example). This can be circumvented if $snapshot is used. Hence, I am providing a Java example below.
final List<WriteModel<Document>> bulkUpdate = new ArrayList<>();
// You should enable $snapshot if performing updates within foreach
collection.find(new Document().append("$query", new Document()).append("$snapshot", true)).forEach(new Block<Document>() {
#Override
public void apply(final Document document) {
// Note that I used incrementing long values for '_id'. Change to String if
// you used string '_id's
long docId = document.getLong("_id");
Document subDoc = (Document)document.get("value");
WriteModel<Document> m = new ReplaceOneModel<>(new Document().append("_id", docId), subDoc);
bulkUpdate.add(m);
// If you used non-incrementing '_id's, then you need to use a final object with a counter.
if(docId % 1000 == 0 && !bulkUpdate.isEmpty()) {
collection.bulkWrite(bulkUpdate);
bulkUpdate.removeAll(bulkUpdate);
}
}
});
// Fixing bug related to Vincent's answer.
if(!bulkUpdate.isEmpty()) {
collection.bulkWrite(bulkUpdate);
bulkUpdate.removeAll(bulkUpdate);
}
Note : This snippet takes an average of 7.4 seconds to execute on my machine with 100k records and 14 attributes (IMDB dataset). Without batching, it takes an average of 25.2 seconds.