Delete all _id field from subdocuments - mongodb

I have been using Mongoose to insert a large amount of data into a mongodb database. I noticed that by default, Mongoose adds _id fields to all subdocuments, leaving me with documents which look like this (I've removed many fields for brevity - I've also shrunken each array to one entry, they generally have more)
{
"start_time" : ISODate("2013-04-05T02:30:28Z"),
"match_id" : 165816931,
"players" : [
{
"account_id" : 4294967295,
"_id" : ObjectId("51daffdaa78cee5c36e29fba"),
"additional_units" : [ ],
"ability_upgrades" : [
{
"ability" : 5155,
"time" : 141,
"level" : 1,
"_id" : ObjectId("51daffdaa78cee5c36e29fca")
},
]
},
],
"_id" : ObjectId("51daffdca78cee5c36e2a02e")
}
I have found how to prevent Mongoose adding these by default (http://mongoosejs.com/docs/guide.html, see option: id), however I now have 95 million records with these extraneous _id fields on all subdocuments. I am interested in finding the best way of deleting all of these fields (leaving the _id on the top level document). My initial thoughts are to use a bunch of for...in loops on each object but this seems very inefficient.

Given Derick's answer, I have created a function to do this:
var deleteIdFromSubdocs = function (obj, isRoot) {
for (var key in obj) {
if (isRoot == false && key == "_id") {
delete obj[key];
} else if (typeof obj[key] == "object") {
deleteIdFromSubdocs(obj[key], false);
}
}
return obj;
And run it against a test collection using:
db.testobjects.find().forEach(function (x){ y = deleteIdFromSubdocs(x, true); db.testobjects.save(y); } )
This appears to work for my test collection. I'd like to see if anyone has any opinions on how this could be done better/any risks involved before I run it against the 95 million document collection.

The players._id could be removed using an update operation, like he following:
db.collection.update({'players._id': {$exists : 1}}, { $unset : { 'players.$._id' : 1 } }, false, true)
However, it's not possible use positional operator in nested arrays. So, one solution is run a script directly on our database:
var cursor = db.collection.find({'players.ability_upgrades._id': {$exists : 1}});
cursor.forEach(function(doc) {
for (var i = 0; i < doc.players.length; i++) {
var player = doc.players[i];
delete player['_id'];
for (var j = 0; j < player.ability_upgrades.length; j++) {
delete player.ability_upgrades[j]['_id'];
}
}
db.collection.save(doc);
});
Save the script to a file and call mongo with the file as parameter:
> mongo remove_oid.js --shell

The only solution is to do this one by one, exactly with a for...in loop as you described.

Just another version, try this with AngularJS and MongoDB ;-)
function removeIds (obj, isRoot) {
for (var key in obj._doc) {
if (isRoot == false && key == "_id") {
delete obj._doc._id;
} else if ((Object.prototype.toString.call( obj[key] ) === '[object Array]' )) {
for (var i=0; i<obj[key].length; i++)
removeIds(obj[key][i], false);
}
}
return obj;
}
Usage:
var newObj = removeIds(oldObj, true);
delete newObj._id;

Related

MongoDB - loop through each database and run a command

My MongoDB instance have several databases of same design. I want to loop through each database and apply unique indexes to several collections in each database.
I understand how to apply the new unique index to the collection, but how do I loop through each database and run the command on each?
db.collection.createIndex (
{ email : 1 },
{ unique : true, collation : { locale : "en", strength : 2 } }
)
The general idea is to loop through the list returned from the commands
db.adminCommand('listDatabases')
or
db.getMongo().getDBNames()
Within the loop, create the db object with getDB() method and loop through each db's collection with db.getCollectionNames(), get the collection object with db.getCollection() or db[collectionName] bracket syntax and create the index, something like the following:
mongo = db.getMongo(); // or mongo = new Mongo();
mongo.getDBNames().forEach(function(dbname){
db = mongo.getDB(dbname);
db.getCollectionNames().forEach(function(collectionName) {
collection = db.getCollection(collectionName); // or db[collectionMame]
indexes = collection.getIndexes();
print("Indexes for " + collectionName + ":");
printjson(indexes);
emailIdx = indexes.filter(function(idx){ return idx.key === { "email": 1 }; });
if (emailIdx.length < 1) {
collection.createIndex (
{ email : 1 },
{ unique : true, collation : { locale : "en", strength : 2 } }
)
}
});
})
A useful Mongo shell cheat sheet can be found here.

MongoDB - reduce function does not work properly

My map function returns key-value pairs where key is the name of a field and the value is an object {type: <field type>, count : 1}.
For example suppose I have these documents:
{
"_id" : ObjectId("57611ad6bcc0d7e01be886c8"),
"index" : NumberInt(0)
}
{
"_id" : ObjectId("57611ad6bcc0d7e01be886c9"),
"index" : NumberInt(7)
}
{
"_id" : ObjectId("57611ad6bcc0d7e01be886c7"),
"index" : NumberInt(9)
}
I have to retrieve the name of each field, its type and the number of occurrences of the field in my collection.
My map function works and I get:
"_id", [{type:"ObjectId", count:1},{type:"ObjectId", count:1},{type:"ObjectId", count:1}]
"index",[{type:"number", count:1},{type:"number", count:1},{type:"number", count:1}]
I want to delete duplicates from type.
I have the following reduce function:
function (key, stuff) {
reduceVal = {type:"", count:0};
var array = [];
for(var idx =0; idx < stuff.length; idx++) {
reduceVal.count += stuff[idx].count;
if(array.indexOf(stuff[idx].type) > -1) {
array.push(stuff[idx].type);
}
}
reduceVal.type = array.toString();
The if clause does not work. My target is to add an element to my array just if it is not a duplicate.
Expected output:
"_id", {type:"ObjectId", count:3}
"index", {type:"number", count:3}
How can I fix?
The reduce function works. The if statement was wrong: I have to add an element to my array when
if(array.indexOf(stuff[idx].type) === -1).
It looks like you just jumbled up your reduce function. As far as I can interpret this, you assume that the reducer is called once globally. This is not the case. Instead, it is called per key, i.e. the input to the reducer is somthing like:
First call:
key = "ObjectId", val = [{type:"ObjectId", count:1},{type:"ObjectId", count:1},{type:"ObjectId", count:1}]
Second call:
key = "number", val = [{type:"number", count:1},...]
Therefore, you need to sum up, knowing that the key is already set (this code is not tested and will have its shortcomings):
function(key, vals) {
var sum = 0;
for(var i = 0; i < vals.length; i++) {
sum += vals[i].count;
}
return { "type" : key, "count" : sum };
}

update multi level document in mongodb [duplicate]

I have document like
{
id : 100,
heros:[
{
nickname : "test",
spells : [
{spell_id : 61, level : 1},
{spell_id : 1, level : 2}
]
}
]
}
I can't $set spell's level : 3 with spell_id : 1 inside spells that inside heros with nickname "test. I tried this query:
db.test.update({"heros.nickname":"test", "heros.spells.spell_id":1},
{$set:{"heros.spells.$.level":3}});
Errror i see is
can't append to array using string field name [spells]
Thanks for help.
You can only use the $ positional operator for single-level arrays. In your case, you have a nested array (heros is an array, and within that each hero has a spells array).
If you know the indexes of the arrays, you can use explicit indexes when doing an update, like:
> db.test.update({"heros.nickname":"test", "heros.spells.spell_id":1}, {$set:{"heros.0.spells.1.level":3}});
Try something like this:
db.test.find({"heros.nickname":"test"}).forEach(function(x) {
bool match = false;
for (i=0 ; i< x.heros[0].spells.length ; i++) {
if (x.heros[0].spells[i].spell_id == 1)
{
x.heros[0].spells[i].level = 3;
match = true;
}
}
if (match === true) db.test.update( { id: x.id }, x );
});
Apparently someone opened a ticket to add the ability to put a function inside the update clause, but it hasn't been addressed yet: https://jira.mongodb.org/browse/SERVER-458

Update embedded object inside array inside array in MongoDB

I have document like
{
id : 100,
heros:[
{
nickname : "test",
spells : [
{spell_id : 61, level : 1},
{spell_id : 1, level : 2}
]
}
]
}
I can't $set spell's level : 3 with spell_id : 1 inside spells that inside heros with nickname "test. I tried this query:
db.test.update({"heros.nickname":"test", "heros.spells.spell_id":1},
{$set:{"heros.spells.$.level":3}});
Errror i see is
can't append to array using string field name [spells]
Thanks for help.
You can only use the $ positional operator for single-level arrays. In your case, you have a nested array (heros is an array, and within that each hero has a spells array).
If you know the indexes of the arrays, you can use explicit indexes when doing an update, like:
> db.test.update({"heros.nickname":"test", "heros.spells.spell_id":1}, {$set:{"heros.0.spells.1.level":3}});
Try something like this:
db.test.find({"heros.nickname":"test"}).forEach(function(x) {
bool match = false;
for (i=0 ; i< x.heros[0].spells.length ; i++) {
if (x.heros[0].spells[i].spell_id == 1)
{
x.heros[0].spells[i].level = 3;
match = true;
}
}
if (match === true) db.test.update( { id: x.id }, x );
});
Apparently someone opened a ticket to add the ability to put a function inside the update clause, but it hasn't been addressed yet: https://jira.mongodb.org/browse/SERVER-458

In MongoDB mapreduce, how can I flatten the values object?

I'm trying to use MongoDB to analyse Apache log files. I've created a receipts collection from the Apache access logs. Here's an abridged summary of what my models look like:
db.receipts.findOne()
{
"_id" : ObjectId("4e57908c7a044a30dc03a888"),
"path" : "/videos/1/show_invisibles.m4v",
"issued_at" : ISODate("2011-04-08T00:00:00Z"),
"status" : "200"
}
I've written a MapReduce function that groups all data by the issued_at date field. It summarizes the total number of requests, and provides a breakdown of the number of requests for each unique path. Here's an example of what the output looks like:
db.daily_hits_by_path.findOne()
{
"_id" : ISODate("2011-04-08T00:00:00Z"),
"value" : {
"count" : 6,
"paths" : {
"/videos/1/show_invisibles.m4v" : {
"count" : 2
},
"/videos/1/show_invisibles.ogv" : {
"count" : 3
},
"/videos/6/buffers_listed_and_hidden.ogv" : {
"count" : 1
}
}
}
}
How can I make the output look like this instead:
{
"_id" : ISODate("2011-04-08T00:00:00Z"),
"count" : 6,
"paths" : {
"/videos/1/show_invisibles.m4v" : {
"count" : 2
},
"/videos/1/show_invisibles.ogv" : {
"count" : 3
},
"/videos/6/buffers_listed_and_hidden.ogv" : {
"count" : 1
}
}
}
It's not currently possible, but I would suggest voting for this case: https://jira.mongodb.org/browse/SERVER-2517.
Taking the best from previous answers and comments:
db.items.find().hint({_id: 1}).forEach(function(item) {
db.items.update({_id: item._id}, item.value);
});
From http://docs.mongodb.org/manual/core/update/#replace-existing-document-with-new-document
"If the update argument contains only field and value pairs, the update() method replaces the existing document with the document in the update argument, except for the _id field."
So you need neither to $unset value, nor to list each field.
From https://docs.mongodb.com/manual/core/read-isolation-consistency-recency/#cursor-snapshot
"MongoDB cursors can return the same document more than once in some situations. ... use a unique index on this field or these fields so that the query will return each document no more than once. Query with hint() to explicitly force the query to use that index."
AFAIK, by design Mongo's map reduce will spit results out in "value tuples" and I haven't seen anything that will configure that "output format". Maybe the finalize() method can be used.
You could try running a post-process that will reshape the data using
results.find({}).forEach( function(result) {
results.update({_id: result._id}, {count: result.value.count, paths: result.value.paths})
});
Yep, that looks ugly. I know.
You can do Dan's code with a collection reference:
function clean(collection) {
collection.find().forEach( function(result) {
var value = result.value;
delete value._id;
collection.update({_id: result._id}, value);
collection.update({_id: result.id}, {$unset: {value: 1}} ) } )};
A similar approach to that of #ljonas but no need to hardcode document fields:
db.results.find().forEach( function(result) {
var value = result.value;
delete value._id;
db.results.update({_id: result._id}, value);
db.results.update({_id: result.id}, {$unset: {value: 1}} )
} );
All the proposed solutions are far from optimal. The fastest you can do so far is something like:
var flattenMRCollection=function(dbName,collectionName) {
var collection=db.getSiblingDB(dbName)[collectionName];
var i=0;
var bulk=collection.initializeUnorderedBulkOp();
collection.find({ value: { $exists: true } }).addOption(16).forEach(function(result) {
print((++i));
//collection.update({_id: result._id},result.value);
bulk.find({_id: result._id}).replaceOne(result.value);
if(i%1000==0)
{
print("Executing bulk...");
bulk.execute();
bulk=collection.initializeUnorderedBulkOp();
}
});
bulk.execute();
};
Then call it:
flattenMRCollection("MyDB","MyMRCollection")
This is WAY faster than doing sequential updates.
While experimenting with Vincent's answer, I found a couple of problems. Basically, if you perform updates within a foreach loop, this will move the document to the end of the collection and the cursor will reach that document again (example). This can be circumvented if $snapshot is used. Hence, I am providing a Java example below.
final List<WriteModel<Document>> bulkUpdate = new ArrayList<>();
// You should enable $snapshot if performing updates within foreach
collection.find(new Document().append("$query", new Document()).append("$snapshot", true)).forEach(new Block<Document>() {
#Override
public void apply(final Document document) {
// Note that I used incrementing long values for '_id'. Change to String if
// you used string '_id's
long docId = document.getLong("_id");
Document subDoc = (Document)document.get("value");
WriteModel<Document> m = new ReplaceOneModel<>(new Document().append("_id", docId), subDoc);
bulkUpdate.add(m);
// If you used non-incrementing '_id's, then you need to use a final object with a counter.
if(docId % 1000 == 0 && !bulkUpdate.isEmpty()) {
collection.bulkWrite(bulkUpdate);
bulkUpdate.removeAll(bulkUpdate);
}
}
});
// Fixing bug related to Vincent's answer.
if(!bulkUpdate.isEmpty()) {
collection.bulkWrite(bulkUpdate);
bulkUpdate.removeAll(bulkUpdate);
}
Note : This snippet takes an average of 7.4 seconds to execute on my machine with 100k records and 14 attributes (IMDB dataset). Without batching, it takes an average of 25.2 seconds.