How can i remove empty string from a mongodb collection? - mongodb

I have a "mongodb colllenctions" and I'd like to remove the "empty strings"with keys from it.
From this:
{
"_id" : ObjectId("56323d975134a77adac312c5"),
"year" : "15",
"year_comment" : "",
}
{
"_id" : ObjectId("56323d975134a77adac312c5"),
"year" : "",
"year_comment" : "asd",
}
I'd like to gain this result:
{
"_id" : ObjectId("56323d975134a77adac312c5"),
"year" : "15",
}
{
"_id" : ObjectId("56323d975134a77adac312c5"),
"year_comment" : "asd",
}
How could I solve it?

Please try executing following code snippet in Mongo shell which strips fields with empty or null values
var result=new Array();
db.getCollection('test').find({}).forEach(function(data)
{
for(var i in data)
{
if(data[i]==null || data[i]=='')
{
delete data[i]
}
}
result.push(data)
})
print(tojson(result))

Would start with getting a distinct list of all the keys in the collection, use those keys as your query basis and do an ordered bulk update using the Bulk API operations. The update statement uses the $unset operator to remove the fields.
The mechanism to get distinct keys list that you need to assemble the query is possible through Map-Reduce. The following mapreduce operation will populate a separate collection with all the keys as the _id values:
mr = db.runCommand({
"mapreduce": "my_collection",
"map" : function() {
for (var key in this) { emit(key, null); }
},
"reduce" : function(key, stuff) { return null; },
"out": "my_collection" + "_keys"
})
To get a list of all the dynamic keys, run distinct on the resulting collection:
db[mr.result].distinct("_id")
// prints ["_id", "year", "year_comment", ...]
Now given the list above, you can assemble your query by creating an object that will have its properties set within a loop. Normally your query will have this structure:
var keysList = ["_id", "year", "year_comment"];
var query = keysList.reduce(function(obj, k) {
var q = {};
q[k] = "";
obj["$or"].push(q);
return obj;
}, { "$or": [] });
printjson(query); // prints {"$or":[{"_id":""},{"year":""},{"year_comment":""}]}
You can then use the Bulk API (available with MongoDB 2.6 and above) as a way of streamlining your updates for better performance with the query above. Overall, you should be able to have something working as:
var bulk = db.collection.initializeOrderedBulkOp(),
counter = 0,
query = {"$or":[{"_id":""},{"year":""},{"year_comment":""}]},
keysList = ["_id", "year", "year_comment"];
db.collection.find(query).forEach(function(doc){
var emptyKeys = keysList.filter(function(k) { // use filter to return an array of keys which have empty strings
return doc[k]==="";
}),
update = emptyKeys.reduce(function(obj, k) { // set the update object
obj[k] = "";
return obj;
}, { });
bulk.find({ "_id": doc._id }).updateOne({
"$unset": update // use the $unset operator to remove the fields
});
counter++;
if (counter % 1000 == 0) {
// Execute per 1000 operations and re-initialize every 1000 update statements
bulk.execute();
bulk = db.collection.initializeOrderedBulkOp();
}
})

If you need to update a single blank parameter or you prefer to do parameter by parameter, you can use the mongo updateMany functionality:
db.comments.updateMany({year: ""}, { $unset : { year : 1 }})

Related

Add field if not exist to document in Mongo

Source Doc
{
"_id" : "12345",
"LastName" : "Smith",
"FirstName" : "Fred",
"ProfileCreated" : NumberLong(1447118831860),
"DropOut" : false,
}
New Doc
{
"_id" : "12345",
"LastName" : "Smith",
"FirstName" : "Fred",
"ProfileCreated" : NumberLong(1447118831860),
"DropOut" : true,
"LatestConsultation" : false,
}
I have two collections which share a lot of the same document ID's and fields but over time the new documents will have fields added to them and or completely new documents with new ID's will get created.
I think I know how to handle new documents using $setOnInsert and upsert = true but I'm not sure how best to handle the addition of new fields. The behavior I require for documents that exists in both collection matched on _id with new fields is to add the new field to the document without modifying the values of any of the other fields even if they have changed as in the example where the DropOut value has changed. The resulting document I require is.
Result document
{
"_id" : "12345",
"LastName" : "Smith",
"FirstName" : "Fred",
"ProfileCreated" : NumberLong(1447118831860),
"DropOut" : false,
"LatestConsultation" : false,
}
What is the best and most performatic way to achive this? Also if this can somehow be combined into a single statement that also includes the addition of documents that exists in the new collection but not in the source collection that would be amazing :-)
PS. I am using Pymongo so a Pymongo example would be even better but I can translate a mongo shell example.
Not sure is this is possible with an atomic update. However, you could string in some mixed operations and tackle this in such a way that you iterate the new collection and for each document in the new collection:
Use the _id field to query the old collection. Use the findOne() method to return a document from the old collection that matches on the _id from the new collection.
Extend the new doc with the old doc by adding the new fields which do not exist in the old document.
Update the new collection with this merged document.
The following basic mongo shell example demonstrates the algorithm above:
function merge(from, to) {
var obj = {};
if (!from) {
from = {};
} else {
obj = from;
}
for (var key in to) {
if (!from.hasOwnProperty(key)) {
obj[key] = to[key];
}
}
return obj;
}
db.new_collection.find({}).snapshot().forEach(function(doc){
var old_doc = db.old_collection.findOne({ "_id": doc._id }),
merged_doc = merge(old_doc, doc);
db.new_collection.update(
{ "_id": doc._id },
{ "$set": merged_doc }
);
});
For dealing with large collections, better leverage your updates using the bulk API which offers better performance and efficient update operations done through
sending the update requests in bulk rather than each update operation for every request (which is slow). The method to use is the bulkWrite() function, which can be applied in the above example as:
function merge(from, to) {
var obj = {};
if (!from) {
from = {};
} else {
obj = from;
}
for (var key in to) {
if (!from.hasOwnProperty(key)) {
obj[key] = to[key];
}
}
return obj;
}
var ops = [];
db.new_collection.find({}).snapshot().forEach(function(doc){
var old_doc = db.old_collection.findOne({ "_id": doc._id }),
merged_doc = merge(old_doc, doc);
ops.push({
"updateOne": {
"filter": { "_id": doc._id },
"update": { "$set": merged_doc }
}
});
if (ops.length === 1000) {
db.new_collection.bulkWrite(ops);
ops = [];
}
});
if (ops.length > 0) db.new_collection.bulkWrite(ops);
Or for MongoDB 2.6.x and 3.0.x releases use this version of Bulk operations:
var bulk = db.new_collection.initializeUnorderedBulkOp(),
counter = 0;
db.new_collection.find({}).snapshot().forEach(function(doc){
var old_doc = db.old_collection.findOne({ "_id": doc._id }),
merged_doc = merge(old_doc, doc);
bulk.find({ "_id": doc._id }).updateOne({ "$set": merged_doc });
if (counter % 1000 === 0) {
bulk.execute();
bulk = db.new_collection.initializeUnorderedBulkOp();
}
});
if (counter % 1000 !== 0 ) bulk.execute();
The Bulk operations API in both cases will help reduce the IO load on the server by sending the requests only once in every 1000 documents in the collection to process.

MongoDB Shell Script Update all field Names where there is space in field name

Using MongoDB shell script 3.2, how can I update all fields where field names have a space replace those with underscore?
{
"Some Field": "value",
"OtherField" :"Value",
"Another Field" : "Value"
}
update the above document as below
{
"Some_Field": "value",
"OtherField" :"Value",
"Another_Field" : "Value"
}
rename field can be done with something like this
db.CollectionName.update( { _id: 1 }, { $rename: { 'nickname': 'alias', 'cell': 'mobile' } } )
Challenging part here is filter, how to come up with a filter where there is a space in field name
This needs a two-step approach. First, you need a mechanism to get a list of all the keys with a space in your collection. Once you get the list, construct an object that maps those keys to their renamed values. You can then use that object as your $rename operator document. Consider using mapReduce to get the list of keys with spaces.
The following mapReduce operation will populate a separate collection with all the filtered keys as the _id values:
mr = db.runCommand({
"mapreduce": "CollectionName",
"map": function() {
var regxp = /\s/;
for (var key in this) {
if (key.match(regxp)) {
emit(key, null);
}
}
},
"reduce": function() {},
"out": "filtered_keys"
})
To get a list of all the spaced keys, run distinct on the resulting collection:
db[mr.result].distinct("_id")
["Some Field", "Another Field"]
Now given the list above, you can assemble your update document by creating an object that will have its properties set within a loop. Normally your update document will have this structure:
var update = {
"$rename": {
"Some Field": "Some_Field",
"Another Field": "Another_Field"
}
}
Thus
var update = { "$rename": {} };
db[mr.result].distinct("_id").forEach(function (key){
update["$rename"][key] = key.replace(/ /g,"_");
});
which you can then use in your update as
db.CollectionName.update({ }, update, false, true );
Thanks to #chridam that was a excellent query.
Had to make small changes to run query, Full working query.
mr = db.runCommand({
"mapreduce": "MyCollectionName",
"map": function() {
var regxp = /\s/;
for (var key in this) {
if (key.match(regxp)) {
emit(key, null);
}
}
},
"reduce": function() {},
"out": "filtered_keys"
})
db[mr.result].distinct("_id")
var update = { "$rename": {} };
db[mr.result].distinct("_id").forEach(function (key){
update["$rename"][key] = key.replace(/\s+/g, "_");
});
//print(update)
db.MyCollectionName.update({ }, update, false, true );

Complex mongodb document search

I'm attempting to write a find query where one of the keys is unknown at the time the query is run, for example on the following document I'm interested in returning the document if "setup" is true:
{
"a": {
"randomstringhere": {
"setup": true
}
}
}
However I can't work how to wildcard the "randomstringhere" field as it changes for each document in the collection.
Can somebody help?
There is not much you can do with that. But you can modify your collection schema like
{
"a": [
{
"keyName": "randomstringhere",
"setup": true
},
//...
]
}
you can than write query to look
{
'a' : { $elemMatch: { setup: true } ,
}
You can't do this with a single query, as with the current design you would need a mechanism to get all the random keys that you need and then assemble the query document that uses the $or operator in the event that you get a list of variable key name.
The first part of your operation is possible using Map-Reduce. The following mapreduce operation will populate a separate collection called collectionKeys with all the random keys as the _id values:
mr = db.runCommand({
"mapreduce": "collection",
"map" : function() {
for (var key in this.a) { emit(key, null); }
},
"reduce" : function() { },
"out": "collectionKeys"
})
To get a list of all the random keys, run distinct on the resulting collection:
db[mr.result].distinct("_id")
Example Ouput
["randomstring_1", "randomstring_2", "randomstring_3", "randomstring_4", ...]
Now given the list above, you can assemble your query by creating an object that will have its properties set within a loop. Normally your query document will have this structure:
var query = {
"$or": [
{ "a.randomstring_1.setup": true },
{ "a.randomstring_2.setup": true },
{ "a.randomstring_3.setup": true }
]
};
which you can then use in your query:
db.collection.find(query)
So using the above list of subdocument keys, you can dynamically construct the above using JavaScript's map() method:
mr = db.runCommand({
"mapreduce": "collection", // your collection name
"map" : function() { // map function
for (var key in this.a) { emit(key, null); }
},
"reduce" : function() { }, // empty reducer that doesn't do anything
"out": "collectionKeys" // output collection with results
})
var randomstringKeysList = db[mr.result].distinct("_id"),
orOperator = randomstringKeysList.map(function (key){
var o = {};
o["a."+ key +".setup"] = true;
return o;
}),
query = { "$or": orOperator };
db.collection.find(query);

mongodb delete nested object without knowledge of object nodes

For the below document, I am trying to delete the node which contains id = 123
{
'_id': "1234567890",
"image" : {
"unknown-node-1" : {
"id" : 123
},
"unknown-node-2" : {
"id" : 124
}
}
}
Result should be as below.
{
'_id': "1234567890",
"image" : {
"unknown-node-2" : {
"id" : 124
}
}
}
The below query achieves the result. But i have to know the unknown-node-1 in advance. How can I achieve the results without pre-knowledge of node, but only
info that I have is image.*.id = 123
(* means unknown node)
Is it possible in mongo? or should I do these find on my app code.
db.test.update({'_id': "1234567890"}, {$unset: {'image.unknown-node-1': ""}})
Faiz,
There is no operator to help match and project a single key value pair without knowing the key. You'll have to write post processing code to scan each one of the documents to find the node with the id and then perform your removal.
If you have the liberty of changing your schema, you'll have more flexibilty. With a document design like this:
{
'_id': "1234567890",
"image" : [
{"id" : 123, "name":"unknown-node-1"},
{"id" : 124, "name":"unknown-node-2"},
{"id" : 125, "name":"unknown-node-3"}
]
}
You could remove documents from the array like this:
db.collectionName.update(
{'_id': "1234567890"},
{ $pull: { image: { id: 123} } }
)
This would result in:
{
'_id': "1234567890",
"image" : [
{"id" : 124, "name":"unknown-node-2"},
{"id" : 125, "name":"unknown-node-3"}
]
}
With your current schema, you will need a mechanism to get a list of the dynamic keys that you need to assemble the query before doing the update and one way of doing this would be with MapReduce. Take for instance the following map-reduce operation which will populate a separate collection with all the keys as the _id values:
mr = db.runCommand({
"mapreduce": "test",
"map" : function() {
for (var key in this.image) { emit(key, null); }
},
"reduce" : function(key, stuff) { return null; },
"out": "test_keys"
})
To get a list of all the dynamic keys, run distinct on the resulting collection:
> db[mr.result].distinct("_id")
[ "unknown-node-1", "unknown-node-2" ]
Now given the list above, you can assemble your query by creating an object that will have its properties set within a loop. Normally if you knew the keys beforehand, your query will have this structure:
var query = {
"image.unknown-node-1.id": 123
},
update = {
"$unset": {
"image.unknown-node-1": ""
}
};
db.test.update(query, update);
But since the nodes are dynamic, you will have to iterate the list returned from the mapReduce operation and for each element, create the query and update parameters as above to update the collection. The list could be huge so for maximum efficiency and if your MongoDB server is 2.6 or newer, it would be better to take advantage of using a write commands Bulk API that allow for the execution of bulk update operations which are simply abstractions on top of the server to make it easy to build bulk operations and thus get perfomance gains with your update over large collections. These bulk operations come mainly in two flavours:
Ordered bulk operations. These operations execute all the operation in order and error out on the first write error.
Unordered bulk operations. These operations execute all the operations in parallel and aggregates up all the errors. Unordered bulk operations do not guarantee order of execution.
Note, for older servers than 2.6 the API will downconvert the operations. However it's not possible to downconvert 100% so there might be some edge cases where it cannot correctly report the right numbers.
In your case, you could implement the Bulk API update operation like this:
mr = db.runCommand({
"mapreduce": "test",
"map" : function() {
for (var key in this.image) { emit(key, null); }
},
"reduce" : function(key, stuff) { return null; },
"out": "test_keys"
})
// Get the dynamic keys
var dynamic_keys = db[mr.result].distinct("_id");
// Get the collection and bulk api artefacts
var bulk = db.test.initializeUnorderedBulkOp(), // Initialize the Unordered Batch
counter = 0;
// Execute the each command, triggers for each key
dynamic_keys.forEach(function(key) {
// Create the query and update documents
var query = {},
update = {
"$unset": {}
};
query["image."+ key +".id"] = 123;
update["$unset"]["image." + key] = ";"
bulk.find(query).update(update);
counter++;
if (counter % 100 == 0 ) {
bulk.execute() {
// re-initialise batch operation
bulk = db.test.initializeUnorderedBulkOp();
}
});
if (counter % 100 != 0) { bulk.execute(); }

How to get average value from a hashmap in MongoDB?

I have a time data in my Mongo database. Each document equal a minute and contain 60 seconds as objects with value for each. How to get average value of all seconds in one minute?
A document looking like that:
{
"_id" : ObjectId("55575e4062771c26ec5f2287"),
"timestamp" : "2015-05-16T18:12:00.000Z",
"values" : {
"0" : "26.17",
"1" : "26.17",
"2" : "26.17",
...
"58" : "24.71",
"59" : "25.20"
}
}
You could take two approaches here:
Changing the schema and use the aggregation framework to get the average by using the $avg operator OR
Apply Map-Reduce.
Let's look at the first option. Currently as it is, the schema will not make it possible to use the aggregation framework because of the dynamic keys in the values subdocument. The ideal schema that would favour the aggregation framework would have the values field be an array which contains embedded key/value documents like this:
/* 0 */
{
"_id" : ObjectId("5559d66c9bbec0dd0344e4b0"),
"timestamp" : "2015-05-16T18:12:00.000Z",
"values" : [
{
"k" : "0",
"v" : 26.17
},
{
"k" : "1",
"v" : 26.17
},
{
"k" : "2",
"v" : 26.17
},
...
{
"k" : "58",
"v" : 24.71
},
{
"k" : "59",
"v" : 25.20
}
]
}
With MongoDB 3.6 and newer, use the aggregation framework to tranform the hashmaps to an array by using the $objectToArray operator then use $avg to calculate the average.
Consider running the following aggregate pipeline:
db.test.aggregate([
{
"$addFields": {
"values": { "$objectToArray": "$values" }
}
}
])
Armed with this new schema, you would then need to update your collection to change the string values to int by iterating the cursor returned from the aggregate method and using bulkWrite as follows:
var bulkUpdateOps = [],
cursor = db.test.aggregate([
{
"$addFields": {
"values": { "$objectToArray": "$values" }
}
}
]);
cursor.forEach(doc => {
const { _id, values } = doc;
let temp = values.map(item => {
item.key = item.k;
item.value = parseFloat(item.v) || 0;
delete item.k;
delete item.v;
return item;
});
bulkUpdateOps.push({
"updateOne": {
"filter": { _id },
"update": { "$set": { values: temp } },
"upsert": true
}
});
if (bulkUpdateOps.length === 1000) {
db.test.bulkWrite(bulkUpdateOps);
bulkUpdateOps = [];
}
});
if (bulkUpdateOps.length > 0) {
db.test.bulkWrite(bulkUpdateOps);
}
If your MongoDB version does not support the $objectToArray operator in the aggregation framework, then to convert the current schema into the one above takes a bit of native JavaScript functions with the MongoDB find() cursor's forEach() function as follows (assuming you have a test collection):
var bulkUpdateOps = [],
cursor = db.test.find();
cursor.forEach(doc => {
const { _id, values } = doc;
let temp = Object.keys(values).map(k => {
let obj = {};
obj.key = k;
obj.value = parseFloat(doc.values[k]) || 0;
return obj;
});
bulkUpdateOps.push({
"updateOne": {
"filter": { _id },
"update": { "$set": { values: temp } },
"upsert": true
}
});
if (bulkUpdateOps.length === 1000) {
db.test.bulkWrite(bulkUpdateOps);
bulkUpdateOps = [];
}
});
if (bulkUpdateOps.length > 0) {
db.test.bulkWrite(bulkUpdateOps);
}
or
db.test.find().forEach(function (doc){
var keys = Object.keys(doc.values),
values = keys.map(function(k){
var obj = {};
obj.key = k;
obj.value = parseFloat(doc.values[k]) || 0;
return obj;
});
doc.values = values;
db.test.save(doc);
});
The collection will now have the above schema and thus follows the aggregation pipeline that will give you the average time in one minute:
db.test.aggregate([
{
"$fields": {
"average": { "$avg": "$values.value" }
}
}
])
Or for MongoDB 3.0 and lower
db.test.aggregate([
{ "$unwind": "$values" },
{
"$group": {
"_id": "$timestamp",
"average": {
"$avg": "$values.value"
}
}
}
])
For the above document, the output would be:
/* 0 */
{
"result" : [
{
"_id" : "2015-05-16T18:12:00.000Z",
"average" : 25.684
}
],
"ok" : 1
}
As for the other Map-Reduce option, the intuition behind the operation is you would use JavaScript to make the necessary transformations and calculate the final average. You would need to define three functions:
Map
When you tell Mongo to MapReduce, the function you provide as the map function will receive each document as the this parameter. The purpose of the map is to exercise whatever logic you need in JavaScript and then call emit 0 or more times to produce a reducible value.
var map = function(){
var obj = this.values;
var keys = Object.keys(obj);
var values = [];
keys.forEach(function(key){
var val = parseFloat(obj[key]);
var value = { count: 1, qty: val };
emit(this.timestamp, value);
});
};
For each document you need to emit a key and a value. The key is the first parameter to the emit function and represents how you want to group the values (in this case you will be grouping by the timestamp). The second parameter to emit is the value, which in this case is a little object containing the count of documents (always 1) and total value of each individual value object key i.e. for each second within the minute.
Reduce
Next you need to define the reduce function where Mongo will group the items you emit and pass them as an array to this reduce function It's inside the reduce function where you want to do the aggregation calculations and reduce all the objects to a single object.
var reduce = function(key, values) {
var result = {count: 0, total: 0 };
values.forEach(function(value){
result.count += value.count;
result.total += value.qty;
});
return result;
};
This reduce function returns a single result. It's important for the return value to have the same shape as the emitted values. It's also possible for MongoDB to call the reduce function multiple times for a given key and ask you to process a partial set of values, so if you need to perform some final calculation, you can also give MapReduce a finalize function.
Finalize
The finalize function is optional, but if you need to calculate something based on a fully reduced set of data, you'll want to use a finalize function. Mongo will call the finalize function after all the reduce calls for a set are complete. This would be the place to calculate the average of all the second values in a document/timestamp:
var finalize = function (key, value) {
value.average = value.total / value.count;
return value;
};
Putting It Together
With the JavaScript in place, all that is left is to tell MongoDB to execute a MapReduce:
var map = function(){
var obj = this.values;
var keys = Object.keys(obj);
var values = [];
keys.forEach(function(key){
var val = parseFloat(obj[key]);
var value = { count: 1, qty: val };
emit(this.timestamp, value);
});
};
var reduce = function(key, values) {
var result = {count: 0, total: 0 };
values.forEach(function(value){
result.count += value.count;
result.total += value.qty;
});
return result;
};
var finalize = function (key, value) {
value.average = value.total / value.count;
return value;
};
db.collection.mapReduce(
map,
reduce,
{
out: { merge: "map_reduce_example" },
finalize: finalize
}
)
And when you query the output collection map_reduce_example, db.map_reduce_example.find(), you get the result:
/* 0 */
{
"_id" : null,
"value" : {
"count" : 5,
"total" : 128.42,
"average" : 25.684
}
}
References:
A Simple MapReduce with MongoDB and C#
MongoDB docuumentation on mapReduce
This kind of data structure creates lots of conflicts and difficult to handled mongo operations. This case either you changed your schema design. But, if you not able to changed this schema then follow this :
In your schema having two major problem 1> keys dynamic and 2> values of given keys in string so you should use some programming code to calculating avg check below scripts
From ref this first calculated size of values
Object.size = function(obj) {
var size = 0,
key;
for (key in obj) {
if (obj.hasOwnProperty(key)) size++;
}
return size;
};
db.collectionName.find().forEach(function(myDoc) {
var objects = myDoc.values;
var value = 0;
// Get the size of an object
var size = Object.size(objects);
for (var key in objects) {
value = value + parseFloat(objects[key]); // parse string values to float
}
var avg = value / size
print(value);
print(size);
print(avg);
});