Sum of Substrings in mongodb - mongodb

We have field(s) in mongodb which has numbers in string form, values such as "$123,00,89.00" or "1234$" etc
Is it possible to customize $sum accumulators in mongodb, so that, certain processing can be done at each field value while the sum is performed. Such as substring or reg-ex processing etc.

The .mapReduce() method is what you need here. You cannot "cast" values in the aggregation framework from one "type" to another ( with the exception of "to string" or from Date to numeric ).
The JavaScript processing means that you can convert a string into a value for "summing". Somthing like this ( with a bit more work on a "safe" regex for the required "currency" values:
db.collection.mapReduce(
function() {
emit(null, this.amount.replace(/\$|,|\./g,"") / 100 );
},
function(key,values) {
return Array.sum(values);
},
{ "out": { "inline": 1 } }
)
Or with .group() which also uses JavaScript procesing, but is a bit more restrcitive in it's requirements:
db.collection.group({
"key": null,
"reduce": function( curr,result ) {
result.total += curr.amount.replace(/\$|,|\./g,"") /100;
},
"initial": { "total": 0 }
});
So JavaScript processing is your only option as these sorts of operations are not supported in the aggregatation framework.
A number can be a string:
db.junk.aggregate([{ "$project": { "a": { "$substr": [ 1,0,1 ] } } }])
{ "_id" : ObjectId("55a458c567446a4351c804e5"), "a" : "1" }
And a Date can become a number:
db.junk.aggregate([{ "$project": { "a": { "$subtract": [ new Date(), new Date(0) ] } } }])
{ "_id" : ObjectId("55a458c567446a4351c804e5"), "a" : NumberLong("1436835669446") }
But there are no other operators to "cast" a "string" to "numeric" or even anthing to do a Regex replace as shown above.
If you want to use .aggregate() then you need to fix your data into a format that will support it, thus "numeric":
var bulk = db.collection.initializeOrderedBulkOp(),
count = 0;
db.collection.find({ "amount": /\$|,\./g }).forEach(function(doc) {
doc.amount = doc.amount.replace(/\$|,|\./g,"") /100;
bulk.find({ "_id": doc._id }).updateOne({
"$set": { "amount": doc.amount }
});
count++;
// execute once in 1000 operations
if ( count % 1000 == 0 ) {
bulk.execute();
bulk = db.collection.initializeOrderedBulkOp();
}
});
// clean up queued operations
if ( count % 1000 != 0 )
bulk.execute();
Then you can use .aggregate() on your "numeric" data:
db.collection.aggregate([
{ "$group": { "_id": null, "total": { "$sum": "$amount" } } }
])

Related

Most efficient way to change a string field value to its substring

I have a collection filled with documents that look like this:
{
data: 11,
version: "0.0.32"
}
and some have a test suffix to version:
{
data: 55,
version: "0.0.42-test"
}
The version field has different values but it always conforms to the pattern: 0.0.XXX. I would like to update all the documents to look like this:
{
data: 11,
version: 32
}
and the suffixed version (for test documents - version should be negative):
{
data: 55,
version: -42
}
The collection with these documents is used by our critical system, that needs to be turned off while updating the data - so I want the update/change to be as fast as possible. There are about 66_000_000 documents in this collection, and it's about 100GB in size.
Which type of mongodb operation would be the most efficient one?
The most efficient way to do this is in the upcoming release of MongoDB as of this writing using the $split operator to split our string as shown here then assign the last element in the array to a variable using the $let variable operator and the $arrayElemAt operators.
Next, we use the $switch operator to perform a logical condition processing or case statement against that variable.
The condition here is $gt which returns true if the value contains "test", and in which case in the in expression we split that string and simply return the $concatenated value of the first element in the newly computed array and the -. If the condition evaluates to false, we just return the variable.
Of course in our case statement, we use the $indexOfCP which returns -1 if there were no occurrences of "test".
let cursor = db.collection.aggregate(
[
{ "$project": {
"data": 1,
"version": {
"$let": {
"vars": {
"v": {
"$arrayElemAt": [
{ "$split": [ "$version", "." ] },
-1
]
}
},
"in": {
"$switch": {
"branches": [
{
"case": {
"$gt": [
{ "$indexOfCP": [ "$$v", "test" ] },
-1
]
},
"then": {
"$concat": [
"-",
"",
{ "$arrayElemAt": [
{ "$split": [ "$$v", "-" ] },
0
]}
]
}
}
],
"default": "$$v"
}
}
}
}
}}
]
)
The aggregation query produces something like this:
{ "_id" : ObjectId("57a98773cbbd42a2156260d8"), "data" : 11, "version" : "32" }
{ "_id" : ObjectId("57a98773cbbd42a2156260d9"), "data" : 55, "version" : "-42" }
As you can see, the "version" field data are string. If the data type for that field does not matter then, you can simply use the $out aggregation pipeline stage operator to write the result into a new collection or replace your collection.
{ "out": "collection" }
If you need to convert your data to floating point number then, the only way to do this, simply because MongoDB doesn't not provides a way to do type conversion out of the box except for integer to string, is to iterate the aggregation Cursor object and convert your value using parseFloat or Number then update your documents using the $set operator and the bulkWrite() method for maximum efficiency.
let requests = [];
cursor.forEach(doc => {
requests.push({
"updateOne": {
"filter": { "_id": doc._id },
"update": {
"$set": {
"data": doc.data,
"version": parseFloat(doc.version)
},
"$unset": { "person": " " }
}
}
});
if ( requests.length === 1000 ) {
// Execute per 1000 ops and re-init
db.collection.bulkWrite(requests);
requests = [];
}}
);
// Clean up queues
if(requests.length > 0) {
db.coll.bulkWrite(requests);
}
While the aggregation query will perfectly work in MongoDB 3.4 or newer our best bet from MongoDB 3.2 backwards is mapReduce with the bulkWrite() method.
var results = db.collection.mapReduce(
function() {
var v = this.version.split(".")[2];
emit(this._id, v.indexOf("-") > -1 ? "-"+v.replace(/\D+/g, '') : v)
},
function(key, value) {},
{ "out": { "inline": 1 } }
)["results"];
results looks like this:
[
{
"_id" : ObjectId("57a98773cbbd42a2156260d8"),
"value" : "32"
},
{
"_id" : ObjectId("57a98773cbbd42a2156260d9"),
"value" : "-42"
}
]
From here you use the previous .forEach loop to update your documents.
From MongoDB 2.6 to 3.0 you will need to use the now deprecated Bulk() API and it associated method as show in my answer here.

How to sum up all the employee salaries and update it to one attribute in the array embedded doc in mongodb

I want to sum up all the EMP_SALARY = (9000)2000+3000+4000 and I'm trying to update the value 9000 to total_employee_salary attribute.How can I do it in mongo shell.Can anyone please help me out regarding this ...
{
"_id" : ObjectId("571898dbc000041fe0b921eb"),
"ORGANIZATION" : "abc",
"TOTAL_EMPLOYEES" : 10,
"TOTAL_EMPLOYEES_SALARY" : 0,
"employees" : [
{
"EMP_NAME" : "vijay",
"EMP_SALARY" : 2000,
},
{
"EMP_NAME" : "vishnu",
"EMP_SALARY" : 3000,
},
{
"EMP_NAME" : "vishal",
"EMP_SALARY" : 4000,
}
]
}
If you are doing this in bulk for your collection, then the best way to do this is iterate with .bulkWrite() to write back:
var ops = [];
db.collection.find().forEach(function(doc) {
ops.push({
"updateOne": {
"filter": { "_id": doc._id },
"update": {
"$set": {
"TOTAL_EMPLOYEE_SALARY": Array.sum(doc.employees.map(function(emp) {
return emp.EMP_SALARY
}))
}
}
}
});
if ( ops.length == 1000 ) {
db.collection.bulkWrite(ops);
ops = [];
}
})
if ( ops.length > 0 ) {
db.collection.bulkWrite(ops);
}
For "super safe" code though, you probably should be using $inc on iteration of each array element instead:
var ops = [];
db.collection.find().forEach(function(doc) {
doc.employees.forEach(function(emp) {
ops.push({
"updateOne": {
"filter": { "_id": doc._id },
"update": {
"$inc": {
"TOTAL_EMPLOYEE_SALARY": emp.EMP_SALARY
}
}
}
});
if ( ops.length == 1000 ) {
db.collection.bulkWrite(ops);
ops = [];
}
})
});
if ( ops.length > 0 ) {
db.collection.bulkWrite(ops);
}
In earlier shell releases you do it using the "bulk" operations builder directly:
var bulk = db.collection.initializeOrderedBulkOp(),
count = 0;
db.collection.find().forEach(function(doc) {
bulk.find({ "_id": doc._id }).updateOne({
"$set": {
"TOTAL_EMPLOYEE_SALARY": Array.sum(doc.employees.map(function(emp) {
return emp.EMP_SALARY
}))
}
});
count++;
if ( count % 1000 == 0 ) {
bulk.execute();
bulk = db.collection.initializeOrderedBulkOp();
}
})
if ( count % 1000 != 0 ) {
bulk.execute();
}
But what you really should be doing in all instances is updating the mongodb-shell package on your system, regardless of the server version used. A modern shell should really be updated just as with a modern API version with your programming language of choice.
You need to iterate documents anyway in order to update each one, so you might as well just sum the content from the array by reading each document.
Just for trivia sake, in modern MongoDB releases the $sum operator works both as an accumulator as well as now adding items in an array. So now you can do this:
db.collection.aggregate([
{ "$project": {
"TOTAL_EMPLOYEE_SALARY": {
"$sum": "$employees.EMP_SALARY"
}
}}
])
And that will give the total of the array in each document.
In earlier versions than MongoDB 3.2 though, you need to $unwind the array and $group instead:
db.collection.aggregate([
{ "$unwind": "$employees" },
{ "$group": {
"_id": "$_id",
"TOTAL_EMPLOYEE_SALARY": { "$sum": "$employees.EMP_SALARY" }
}}
])
db.collection.aggregate([
{
$group: {
_id: null,
"sum": {
$sum: "$employees.EMP_SALARY"
}
}
}
])

mongodb count two condition one time?

I need to count two query like below:
1. data[k]['success'] = common.find({'url': {"$regex": k}, 'result.titile':{'$ne':''}}).count()
2. data[k]['fail'] = common.find({'url': {"$regex": k}, 'result.titile':''}).count()
I think it would be more efficient if mongodb can work like below:
result = common.find({'url': {"$regex": k})
count1 = result.find({'result.titile':{'$ne':''}})
count2 = result.count() - count1
//result do not have find or count method, just for example
Two count are basing same search condition{'url': {"$regex": k}, splited by {'result.titile':{'$ne':''}} or not.
Is there some build-in way to do these without writing custom js?
The async method would be the preferred one if at all your client supports it.
You could also aggregate as below:
$match the docs which have the urls.
$group by the _id as null, and take the $sum of all documents. We need those documents, to get the sum of those which do not have a title, so just accumulate them using the $push operator.
$unwind the documents.
$match those which do not have a title.
$group, and get the $sum.
$project the desired result.
sample code:
db.t.aggregate([
{$match:{"url":{"$regex":k}}},
{$group:{"_id":null,
"count_of_url_matching_docs":{$sum:1},
"docs":{$push:"$$ROOT"}}},
{$unwind:"$docs"},
{$match:{"docs.result.titile":{$ne:""}}},
{$group:{"_id":null,
"count_of_url_matching_docs":{$first:"$count_of_url_matching_docs"},
"count_of_docs_with_titles":{$sum:1}}},
{$project:{"_id":0,
"count_of_docs_with_titles":"$count_of_docs_with_titles",
"count_difference":{$subtract:[
"$count_of_url_matching_docs",
"$count_of_docs_with_titles"]}}}
])
Test data:
db.t.insert([
{"url":"s","result":{"titile":1}},
{"url":"s","result":{"titile":""}},
{"url":"s","result":{"titile":""}},
{"url":"s","result":{"titile":2}}
])
Test Result:
{ "count_of_docs_with_titles" : 2, "count_difference" : 2 }
Use .aggregate() with a conditional key for grouping via $cond:
common.aggregate([
{ "$match": { "url": { "$regex": k } } },
{ "$group": {
"_id": {
"$cond": {
"if": { "$ne": [ "$result.title", "" ] },
"then": "success",
"else": "fail"
}
},
"count": { "$sum": 1 }
}}
])
However it is actually more efficient to run both queries in parallel if your environment supports it, such as with nodejs
async.parallel(
[
function(callback) {
common.count({
"url": { "$regex": k },
"result.title": { "$ne": "" }
}, function(err,count) {
callback(err,{ "success": count });
});
},
function(callback) {
common.count({
"url": { "$regex": k },
"result.title": ""
}, function(err,count) {
callback(err,{ "fail": count });
});
}
],
function(err,results) {
if (err) throw err;
console.log(results);
}
)
Which makes sense really since each item is not being tested and each result can actually run on the server at the same time.

How to select custom data in mongo

Is there a way how to include custom data in the mongo query response?
What I mean is a mongo alternative for something like this in MySQL code:
SELECT
value,
'7' AS min_value
FORM
my_table
WHERE
value >= 7
...while the 7 should probably be a variable in the language where the mongo query is being called from.
Try the $literal operator if using the aggregation framework with a $match pipeline step as your query filter. For example, create a sample collection in mongo shell that has 10 test documents with the value field as an increasing integer (0 to 9):
for(x=0;x<10;x++){ db.my_table.insert({value: x }) }
Running the following aggregation pipeline:
var base = 7;
db.my_table.aggregate([
{
"$match": {
"value": { "$gte": base }
}
},
{
"$project": {
"value": 1,
"min_value": { "$literal": base }
}
}
])
would produce the result:
/* 0 */
{
"result" : [
{
"_id" : ObjectId("561e2bcc3d8f561c1548d39b"),
"value" : 7,
"min_value" : 7
},
{
"_id" : ObjectId("561e2bcc3d8f561c1548d39c"),
"value" : 8,
"min_value" : 7
},
{
"_id" : ObjectId("561e2bcc3d8f561c1548d39d"),
"value" : 9,
"min_value" : 7
}
],
"ok" : 1
}
The only things in MongoDB query actions that actuallly "modify" the results returned other than the original document or "field selection" are the .aggregate() method or the JavaScript manipulation alternate in mapReduce.
Otherwise documents are returned "as is", or at least with just the selected fields or array entry specified.
So if you want something else returned from the server, then you need to use one of those methods:
var seven = 7;
db.collection.aggregate([
{ "$match": {
"value": { "$gt": seven }
}},
{ "$project": {
"value": 1,
"min_value": { "$literal": seven }
}}
])
Where the $literal operator comes into play, or in versions prior to 2.6 and greater or equal to 2.2 ( aggregation framework introduced ) can use $const instead:
var seven = 7;
db.collection.aggregate([
{ "$match": {
"value": { "$gt": seven }
}},
{ "$project": {
"value": 1,
"min_value": { "$const": seven }
}}
])
Or just use mapReduce and it's JavaScript translation:
var seven = 7;
db.mapReduce(
function() {
emit(this._id,{ "value": this.value, "min_value": seven });
},
function() {}, // no reduce at all since all _id unique
{
"out": { "inline": 1 },
"query": { "value": { "$gt": seven } },
"scope": { "seven": seven }
}
);
Those are basically your options.

Removing white spaces (leading and trailing) from string value

I have imported a csv file in mongo using mongoimport and I want to remove leading and trailing white spaces from my string value.
Is it possible directly in mongo to use a trim function for all collection or do I need to write a script for that?
My collection contains elements such as:
{
"_id" : ObjectId("53857680f7b2eb611e843a32"),
"category" : "Financial & Legal Services "
}
I want to apply trim function for all the collection so that "category" should not contain any leading and trailing spaces.
It is not currently possible for an update in MongoDB to refer to the existing value of a current field when applying the update. So you are going to have to loop:
db.collection.find({},{ "category": 1 }).forEach(function(doc) {
doc.category = doc.category.trim();
db.collection.update(
{ "_id": doc._id },
{ "$set": { "category": doc.category } }
);
})
Noting the use of the $set operator there and the projected "category" field only in order to reduce network traffic"
You might limit what that processes with a $regex to match:
db.collection.find({
"$and": [
{ "category": /^\s+/ },
{ "category": /\s+$/ }
]
})
Or even as pure $regex without the use of $and which you only need in MongoDB where multiple conditions would be applied to the same field. Otherwise $and is implicit to all arguments:
db.collection.find({ "category": /^\s+|\s+$/ })
Which restricts the matched documents to process to only those with leading or trailing white-space.
If you are worried about the number of documents to look, bulk updating should help if you have MongoDB 2.6 or greater available:
var batch = [];
db.collection.find({ "category": /^\s+|\s+$/ },{ "category": 1 }).forEach(
function(doc) {
batch.push({
"q": { "_id": doc._id },
"u": { "$set": { "category": doc.catetgory.trim() } }
});
if ( batch.length % 1000 == 0 ) {
db.runCommand("update", batch);
batch = [];
}
}
);
if ( batch.length > 0 )
db.runCommand("update", batch);
Or even with the bulk operations API for MongoDB 2.6 and above:
var counter = 0;
var bulk = db.collection.initializeOrderedBulkOp();
db.collection.find({ "category": /^\s+|\s+$/ },{ "category": 1}).forEach(
function(doc) {
bulk.find({ "_id": doc._id }).update({
"$set": { "category": doc.category.trim() }
});
counter = counter + 1;
if ( counter % 1000 == 0 ) {
bulk.execute();
bulk = db.collection.initializeOrderedBulkOp();
}
}
);
if ( counter > 1 )
bulk.execute();
Best done with bulkWrite() for modern API's which uses the Bulk Operations API ( technically everything does now ) but actually in a way that is safely regressive with older versions of MongoDB. Though in all honesty that would mean prior to MongoDB 2.6 and you would be well out of coverage for official support options using such a version. The coding is somewhat cleaner for this:
var batch = [];
db.collection.find({ "category": /^\s+|\s+$/ },{ "category": 1}).forEach(
function(doc) {
batch.push({
"updateOne": {
"filter": { "_id": doc._id },
"update": { "$set": { "category": doc.category.trim() } }
}
});
if ( batch.length % 1000 == 0 ) {
db.collection.bulkWrite(batch);
batch = [];
}
}
);
if ( batch.length > 0 ) {
db.collection.bulkWrite(batch);
batch = [];
}
Which all only send operations to the server once per 1000 documents, or as many modifications as you can fit under the 64MB BSON limit.
As just a few ways to approach the problem. Or update your CSV file first before importing.
Starting Mongo 4.2, db.collection.update() can accept an aggregation pipeline, finally allowing the update of a field based on its own value.
Starting Mongo 4.0, the $trim operator can be applied on a string to remove its leading/trailing white spaces:
// { category: "Financial & Legal Services " }
// { category: " IT " }
db.collection.updateMany(
{},
[{ $set: { category: { $trim: { input: "$category" } } } }]
)
// { category: "Financial & Legal Services" }
// { category: "IT" }
Note that:
The first part {} is the match query, filtering which documents to update (in this case all documents).
The second part [{ $set: { category: { $trim: { input: "$category" } } } }] is the update aggregation pipeline (note the squared brackets signifying the use of an aggregation pipeline):
$set is a new aggregation operator which in this case replaces the value for "category".
With $trim we modify and trim the value for "category".
Note that $trim can take an optional parameter chars which allows specifying which characters to trim.
Small correction to the answer from Neil for bulk operations api
it is
initializeOrderedBulkOp
not
initializeBulkOrderedOp
also you missed to
counter++;
inside the forEach, so in summary
var counter = 1;
var bulk = db.collection.initializeOrderedBulkOp();
db.collection.find({ "category": /^\s+|\s+$/ },{ "category": 1}).forEach(
function(doc) {
bulk.find({ "_id": doc._id }).update({
"$set": { "category": doc.category.trim() }
});
if ( counter % 1000 == 0 ) {
bulk.execute();
counter = 1;
}
counter++;
}
);
if ( counter > 1 )
bulk.execute();
Note: I don't have enough reputation to comment, hence adding an answer
You can execute javascript in an MongoDB update command when it's in a cursor method:
db.collection.find({},{ "category": 1 }).forEach(function(doc) {
db.collection.update(
{ "_id": doc._id },
{ "$set": { "category": doc.category.trim() } }
);
})
If you have a ton of records and need to batch process, you might want to look at the other answers here.