Distinct/Aggregation query Mongodb array, trim trailing space - mongodb

I have a MongoDB collection which contains a colours array like :
myCollection :
{
_id : ...,
"colours" : [
{
"colourpercentage" : "42",
"colourname" : "Blue"
},
{
"colourpercentage" : "32",
"colourname" : "Red"
},
{
"colourpercentage" : "10",
"colourname" : "Green "
}
]
}
I would like to retrieve every distinct colourname of every entry of this collection, and be able to filter it with a search.
I tried with distinct but without success. I searched further and found that an aggregation could help me. For the moment I have :
db.getCollection('myCollection').aggregate([
{ "$match": { "colours.colourname": /Gre/ } }, # Gre is my search
{ "$unwind": "$colours" },
{ "$match": { "colours.colourname": /search/ } },
{ "$group": {
"_id": "$colours.colourname"
}}
])
It is working, but I get an array like :
{
"result" : [
{
"_id" : "Grey"
},
{
"_id" : "Light Green "
},
{
"_id" : "Light Green"
},
{
"_id" : "Green "
},
{
"_id" : "Green"
}
],
"ok" : 1.0000000000000000
}
And I would like to remove duplicate entries which have a space in the end and displays them like :
["Grey","Light Green","Green"]

One approach you could take is the Map-Reduce way even though the JavaScript interpreter driven mapReduce takes a bit longer than the aggregation framework but will work since you will be using some very useful native JavaScript functions that are lacking in the aggregation framework. For instance, in the map function you could use the trim() function to remove any trailing spaces in your colourname fields so that you can emit the "cleansed" keys.
The Map-Reduce operation would typically have the following map and reduce functions:
var map = function() {
if (!this.colours) return;
this.colours.forEach(function (c){
emit(c.colourname.trim(), 1)
});
};
var reduce = function(key, values) {
var count = 0;
for (index in values) {
count += values[index];
}
return count;
};
db.runCommand( { mapreduce : "myCollection", map : map , reduce : reduce , out : "map_reduce_result" } );
You can then query map_reduce_result collection with the regex to have the result:
var getDistinctKeys = function (doc) { return doc._id };
var result = db.map_reduce_result.find({ "_id": /Gre/ }).map(getDistinctKeys);
print(result); // prints ["Green", "Grey", "Light Green"]
-- UPDATE --
To implement this in Python, PyMongo's API supports all of the features of MongoDB’s map/reduce engine thus you could try the following:
import pymongo
import re
from bson.code import Code
client = pymongo.MongoClient("localhost", 27017)
db = client.test
map = Code("function () {"
" if (!this.colours) return;"
" this.colours.forEach(function (c){"
" emit(c.colourname.trim(), 1)"
" });"
"};")
reduce = Code("function (key, values) {"
" var count = 0;"
" for (index in values) {"
" count += values[index];"
" }"
" return count;"
" };")
result = db.myCollection.map_reduce(map, reduce, "map_reduce_result")
regx = re.compile("Gre", re.IGNORECASE)
for doc in result.find({"_id": regx}):
print(doc)

Related

How to query MongoDb documents using the indices of embedded arrays

I am trying to learn how to use mongo queries to reach deep into a data tree. Specifically, I'm trying to remove the object below {"object": 'to remove'}
{
"_id" : ObjectId("7840f22736341b09154f7ebf"),
"username" : "nmay",
"fname" : "Nate",
"lname" : "May",
"data" : [
{
"monthNum" : 1,
"year" : 2016,
"days" : [
{
"date" : "2016-01-01T06:00:00.000Z",
"type1" : [],
"type2" : []
},
{
"date" : "2016-01-02T06:00:00.000Z",
"type1" : [
{"object": 'to remove'}
],
"type2" : []
}
]
}
]
}
so far I know how to query for the user _id, but I'm not sure how to remove the desired object using the indices in each array. In this example I want to remove data[0].days[1].type1[0]
Here is the query that I have so far:
app.delete('/user/:id/data/:monthIndex/days/:dayIndex/type1/:type1Index', function (req, res, next) {
var monthIndex = parseInt(req.params.monthIndex); // these console the value properly
var dayIndex = parseInt(req.params.dayIndex); // -1 is applied to the parameter to translate to array position
var type1Index = parseInt(req.params.type1Index);
db.users.update(
{ _id: mongojs.ObjectId(req.params.id) },
{ $pull: data.monthIndex.days.dayIndex.type1.type1Index }
);
}
It gives me the error
ReferenceError: data is not defined
Can someone demonstrate how I can pass this query my index parameters to remove the desired object?
Unfortunately, there is no way to remove an array element by its numerical index with a single operation in MongoDB. In order to do this, you need to unset desired element(s) first, and remove the resulting null-valued fields afterwards.
Your code should look something like this:
db.users.update(
{ _id : mongojs.ObjectId(req.params.id) },
{ $unset : { 'data.0.days.1.type1.0' : 1 } }
);
db.users.update(
{ _id : mongojs.ObjectId(req.params.id) },
{ $pull : { 'data.0.days.1.type1' : null } }
);
Edit by #bob: to pass in the parameters you have to build the query string, which is ugly:
var unset = {};
unset['$unset'] = {};
unset.$unset['data.' + req.params.monthIndex + '.days.' + req.params.dayIndex + '.foods.' + req.params.foodIndex] = 1;
db.users.update( { _id : mongojs.ObjectId(req.params.id) }, unset );
var pull = {};
pull['$pull'] = {};
pull.$pull['data.' + req.params.monthIndex + '.days.' + req.params.dayIndex + '.foods'] = null;
db.users.update( { _id : mongojs.ObjectId(req.params.id) }, pull );

MongoDB query to return documents that only have keys amongst a predefined set

The MongoDB query language allows filtering documents based on the existence or absence of a given field with the $exists operator.
Is there a way, with the MongoDB syntax, and given a set K of allowed fields, to exclude documents that have fields not in K from the results, but:
not knowing in advance which extra fields (outside K) can be encountered
not using JavaScript, that is, the $where operator?
Example:
{
"Some field" : "foo"
}
{
"Some field" : "bar",
"Some other field" : "foobar"
}
With the set K = [ "Some field" ], only the first document is to be returned.
Note how this is not to be confused with a projection, which would return both documents but removing the extra field.
I'm not sure if MongoDB do support such kind of operations out of box but you can achieve so with help of mapReduce.
Assuming your sample data set;
// Variable for map
var map = function () {
var isAcceptable = true;
Object.keys(this).forEach(function (key) {
if (key != "_id" && white_list.indexOf(key) == -1) {
isAcceptable = false;
}
});
if (isAcceptable == true) {
emit(1, this);
}
};
// Variable for reduce
var reduce = function (key, values) {
return values;
};
db.collection.mapReduce(
map,
reduce,
{
scope: {"white_list": ["Some field"]},
out: {"inline": 1}
}
);
Will return:
{
"results" : [
{
"_id" : 1,
"value" : {
"_id" : ObjectId("57cd7503e55de957c62fb9c8"),
"Some field" : "foo"
}
}
],
"timeMillis" : 13,
"counts" : {
"input" : 2,
"emit" : 1,
"reduce" : 0,
"output" : 1
},
"ok" : 1
}
Desired result will be in results.values of returned document. However, keep in mind limitation of MongoDB mapReduce and maximum size of BSON document.
Given a set of known fields K, you can construct a query that takes the set as input and gives a query with the $exists operator along with the corresponding fields projection. Using an example, suppose you have the following documents in a test collection
db.test.insert({ "fieldX": "foo", "fieldY": "bar", "fieldZ": 1 })
db.test.insert({ "fieldX": "123", "fieldY": "bar", "fieldZ": 2 })
db.test.insert({ "fieldY": "abc", "fieldZ": 3 })
db.test.insert({ "fieldX": "xyz", "fieldZ": 4 })
db.test.insert({ "fieldZ": 5 })
Then you can construct a query Q and a projection P from an input set K as follows:
var K = [ "fieldX", "fieldZ" ];
var or = K.map(function(field) {
var obj = {};
obj[field] = { "$exists": true };
return obj;
});
var P = K.reduce(function(doc, field) {
doc[field] = 1;
return doc;
}, {} );
var Q = { "$or": or };
db.test.find(Q, P);
Sample Output:
/* 1 */
{
"_id" : ObjectId("57cd78322c241f5870c82b7d"),
"fieldX" : "foo",
"fieldZ" : 1
}
/* 2 */
{
"_id" : ObjectId("57cd78332c241f5870c82b7e"),
"fieldX" : "123",
"fieldZ" : 2
}
/* 3 */
{
"_id" : ObjectId("57cd78332c241f5870c82b7f"),
"fieldZ" : 3
}
/* 4 */
{
"_id" : ObjectId("57cd78332c241f5870c82b80"),
"fieldX" : "xyz",
"fieldZ" : 4
}
/* 5 */
{
"_id" : ObjectId("57cd78332c241f5870c82b81"),
"fieldZ" : 5
}

how to query for exact mach in unknown number of subfields in mongodb

I have a collection where documents can have an unknown number of sub documents:
"agent_id": {
"0":"1234",
"1":"2234",...etc
How do I search for an exact match in all the agent_id sub-fields?
You need to dynamically create an object with properties that are a concatenation of the embedded document name agent_id with the dot (.) and the field name, enclosed in quotes, something like this:
var query = {
"agent_id.0": "78343",
"agent_id.1": "78343",
"agent_id.2": "78343",
"agent_id.3": "78343",
...
"agent_id.n": "78343"
}
One way to create the object is generate the sub-documents keys with mapReduce. The following demonstrates this approach. In the Map-Reduce operation, an array of keys in the agent_id subdocument is generated to an output collection "collection_keys" and then used to produce the find() query expression:
Suppose you populate a sample collection
db.collection.insert([
{
"agent_id": {
"0":"1234",
"1":"2234",
"56":"8451",
"74":"1475",
"10":"1234"
}
},
{
"agent_id": {
"5":"5874",
"18":"2351"
}
}
])
Running the following mapReduce operation
var mr = db.runCommand({
"mapreduce" : "collection",
"map" : function() {
for (var key in this.agent_id) { emit(key, null); }
},
"reduce" : function(key, stuff) {
return null
},
"out": "collection" + "_keys"
});
var query = { "$or": [] },
value = "1234";
db[mr.result].distinct("_id").forEach(function (key){
var obj = {};
obj["agent_id." + key] = value;
query["$or"].push(obj)
});
printjson(query);
will produce:
{
"$or" : [
{
"agent_id.0" : "1234"
},
{
"agent_id.1" : "1234"
},
{
"agent_id.10" : "1234"
},
{
"agent_id.18" : "1234"
},
{
"agent_id.5" : "1234"
},
{
"agent_id.56" : "1234"
},
{
"agent_id.74" : "1234"
}
]
})
You can then use the query document in your find() query:
db.collection.find(query)
which will produce the result:
/* 0 */
{
"_id" : ObjectId("561d5312cd05efc95a1ea1f4"),
"agent_id" : {
"0" : "1234",
"1" : "2234",
"56" : "8451",
"74" : "1475",
"10" : "1234"
}
}

MongoDB: Update a field of an item in array with matching another field of that item

I have a data structure like this:
We have some centers. A center has some switches. A switch has some ports.
{
"_id" : ObjectId("561ad881755a021904c00fb5"),
"Name" : "center1",
"Switches" : [
{
"Ports" : [
{
"PortNumber" : 2,
"Status" : "Empty"
},
{
"PortNumber" : 5,
"Status" : "Used"
},
{
"PortNumber" : 7,
"Status" : "Used"
}
]
}
]
}
All I want is to write an Update query to change the Status of the port that it's PortNumber is 5 to "Empty".
I can update it when I know the array index of the port (here array index is 1) with this query:
db.colection.update(
// query
{
_id: ObjectId("561ad881755a021904c00fb5")
},
// update
{
$set : { "Switches.0.Ports.1.Status" : "Empty" }
}
);
But I don't know the array index of that Port.
Thanks for help.
You would normally do this using the positional operator $, as described in the answer to this question:
Update field in exact element array in MongoDB
Unfortunately, right now the positional operator only supports one array level deep of matching.
There is a JIRA ticket for the sort of behavior that you want: https://jira.mongodb.org/browse/SERVER-831
In case you can make Switches into an object instead, you could do something like this:
db.colection.update(
{
_id: ObjectId("561ad881755a021904c00fb5"),
"Switch.Ports.PortNumber": 5
},
{
$set: {
"Switch.Ports.$.Status": "Empty"
}
}
)
Since you don't know the array index of the Port, I would suggest you dynamically create the $set conditions on the fly i.e. something which would help you get the indexes for the objects and then modify accordingly, then consider using MapReduce.
Currently this seems to be not possible using the aggregation framework. There is an unresolved open JIRA issue linked to it. However, a workaround is possible with MapReduce. The basic idea with MapReduce is that it uses JavaScript as its query language but this tends to be fairly slower than the aggregation framework and should not be used for real-time data analysis.
In your MapReduce operation, you need to define a couple of steps i.e. the mapping step (which maps an operation into every document in the collection, and the operation can either do nothing or emit some object with keys and projected values) and reducing step (which takes the list of emitted values and reduces it to a single element).
For the map step, you ideally would want to get for every document in the collection, the index for each Switches and Ports array fields and another key that contains the $set keys.
Your reduce step would be a function (which does nothing) simply defined as var reduce = function() {};
The final step in your MapReduce operation will then create a separate collection Switches that contains the emitted Switches array object along with a field with the $set conditions. This collection can be updated periodically when you run the MapReduce operation on the original collection.
Altogether, this MapReduce method would look like:
var map = function(){
for(var i = 0; i < this.Switches.length; i++){
for(var j = 0; j < this.Switches[i].Ports.length; j++){
emit(
{
"_id": this._id,
"switch_index": i,
"port_index": j
},
{
"index": j,
"Switches": this.Switches[i],
"Port": this.Switches[i].Ports[j],
"update": {
"PortNumber": "Switches." + i.toString() + ".Ports." + j.toString() + ".PortNumber",
"Status": "Switches." + i.toString() + ".Ports." + j.toString() + ".Status"
}
}
);
}
}
};
var reduce = function(){};
db.centers.mapReduce(
map,
reduce,
{
"out": {
"replace": "switches"
}
}
);
Querying the output collection Switches from the MapReduce operation will typically give you the result:
db.switches.findOne()
Sample Output:
{
"_id" : {
"_id" : ObjectId("561ad881755a021904c00fb5"),
"switch_index" : 0,
"port_index" : 1
},
"value" : {
"index" : 1,
"Switches" : {
"Ports" : [
{
"PortNumber" : 2,
"Status" : "Empty"
},
{
"PortNumber" : 5,
"Status" : "Used"
},
{
"PortNumber" : 7,
"Status" : "Used"
}
]
},
"Port" : {
"PortNumber" : 5,
"Status" : "Used"
},
"update" : {
"PortNumber" : "Switches.0.Ports.1.PortNumber",
"Status" : "Switches.0.Ports.1.Status"
}
}
}
You can then use the cursor from the db.switches.find() method to iterate over and update your collection accordingly:
var newStatus = "Empty";
var cur = db.switches.find({ "value.Port.PortNumber": 5 });
// Iterate through results and update using the update query object set dynamically by using the array-index syntax.
while (cur.hasNext()) {
var doc = cur.next();
var update = { "$set": {} };
// set the update query object
update["$set"][doc.value.update.Status] = newStatus;
db.centers.update(
{
"_id": doc._id._id,
"Switches.Ports.PortNumber": 5
},
update
);
};

Mongo: count the number of word occurrences in a set of documents

I have a set of documents in Mongo. Say:
[
{ summary:"This is good" },
{ summary:"This is bad" },
{ summary:"Something that is neither good nor bad" }
]
I'd like to count the number of occurrences of each word (case insensitive), then sort in descending order. The result should be something like:
[
"is": 3,
"bad": 2,
"good": 2,
"this": 2,
"neither": 1,
"nor": 1,
"something": 1,
"that": 1
]
Any idea how to do this? Aggregation framework would be preferred, as I understand it to some degree already :)
MapReduce might be a good fit that can process the documents on the server without doing manipulation on the client (as there isn't a feature to split a string on the DB server (open issue).
Start with the map function. In the example below (which likely needs to be more robust), each document is passed to the map function (as this). The code looks for the summary field and if it's there, lowercases it, splits on a space, and then emits a 1 for each word found.
var map = function() {
var summary = this.summary;
if (summary) {
// quick lowercase to normalize per your requirements
summary = summary.toLowerCase().split(" ");
for (var i = summary.length - 1; i >= 0; i--) {
// might want to remove punctuation, etc. here
if (summary[i]) { // make sure there's something
emit(summary[i], 1); // store a 1 for each word
}
}
}
};
Then, in the reduce function, it sums all of the results found by the map function and returns a discrete total for each word that was emitted above.
var reduce = function( key, values ) {
var count = 0;
values.forEach(function(v) {
count +=v;
});
return count;
}
Finally, execute the mapReduce:
> db.so.mapReduce(map, reduce, {out: "word_count"})
The results with your sample data:
> db.word_count.find().sort({value:-1})
{ "_id" : "is", "value" : 3 }
{ "_id" : "bad", "value" : 2 }
{ "_id" : "good", "value" : 2 }
{ "_id" : "this", "value" : 2 }
{ "_id" : "neither", "value" : 1 }
{ "_id" : "or", "value" : 1 }
{ "_id" : "something", "value" : 1 }
{ "_id" : "that", "value" : 1 }
A basic MapReduce example
var m = function() {
var words = this.summary.split(" ");
if (words) {
for(var i=0; i<words.length; i++) {
emit(words[i].toLowerCase(), 1);
}
}
}
var r = function(k, v) {
return v.length;
};
db.collection.mapReduce(
m, r, { out: { merge: "words_count" } }
)
This will insert word counts into a collection name words_count which you can sort (and index)
Note that it doesn't use stemming, omit punctuation, handles stop words etc.
Also note you can optimize the map function by accumulating repeating word(s) occurrences and emitting the count, not just 1
You can use #split.
Try Below query
db.summary.aggregate([
{ $project : { summary : { $split: ["$summary", " "] } } },
{ $unwind : "$summary" },
{ $group : { _id: "$summary" , total : { "$sum" : 1 } } },
{ $sort : { total : -1 } }
]);
Old question but since 4.2 this can be done with $regexFindAll now.
db.summaries.aggregate([
{$project: {
occurences: {
$regexFindAll: {
input: '$summary',
regex: /\b\w+\b/, // match words
}
}
}},
{$unwind: '$occurences'},
{$group: {
_id: '$occurences.match', // group by each word
totalOccurences: {
$sum: 1 // add up total occurences
}
}},
{$sort: {
totalOccurences: -1
}}
]);
This will output docs in the following format:
{
_id: "matchedwordstring",
totalOccurences: number
}