Imitate a COUNT... GROUP BY in Cloudant (CouchDB) - group-by

In CloudAnt, I want to get counts of features, grouped by two attributes: topics and councils. After a lot of fiddling, I was able to construct this:
// map
function (doc) {
if (doc.properties && doc.properties.openCouncilDataTopic) {
var ret = {};
ret [doc.properties.openCouncilDataTopic] = {};
ret [doc.properties.openCouncilDataTopic][doc.properties.sourceCouncilId] = 1;
emit(null, ret);
}
}
// reduce
function (keys, values, rereduce) {
var ret = values[0];
function zero(x) { return x ? x : 0 }
function add(i, topic, id) {
ret[topic][id] = zero(ret[topic][id]) + values[i][topic][id];
}
function countTopic(topic) {
if (!ret[topic])
ret[topic] = {};
Object.keys(values[i][topic]).forEach(add.bind(undefined, i, topic));
}
for (var i = 1; i < values.length; i++) {
Object.keys(values[i]).forEach(countTopic);
}
return ret;
}
Is there a better way?

Yes, there is a better way! Cloudant supports "aggregation over complex keys"
Instead of manually computing the counts, the map function can simply emit keys which are the attributes to group on:
function (doc) {
if (doc.properties && doc.properties.openCouncilDataTopic) {
emit([doc.properties.openCouncilDataTopic, doc.properties.sourceCouncilId], 1);
}
}
Then we can use the built-in _count reduce function.
To call the view, add group_by=2, where 2 is the number of attributes to group by. (The default, 0, aggregates everything down to a single number).
This is likely to be much faster than implementing the count in Javascript, too.
One downside is we have less control over the output, which comes out like this:
{
"rows": [
{
"key": [
"childcare-centres",
"https://data.gov.au/organization/cardinia-shire-council"
],
"value": 22
},
...
}

Related

mapreduce between consecutive documents

Setup:
I got a large collection with the following entries
Name - String
Begin - time stamp
End - time stamp
Problem:
I want to get the gaps between documents, Using the map-reduce paradigm.
Approach:
I'm trying to set a new collection of pairs mid, after that I can compute differences from it using $unwind and Pair[1].Begin - Pair[0].End
function map(){
emit(0, this)
}
function reduce(){
var i = 0;
var pairs = [];
while ( i < values.length -1){
pairs.push([values[i], values[i+1]]);
i = i + 1;
}
return {"pairs":pairs};
}
db.collection.mapReduce(map, reduce, sort:{begin:1}, out:{replace:"mid"})
This works with limited number of document because of the 16MB document cap. I'm not sure if I need to get the collection into memory and doing it there, How else can I approach this problem?
The mapReduce function of MongoDB has a different way of handling what you propose than the method you are using to solve it. The key factor here is "keeping" the "previous" document in order to make the comparison to the next.
The actual mechanism that supports this is the "scope" functionality, which allows a sort of "global" variable approach to use in the overall code. As you will see, what you are asking when that is considered takes no "reduction" at all as there is no "grouping", just emission of document "pair" data:
db.collection.mapReduce(
function() {
if ( last == null ) {
last = this;
} else {
emit(
{
"start_id": last._id,
"end_id": this._id
},
this.Begin - last.End
);
last = this;
}
},
function() {}, // no reduction required
{
"out": { "inline": 1 },
"scope": { "last": null }
}
)
Out with a collection as the output as required to your size.
But this way by using a "global" to keep the last document then the code is both simple and efficient.

Query key with value anywhere in object hierarchy in Mongo

In Mongo how can I find all documents that have a given key and value, regardless of where that key appears in the document's key/value hierarchy?
For example the input key roID and value 5 would match both:
{
roID: '5'
}
and
{
other: {
roID: '5'
}
}
There is no built in way to do this. You might have to scan each matched document recursively to try and locate that attribute. Not recommended. You might want to think about restructuring your data or perhaps manipulating it into a more unified format so that it will be easier (and faster) to query.
If your desired key appears in a fixed number of different locations, you could use the $or operator to scan all the possibilities.
Taking your sample documents as an example, your query would look something like this:
db.data.find( { "$or": [
{ "roID": 5 },
{ "other.roID": 5 },
{ "foo.bar.roID": 5 },
{ any other possbile locations of roID },
...
] } )
If the number of documents in collection is not so large, then it can be done by this:
db.system.js.save({_id:"keyValueExisted", value: function (key, value) {
function findme(obj) {
for (var x in obj) {
var v = obj[x];
if (x == key && v == value) {
return true;
} else if (v instanceof Object) {
if (findme(v)) return true;
}
}
return false;
}
return findme(this);
}});
var param = ['roID', '5'];
db.c.find({$where: "keyValueExisted.apply(this, " + tojsononeline(param) + ");"});

MongoDB getting size of cursor takes a long time if size is 0

I'm currently working on a project where I'm using keyword queries against a MongoDB. If I search for things that exists in the database everything works ok, but if I search for things that don't exist, or I have a typo in my query the appilcation basically crashes.
The query is as simple as this:
var query = Query.And(Query.Matches("text", searchText)
Where searchText is what's being written into the searchbox in the UI.
To check the size of the cursor I've tried implementing this:
if ( cursor.Size() == 0)
{
MessageBox.Show("Your search did not return a match. Please search for
something else.");
return database;
}
But the system takes 10-15 minutes to evaluate that the size is 0, compared to the 0.5 seconds if the size is 1 or more.
So do anyone have any suggestions? Either a better way of checking the size of the cursor or some kind of function that makes the method time out and tell the user that no match was found?
Thanks in advance.
Update:
As requested added the explain for something that should and something that shouldn't exist
db.docs.find( {text: "a"}).explain
function (verbose) {
/* verbose=true --> include allPlans, oldPlan fields */
var n = this.clone();
n._ensureSpecial();
n._query.$explain = true;
n._limit = Math.abs(n._limit) * -1;
var e = n.next();
function cleanup(obj){
if (typeof(obj) != 'object'){
return;
}
delete obj.allPlans;
delete obj.oldPlan;
if (typeof(obj.length) == 'number'){
for (var i=0; i < obj.length; i++){
cleanup(obj[i]);
}
}
if (obj.shards){
for (var key in obj.shards){
cleanup(obj.shards[key]);
}
}
if (obj.clauses){
cleanup(obj.clauses);
}
}
if (!verbose)
cleanup(e);
return e;
}
db.docs.find( {text: "fgrgfk"}).explain
function (verbose) {
/* verbose=true --> include allPlans, oldPlan fields */
var n = this.clone();
n._ensureSpecial();
n._query.$explain = true;
n._limit = Math.abs(n._limit) * -1;
var e = n.next();
function cleanup(obj){
if (typeof(obj) != 'object'){
return;
}
delete obj.allPlans;
delete obj.oldPlan;
if (typeof(obj.length) == 'number'){
for (var i=0; i < obj.length; i++){
cleanup(obj[i]);
}
}
if (obj.shards){
for (var key in obj.shards){
cleanup(obj.shards[key]);
}
}
if (obj.clauses){
cleanup(obj.clauses);
}
}
if (!verbose)
cleanup(e);
return e;
}
Update 2: Overview of indexes:
db.docs.getIndexes()
{
"v" : 1,
"key" : {
"_id" : 1
},
"ns" : "tweet_database.docs",
"name" : "_id_"
}

How to make nested group in MongoDB?

Assume there's a data in Mongo! There are a few projects, and there're managers, to who the problems are assigned. Each problem is assigned to different people as the project has different parts.
How can one compute the number, each assignee has been assigned within a project.
For example:
{
"Project A": {
"Ted":10,
"Matt":5
},
"Project B": {
"Mary":3,
"Hector":30
}
}
The following writes the sum but doesn't group it:
db.test.issues.group({
"key": {
"project": true
"assignee":true
},
"initial": {
"countassignee": 0
},
"reduce": function(obj, prev) {
prev.countassignee++;
}
});
It will return as:
{
{
"project": "Project A",
"countassignee":10,
"assignee":"Ted"
}
....
}
The data sample provide by you is pretty confusing. its Project A and project B supposed to be different document.
I believe the best use of group is to aggegrate within a collections, if you want to count within a doucment you may use javascript.
OK! It doesn't seem like the best answer, but it'll do!
first define a cursor sorting by projects
var cursor = db.test.find().sort({projects:1});
a list for the counter:
var asgn_nums = {};
and a variable holding project name;:
var project = '0';
Here counts for each project:
while (cursor.hasNext()) {
var issue = cursor.next();
var p = issue.project;
if(p != project)
{
if(project != '0'){
print(project+":");
printjson(asgn_nums);
}
project = p;
asgn_nums = {};
}
if(issue.assignee != null)
{
if (asgn_nums[issue.assignee] == null)
asgn_nums[issue.assignee] = 1;
else
asgn_nums[issue.assignee]+= 1;
}
if(!cursor.hasNext())
{
print(project+":");
printjson(asgn_nums);
}
}

how to calculate count and unique count over two fields in mongo reduce function

I have a link tracking table that has (amongst other fields) track_redirect and track_userid. I would like to output both the total count for a given link, and also the unique count - counting duplicates by the user id. So we can differentiate if someone has clicked the same link 5 times.
I've tried emitting this.track_userid in both the key and values parts but can't get to grips with how to correctly access them in the reduce function.
So if I roll back to when it actually worked, I have the very simple code below - just like it would be in a 'my first mapreduce function' example
map
function() {
if(this.track_redirect) {
emit(this.track_redirect,1);
}
}
reduce
function(k, vals) {
var sum = 0;
for (var i in vals) {
sum += vals[i];
}
return sum;
}
I'd like to know the correct way to emit the additional userid information and access it in the mapreduce please. or am i thinking about it in the wrong way?
in case it's not clear, I don't want to calculate the total clicks a userid has made, but to count the unique clicks of each url + userid - not counting any duplicate clicks a userid made on each link
can someone point me in the right direction please? thanks!
You can actually pass arbitrary object on the second parameter of the emit call. That means you can take advantage of this and store the userid in it. For example, your map function can look like this:
var mapFunc = function() {
if (this.track_redirect) {
var tempDoc = {};
tempDoc[this.track_userid] = 1;
emit(this.track_redirect, {
users_clicked: tempDoc,
total_clicks: 1
});
}
};
And your reduce function might look like this:
var reduceFunc = function(key, values) {
var summary = {
users_clicked: {},
total_clicks: 0
};
values.forEach(function (doc) {
summary.total_clicks += doc.total_clicks;
// Merge the properties of 2 objects together
// (and these are actually the userids)
Object.extend(summary.users_clicked, doc.users_clicked);
});
return summary;
};
The users_clicked property of the summary object basically stores the id of every user as a property (since you can't have duplicate properties, you can guarantee that it will store unique users). Also note that you have to be careful of the fact that some of the values passed to the reduce function can be result of a previous reduce and the sample code above takes that into account. You can find more about the said behavior in the docs here.
In order to get the unique count, you can pass in the finalizer function that gets called when the reduce phase is completed:
var finalFunc = function(key, value) {
// Counts the keys of an object. Taken from:
// http://stackoverflow.com/questions/18912/how-to-find-keys-of-a-hash
var countKeys = function(obj) {
var count = 0;
for(var i in obj) {
if (obj.hasOwnProperty(i))
{
count++;
}
}
return count;
};
return {
redirect: key,
total_clicks: value.total_clicks,
unique_clicks: countKeys(value.users_clicked)
};
};
Finally, you can execute the map reduce job like this (modify the out attribute to fit your needs):
db.users.mapReduce(mapFunc, reduceFunc, { finalize: finalFunc, out: { inline: 1 }});