MongoDb aggregation or mapreduce for invoicing statistics? - mongodb

I'm new to MongoDb and have a job for (I suppose) MapReduce or Aggregation.
I have an "invoices" collection with documents in this format:
{
date: 'some unix timestamp',
total: 12345,
paid: true
}
I need to display a table with months (jan-dec) as columns, a row for each year and the sum of total in the month (divided in paid and unpaid) in the cell. Like this:
| Jan | Feb | ...
2013 | 1,222 / 200 | 175 / 2,122 | ...
...
Can you help me get the mongo command right?
Maybe I'm better off writing some JS code to execute in mongo?

I've now found a solution using MapReduce. Here it is in use from PHP:
$map = new MongoCode('
function() {
var d = new Date(this.date*1000);
emit({y: d.getFullYear(), m: d.getMonth()}, {
total: this.total,
notPaid: this.paid ? 0 : this.total,
count: 1
});
};
');
$reduce = new MongoCode('
function(month, values) {
result = { total: 0, notPaid: 0, count: 0 };
for (var i = 0; i < values.length; i++) {
result.total += values[i].total;
result.notPaid += values[i].notPaid;
result.count += values[i].count;
}
return result;
};
');
$result = $db->command(array(
'mapreduce' => 'invoices',
'map' => $map,
'reduce' => $reduce,
'out' => 'temp'
));
echo $result['timeMillis'];
Now the results are in the "temp" collection, one document per month. Could it be optimized or enhanced?

You can do this with aggregation framework like this:
db.invoices.aggregate( [
{
"$project" : {
"yr" : {
"$year" : "$date"
},
"mo" : {
"$month" : "$date"
},
"total" : 1,
"unpaid" : {
"$cond" : [
"$paid",
0,
"$total"
]
}
}
},
{
"$group" : {
"_id" : {
"y" : "$yr",
"m" : "$mo"
},
"total" : {
"$sum" : "$total"
},
"unpaid" : {
"$sum" : "$unpaid"
}
}
}
] )
You can use another $project at the end to pretty-up the output, and a $sort to order it, but that's the basic functioning core of it.

Related

How to use aggregation function mongo db-query

I am new in MongoDB and I would like to use the aggregation function where I want to check type == topic and get the following output
Expected output
[
{
conceptName : 59d98cfd1c5edc24e4024d00
totalCount : 2
},
{
conceptName : 59d98cfd1c5edc24e4024d03
totalCount : 1
}
]
Sample input db.GroupContents
{
"_id" : "5a0948bb1c5edc7a5000521a",
"type" : "topic",
"groupID" : "5a0948bb1c5edc7a5000521a",
"pedagogyID" : "59d98cfa1c5edc24e40249a3",
}
Sample input db.PedagogyNodes
{
"_id" : "59d98cfa1c5edc24e40249a3",
"latestVersion" : "59d98cfa1c5edc24e402497f_1",
"createdAt" : "2017-10-08 04:27:06",
"updatedAt" : "2017-10-08 04:27:06"
}
Sample input db.PedagogyVersions
{
"_id" : "59d98cfa1c5edc24e402497f_1",
"type" : "topic",
"contentNodes" : {
"LearningNodes" : [
"59d98cfd1c5edc24e4024d00",
"59d98cfd1c5edc24e4024d03",
"59d98cfd1c5edc24e4024d00",
]
},
"createdAt" : "2017-10-08 04:27:06",
"updatedAt" : "2017-10-08 04:27:06"
}
What I have tried so far
var groupID = "5a0948bb1c5edc7a5000521a"; // Step 1
var records;
var pnDoc;
var pvDoc;
db.GroupContents.find({groupID : groupID}).forEach(function (doc){ // Step 2
var pedagogyID = doc.pedagogyID;
var records = db.getSiblingDB('PedagogyService');
records.PedagogyNodes.find({_id : pedagogyID}).forEach(function (pnDoc) { // Step 3
var latestVersion = pnDoc.latestVersion;
// addded aggregate function here
records.PedagogyVersions.aggregate([
{
$match:{_id:latestVersion} // Step 4
},
{
$unwind:"$contentNodes.LearningNodes"
},
{
$group:
{
_id:"$contentNodes.LearningNodes",
count:{$sum:1}
}
}
])
})
});
I am unable to write db query based on my expected answer, please help.
Understand my requirement
Step : 1 => I am passing `groupID = 5a0948bb1c5edc7a5000521a`
Step : 2 => we have to check from GroupContents where groupID = groupID then we have to take `pedagogyID`
Step : 3 => we have to check from PedagogyNodes where _id = pedagogyID then we have to take `latestVersion`
Step : 4 => we have to check from PedagogyVersions where _id = latestVersion then we have to take `contentNodes->LearningNodes`
Step : 5 => Finally we have to do the aggregation then we have display the result
Try to unwind the LearningNodes array and then count them by grouping them together
db.PedagogyNodes.aggregate([
{
$unwind:"$contentNodes.LearningNodes"
},
{
$group:
{
_id:"$contentNodes.LearningNodes",
count:{$sum:1}
}
}
])
In case you need to do any matches you can use the $match stage
db.PedagogyNodes.aggregate([
{
$match:{type:"topic"}
},
{
$unwind:"$contentNodes.LearningNodes"
},
{
$group:
{
_id:"$contentNodes.LearningNodes",
count:{$sum:1}
}
}
])
Answering the edited question =>
You were not able to view the output on the console since mongoshell does not print script output on the screen. To do this, do the following:
var result = records.PedagogyVersions.aggregate([......]);
result.forEach(function(resultDoc){
print(tojson(resultDoc))
})
To see the result of your aggregation you have to pass the callback to be executed as parameter.
records.PedagogyVersions.aggregate([
{
$match:{_id:latestVersion} // Step 4
},
{
$unwind:"$contentNodes.LearningNodes"
},
{
$group:
{
_id:"$contentNodes.LearningNodes",
count:{$sum:1}
}
}
], function(err, results) {
console.log(results);
});

Calculate average using mapreduce in MongoDb

I have a collection of 10 million records which resembles this.
{
"_id" : ObjectId("596dd10bbd1a6628ace1c14c"),
"X" : 13212,
"Z" : 173836,
"userID" : 9354785
}
User ID is unique. I have to calculate the average of X and sum of Z. I can calculate the sum of Z using the following mapReduce function
var mapFunction1 = function() {
emit(this.userID, this.Z);
};
var reduceFunction1 = function() {
return Array.sum(Z);
};
db.transaction.mapReduce(
mapfunction1,
reduceFunction1,
{out:"mapreduce"}
)
How do i calculate the average of X?
I tried Array.avg(Z) but it returns the same output as sum(Z).
It looks like the requirements can be expressed more simply using the Aggregation Pipeline with the $avg and $sum operators.
Input
> db.transactions.find()
{ "_id" : ObjectId("5970e59e26507421fa20bee9"), "X" : 13212, "Z" : 173836, "userID" : 9354785 }
{ "_id" : ObjectId("5970e5a426507421fa20beea"), "X" : 1234, "Z" : 5678, "userID" : 1 }
{ "_id" : ObjectId("5970e5a826507421fa20beeb"), "X" : 100, "Z" : 200, "userID" : 2 }
Aggregation Pipeline
> db.transactions.aggregate([
{
$group : {
_id: "aggregates",
avgX: {
$avg: "$X"
},
sumZ: {
$sum: "$Z"
}
}
}
])
Output
{ "_id" : "aggregates", "avgX" : 4848.666666666667, "sumZ" : 179714 }
You are not passing (key,value) pair parameter to reduceFunction1.
Try this:
var mapFunction1 = function() {
emit(this.userID, this.Z);
};
var reduceFunction1 = function(varKey,varZ) {
return Array.avg(varZ);
};
db.transaction.mapReduce(
mapfunction1,
reduceFunction1,
{out:"mapreduce"}
)

Is it possible to retrieve a 'time span' from a MongoDB query, using the timestamp within an ObjectId?

We have a basic enquiry management tool that we're using to track some website enquiries in our administration suite, and we're using the ObjectId of each document in our enquiries collection to sort the enquiries by the date they were added.
{
"_id" : ObjectId("53a007db144ff47be1000003"),
"comments" : "This is a test enquiry. Please ignore. We'll delete it shortly.",
"customer" : {
"name" : "Test Enquiry",
"email" : "test#test.com",
"telephone" : "07890123456",
"mobile" : "07890123456",
"quote" : false,
"valuation" : false
},
"site" : [],
"test" : true,
"updates" : [
{
"_id" : ObjectId("53a007db144ff47be1000001"),
"status" : "New",
"status_id" : ObjectId("537de7c3a5e6e668ffc2335c"),
"status_index" : 100,
"substatus" : "New Web Enquiry",
"substatus_id" : ObjectId("5396bb9fa5e6e668ffc23388"),
"notes" : "New enquiry received from website.",
},
{
"_id" : ObjectId("53a80c977d299cfe91bacf81"),
"status" : "New",
"status_id" : ObjectId("537de7c3a5e6e668ffc2335c"),
"status_index" : 100,
"substatus" : "Attempted Contact",
"substatus_id" : ObjectId("53a80e06a5e6e668ffc2339e"),
"notes" : "In this test, we pretend that we've not managed to get hold of the customer on the first attempt.",
},
{
"_id" : ObjectId("53a80e539b966b8da5c40c36"),
"status" : "Approved",
"status_id" : ObjectId("52e77a49d85e95f00ebf6c72"),
"status_index" : 200,
"substatus" : "Enquiry Confirmed",
"substatus_id" : ObjectId("53901f1ba5e6e668ffc23372"),
"notes" : "In this test, we pretend that we've got hold of the customer after failing to contact them on the first attempt.",
}
]
}
Within each enquiry is an updates array of objects which also have an ObjectId as their main identity field. We're using an $unwind and $group aggregation to pull the first and latest updates, as well as the count of updates, making sure we only take enquiries where there have been more than one update (as one is automatically inserted when the enquiry is made):
db.enquiries.aggregate([
{
$match: {
"test": true
}
},
{
$unwind: "$updates"
},
{
$group: {
"_id": "$_id",
"latest_update_id": {
$last: "$updates._id"
},
"first_update_id": {
$first: "$updates._id"
},
"update_count": {
$sum: 1
}
}
},
{
$match: {
"update_count": {
$gt: 1
}
}
}
])
This results in the following output:
{
"result" : [
{
"_id" : ObjectId("53a295ad122ea80200000005"),
"latest_update_id" : ObjectId("53a80bdc7d299cfe91bacf7e"),
"first_update_id" : ObjectId("53a295ad122ea80200000003"),
"update_count" : 2
},
{
"_id" : ObjectId("53a007db144ff47be1000003"),
"latest_update_id" : ObjectId("53a80e539b966b8da5c40c36"),
"first_update_id" : ObjectId("53a007db144ff47be1000001"),
"update_count" : 3
}
],
"ok" : 1
}
This is then passed through to our code (node.js, in this case) where we perform a few operations on it and then present some information on our dashboard.
Ideally, I'd like to add another $group pipeline aggregation to the query which would subtract the timestamp of first_update_id from the timestamp of latest_update_id to give us a timespan, which we could then use $avg on.
Can anyone tell me if this is possible? (Thank you!)
As Neil already pointed out, you can't get to the timestamp from the ObjectId in the aggregation framework.
You said that speed is not important, so using MapReduce you can get what you want:
var map = function() {
if (this.updates.length > 1) {
var first = this.updates[0];
var last = this.updates[this.updates.length - 1];
var diff = last._id.getTimestamp() - first._id.getTimestamp();
var val = {
latest_update_id : last._id,
first_update_id : first._id,
update_count : this.updates.length,
diff: diff
}
emit(this._id, val);
}
};
var reduce = function() { };
db.runCommand(
{
mapReduce: "enquiries",
map: map,
reduce: reduce,
out: "mrresults",
query: { test : true}
}
);
This are the results:
{
"_id" : ObjectId("53a007db144ff47be1000003"),
"value" : {
"latest_update_id" : ObjectId("53a80e539b966b8da5c40c36"),
"first_update_id" : ObjectId("53a007db144ff47be1000001"),
"update_count" : 3,
"diff" : 525944000
}
}
Edit:
If you want to get the average diff for all documents you can do it like this:
var map = function() {
if (this.updates.length > 1) {
var first = this.updates[0];
var last = this.updates[this.updates.length - 1];
var diff = last._id.getTimestamp() - first._id.getTimestamp();
emit("1", {diff : diff});
}
};
var reduce = function(key, values) {
var reducedVal = { count: 0, sum: 0 };
for (var idx = 0; idx < values.length; idx++) {
reducedVal.count += 1;
reducedVal.sum += values[idx].diff;
}
return reducedVal;
};
var finalize = function (key, reducedVal) {
reducedVal.avg = reducedVal.sum/reducedVal.count;
return reducedVal;
};
db.runCommand(
{
mapReduce: "y",
map: map,
reduce: reduce,
finalize : finalize,
out: "mrtest",
query: { test : true}
}
);
And the example output:
> db.mrtest.find().pretty()
{
"_id" : "1",
"value" : {
"count" : 2,
"sum" : 1051888000,
"avg" : 525944000
}
}

MongoDB count number of new documents per minute based on _id

I want to create a statistic on how many new documents are stored each minute.
Since the _id field with standard ObjectID contains already the timestamp of the document creation I think it should be possible to somehow use it.
On Stackoverflow i found the following map reduce code to get it done when there is a dedicated field for the creation data
Map-Reduce count number of documents in each minute MongoDB
map = function() {
var created_at_minute = new Date(this.created_at.getFullYear(),
this.created_at.getMonth(),
this.created_at.getDate(),
this.created_at.getHours(),
this.created_at.getMinutes());
emit(created_at_minute, {count: 1});
}
reduce = function(key, values) {
var total = 0;
for(var i = 0; i < values.length; i++) { total += values[i].count; }
return {count: total};
}
According to the Mongo DB Documentation (http://docs.mongodb.org/manual/reference/object-id/) it should be possible to get the timestamp from the _id by calling ObjectId("507f191e810c19729de860ea").getTimestamp().
Right now I have no idea if it is possible at all to use this getTimestamp() inside of the map function.
Has anybody an idea how to do it or is there a better way ?
I need it to be implementable in python or php
You can do this with M/R indeed. getTimestamp() works in M/R as it runs in JavaScript on the server, it doesn't matter whether your client language is PHP or Python:
map = function() {
var datetime = this._id.getTimestamp();
var created_at_minute = new Date(datetime.getFullYear(),
datetime.getMonth(),
datetime.getDate(),
datetime.getHours(),
datetime.getMinutes());
emit(created_at_minute, {count: 1});
}
reduce = function(key, values) {
var total = 0;
for(var i = 0; i < values.length; i++) { total += values[i].count; }
return {count: total};
}
db.so.mapReduce( map, reduce, { out: 'inline' } );
db.inline.find();
Which outputs something like:
{ "_id" : ISODate("2013-08-05T15:24:00Z"), "value" : { "count" : 9 } }
{ "_id" : ISODate("2013-08-05T15:26:00Z"), "value" : { "count" : 2 } }
However, I would suggest you don't use M/R but instead turn to the aggregation framework as it's much faster because can use indexes and run concurrently. Right now, the A/F does not have an operator to get the timestamp out of an ObjectID field yet though so you will have to store the time at the moment of insertion as well. F.e. with documents like this:
db.so.drop();
db.so.insert( { date: new ISODate( "2013-08-05T15:24:15" ) } );
db.so.insert( { date: new ISODate( "2013-08-05T15:24:19" ) } );
db.so.insert( { date: new ISODate( "2013-08-05T15:24:25" ) } );
db.so.insert( { date: new ISODate( "2013-08-05T15:24:32" ) } );
db.so.insert( { date: new ISODate( "2013-08-05T15:24:45" ) } );
db.so.insert( { date: new ISODate( "2013-08-05T15:25:15" ) } );
db.so.insert( { date: new ISODate( "2013-08-05T15:25:15" ) } );
db.so.aggregate( [
{ $group: {
_id: {
y: { '$year': '$date' },
m: { '$month': '$date' },
d: { '$dayOfMonth': '$date' },
h: { '$hour': '$date' },
i: { '$minute': '$date' },
},
count: { $sum : 1 }
} }
] );
Which outputs:
{
"result" : [
{
"_id" : {
"y" : 2013,
"m" : 8,
"d" : 5,
"h" : 15,
"i" : 25
},
"count" : 2
},
{
"_id" : {
"y" : 2013,
"m" : 8,
"d" : 5,
"h" : 15,
"i" : 24
},
"count" : 5
}
],
"ok" : 1
}

Mongo: count the number of word occurrences in a set of documents

I have a set of documents in Mongo. Say:
[
{ summary:"This is good" },
{ summary:"This is bad" },
{ summary:"Something that is neither good nor bad" }
]
I'd like to count the number of occurrences of each word (case insensitive), then sort in descending order. The result should be something like:
[
"is": 3,
"bad": 2,
"good": 2,
"this": 2,
"neither": 1,
"nor": 1,
"something": 1,
"that": 1
]
Any idea how to do this? Aggregation framework would be preferred, as I understand it to some degree already :)
MapReduce might be a good fit that can process the documents on the server without doing manipulation on the client (as there isn't a feature to split a string on the DB server (open issue).
Start with the map function. In the example below (which likely needs to be more robust), each document is passed to the map function (as this). The code looks for the summary field and if it's there, lowercases it, splits on a space, and then emits a 1 for each word found.
var map = function() {
var summary = this.summary;
if (summary) {
// quick lowercase to normalize per your requirements
summary = summary.toLowerCase().split(" ");
for (var i = summary.length - 1; i >= 0; i--) {
// might want to remove punctuation, etc. here
if (summary[i]) { // make sure there's something
emit(summary[i], 1); // store a 1 for each word
}
}
}
};
Then, in the reduce function, it sums all of the results found by the map function and returns a discrete total for each word that was emitted above.
var reduce = function( key, values ) {
var count = 0;
values.forEach(function(v) {
count +=v;
});
return count;
}
Finally, execute the mapReduce:
> db.so.mapReduce(map, reduce, {out: "word_count"})
The results with your sample data:
> db.word_count.find().sort({value:-1})
{ "_id" : "is", "value" : 3 }
{ "_id" : "bad", "value" : 2 }
{ "_id" : "good", "value" : 2 }
{ "_id" : "this", "value" : 2 }
{ "_id" : "neither", "value" : 1 }
{ "_id" : "or", "value" : 1 }
{ "_id" : "something", "value" : 1 }
{ "_id" : "that", "value" : 1 }
A basic MapReduce example
var m = function() {
var words = this.summary.split(" ");
if (words) {
for(var i=0; i<words.length; i++) {
emit(words[i].toLowerCase(), 1);
}
}
}
var r = function(k, v) {
return v.length;
};
db.collection.mapReduce(
m, r, { out: { merge: "words_count" } }
)
This will insert word counts into a collection name words_count which you can sort (and index)
Note that it doesn't use stemming, omit punctuation, handles stop words etc.
Also note you can optimize the map function by accumulating repeating word(s) occurrences and emitting the count, not just 1
You can use #split.
Try Below query
db.summary.aggregate([
{ $project : { summary : { $split: ["$summary", " "] } } },
{ $unwind : "$summary" },
{ $group : { _id: "$summary" , total : { "$sum" : 1 } } },
{ $sort : { total : -1 } }
]);
Old question but since 4.2 this can be done with $regexFindAll now.
db.summaries.aggregate([
{$project: {
occurences: {
$regexFindAll: {
input: '$summary',
regex: /\b\w+\b/, // match words
}
}
}},
{$unwind: '$occurences'},
{$group: {
_id: '$occurences.match', // group by each word
totalOccurences: {
$sum: 1 // add up total occurences
}
}},
{$sort: {
totalOccurences: -1
}}
]);
This will output docs in the following format:
{
_id: "matchedwordstring",
totalOccurences: number
}