I've used aggregation in mongo a lot, I know performance benefits on the grouped counts and etc. But, do mongo have any difference in performance on those two ways to count all documents in a collection?:
collection.aggregate([
{
$match: {}
},{
$group: {
_id: null,
count: {$sum: 1}
}
}]);
and
collection.find({}).count()
Update: Second case:
Let's say we have this sample data:
{_id: 1, type: 'one', value: true}
{_id: 2, type: 'two', value: false}
{_id: 4, type: 'five', value: false}
With aggregate():
var _ids = ['id1', 'id2', 'id3'];
var counted = Collections.mail.aggregate([
{
'$match': {
_id: {
'$in': _ids
},
value: false
}
}, {
'$group': {
_id: "$type",
count: {
'$sum': 1
}
}
}
]);
With count():
var counted = {};
var type = 'two';
for (i = 0, len = _ids.length; i < len; i++) {
counted[_ids[i]] = Collections.mail.find({
_id: _ids[i], value: false, type: type
}).count();
}
.count() is by far faster. You can see the implementation by calling
// Note the missing parentheses at the end
db.collection.count
which returns the length of the cursor. of the default query (if count() is called with no query document), which in turn is implemented as returning the length of the _id_ index, iirc.
An aggregation, however, reads each and every document and processes it. This can only be halfway in the same order of magnitude with .count() when doing it over only some 100k of documents (give and take according to your RAM).
Below function was applied to a collection with some 12M entries:
function checkSpeed(col,iterations){
// Get the collection
var collectionUnderTest = db[col];
// The collection we are writing our stats to
var stats = db[col+'STATS']
// remove old stats
stats.remove({})
// Prevent allocation in loop
var start = new Date().getTime()
var duration = new Date().getTime()
print("Counting with count()")
for (var i = 1; i <= iterations; i++){
start = new Date().getTime();
var result = collectionUnderTest.count()
duration = new Date().getTime() - start
stats.insert({"type":"count","pass":i,"duration":duration,"count":result})
}
print("Counting with aggregation")
for(var j = 1; j <= iterations; j++){
start = new Date().getTime()
var doc = collectionUnderTest.aggregate([{ $group:{_id: null, count:{ $sum: 1 } } }])
duration = new Date().getTime() - start
stats.insert({"type":"aggregation", "pass":j, "duration": duration,"count":doc.count})
}
var averages = stats.aggregate([
{$group:{_id:"$type","average":{"$avg":"$duration"}}}
])
return averages
}
And returned:
{ "_id" : "aggregation", "average" : 43828.8 }
{ "_id" : "count", "average" : 0.6 }
The unit is milliseconds.
hth
Related
I have time series data as the following format in a mongodb collection:
{
"Name" : "AKBNK",
"Date" : ISODate("2009-01-02T00:00:00Z"),
"Close" : 3.256746559,
}
I want to calculate simple moving average using mongodb mapreduce. I tried it as the following to perform window sliding, but it works slowly when the period is big.
var mapper = function() {
var i = 0, j = counter;
if (j < period) {
j = period;
i = period - counter;
}
for (; i < period && j <= limit; i++, j++) {
emit (j, this.Close);
}
counter++;
}
var reducer = function(key, values) {
return Array.sum(values);
}
var finalizer = function(key, reducedValue) {
return reducedValue / period;
}
var period = 730;
db.data.mapReduce(mapper, reducer, {finalize: finalizer, out: "smaOut", query: {Name: "AKBNK"}, sort: {Date: -1}, limit: period * 2 - 1, scope: {counter: 1, period: period, limit: period * 2 - 1}});
Any advice how can I do this faster? How can I map the data?
You could try using the below aggregation pipeline which seems to produce the correct results at a quick glance but at a much higher speed:
db.data.aggregate({
$match: {
"Name": "AKBNK" // this stage will use and index if you have one on the "Name" field
}
}, {
$sort: { "Date": -1 }, // this stage will also use and index if you have one on "Date"
}, {
$group: {
"_id": null, // create one single document
"allCloseValues": { $push: "$Close" } // that shall contain an array with all "Close" values
}
}, {
$addFields: {
"copyOfAllCloseValues": "$allCloseValues" // duplicate the array
}
}, {
$unwind: {
"path": "$copyOfAllCloseValues", // flatten the created single document
"includeArrayIndex": "_id" // use the "_id" field to hold the array index
}
}, {
$project: {
avg: {
$avg: { // calculate the average of some part of the array "Close"
$slice: [ "$allCloseValues", "$_id", 730 ] // which shall start at index "_id" and take up to 730 values
}
}
}
}, {
$out: "smaOut" // write the resulting documents out to the "smaOut" collection
});
I'm trying to map reduce a bunch of data in order to generate a daily graph, the catch is that the application has users from all over the world and they want the data in their own timezone.
The current map reduction I have is pretty simple
var map = function(){
var userLogin = this;
var d = this.StartTime;
var start = d.getFullYear() + '-' + d.getMonth() + 1, + '-' + d.getDate();
var reduceValue = {
SuccessSession: 0,
FailSession: 0
}
if(userLogin.ExitReason.Severity <2)
{
reduceValue.SuccessSession += 1;
} else {
reduceValue.FailSession += 1;
}
emit({ClientId: this.ClientId, StartDate:start, IsAdmin: this.IsAdmin},
{SuccessSession: reduceValue.SuccessSession, FailSession: reduceValue.FailSession })
}
and the Reduce
var reduce = function(key, value) {
var reducedValue = {
SuccessSession: 0,
FailSession : 0
};
for(var idx = 0; idx < value.length; idx++)
{
reducedValue.SuccessSession += value[idx].SuccessSession;
reducedValue.FailSession += value[idx].FailSession;
}
return reducedValue;
};
The problem with this approach is that it calculates from midnight UTC to midnight UTC, however I'd prefer to be able to query the Map Reduced and still be able to query depending upon which timezone a user is in.
Is there any simple way to accomplish being able to mapreduce and yet keep the ability to use timezones?
No easy way. Map is a javascript, so you will need at least IANA TZ database there. Something like https://momentjs.com/timezone/ being available within the map function.
Would you consider aggregation framework instead of map-reduce? It has support of timezones and is much faster. The pipeline could be as simple as this:
db.collection.aggregate([
{ $group: {
_id: {
ClientId: "$ClientId",
start: { $dateToString: {
format: "%Y-%m-%d",
date: "$StartTime",
timezone: "Europe/Kiev"
} },
IsAdmin: "$IsAdmin"
},
"SuccessSession": { $sum: { $cond: {
if: { $lt: [ "$ExitReason.Severity", 2 ] },
then: 1,
else: 0
} } },
"FailSession":{ $sum: { $cond: {
if: { $lt: [ "$ExitReason.Severity", 2 ] },
then: 0,
else: 1
} } },
} }
])
I have the following schema in mongodb, where the timestamp is the timestamp at an hourly level
{
"day_chan1" : 54.464,
"day_chan2" : 44.141,
"day_chan3" : 44.89,
"gatewayId" : "443719005AA3",
"timestamp" : ISODate("2016-02-15T23:00:00.000Z"),
"total_curr_chan" : 5.408,
"total_day_chan" : 143.495,
"type" : 1
}
I want to be able to query the last timestamp for the day for the last 7 days and 30 days. In order to do this, I am thinking of doing something like
var d = new Date(); // today!
for(var i =1; i <= 7; i++) {
var n = i; // go back n days!
d.setDate(d.getDate() - n);
d.setHours(23,0,0);
var query = {
gatewayId: req.params.deviceId,
timestamp: { $lt: new Date(d) }
};
db
.find(query,function(resp) {
//return the data here
});
}
But this creates a problem of multiple callbacks and I want to know if there is an easier way of doing so using aggregates or some other method
Use the $hour operator within the $project operator to extract the hour part of the timestamp, then query with $match to filter documents that do not satisfy the given hour criteria:
var pipeline = [
{
"$project": {
"day_chan1": 1,
"day_chan2": 1,
"day_chan3": 1,
"gatewayId": 1,
"timestamp": 1,
"total_curr_chan": 1,
"total_day_chan": 1,
"type": 1,
"hour": { "$hour": "$timestamp" }
}
},
{ "$match": { "hour": 23 } }
];
collection.aggregate(pipeline, function(err, result) {
//return the data here
console.log(result);
});
For arbitrary last hour it must be a bit more complex:
db.collection.aggregate([
{$match:{
timestamp:{$type: "date"}}
// add date constraints here
},
{$project:{
_id:1,
date:{"y":{$year:"$timestamp"}, "d":{$dayOfYear:"$timestamp"}},
doc:"$$CURRENT"}
},
{$group:{
_id:"$date",
maxtime: {$max:"$doc.timestamp"},
doc:{$push:"$doc"}}
},
{$unwind:"$doc"},
{$project:{
latest: {$cmp: ["$maxtime", "$doc.timestamp"]},
doc:"$doc"}
},
{$match:{"latest":0}}
])
With map-reduce it should be simpler, but may be slower.
I have a collection A that has documents in form of:
{
_id: 12345,
title: "title"
}
and document B in form of:
{
_id: 12345,
newAttribute: "newAttribute12345"
}
I want to update collection A to have documents like:
{
_id: 12345,
title: "title"
newAttribute: "newAttribute12345"
}
At this time I do it with
update({_id: doc._id}, {$set: {newAttribute: doc.newAttrubute}})
, but I need to run it 10,000 in a loop for all my documents.
How can I update multiple documents like these (by _id) in 1 db call or in most efficient way? (this is basically a join/bulk update attributes operation)
I use mongodb 2.6
consider following scenario, two collections name as title and attribute.
title collection contains following documents :
[{
_id: 12345,
title: "title"
},
{
_id: 12346,
title: "title1"
}]
and attribute collection contains following document :
[{
_id: 12345,
newAttribute: "newAttribute12345"
},
{
_id: 12346,
newAttribute: "newAttribute12346"
},
{
_id: 12347,
newAttribute: "newAttribute12347"
}]
And you want to update title collection as using this criteria title._id = attribute._id use mongo bulk update with following script :
var bulk = db.title.initializeOrderedBulkOp();
var counter = 0;
db.attribute.find().forEach(function(data) {
var updoc = {
"$set": {}
};
var updateKey = "newAttribute";
updoc["$set"][updateKey] = data.newAttribute;
bulk.find({
"_id": data._id
}).update(updoc);
counter++;
// Drain and re-initialize every 1000 update statements
if(counter % 1000 == 0) {
bulk.execute();
bulk = db.title.initializeOrderedBulkOp();
}
})
// Add the rest in the queue
if(counter % 1000 != 0) bulk.execute();
A possible/problematic answer is hacky join in mongo (maybe there is something better):
http://tebros.com/2011/07/using-mongodb-mapreduce-to-join-2-collections/
The problem with this is that I have to swap the collections later and this requires me to know the properties of my collection
var r = function(key, values){
var result = { prop1: null, prop2: null };
values.forEach(function(value){
if (result.prop1 === null && value.prop1 !== null) {
result.prop1 = value.prop1;
}
if (result.prop2 === null && value.prop2 !== null) {
result.prop2 = value.prop2;
}
})
return result;
};
var m = function(){
emit(this._id, { prop1: this.prop1, prop2: this.prop2 })
}
db.A.mapReduce(m1, r, { out: { reduce: 'C' }});
db.B.mapReduce(m1, r, { out: { reduce: 'C' }});
You can use the cursor.forEach method
db.collectionA.find().forEach(function(docA){
db.collectionB.find().forEach(function(docB){
if(docA._id === docB._id){
docA.newAttribute = docB.newAttribute;
db.collectionA.save(docA);
}
})
})
> db.collectionA.find()
{ "_id" : 12345, "title" : "title", "newAttribute" : "newAttribute12345" }
Initially, i have a relationship where an order has many lineitems and many lineitems has only one order, as usual.
Using mongoDB, I did this document to represent it:
{
"_id" : ObjectId("511b7d1b3daee1b1446ecdfe"),
"l_order" : {
"_id" : ObjectId("511b7d133daee1b1446eb54d"),
"o_orderkey" : NumberLong(1),
"o_totalprice" : 173665.47,
"o_orderdate" : ISODate("1996-01-02T03:00:00Z"),
"o_orderpriority" : "5-LOW",
"o_shippriority" : 0,
},
"l_linenumber" : 1,
"l_shipdate" : ISODate("1996-03-13T03:00:00Z"),
"l_commitdate" : ISODate("1996-02-12T03:00:00Z"),
"l_receiptdate" : ISODate("1996-03-22T03:00:00Z"),
}
My intention is translate this sql query:
select
o_orderpriority,
count(*) as order_count
from
orders
where
o_orderdate >= date '1993-07-01'
and o_orderdate < date '1993-07-01' + interval '3' month
and exists (
select
*
from
lineitem
where
l_orderkey = o_orderkey
and l_commitdate < l_receiptdate
)
group by
o_orderpriority
order by
o_orderpriority;
For this a use two mapreduce functions:
First
db.runCommand({
mapreduce: "lineitem",
query: {
"l_order.o_orderdate": {'$gte': new Date("July 01, 1993"), '$lt': new Date("Oct 01, 1993")}
},
map: function Map() {
if(this.l_commitdate < this.l_receiptdate){
emit( this.l_order.o_orderkey, this.l_order.o_orderpriority );
}
},
out: 'query004a'
});
Second
db.runCommand({
mapreduce: "query004a",
map: function Map() {
/*Remenbering, the value here will be this.l_order.o_orderpriority from the previous mapreduce function*/
emit( this.value, 1 );
},
reduce: function(key, values) {
return Array.sum(values);
},
out: 'query004b'
});
In first i segregated the document pieces there was in date range and respect the comparison, grouping them for order key to avoid duplicate. In second i grouped the o_orderpriority and sum.
Well for my surprise the answer was bigger than i was expecting. But why and where this occurs?
in your first map function you should use 'oderpriority' as a key and 'orderkey' as value - this will reduce the set to the key you want in the second mapReduce. (You need to specify a reduce function, otherwise mapReduce will return an error).
So, this could look like this:
OrderDateMin = new Date("1996-01-01");
OrderDateMax = new Date("1996-04-01");
// first where on oderdate
query = {
"l_order.o_orderdate": {$gte: OrderDateMin, $lt: OrderDateMax}
}
map1 = function() {
//second "where" on commitdate < receiptdate
if ( this.l_commitdate < this.l_receiptdate ) {
// emit orderpriority as key, "1" as counter
emit( this.l_order.o_orderpriority, this.l_order.o_orderkey );
}
};
reduce1 = function(key, values) {
return 1;
}
db.runCommand({
mapReduce: "xx",
query: query,
map: map1,
reduce: reduce1,
out: 'query004a',
})
map2 = function() {
//_id is ordepriority
emit( this._id, 1 );
};
reduce2 = function(key, values) {
// count entries per orderpriority
count = 0;
values.forEach( function(value) { count += value; } );
return count;
}
db.runCommand({
mapReduce: "query004a",
map: map2,
reduce: reduce2,
out: 'query004b',
})
Now, the same can be achieved with one aggregate command, which is faster (implemented in C, not in JavaScript):
db.xx.aggregate([
// first "where", this will use an index, if defined
{ $match: {
"l_order.o_orderdate": { $gte: OrderDateMin, $lt: OrderDateMax }
}},
// reduce to needed fields, create a field for decision of second "where"
{ $project: {
"key": "$l_order.o_orderkey",
"pri": "$l_order.o_orderpriority",
okay: { $cond: [ {$lt: ["l_commitdate", "l_receiptdate"]}, 1, 0 ] }
}},
// select second where condition matched
{ $match: { "okay": 1 } },
// group by priority and key
{ $group: { _id: { "pri": "$pri", "key": "$key" } } },
// group by priority - count entries
{ $group: { _id: "$_id.pri", "count": { $sum: 1 } } },
])
which would return something like:
{ "result" : [ { "_id" : "5-LOW", "count" : 1 } ], "ok" : 1 }
Last, but not least: a suggestion regarding design:
It would be simpler if your structure was the other way round: an "orders" collection with the order-items embedded as an array of items. This would avoid having duplicate order data throughout the collection.
Further info:
http://docs.mongodb.org/manual/reference/command/mapReduce/#mapReduce
http://docs.mongodb.org/manual/reference/aggregation
Does this help?
Cheers
Ronald