I have time series data as the following format in a mongodb collection:
{
"Name" : "AKBNK",
"Date" : ISODate("2009-01-02T00:00:00Z"),
"Close" : 3.256746559,
}
I want to calculate simple moving average using mongodb mapreduce. I tried it as the following to perform window sliding, but it works slowly when the period is big.
var mapper = function() {
var i = 0, j = counter;
if (j < period) {
j = period;
i = period - counter;
}
for (; i < period && j <= limit; i++, j++) {
emit (j, this.Close);
}
counter++;
}
var reducer = function(key, values) {
return Array.sum(values);
}
var finalizer = function(key, reducedValue) {
return reducedValue / period;
}
var period = 730;
db.data.mapReduce(mapper, reducer, {finalize: finalizer, out: "smaOut", query: {Name: "AKBNK"}, sort: {Date: -1}, limit: period * 2 - 1, scope: {counter: 1, period: period, limit: period * 2 - 1}});
Any advice how can I do this faster? How can I map the data?
You could try using the below aggregation pipeline which seems to produce the correct results at a quick glance but at a much higher speed:
db.data.aggregate({
$match: {
"Name": "AKBNK" // this stage will use and index if you have one on the "Name" field
}
}, {
$sort: { "Date": -1 }, // this stage will also use and index if you have one on "Date"
}, {
$group: {
"_id": null, // create one single document
"allCloseValues": { $push: "$Close" } // that shall contain an array with all "Close" values
}
}, {
$addFields: {
"copyOfAllCloseValues": "$allCloseValues" // duplicate the array
}
}, {
$unwind: {
"path": "$copyOfAllCloseValues", // flatten the created single document
"includeArrayIndex": "_id" // use the "_id" field to hold the array index
}
}, {
$project: {
avg: {
$avg: { // calculate the average of some part of the array "Close"
$slice: [ "$allCloseValues", "$_id", 730 ] // which shall start at index "_id" and take up to 730 values
}
}
}
}, {
$out: "smaOut" // write the resulting documents out to the "smaOut" collection
});
Related
I need to duplicate a document from my collection orders by filtering the field cd_order with value "7650614875".
Collection orders look like this:
{
...
"cd_order": "7650614875"
...
}
I will copy this document more than 50 times (I will change this value in the query), so I also need to change the value from this field cd_order from this copied document to another value, so I though by converting into int and then use the function inc to increment 1 and then converting back to string.
I tried the query below but only the copy ocorrued, the rest didn't work:
var copy = db.orders.findOne({ cd_order: "7650614875" }, { _id: 0 });
for (var i = 0; i< 3; i++){
db.orders.insert(copy);
{ $convert: { input: $string, to: "int" } }
{ $inc: { "cd_order" : 1 } }
{ $convert: { input: $int, to: "string" } }
}
How can I duplicate this document, increment 1 into field cd_order to not be the same as the previous one, and also to print all new cd_order at the end?
Print example:
cd_order: 7650614875, 7650614876, 7650614877, 76506148758, ...
You can use $range to generate an array of increment. $unwind the array to add to cd_order and $merge to insert/update into the collection
db.collection.aggregate([
{
"$match": {
"cd_order": "7650614875"
}
},
{
$set: {
inc: {
"$range": [
0,
51,
1
]
}
}
},
{
"$unwind": "$inc"
},
{
$set: {
cd_order: {
$toString: {
$add: [
{
"$toLong": "$cd_order"
},
"$inc"
]
}
},
"inc": "$$REMOVE",
_id: "$$REMOVE"
}
},
{
"$merge": {
"into": "collection",
"on": "cd_order",
"whenMatched": "merge",
"whenNotMatched": "insert"
}
}
])
Mongo Playground
I was able to duplicate to convert the value and increment before duplicating the document, it worked well:
var copy = db.orders.findOne({ cd_order: "7650614877" }, { _id: 0 });
for (var i = 0; i < 100; i++) {
copy.cd_order = (parseInt(copy.cd_order) + 1).toString();
db.orders.insert(copy);
}
I was also able to print all values using this query:
var orders = db.orders.find({ "nm_ancestor": {$eq: "Luigi D'arpino"} }, { "cd_order": 1 });
var ordersString = "";
orders.forEach(function(order) {
if(ordersString.length>0) ordersString += ", ";
ordersString += order.cd_order;
});
print("cd_order: " + ordersString);
I know the method for converting dates for the first level
db.collection.find().forEach(function(x){
x.DateField = ISODate(x.DateField);
db.collection.save(x);})
This works with first level fields, but what should I do if i want to change fields of objects in arrays such as CreatedTime in the below json sample:
{
"Pharmacy": "b",
"Medicine": [
{
"MedName": "MedB",
"Quantity": 60,
"CreatedTime" : "2006-05-05T11:44:47.86Z"
},
{
"MedName": "MedC",
"Quantity": 34,
"CreatedTime" : "2006-11-23T12:28:44.86Z"
}
]
}
You can use the cursor from the aggregate query and bulk updates to update documents in 4.0 version.
Use $map with $toDate to keep the existing values and convert the string date to date type.
Here is the shell sample.
var bulk = db.collection.initializeUnorderedBulkOp();
var count = 0;
var batch = 50; // Change batch size as you need
db.collection.aggregate([
{"$project":{
"Medicine":{
"$map":{
"input":"$Medicine",
"in":{
"MedName":"$$this.MedName",
"Quantity":"$$this.Quantity",
"CreatedTime":{"$toDate":"$$this.CreatedTime"}
}
}
}
}}
]).forEach(function(doc){
bulk.find( {"_id" : doc._id}).updateOne(
{ "$set": {"Medicine" : doc.Medicine}}
);
count++;
if (count == batch) {
bulk.execute();
bulk = db.collection.initializeUnorderedBulkOp();
count = 0;
}
});
if (count > 0) {
bulk.execute();
}
db.getCollection('test').find({}).forEach(function(doc) {
doc.Medicine.forEach(function(doc1){
var dbl = new Date(doc1.CreatedTime);
doc1.CreatedTime = dbl;
});
db.test.save(doc);
});
EDITED: ======= Second approach: Source
db.logmessages.aggregate( [ {
$project: {
date: {
$dateFromString: {
dateString: '$date',
timezone: 'America/New_York'
}
}
}
} ] )
I have the following schema in mongodb, where the timestamp is the timestamp at an hourly level
{
"day_chan1" : 54.464,
"day_chan2" : 44.141,
"day_chan3" : 44.89,
"gatewayId" : "443719005AA3",
"timestamp" : ISODate("2016-02-15T23:00:00.000Z"),
"total_curr_chan" : 5.408,
"total_day_chan" : 143.495,
"type" : 1
}
I want to be able to query the last timestamp for the day for the last 7 days and 30 days. In order to do this, I am thinking of doing something like
var d = new Date(); // today!
for(var i =1; i <= 7; i++) {
var n = i; // go back n days!
d.setDate(d.getDate() - n);
d.setHours(23,0,0);
var query = {
gatewayId: req.params.deviceId,
timestamp: { $lt: new Date(d) }
};
db
.find(query,function(resp) {
//return the data here
});
}
But this creates a problem of multiple callbacks and I want to know if there is an easier way of doing so using aggregates or some other method
Use the $hour operator within the $project operator to extract the hour part of the timestamp, then query with $match to filter documents that do not satisfy the given hour criteria:
var pipeline = [
{
"$project": {
"day_chan1": 1,
"day_chan2": 1,
"day_chan3": 1,
"gatewayId": 1,
"timestamp": 1,
"total_curr_chan": 1,
"total_day_chan": 1,
"type": 1,
"hour": { "$hour": "$timestamp" }
}
},
{ "$match": { "hour": 23 } }
];
collection.aggregate(pipeline, function(err, result) {
//return the data here
console.log(result);
});
For arbitrary last hour it must be a bit more complex:
db.collection.aggregate([
{$match:{
timestamp:{$type: "date"}}
// add date constraints here
},
{$project:{
_id:1,
date:{"y":{$year:"$timestamp"}, "d":{$dayOfYear:"$timestamp"}},
doc:"$$CURRENT"}
},
{$group:{
_id:"$date",
maxtime: {$max:"$doc.timestamp"},
doc:{$push:"$doc"}}
},
{$unwind:"$doc"},
{$project:{
latest: {$cmp: ["$maxtime", "$doc.timestamp"]},
doc:"$doc"}
},
{$match:{"latest":0}}
])
With map-reduce it should be simpler, but may be slower.
I've used aggregation in mongo a lot, I know performance benefits on the grouped counts and etc. But, do mongo have any difference in performance on those two ways to count all documents in a collection?:
collection.aggregate([
{
$match: {}
},{
$group: {
_id: null,
count: {$sum: 1}
}
}]);
and
collection.find({}).count()
Update: Second case:
Let's say we have this sample data:
{_id: 1, type: 'one', value: true}
{_id: 2, type: 'two', value: false}
{_id: 4, type: 'five', value: false}
With aggregate():
var _ids = ['id1', 'id2', 'id3'];
var counted = Collections.mail.aggregate([
{
'$match': {
_id: {
'$in': _ids
},
value: false
}
}, {
'$group': {
_id: "$type",
count: {
'$sum': 1
}
}
}
]);
With count():
var counted = {};
var type = 'two';
for (i = 0, len = _ids.length; i < len; i++) {
counted[_ids[i]] = Collections.mail.find({
_id: _ids[i], value: false, type: type
}).count();
}
.count() is by far faster. You can see the implementation by calling
// Note the missing parentheses at the end
db.collection.count
which returns the length of the cursor. of the default query (if count() is called with no query document), which in turn is implemented as returning the length of the _id_ index, iirc.
An aggregation, however, reads each and every document and processes it. This can only be halfway in the same order of magnitude with .count() when doing it over only some 100k of documents (give and take according to your RAM).
Below function was applied to a collection with some 12M entries:
function checkSpeed(col,iterations){
// Get the collection
var collectionUnderTest = db[col];
// The collection we are writing our stats to
var stats = db[col+'STATS']
// remove old stats
stats.remove({})
// Prevent allocation in loop
var start = new Date().getTime()
var duration = new Date().getTime()
print("Counting with count()")
for (var i = 1; i <= iterations; i++){
start = new Date().getTime();
var result = collectionUnderTest.count()
duration = new Date().getTime() - start
stats.insert({"type":"count","pass":i,"duration":duration,"count":result})
}
print("Counting with aggregation")
for(var j = 1; j <= iterations; j++){
start = new Date().getTime()
var doc = collectionUnderTest.aggregate([{ $group:{_id: null, count:{ $sum: 1 } } }])
duration = new Date().getTime() - start
stats.insert({"type":"aggregation", "pass":j, "duration": duration,"count":doc.count})
}
var averages = stats.aggregate([
{$group:{_id:"$type","average":{"$avg":"$duration"}}}
])
return averages
}
And returned:
{ "_id" : "aggregation", "average" : 43828.8 }
{ "_id" : "count", "average" : 0.6 }
The unit is milliseconds.
hth
Initially, i have a relationship where an order has many lineitems and many lineitems has only one order, as usual.
Using mongoDB, I did this document to represent it:
{
"_id" : ObjectId("511b7d1b3daee1b1446ecdfe"),
"l_order" : {
"_id" : ObjectId("511b7d133daee1b1446eb54d"),
"o_orderkey" : NumberLong(1),
"o_totalprice" : 173665.47,
"o_orderdate" : ISODate("1996-01-02T03:00:00Z"),
"o_orderpriority" : "5-LOW",
"o_shippriority" : 0,
},
"l_linenumber" : 1,
"l_shipdate" : ISODate("1996-03-13T03:00:00Z"),
"l_commitdate" : ISODate("1996-02-12T03:00:00Z"),
"l_receiptdate" : ISODate("1996-03-22T03:00:00Z"),
}
My intention is translate this sql query:
select
o_orderpriority,
count(*) as order_count
from
orders
where
o_orderdate >= date '1993-07-01'
and o_orderdate < date '1993-07-01' + interval '3' month
and exists (
select
*
from
lineitem
where
l_orderkey = o_orderkey
and l_commitdate < l_receiptdate
)
group by
o_orderpriority
order by
o_orderpriority;
For this a use two mapreduce functions:
First
db.runCommand({
mapreduce: "lineitem",
query: {
"l_order.o_orderdate": {'$gte': new Date("July 01, 1993"), '$lt': new Date("Oct 01, 1993")}
},
map: function Map() {
if(this.l_commitdate < this.l_receiptdate){
emit( this.l_order.o_orderkey, this.l_order.o_orderpriority );
}
},
out: 'query004a'
});
Second
db.runCommand({
mapreduce: "query004a",
map: function Map() {
/*Remenbering, the value here will be this.l_order.o_orderpriority from the previous mapreduce function*/
emit( this.value, 1 );
},
reduce: function(key, values) {
return Array.sum(values);
},
out: 'query004b'
});
In first i segregated the document pieces there was in date range and respect the comparison, grouping them for order key to avoid duplicate. In second i grouped the o_orderpriority and sum.
Well for my surprise the answer was bigger than i was expecting. But why and where this occurs?
in your first map function you should use 'oderpriority' as a key and 'orderkey' as value - this will reduce the set to the key you want in the second mapReduce. (You need to specify a reduce function, otherwise mapReduce will return an error).
So, this could look like this:
OrderDateMin = new Date("1996-01-01");
OrderDateMax = new Date("1996-04-01");
// first where on oderdate
query = {
"l_order.o_orderdate": {$gte: OrderDateMin, $lt: OrderDateMax}
}
map1 = function() {
//second "where" on commitdate < receiptdate
if ( this.l_commitdate < this.l_receiptdate ) {
// emit orderpriority as key, "1" as counter
emit( this.l_order.o_orderpriority, this.l_order.o_orderkey );
}
};
reduce1 = function(key, values) {
return 1;
}
db.runCommand({
mapReduce: "xx",
query: query,
map: map1,
reduce: reduce1,
out: 'query004a',
})
map2 = function() {
//_id is ordepriority
emit( this._id, 1 );
};
reduce2 = function(key, values) {
// count entries per orderpriority
count = 0;
values.forEach( function(value) { count += value; } );
return count;
}
db.runCommand({
mapReduce: "query004a",
map: map2,
reduce: reduce2,
out: 'query004b',
})
Now, the same can be achieved with one aggregate command, which is faster (implemented in C, not in JavaScript):
db.xx.aggregate([
// first "where", this will use an index, if defined
{ $match: {
"l_order.o_orderdate": { $gte: OrderDateMin, $lt: OrderDateMax }
}},
// reduce to needed fields, create a field for decision of second "where"
{ $project: {
"key": "$l_order.o_orderkey",
"pri": "$l_order.o_orderpriority",
okay: { $cond: [ {$lt: ["l_commitdate", "l_receiptdate"]}, 1, 0 ] }
}},
// select second where condition matched
{ $match: { "okay": 1 } },
// group by priority and key
{ $group: { _id: { "pri": "$pri", "key": "$key" } } },
// group by priority - count entries
{ $group: { _id: "$_id.pri", "count": { $sum: 1 } } },
])
which would return something like:
{ "result" : [ { "_id" : "5-LOW", "count" : 1 } ], "ok" : 1 }
Last, but not least: a suggestion regarding design:
It would be simpler if your structure was the other way round: an "orders" collection with the order-items embedded as an array of items. This would avoid having duplicate order data throughout the collection.
Further info:
http://docs.mongodb.org/manual/reference/command/mapReduce/#mapReduce
http://docs.mongodb.org/manual/reference/aggregation
Does this help?
Cheers
Ronald