Remove redundant data from sensors by using date and value - mongodb

I'm developing an application that collects data from sensors and I need to reduce the amount of data that is stored in a mongodb database by using a value (temperature) and a date (timestamp).
The document have the following format:
{
temperature: 10,
timestamp: ISODate("2016-04-29T14:37:50.370Z")
sensorCode:"SENSOR_A1"
}
The problem is that sensors sent data too much frequently so there are too many documents with redudant data in a short period of time (let's say 10 minutes). I meant it is not useful to have multiple equal values in a very short period of time.
Example: here there are data from a sensor that is reporting temperature is 10
// collection: datasensors
[
{
temperature: 10,
timestamp: ISODate("2016-04-29T14:37:50.370Z")
sensorCode:"SENSOR_A1"
},
{
temperature: 10,
timestamp: ISODate("2016-04-29T14:38:50.555Z")
sensorCode:"SENSOR_A1"
},
{
temperature: 10,
timestamp: ISODate("2016-04-29T14:38:51.654Z")
sensorCode:"SENSOR_A1"
}
,
{
temperature: 10,
timestamp: ISODate("2016-04-29T14:50:20.335Z")
sensorCode:"SENSOR_A1"
}
]
Because a minute precission is not required, I would like to remove all documents from 2016-04-29T14:37:50.370Z to 2016-04-29T14:38:51.32Z except one. So the result should be this:
[
{
temperature: 10,
timestamp: ISODate("2016-04-29T14:38:51.654Z")
sensorCode:"SENSOR_A1"
},
{
temperature: 10,
timestamp: ISODate("2016-04-29T14:50:20.335Z")
sensorCode:"SENSOR_A1"
}
]
The remove operation I want to perform should "reduce" equal temperatures in time ranges less than 10 minutes to one value.
Is there any technique to achieve this?

I simplified my solution and decided to keep every unique measurement received in 10 minutes time window.
Mongo 3.2 is required for that
adding a time mark will separate measurements in 10 minutes time groups
Then we are preserving first record in group and storing all ids for futher process
Then removing id of document we want to keep from an array of all ids (let say documents to delete)
Finally as forEach loop we are deleting not needed ids - this line is commented :-)
Copy code below to mongo console, execute and verify ids to delete, then un-comment and GO!
var addTimeMark = {
$project : {
_id : 1,
temperature : 1,
timestamp : 1,
sensorCode : 1,
yearMonthDay : {
$substr : [{
$dateToString : {
format : "%Y%m%d%H%M",
date : "$timestamp"
}
}, 0, 11]
}
}
}
var getFirstRecordInGroup = {
// take only first record froum group
$group : {
_id : {
timeMark : "$yearMonthDay",
sensorCode : "$sensorCode",
temperature : "$temperature"
},
id : {
$first : "$_id"
},
allIds : {
$push : "$_id"
},
timestamp : {
$first : "$timestamp"
},
totalEntries : {
$sum : 1
}
}
}
var removeFirstIdFromAllIds = {
$project : {
_id : 1,
id : 1,
timestamp : 1,
totalEntries : 1,
allIds : {
$filter : {
input : "$allIds",
as : "item",
cond : {
$ne : ["$$item", "$id"]
}
}
}
}
}
db.sensor.aggregate([
addTimeMark,
getFirstRecordInGroup,
removeFirstIdFromAllIds,
]).forEach(function (entry) {
printjson(entry.allIds);
// db.sensor.deleteMany({_id:{$in:entry.allIds}})
})
below document outlook after each step:
{
"_id" : ObjectId("574b5d8e0ac96f88db507209"),
"temperature" : 10,
"timestamp" : ISODate("2016-04-29T14:37:50.370Z"),
"sensorCode" : "SENSOR_A1",
"yearMonthDay" : "20160429143"
}
2:
{
"_id" : {
"timeMark" : "20160429143",
"sensorCode" : "SENSOR_A1",
"temperature" : 10
},
"id" : ObjectId("574b5d8e0ac96f88db507209"),
"allIds" : [
ObjectId("574b5d8e0ac96f88db507209"),
ObjectId("574b5d8e0ac96f88db50720a"),
ObjectId("574b5d8e0ac96f88db50720b")
],
"timestamp" : ISODate("2016-04-29T14:37:50.370Z"),
"totalEntries" : 3
}
and last;
{
"_id" : {
"timeMark" : "20160429143",
"sensorCode" : "SENSOR_A1",
"temperature" : 10
},
"id" : ObjectId("574b5d8e0ac96f88db507209"),
"allIds" : [
ObjectId("574b5d8e0ac96f88db50720a"),
ObjectId("574b5d8e0ac96f88db50720b")
],
"timestamp" : ISODate("2016-04-29T14:37:50.370Z"),
"totalEntries" : 3
}

Related

How to take data N samples from MongoDB to form a graph representation on whole data set?

This is a sample document in our new app to store time-series data in MongoDB sub-document,
{
"_id" : ObjectId("5dcb6cacfb315e66b551a1a0"),
"youtubeId" : "bIWShN9rKQU",
"views" : [
{
"count" : 17506,
"at" : ISODate("2019-08-12T13:31:00.002Z")
},
{
"count" : 29576,
"at" : ISODate("2019-11-14T13:32:00.216Z")
},
{
"count" : 29579,
"at" : ISODate("2019-11-15T13:33:00.197Z")
},
{
"count" : 29582,
"at" : ISODate("2019-11-16T13:34:00.192Z")
},
{
"count" : 29586,
"at" : ISODate("2019-11-17T13:35:00.180Z")
},
{
"count" : 29595,
"at" : ISODate("2019-11-19T13:36:00.190Z")
},
{
"count" : 29597,
"at" : ISODate("2019-11-20T13:37:00.206Z")
},
{
"count" : 29604,
"at" : ISODate("2019-11-21T13:38:00.228Z")
},
{
"count" : 29606,
"at" : ISODate("2019-11-22T13:39:00.218Z")
},
{
"count" : 29613,
"at" : ISODate("2019-11-24T13:40:00.201Z")
},
{
"count" : 29619,
"at" : ISODate("2019-11-25T13:41:00.250Z")
},
{
"count" : 29624,
"at" : ISODate("2019-11-27T13:42:00.103Z")
},
{
"count" : 29636,
"at" : ISODate("2019-11-29T13:43:00.128Z")
}
]
}
Now, I wanted to send this data in a web service consumed by a mobile application for plotting the graph, but I wanted to get only 10 objects in views array which should be the representation of entire data set with respect to the time. But it should be 10 data irrespective of the size of the array.
How can I take 10 data from the entire data set like this by using the at timestamp field to create a representation of the whole data?
In the above example views is an object array, which has times from
2019-08-12T13:31:00.002Z to 2019-11-29T13:43:00.128Z (13 records as 1/
minute), so that 5 samples from that means one record per every 2
minutes approximately
var noOfSamples = 5
db.test.aggregate( [
{
$addFields: {
indexes: {
$range: [ 0,
{ $size: "$views" },
{ $ceil: { $divide: [ { $size: "$views" }, noOfSamples ] } }
]
}
}
},
{
$project: {
sample: {
$map: {
input: "$indexes",
as: "ix",
in: { $arrayElemAt: [ "$views", "$$ix" ] }
}
}
}
}
] )
NOTES:
The number of samples you want is 5, noOfSamples. The size of the views array is 13.
Number of elements in views is divided by noOfSamples; in this case you get a value of 2.6. The $ceil rounds it to the next rounded integer, which is 3 (lets call it "step"). The $range operator gives you an array of numbers between 0 and 12 with a step of 3 (0 is the views array's first index, 12 is the array's highest index). With the aggregation's first stage you get an array called as indexes: [ 0, 3, 6, 9, 12 ].
In the second stage of the aggregation, you get the views array elements by their indexes, using the indexes generated in the previous stage. The $map aggregation array operator maps the generated index from indexes to the views array element - so you get the five elements from views array as sample.

Query to count number of occurrence in array grouped by day

I have the following document structure:
(trackerEventsCollection) =
{
"_id" : ObjectId("5b26c4fb7c696201040c8ed1"),
"trackerId" : ObjectId("598fc51324h51901043d76de"),
"trackingEvents" : [
{
"type" : "checkin",
"eventSource" : "app",
"timestamp" : ISODate("2017-08-25T06:34:58.964Z")
},
{
"type" : "power",
"eventSource" : "app",
"timestamp" : ISODate("2017-08-25T06:51:23.795Z")
},
{
"type" : "position",
"eventSource" : "app",
"timestamp" : ISODate("2017-08-25T06:51:23.985Z")
}
]
}
I would like to write a query that would count number of trackingEvents with type "type" : "power" grouped by day. This seems to be quite tricky to me because parent document does not have date and I should rely on timestamp field that belongs to the trackingEvents array members.
I'm not really experienced mongodb user and couldn't understand how can this be achieved so far.
Would really appreciate any help, thanks
To process your nested array as a separate documents you need to use $unwind. In the next stage you can use $match to filter out by type. Then you can group by single days counting occurences. The point is that you have to build grouping key containing year, month and day like in following code:
db.trackerEvents.aggregate([
{ $unwind: "$trackingEvents" },
{ $match: { "trackingEvents.type": "power" } },
{
$group: {
_id: {
year: { $year:"$trackingEvents.timestamp" },
month:{ $month:"$trackingEvents.timestamp" },
day: { $dayOfMonth:"$trackingEvents.timestamp" }
},
count: { $sum: 1 }
}
}
])

MongoDB - aggregate $group by every seven days from a date

Suppose you have any number of documents in a collection with the following structure:
{
"_id" : "1",
"totalUsers" : NumberInt(10000),
"iosUsers" : NumberInt(5000),
"androidUsers" : NumberInt(5000),
"creationTime" : ISODate("2017-12-04T06:14:21.529+0000")
},
{
"_id" : "2",
"totalUsers" : NumberInt(12000),
"iosUsers" : NumberInt(6000),
"androidUsers" : NumberInt(6000),
"creationTime" : ISODate("2017-12-04T06:14:21.529+0000")
},
{
"_id" : "3",
"totalUsers" : NumberInt(14000),
"iosUsers" : NumberInt(7000),
"androidUsers" : NumberInt(7000),
"creationTime" : ISODate("2017-12-04T06:14:21.529+0000")
}
And want to write a query that returns results between two given dates (ie: startDate and endDate) and then group the results every seven days:
db.collection.aggregate(
{ $match: {$gte: startDate, $lte: endDate } },
{ $group: { _id: { --- every seven days from endDate --- } }
)
How can I do this?
First get boundries
var boundries = [];
vat sd= ISODate("2017-10-18T20:41:33.602+0000"),ed=ISODate("2017-11-22T12:41:36.348+0000");
boundries.push(sd);
var i = sd;
while(i<=ed){
//push ISODate(i + 7 days) in boundries
}
//also push ISODate(ed+1day) because upper bound is exclusive
//use $bucket aggregation
db.collection.aggregate({$match:{creationTime:{$gte:stDate,$lte:endDate}}},{
$bucket:{
groupBy: "$creationTime",
boundaries:boundries ,
}
})

Find all documents within last n days

My daily collection has documents like:
..
{ "date" : ISODate("2013-01-03T00:00:00Z"), "vid" : "ED", "san" : 7046.25, "izm" : 1243.96 }
{ "date" : ISODate("2013-01-03T00:00:00Z"), "vid" : "UA", "san" : 0, "izm" : 0 }
{ "date" : ISODate("2013-01-03T00:00:00Z"), "vid" : "PAL", "san" : 0, "izm" : 169.9 }
{ "date" : ISODate("2013-01-03T00:00:00Z"), "vid" : "PAL", "san" : 0, "izm" : 0 }
{ "date" : ISODate("2013-01-03T00:00:00Z"), "vid" : "CTA_TR", "san" : 0, "izm" : 0 }
{ "date" : ISODate("2013-01-04T00:00:00Z"), "vid" : "CAD", "san" : 0, "izm" : 169.9 }
{ "date" : ISODate("2013-01-04T00:00:00Z"), "vid" : "INT", "san" : 0, "izm" : 169.9 }
...
I left off _id field to spare the space here.
My task is to "fetch all documents within last 15 days". As you can see I need somehow to:
Get 15 unique dates. The newest one should be taken as the newest document in collection (what I mean that it isn't necessary the today's date, it's just the latest one in collection based on date field), and the oldest.. well, maybe it's not necessary to strictly define the oldest day in query, what I need is some kind of top15 starting from the newest day, if you know what I mean. Like 15 unique days.
db.daily.find() all documents, that have date field in that range of 15 days.
In the result, I should see all documents within 15 days starting from the newest in collection.
I just tested the following query against your data sample and it worked perfectly:
db.datecol.find(
{
"date":
{
$gte: new Date((new Date().getTime() - (15 * 24 * 60 * 60 * 1000)))
}
}
).sort({ "date": -1 })
Starting in Mongo 5, it's a nice use case for the $dateSubtract operator:
// { date: ISODate("2021-12-05") }
// { date: ISODate("2021-12-02") }
// { date: ISODate("2021-12-02") }
// { date: ISODate("2021-11-28") } <= older than 5 days
db.collection.aggregate([
{ $match: {
$expr: {
$gt: [
"$date",
{ $dateSubtract: { startDate: "$$NOW", unit: "day", amount: 5 } }
]
}
}}
])
// { date: ISODate("2021-12-05") }
// { date: ISODate("2021-12-02") }
// { date: ISODate("2021-12-02") }
With $dateSubtract, we create the oldest date after which we keep documents, by subtracting 5 (amount) "days" (unit) out of the current date $$NOW (startDate).
And you can obviously add a $sort stage to sort documents by date.
You need to run the distinct command to get all the unique dates. Below is the example. The "values" array has all the unique dates of the collection from which you need to retrieve the most recent 15 days on the client side
db.runCommand ( { distinct: 'datecol', key: 'date' } )
{
"values" : [
ISODate("2013-01-03T00:00:00Z"),
ISODate("2013-01-04T00:00:00Z")
],
"stats" : {
"n" : 2,
"nscanned" : 2,
"nscannedObjects" : 2,
"timems" : 0,
"cursor" : "BasicCursor"
},
"ok" : 1
}
You then use the $in operator with the most recent 15 dates from step 1. Below is an example that finds all documents that belong to one of the mentioned two dates.
db.datecol.find({
"date":{
"$in":[
new ISODate("2013-01-03T00:00:00Z"),
new ISODate("2013-01-04T00:00:00Z")
]
}
})

Upsert with pymongo and a custom _id field

I'm attempting to store pre-aggregated performance metrics in a sharded mongodb according to this document.
I'm trying to update the minute sub-documents in a record that may or may not exist with an upsert like so (self.collection is a pymongo collection instance):
self.collection.update(query, data, upsert=True)
query:
{ '_id': u'12345CHA-2RU020130304',
'metadata': { 'adaptor_id': 'CHA-2RU',
'array_serial': 12345,
'date': datetime.datetime(2013, 3, 4, 0, 0, tzinfo=<UTC>),
'processor_id': 0}
}
data:
{ 'minute': { '16': { '45': 1.6693091}}}
The problem is that in this case the 'minute' subdocument always only has the last hour: { minute: metric} entry, the minute subdocument does not create new entries for other hours, it's always overwriting the one entry.
I've also tried this with a $set style data entry:
{ '$set': { 'minute': { '16': { '45': 1.6693091}}}}
but it ends up being the same.
What am I doing wrong?
In both of the examples listed you are simply setting a field ('minute')to a particular value, the only reason it is an addition the first time you update is because the field itself does not exist and so must be created.
It's hard to determine exactly what you are shooting for here, but I think what you could do is alter your schema a little so that 'minute' is an array. Then you could use $push to add values regardless of whether they are already present or $addToSet if you don't want duplicates.
I had to alter your document a little to make it valid in the shell, so my _id (and some other fields) are slightly different to yours, but it should still be close enough to be illustrative:
db.foo.find({'_id': 'u12345CHA-2RU020130304'}).pretty()
{
"_id" : "u12345CHA-2RU020130304",
"metadata" : {
"adaptor_id" : "CHA-2RU",
"array_serial" : 12345,
"date" : ISODate("2013-03-18T23:28:50.660Z"),
"processor_id" : 0
}
}
Now let's add a minute field with an array of documents instead of a single document:
db.foo.update({'_id': 'u12345CHA-2RU020130304'}, { $addToSet : {'minute': { '16': {'45': 1.6693091}}}})
db.foo.find({'_id': 'u12345CHA-2RU020130304'}).pretty()
{
"_id" : "u12345CHA-2RU020130304",
"metadata" : {
"adaptor_id" : "CHA-2RU",
"array_serial" : 12345,
"date" : ISODate("2013-03-18T23:28:50.660Z"),
"processor_id" : 0
},
"minute" : [
{
"16" : {
"45" : 1.6693091
}
}
]
}
Then, to illustrate the addition, add a slightly different entry (since I am using $addToSet this is required for a new field to be added:
db.foo.update({'_id': 'u12345CHA-2RU020130304'}, { $addToSet : {'minute': { '17': {'48': 1.6693391}}}})
db.foo.find({'_id': 'u12345CHA-2RU020130304'}).pretty()
{
"_id" : "u12345CHA-2RU020130304",
"metadata" : {
"adaptor_id" : "CHA-2RU",
"array_serial" : 12345,
"date" : ISODate("2013-03-18T23:28:50.660Z"),
"processor_id" : 0
},
"minute" : [
{
"16" : {
"45" : 1.6693091
}
},
{
"17" : {
"48" : 1.6693391
}
}
]
}
I ended up setting the fields like this:
query:
{ '_id': u'12345CHA-2RU020130304',
'metadata': { 'adaptor_id': 'CHA-2RU',
'array_serial': 12345,
'date': datetime.datetime(2013, 3, 4, 0, 0, tzinfo=<UTC>),
'processor_id': 0}
}
I'm setting the metrics like this:
data = {"$set": {}}
for metric in csv:
date_utc = metric['date'].astimezone(pytz.utc)
data["$set"]["minute.%d.%d" % (date_utc.hour,
date_utc.minute)] = float(metric['metric'])
which creates data like this:
{"$set": {'minute.16.45': 1.6693091,
'minute.16.46': 1.566343,
'minute.16.47': 1.22322}}
So that when self.collection.update(query, data, upsert=True) is run it updates those fields.