MongoDB aggregation: average sales per hour - mongodb

I have a collection with sales. Now I need to get the average number of sales per hour within a date range.
Up to now I have a query like this:
db.getCollection('sales').aggregate({
"$match": {
$and: [
{ "createdAt": { $gte: ISODate("2018-05-01T00:00:00.000Z") } },
{ "createdAt": { $lt: ISODate("2018-10-30T23:59:00.000Z") } },
]
}
},{
"$project": {
"h":{"$hour":"$createdAt"},
}
},{
"$group":{
"_id": "$h",
"salesPerHour": { $sum: 1 },
},
},{
"$sort": { "salesPerHour": -1 }
});
The result looks like this: {"_id" : 15, "salesPerHour" : 681.0}
How can I get the average value of salesPerHour instead the sum?
Update 1 => Example document.
{
"_id" : "pX6jj7j4274J9xpSA",
"idFiscalSale" : "48",
"documentYear" : "2018",
"paymentType" : "cash",
"cashReceived" : 54,
"items" : [...],
"customer" : null,
"subTotal" : 23.89,
"taxTotal" : 3.7139,
"total" : 23.89,
"rewardPointsValue" : 0,
"rewardPointsEarned" : 24,
"discountValue" : 0,
"createdAt" : ISODate("2018-04-24T00:00:00.201Z")
}

You can use below aggregation query.
db.sales.aggregate([
{"$match":{
"createdAt":{
"$gte":ISODate("2018-05-01T00:00:00.000Z"),
"$lt":ISODate("2018-10-30T23:59:00.000Z")
}
}},
{"$group":{
"_id":{"$hour":"$createdAt"},
"salesPerHour":{"$sum":1}
}},
{"$group":{
"_id":null,
"salesPerHour":{"$avg":"$salesPerHour"}
}}
])

You can try below aggregation
You have to use $avg aggregation operator with the salesPerHour field
db.collection.aggregate([
{ "$match": {
"$and": [
{ "createdAt": { "$gte": ISODate("2018-05-01T00:00:00.000Z") }},
{ "createdAt": { "$lt": ISODate("2018-10-30T23:59:00.000Z") }}
]
}},
{ "$group": {
"_id": { "$hour": "$createdAt" },
"salesPerHour": {
"$avg": "$salesPerHour"
}
}}
])

Related

Count the objects inside of an array on each document MongoDB

My documents are organized this way:
{
"_id" : ObjectId("5ea899d7e7da54cabbc022e7"),
"date" : ISODate("2018-01-27T00:00:00Z"),
"vehicleid" : 32028,
"points" : [
{
"direction" : 225,
"location" : {
"type" : "Point",
"coordinates" : [
-3.801898,
-38.501078
]
},
"odometer" : 134746396,
"routecode" : 0,
"speed" : 0,
"deviceid" : 148590,
"metrictimestamp" : ISODate("2018-01-27T23:32:03Z")
}
Where points is an array of objects. I need to group this documents and return the amount of elements inside each array. I guess that is something like:
pipe = [
{
'$project':{
"_id":0
}
},
{
'$group':{
"_id":{
"vehicleid":"$vehicleid",
"date":"$date"
},'count':{'$size':'points'}
}
}
]
Detail: I need to run this on pymongo.
You have to use $sum to sum the size of each array like this
{
"$group": {
"_id": {
"vehicleid": "$vehicleid",
"date": "$date"
},
"count": { "$sum": { "$size": "$points" } }
}
}
You can use any of the following aggregation pipelines. You will get the size of the points array field. Each pipeline uses different approach, and the output details differ, but the size info will be same.
The code runs with PyMongo:
pipeline = [
{
"$unwind": "$points"
},
{
"$group": {
"_id": { "vehicleid": "$vehicleid", "date": "$date" },
"count": { "$sum": 1 }
}
}
]
pipeline = [
{
"$addFields": { "count": { "$size": "$points" } }
}
]
You can follow this code
$group : {
_id : {
"vehicleid":"$vehicleid",
"date":"$date"
count: { $sum: 1 }
}
}

Mongodb - Get sales per hours

Good people! I am in need of your help.
I am trying to create a line graph using apexcharts with data imported from Mongodb.
I am trying to graph hourly sales, so I need the number of sales for each hour of the day.
Example Mongodb document.
{
"_id" : ObjectId("5dbee4eed6f04aaf191abc59"),
"seller_id" : "5aa1c2c35ef7a4e97b5e995a",
"temp" : "4.3",
"sale_type" : "coins",
"createdAt" : ISODate("2020-05-10T00:10:00.000Z"),
"updatedAt" : ISODate("2019-11-10T14:32:14.650Z")
}
Up to now I have a query like this:
db.getCollection('sales').aggregate([
{ "$facet": {
"00:00": [
{ "$match" : {createdAt: {$gte: ISODate("2020-05-10T00:00:00.000Z"),$lt: ISODate("2020-05-10T00:59:00.001Z")},seller_id: "5aa1c2c35ef7a4e97b5e995a",
}},
{ "$count": "sales" },
],
"01:00": [
{ "$match" : {createdAt: {$gte: ISODate("2020-05-10T01:00:00.000Z"),$lt: ISODate("2020-05-10T01:59:00.001Z")},seller_id: "5aa1c2c35ef7a4e97b5e995a",
}},
{ "$count": "sales" },
],
"02:00": [
{ "$match" : {createdAt: {$gte: ISODate("2020-05-10T02:00:00.000Z"),$lt: ISODate("2020-05-10T02:59:00.001Z")},seller_id: "5aa1c2c35ef7a4e97b5e995a",
}},
{ "$count": "sales" },
],
"03:00": [
{ "$match" : {createdAt: {$gte: ISODate("2020-05-10T03:00:00.000Z"),$lt: ISODate("2020-05-10T03:59:00.001Z")},seller_id: "5aa1c2c35ef7a4e97b5e995a",
}},
{ "$count": "sales" },
],
}},
{ "$project": {
"ventas0": { "$arrayElemAt": ["$01:00.sales", 0] },
"ventas1": { "$arrayElemAt": ["$02:00.sales", 0] },
"ventas3": { "$arrayElemAt": ["$03:00.sales", 0] },
}}
])
But I am sure there is a more efficient way to do this.
My expected output looks like this:
[countsale(00:00),countsale(01:00),countsale(02:00),countsale(03:00), etc to 24 hs]
You are correct, there is a more efficient way to do this. We can use Date expression operators and specifically by grouping with $hour.
db.getCollection('sales').aggregate([
{
$match: {
createdAt: {$gte: ISODate("2020-05-10T00:00:00.000Z"), $lt: ISODate("2020-05-11T00:00:00.001Z")}
}
},
{
$group: {
_id: {$hour: "$createdAt"},
count: {$sum: 1}
}
},
{
$sort: {
_id: 1
}
}
]);
This will give you this result:
[
{
_id: 0,
count: x
},
{
_id: 1,
count: y
},
...
{
_id: 23,
count: z
}
]
From here you can restructure the data easily as you wish.
A problem I forsee happening are hours without any matches (i.e count=0) will not exists in the result set. you'll have to fill in those gaps manually.

How do I aggregate and avg the time between two dates?

Assuming the below is my element structure. How can I SHELL query the mongodb and get the avg difference (average length) each trip took for every trip in the db? I am guessing subtracting dates? But then how to subtract and then avg?
"_id": {
"$oid": "5445ab058767000062"
},
"comment": null,
"scheduled_request": false,
"status": "blah",
"timestamp_started": {
"$date": "2014-10-21T00:38:28.990Z"
},
"timestamp_transaction_complete": {
"$date": "2014-10-21T00:49:12.990Z"
},
"user_id": "5445a9000057"
UDPATE ========
Here is my query
db.ambulance_requests.aggregate([
{ "$group": {
"_id": null,
"avg_time": {
"$avg": {
"$subtract": [
"$timestamp_transaction_complete",
"$timestamp_started"
]
}
}
}}
])
AND MY RESULT (from a Mac Terminal Shell):
{ "_id" : null, "avg_time" : 0 }
You $subtract and $avg by applying them in a $group pipeline stage. For "everything", use null for the grouping key:
db.trips.aggregate([
{ "$group": {
"_id": null,
"avg_time": {
"$avg": {
"$subtract": [
{ "$ifNull": [ "$timestamp_completed", 0 ] },
{ "$ifNull": [ "$timestamp_started", 0 ] }
]
}
}
}}
])
When you $subtract on BSON Date object from another, the difference is returned as the milliseconds interval between them. This is also a generally handy technique for extracting the milliseconds value for other purposes.
Your single document as supplied:
{
"comment" : null,
"scheduled_request" : false,
"status" : "blah",
"timestamp_started" : ISODate("2014-10-21T00:38:28.990Z"),
"timestamp_completed" : ISODate("2014-10-21T00:49:12.990Z"),
"user_id" : "5445a9000057"
}
The result from your single document in the question:
/* 1 */
{
"_id" : null,
"avg_time" : 644000.0
}
https://mongoplayground.net/p/nFO54i5GIXU
if finishedAt dose not exist in a doc then skip that document from avg calculation
db.collection.aggregate([
{
"$match": {
"finishedAt": {
"$exists": true
}
}
},
{
"$unwind": "$tags"
},
{
"$match": {
"$or": [
{
"tags.name": "Canada"
},
{
"tags.name": "ABC"
},
]
}
},
{
"$group": {
"_id": null,
"avg_time": {
"$avg": {
"$subtract": [
"$finishedAt",
"$createdAt"
]
}
}
}
}
])

MongoDB aggregate - average on specific values in array of documents

I'm currently working on a database with the following structure:
{"_id" : ObjectId("1abc2"),
"startdatetime" : ISODate("2016-09-11T18:00:37Z"),
"diveValues" : [
{
"temp" : 15.269,
"depth" : 0.0,
},
{
"temp" : 14.779257384,
"depth" : 1.0,
},
{
"temp" : 14.3940253165,
"depth" : 2.0,
},
{
"temp" : 13.9225795455,
"depth" : 3.0,
},
{
"temp" : 13.8214431818,
"depth" : 4.0,
},
{
"temp" : 13.6899553571,
"depth" : 5.0,
}
]}
The database has information about depth n metres in water, and the temperature on given depth. This is stored in the "diveValues" array. I have been successful on averaging on all depths between to dates, both monthly average and daily average. What I'm having a serious issue with is to get the average between to depths, say between 1 and 4 metres, for every month the last 6 months.
Here is an example of average temperature for each month from January to June, for all depths:
db.collection.aggregate(
[
{$unwind:"$diveValues"},
{$match:
{'startdatetime':
{$gt:new ISODate("2016-01-10T06:00:29Z"),
$lt:new ISODate("2016-06-10T06:00:29Z")}
}
},
{$group:
{_id:
{ year: { $year: "$startdatetime" },
month: { $month: "$startdatetime" }},
avgTemp: { $avg: "$diveValues.temp" }}
},
{$sort:{_id:1}}
]
)
Resulting in:
{ "_id" : { "year" : 2016, "month" : 1 }, "avgTemp" : 7.575706502958313 }
{ "_id" : { "year" : 2016, "month" : 3 }, "avgTemp" : 6.85037457740135 }
{ "_id" : { "year" : 2016, "month" : 4 }, "avgTemp" : 7.215702831902588 }
{ "_id" : { "year" : 2016, "month" : 5 }, "avgTemp" : 9.153453683614638 }
{ "_id" : { "year" : 2016, "month" : 6 }, "avgTemp" : 11.497953009390237 }
Now, I can not seem to figure out how to get average temperature between 1 and 4 metres for the same period.
I have been trying to group the values by wanted depths, but have not managed it - more often than not ending up with bad syntax. Also, if I'm not wrong, the $match pipeline would return all depths as long as the dive has values for 1 and 4 metres, so that will not work.
With the find() tool I am using $slice to return the values I intend from the array - but have not been successful along with the aggregate() function.
Is there a way to solve this? Thanks in advance, much appreciated!
You'd need to place your $match pipeline before $unwind to optimize your aggregation operation as doing an $unwind operation on the whole collection could potentially cause some performance issues since it produces a copy of each document per array entry and that uses more memory (possible memory cap on aggregation pipelines of 10% total memory) thus takes "time" to produce the flattened arrays as well as "time" to process it. Hence it's better to limit the number of documents getting into the pipeline to be flattened.
db.collection.aggregate([
{
"$match": {
"startdatetime": {
"$gt": new ISODate("2016-01-10T06:00:29Z"),
"$lt": new ISODate("2016-06-10T06:00:29Z")
},
"diveValues.depth": { "$gte": 1, "$lte": 4 }
}
},
{ "$unwind": "$diveValues" },
{ "$match": { "diveValues.depth": { "$gte": 1, "$lte": 4 } } },
{
"$group": {
"_id": {
"year": { "$year": "$startdatetime" },
"month": { "$month": "$startdatetime" }
},
"avgTemp": { "$avg": "$diveValues.temp" }
}
}
])
If you want results to contain the average temps for all depths and for the 1-4 depth range, then you would need to run this pipeline which would use the $cond tenary operator to feed the $avg operator the accumulated temperatures within a group based on the depth range:
db.collection.aggregate([
{
"$match": {
"startdatetime": {
"$gt": new ISODate("2016-01-10T06:00:29Z"),
"$lt": new ISODate("2016-06-10T06:00:29Z")
}
}
},
{ "$unwind": "$diveValues" },
{
"$group": {
"_id": {
"year": { "$year": "$startdatetime" },
"month": { "$month": "$startdatetime" }
},
"avgTemp": { "$avg": "$diveValues.temp" },
"avgTempDepth1-4": {
"$avg": {
"$cond": [
{
"$and": [
{ "$gte": [ "$diveValues.depth", 1 ] },
{ "$lte": [ "$diveValues.depth", 4 ] }
]
},
"$diveValues.temp",
null
]
}
}
}
}
])
First of all, the date $match operator should be used at the beginning of the pipeline so that indexes can be used.
Now, to the question, you just need to filter the depth interval like you did with the dates:
db.col.aggregate([
{"$match": {
'startdatetime': {
"$gt": new ISODate("2016-01-10T06:00:29Z"),
"$lt": new ISODate("2016-11-10T06:00:29Z")
}
}},
{"$unwind": "$diveValues"},
{"$match": {
"diveValues.depth": {
"$gte": 1.0,
"$lt": 4.0
}
}},
{"$group": {
"_id": {
"year": {"$year": "$startdatetime" },
"month": {"$month": "$startdatetime" }
},
"avgTemp": { "$avg": "$diveValues.temp" }}
}
])
This will give you the average only for the chosen depth interval.

MongoDB dateDiff between multiple documents

I have collection in my mongoDB which stores service given to customer along with their email address something like below
{
"_id" : ObjectId("56a84627f8fd4a136c0e944a"),
"Vehicle" : "Honda",
"ServiceSelected" : "FULL SERVICE",
"FullName" : "xyz",
"Email" : "xyz#xyz.com",
"BookingTime" : ISODate("2015-12-27T06:00:00.000Z")
},
{
"_id" : ObjectId("56a84627f8fd4a136c0e944b"),
"Vehicle" : "AUDI",
"ServiceSelected" : "FLAT TYRE",
"FullName" : "abc",
"Email" : "abc#abc.com",
"BookingTime" : ISODate("2015-12-26T06:00:00.000Z")
},
{
"_id" : ObjectId("56a84627f8fd4a136c0e944c"),
"Vehicle" : "BMW",
"ServiceSelected" : "OTHERS",
"FullName" : "def",
"Email" : "def#def.com",
"BookingTime" : ISODate("2015-12-25T06:00:00.000Z")
},
{
"_id" : ObjectId("56a84627f8fd4a136c0e944d"),
"Vehicle" : "BMW",
"ServiceSelected" : "OTHERS",
"FullName" : "def",
"Email" : "def#def.com",
"BookingTime" : ISODate("2015-12-30T06:00:00.000Z")
},
{
"_id" : ObjectId("56a84627f8fd4a136c0e944a"),
"Vehicle" : "Honda",
"ServiceSelected" : "FULL SERVICE",
"FullName" : "xyz",
"Email" : "xyz#xyz.com",
"BookingTime" : ISODate("2016-01-27T06:00:00.000Z")
}
From the above collection I want to fetch all the documents that have taken our service with a gap of at-least 30 days i.e. from the above collection "Email" : "xyz#xyz.com" should be returned but not "Email" : "def#def.com" as the second service was taken with in 5 days.
I know there is flaw in the design and an additional flag can be set while inserting the record from the application but I need to fetch the data for the existing records.
You need to use the $min and $max operators which respectively return the minimum and maximum value for "BookingTime" in your $group stage. The last stage in the pipeline is the $redact stage where you use a simple "date" math using the $divide and $subtract arithmetic operators.to return those documents where the number of days between first "service" and last "service" is greater than 30
db.collection.aggregate( [
{ "$group": {
"_id": "$Email",
"date1": { "$min": "$BookingTime" },
"date2": { "$max": "$BookingTime" }
}},
{ "$redact": {
"$cond": [
{ "$gte": [
{ "$divide": [
{ "$subtract": [ "$date2", "$date1" ] },
1000 * 60 * 60 * 24
]},
30
]},
"$$KEEP",
"$$PRUNE"
]
}}
])
Which returns:
{
"_id" : "xyz#xyz.com",
"date1" : ISODate("2015-12-27T06:00:00Z"),
"date2" : ISODate("2016-01-27T06:00:00Z")
}
Another way to do this is by using the $cond operator in a $project stage to avoid a collection scan.
db.collection.aggregate( [
{ "$group": {
"_id": "$Email",
"date1": { "$min": "$BookingTime" },
"date2": { "$max": "$BookingTime" },
"count": { "$sum": 1 }
}},
{ "$match": { "count": { "$gte": 2 } } },
{ "$project": {
"emails": {
"$cond": [
{ "$gte": [
{ "$divide": [
{ "$subtract": [ "$date2", "$date1" ] },
1000 * 60 * 60 * 24
]},
30
] },
"$_id",
false
]
}
}},
{ "$match": { "emails": { "$ne": false } } }
])
You can get first sales date and last sales date by $min and $max:
db.services.aggregate({
$group: {
"_id" :"$Email",
lastSalesDate: { $max: "$BookingTime" },
firstSalesDate: { $min: "$BookingTime" }
}
}
)
After that you can add filter based on lastSalesDate. You can calculate ISO date which 30 days before. ex. ISODate("2015-12-28T00:00:00.000Z"). By $lt , you will get customers of 30 days before.
db.services.aggregate(
{
$group: {
"_id" :"$Email",
lastSalesDate: { $max: "$BookingTime" },
firstSalesDate: { $min: "$BookingTime" }
}
},
{
$match : {
"lastSalesDate" : { $lt: ISODate("2015-12-28T00:00:00.000Z") }
}
}
)
Results like:
{
"_id" : "abc#abc.com",
"lastSalesDate" : ISODate("2015-12-26T06:00:00.000+0000"),
"firstSalesDate" : ISODate("2015-12-26T06:00:00.000+0000")
}
This is what I used finally
db.services.aggregate(
{$group: {
"_id" :"$Email",
count:{$sum:1},
lastSalesDate: { $max: "$BookingTime" },
firstSalesDate: { $min: "$BookingTime" }
},
{$project:{
_id:1,count:1,dateDifference: { $divide:[ {$subtract: [ "$lastSalesDate", "$firstSalesDate" ]},86400000] }
}
},
{$match:{
count:{$gt:1},dateDifference:{$gt:20}
}
}
}
)
Count > 1 helped to filter the records which never repeated and datedifferentce > 20 is for days as I already converted milliseconds to days using division operation.