MongoDB - get datewise/houlty aggregate count of column - mongodb

I have set of documents in my mongoDB collection. I am looking to get datewise aggregate count of document if date range is more than a day and hourly aggregate count for same column if date query is for single day. The data may have documents with same conversationId, hence it is necessary to group with conversationId as well.Below is sample of data for reference
[
{
"_id":"c438a671-2391-4b85-815c-ecfcb3d2bb54",
"status":"INTERNAL_UPDATE",
"conversationId":"ac44781d-caab-4410-a708-9d6db8480fc3",
"messageIds":[],
"messageId":"4dc02026-ac06-4eb1-aa59-e385fcce4a36",
"responseId":"0c00c83d-61c5-4937-846c-2e6a46aae857",
"conversation":{},
"message":{},
"params":{},
"timestamp":"2021-05-04T11:40:06.552Z",
"source":{}
},
{
"_id":"98370ddf-9ff8-4347-bab7-1f7777ab9e9d",
"status":"NEW",
"conversationId":"b5dc39d2-56a1-4eb6-a728-cdbe33dca580",
"messageIds":[],
"messageId":"ba94b839-f795-44f2-aea0-173d26006f14",
"responseId":"a2b75364-447b-4345-8008-2beccd6cbb34",
"conversation":{},
"message":{},
"params":{},
"timestamp":"2021-05-05T11:40:30.897Z",
"source":{}
},
{
"_id":"db1eae2b-62d9-455c-ab46-dbfc5baf8b67",
"status":"INTERNAL_UPDATE",
"conversationId":"b5dc39d2-56a1-4eb6-a728-cdbe33dcb584",
"messageIds":[],
"messageId":"b83c743b-d36e-4fdd-9c03-21988af47263",
"responseId":"97198c09-0130-48dc-a225-6d0faeff3116",
"conversation":{},
"message":{},
"params":{},
"timestamp":"2021-05-05T11:40:31.418Z",
"source":{}
},
{
"_id":"12a21495-f857-4f18-a06e-f8ba0b951ade",
"status":"NEW",
"conversationId":"8e37c704-add8-4f9f-8e70-d630c24f653b",
"messageIds":[],
"messageId":"51a48362-545c-4f9f-930b-42e4841fc974",
"responseId":"4691468b-a43b-41d1-83df-1349fb554bfa",
"conversation":{},
"message":{},
"params":{},
"timestamp":"2021-05-06T11:43:58.174Z",
"source":{}
},
{
"_id":"4afaa735-4618-40cf-8b4f-00ee83b2c3c5",
"status":"INTERNAL_UPDATE",
"conversationId":"8e37c704-add8-4f9f-8e70-d630c24f653b",
"messageIds":[],
"messageId":"7c860126-bf1e-41b2-a7d3-6bcec3e8d5fb",
"responseId":"09cec9a1-2621-481d-b527-d98b007ef5be",
"conversation":{},
"message":{},
"params":{},
"timestamp":"2021-05-06T11:43:58.736Z",
"source":{}
},
{
"_id":"cf8deeca-2cfd-497e-b92b-03204c84217a",
"status":"NEW",
"conversationId":"3c6870b5-88d6-4e21-8629-28137dea3fee",
"messageIds":[],
"messageId":"da84e414-2269-4812-8ddd-e2cd6c9be4fd",
"responseId":"ae1014b2-0cc1-41f0-9990-cf724ed67ab7",
"conversation":{},
"message":{},
"params":{},
"timestamp":"2021-05-06T13:37:55.060Z",
"source":{}
}
]
Presently I am able to group by conversationId, but unable to get data aggregated datewise or on hourly basis if date range is on single date.
Below is the query for same
db.documentName.aggregate([
{
'$match': {
'$and': [
{
timestamp: {
'$gte': ISODate('2021-05-01T00:00:00.000Z'),
'$lte': ISODate('2021-05-10T23:59:59.999Z')
}
},
{ 'source.author': { '$regex': 'user', '$options': 'i' } },
{},
{}
]
}
},
{ '$group': {
_id: {'conversationId': '$conversationId'} },
{ '$count': 'document_count' }
])
I have tried adding something like, $hour: '$timestamp' with comma separation beside conversationId in $group, but its of no use and is giving error.
The desired result I am trying to get for above data is, something like this
[{"date": "2021-05-04", "doc_count": 1},
{"date": "2021-05-05", "doc_count": 2},
{"date": "2021-05-06", "doc_count": 2}]
As for 2021-05-05 there are 2 docs with different conversationId, and for 2021-05-06 there are 3 docs in total but 2 documents have same conversationId hence aggregate count for 2021-05-06 is also 2. Hope this clarifies my quesiton.

The question is not entirely clear to me, but it sounds like you want something like this:
The groupId is a field to rebuild the date including the hour, or not, according to your condition:
EDIT:
db.collection.aggregate([
{$match: {
timestamp: {
$gte: ISODate("2021-05-01T00:00:00.000Z"),
$lte: ISODate("2021-05-07T23:59:59.999Z")
}
}
},
{$project: {
conversationId: 1,
groupId: {
$dateFromParts: {
year: {$year: "$timestamp"},
month: {$month: "$timestamp"},
day: {$dayOfMonth: "$timestamp"},
hour: {$cond: [
{$gte: [
{$dateDiff: {
startDate: ISODate("2021-05-01T00:00:00.000Z"),
endDate: ISODate("2021-05-07T23:59:59.999Z"),
unit: "day"}}, 1]},
0,
{$hour: "$timestamp"}]}
}
}
}
},
{$group: {_id: {conversationId: "$conversationId", groupId: "$groupId"}}},
{$group: {_id: "$_id.groupId", doc_count: {$sum: 1}}},
{$project: {date: {$toString: "$_id"}, doc_count: 1, _id: 0}}
])
See how it works on the playground example

As suggested by #nimrodserok, for mongo version 4.2.9 the query would be
db.collection.aggregate([
{
$match: {
timestamp: {
$gte: ISODate("2021-05-01T00:00:00.000Z"),
$lte: ISODate("2021-05-07T23:59:59.999Z")
}
}
},
{
$project: {
conversationId: 1,
groupId: {
$dateFromParts: {
year: {
$year: "$timestamp"
},
month: {
$month: "$timestamp"
},
day: {
$dayOfMonth: "$timestamp"
},
hour: {
$cond: [
{
$gte: [
{
$subtract: [
{
$toLong: ISODate("2021-05-07T23:59:59.999Z")
},
{
$toLong: ISODate("2021-05-01T00:00:00.000Z")
}
]
},
86400000
]
},
0,
{
$hour: "$timestamp"
}
]
}
}
}
}
},
{
$group: {
_id: {
conversationId: "$conversationId",
groupId: "$groupId"
}
}
},
{
$group: {
_id: "$_id.groupId",
doc_count: {
$sum: 1
}
}
},
{
$project: {
date: {
$toString: "$_id"
},
doc_count: 1,
_id: 0
}
}
])

Related

Fill data with NULL value if it is not present in the timeperiod using mongodb aggregation pipeline

I have to write an aggreagtion pipeline in which I will pass:
Timestamps of start date and end data for a day
I have to divide the data into 30min buckets and find data in between that buckets like:
2023-01-16T00:30:00.000+00:00 , 2023-01-16T01:00:00.000+00:00, 2023-01-16T01:30:00.000+00:00 and so on.
If data is not present in any particular bucket fill the values of that bucketa with zero but give the timestamp like:
2023-01-16T01:00:00.000+00:00 ther is no data give {timestamp:2023-01-16T01:00:00.000+00:00,a:0,b:0,c:0}
I have done the following:
[{
$match: {
$and: [
{
timestamp: {
$gte: ISODate('2023-01-16T00:00:00.000Z'),
$lt: ISODate('2023-01-16T23:59:59.000Z')
}
}
]
}
}, {
$group: {
_id: {
$toDate: {
$subtract: [
{
$toLong: '$timestamp'
},
{
$mod: [
{
$toLong: '$timestamp'
},
1800000
]
}
]
}
},
in: {
$sum: '$a'
},
out: {
$sum: '$b'
},
Count: {
$sum: 1
}
}
}, {
$addFields: {
totalIn: {
$add: [
'$in',
'$out'
]
},{
$sort: {
_id: 1
}
}]
Result is:
[{
"_id": {
"2023-01-16T12:00:00.000+00:00"
}
},
"totalIn": 397,
"count":22
},
{
"_id": {
"2023-01-16T01:30:00.000+00:00"
}
},
"totalIn": 222,
"count":2
}
...]
expected result:
[{
"_id": {
"2023-01-16T12:00:00.000+00:00"
}
},
"totalIn": 397,
"count":22
},
{
"_id": {
"2023-01-16T12:30:00.000+00:00"
}
},
"totalIn": 0,
"count":0
},
{
"_id": {
"2023-01-16T01:00:00.000+00:00"
}
},
"totalIn": 0,
"count":0
},
{
"_id": {
"2023-01-16T12:00:00.000+00:00"
}
},
"totalIn": 222,
"count":2
}
...]
One option is to use $range with $dateAdd:
db.collection.aggregate([
{$match: {timestamp: {
$gte: startDate,
$lt: endDate
}}},
{$group: {
_id: {$dateTrunc: {date: "$timestamp", unit: "minute", binSize: 30}},
in: {$sum: "$a"},
out: {$sum: "$b"},
count: {$sum: 1}
}},
{$group: {
_id: 0,
data: {$push: {
timestamp: "$_id",
totalIn: {$add: ["$in", "$out"]},
count: "$count"
}}
}},
{$project: {
_id: 0, data: 1,
bins: {$map: {
input: {$range: [
0,
{$multiply: [
{$dateDiff: {
startDate: startDate,
endDate: endDate,
unit: "hour"
}},
2
]}
]},
in: {$dateAdd: {
startDate: startDate,
unit: "minute",
amount: {$multiply: ["$$this", 30]}
}}
}}
}},
{$unwind: "$bins"},
{$set: {data: {$filter: {
input: "$data",
cond: {$eq: ["$bins", "$$this.timestamp"]}
}}}},
{$project: {
_id: "$bins",
count: {$ifNull: [{$first: "$data.count"}, 0]},
totalIn: {$ifNull: [{$first: "$data.totalIn"}, 0]}
}}
])
See how it works on the playground example

Mongodb aggregate group by array elements

I have a mongodb document that contains customer id, status (active, deactivate) and date.
[
{
id:1,
date:ISODate('2022-12-01'),
status:'activate'
},
{
id:2,
date:ISODate('2022-12-01'),
status:'activate'
},
{
id:1,
date:ISODate('2022-12-02'),
status:'deactivate'
},
{
id:2,
date:ISODate('2022-12-21'),
status:'deactivate'
}
]
I need to get daywise customer status count.
I came up with below aggregation.
db.collection.aggregate([
{
$addFields: {
"day": {
"$dateToString": {
"format": "%Y-%m-%d",
"date": "$date"
}
}
}
},
{
$group: {
_id: "$day",
type: {
$push: "$status"
}
}
}
])
this way I can get status in a array. like below.
[
{
_id:"2022-12-01",
type:[
0:"activate",
1:"activate"
]
},
{
_id:"2022-12-02",
type:[
0:"deactivate"
]
},
{
_id:"2022-12-21",
type:[
0:"deactivate"
]
}
]
now it's working as intended. but I need the output like below.
[
{
_id:"2022-12-01",
type:{
"activate":2,
}
},
{
_id:"2022-12-02",
type:{
"deactivate":1
}
},
{
_id:"2022-12-21",
type:{
"deactivate":1
}
}
]
this table has around 100,000 documents and doing this programmatically will take about 10 seconds. that's why I'm searching a way to do this as a aggregation
One option is to group twice and then use $arrayToObject:
db.collection.aggregate([
{$group: {
_id: {day: "$date", status: "$status"},
count: {$sum: 1}
}},
{$group: {
_id: {$dateToString: {format: "%Y-%m-%d", date: "$_id.day"}},
data: {$push: {k: "$_id.status", v: "$count"}}
}},
{$project: {type: {$arrayToObject: "$data"}}}
])
See how it works on the playground example

MongoDB aggregate $group $sum that matches date inside array of objects

I'll explain my problem here and i'll put a tldr at the bottom summarizing the question.
We have a collection called apple_receipt, since we have some apple purchases in our application. That document has some fields that we will be using on this aggregation. Those are: price, currency, startedAt and history. Price, currency and startedAt are self-explanatory. History is a field that is an array of objects containing a price and startedAt. So, what we are trying to accomplish is a query that gets every document between a date of our choice, for example: 06-06-2020 through 10-10-2022 and get the total price combined of all those receipts that have a startedAt between that. We have a document like this:
{
price: 12.9,
currency: 'BRL',
startedAt: 2022-08-10T16:23:42.000+00:00
history: [
{
price: 12.9,
startedAt: 2022-05-10T16:23:42.000+00:00
},
{
price: 12.9,
startedAt: 2022-06-10T16:23:42.000+00:00
},
{
price: 12.9,
startedAt: 2022-07-10T16:23:42.000+00:00
}
]
}
If we query between dates 06-06-2022 to 10-10-2022, we would have a return like this: totalPrice: 38,7.
-total price of the 3 objects that have matched the date inside that value range-
I have tried this so far:
AppleReceipt.aggregate([
{
$project: {
price: 1,
startedAt: 1,
currency: 1,
history: 1,
}
},
{
$unwind: {
path: "$history",
preserveNullAndEmptyArrays: true,
}
},
{
$match: {
$or: [
{ startedAt: {$gte: new Date(filters.begin), $lt: new Date(filters.end)} },
]
}
},
{
$group: {
_id: "$_id",
data: { $push: '$$ROOT' },
totalAmountHelper: { $sum: '$history.price' }
}
},
{
$unwind: "$data"
},
{
$addFields: {
totalAmount: { $add: ['$totalAmountHelper', '$data.price'] }
}
}
])
It does bring me the total value but I couldn't know how to take into consideration the date to make the match stage to only get the sum of the documents that are between that date.
tl;dr: Want to make a query that gets the total sum of the prices of all documents that have startedAt between the dates we choose. Needs to match the ones inside history field - which is an array of objects, and also the startedAt outside of the history field.
https://mongoplayground.net/p/lOvRbX24QI9
db.collection.aggregate([
{
$set: {
"history_total": {
"$reduce": {
"input": "$history",
"initialValue": 0,
"in": {
$sum: [
{
"$cond": {
"if": {
$and: [
{
$gte: [
new Date("2022-06-06"),
{
$dateFromString: {
dateString: "$$this.startedAt"
}
}
]
},
{
$lt: [
{
$dateFromString: {
dateString: "$$this.startedAt"
}
},
new Date("2022-10-10")
]
},
]
},
"then": "$$this.price",
"else": 0
}
},
"$$value",
]
}
}
}
}
},
{
$set: {
"history_total": {
"$sum": [
"$price",
"$history_total"
]
}
}
}
])
Result:
[
{
"_id": ObjectId("5a934e000102030405000000"),
"currency": "BRL",
"history": [
{
"price": 12.9,
"startedAt": "2022-05-10T16:23:42.000+00:00"
},
{
"price": 12.9,
"startedAt": "2022-06-10T16:23:42.000+00:00"
},
{
"price": 12.9,
"startedAt": "2022-07-10T16:23:42.000+00:00"
}
],
"history_total": 325.79999999999995,
"price": 312.9,
"startedAt": "2022-08-10T16:23:42.000+00:00"
}
]
Kudos goes to #user20042973

Mongo DB aggregation query to get an attribute based on another array of objects

I am trying to
get all the ids for which the period.startDate > sysdate
get all the ids for which the period.endDate < sysdate
from the JSON.
[
{
"id": 1,
"period":
[
{
"startDate": "2020-05-05",
"endDate": "2020-05-06"
},
{
"startDate": "2020-06-05",
"endDate": "2020-06-06"
}
]
},
{
"id": 2,
"period":
[
{
"startDate": "2024-07-05",
"endDate": "2024-07-06"
},
{
"startDate": "2024-08-05",
"endDate": "2024-08-06"
}
]
}
]
I have tried to go far as below aggregation:
[{
$project: {
_id: 0,
sId: '$id',
period: 1
} }, {
$unwind: {
path: '$period',
includeArrayIndex: 'index'
} }, {
$group: {
_id: '$sId',
minDate: {
$min: '$periods.startDate'
}
} }, {
$project: {
storeId: '$_id',
_id: 0,
minDated: {
$dateFromString: {
dateString: '$minDate'
}
},
today: ISODate('2022-08-03T11:37:03.954Z')
} }]
One option is using $reduce and $group:
db.collection.aggregate([
{$project: {
_id: 0,
id: 1,
minDate: {
$dateFromString: {
dateString: {
$reduce: {
input: "$period",
initialValue: {$first: "$period.startDate"},
in: {$min: ["$$value", "$$this.startDate"]}
}
}
}
},
maxDate: {
$dateFromString: {
dateString: {
$reduce: {
input: "$period",
initialValue: {$first: "$period.endDate"},
in: {$max: ["$$value", "$$this.startDate"]}
}
}
}
}
}
},
{$group: {
_id: 0,
startDateLargerIds: {
$push: {
$cond: [{$gt: ["$minDate", ISODate("2022-08-03T11:37:03.954Z")]},
"$id", "$$REMOVE"]}
},
endDateSmallerIds: {
$push: {
$cond: [{$lt: ["$maxDate", ISODate("2022-08-03T11:37:03.954Z")]},
"$id", "$$REMOVE"]}
}
}
},
{$unset: "_id"}
])
See how it works on the playground example

Mongodb aggregate $group and count for date ranges

I have documents like these:
{
"_id" : ObjectId("5cc80389c723e046f504b5a9"),
"adddress" : "string",
"checkIn" : "2019-04-30T08:12:57.909Z"
},
{
"_id" : ObjectId("5cc995f5a6f3eb7c483b019f"),
"adddress" : "string",
"checkIn" : "2019-05-01T12:49:57.561Z"
}
I have tried aggrgation like this:
var start = new Date("2019-04-30T08:12:57.909Z");
var end = new Date("2019-05-01T12:49:57.561Z");
var pipeline = [
{
$match: {
checkIn: {
$gte: start,
$lte: end
}
}
},
{
$group: {
_id: {
year: {
$year: "$checkIn"
},
month: {
$month: "$checkIn"
},
day: {
$dayOfYear: "$checkIn"
}
},
count: {
$sum: 1
}
}
}];
db.collections.aggregate(pipeline).toArray()
Is it possible to count them by checkIn date and get result like this:
"_id": [{
"checkIn": "2019-03-15T00:00:00Z",
"count": 4
}, {
"checkIn": "2019-04-30T00:00:00Z",
"count": 1
}, {
"checkIn": "2019-05-10T00:00:00Z",
"count": 1
}],
The result is shown the total number of the day.
{$project: {
checkIn: { $dateToString: { format: '%Y-%m-%d', date: '$checkIn' } }
}},
{$group: {
_id: '$checkIn',
checkIn: {$first: '$checkIn'},
count: {$sum: 1}
}},
{$sort: {checkIn: 1}}
Try this: I have tested this query and its working.
db.sample.aggregate([{
$addFields: {
date: {
$dateFromString: {
dateString: "$checkIn"
}
}
}
},{
$match: {
date: {
$gte: start,
$lte: end
}
}
},
{
$addFields: {
dateString: {
$dateToString: {
format: "%Y-%m-%d",
date: "$date"
}
}
}
},
{
$group: {
_id: "$dateString",
count: {
$sum: 1
}
}
}
]);