Fill data with NULL value if it is not present in the timeperiod using mongodb aggregation pipeline - mongodb

I have to write an aggreagtion pipeline in which I will pass:
Timestamps of start date and end data for a day
I have to divide the data into 30min buckets and find data in between that buckets like:
2023-01-16T00:30:00.000+00:00 , 2023-01-16T01:00:00.000+00:00, 2023-01-16T01:30:00.000+00:00 and so on.
If data is not present in any particular bucket fill the values of that bucketa with zero but give the timestamp like:
2023-01-16T01:00:00.000+00:00 ther is no data give {timestamp:2023-01-16T01:00:00.000+00:00,a:0,b:0,c:0}
I have done the following:
[{
$match: {
$and: [
{
timestamp: {
$gte: ISODate('2023-01-16T00:00:00.000Z'),
$lt: ISODate('2023-01-16T23:59:59.000Z')
}
}
]
}
}, {
$group: {
_id: {
$toDate: {
$subtract: [
{
$toLong: '$timestamp'
},
{
$mod: [
{
$toLong: '$timestamp'
},
1800000
]
}
]
}
},
in: {
$sum: '$a'
},
out: {
$sum: '$b'
},
Count: {
$sum: 1
}
}
}, {
$addFields: {
totalIn: {
$add: [
'$in',
'$out'
]
},{
$sort: {
_id: 1
}
}]
Result is:
[{
"_id": {
"2023-01-16T12:00:00.000+00:00"
}
},
"totalIn": 397,
"count":22
},
{
"_id": {
"2023-01-16T01:30:00.000+00:00"
}
},
"totalIn": 222,
"count":2
}
...]
expected result:
[{
"_id": {
"2023-01-16T12:00:00.000+00:00"
}
},
"totalIn": 397,
"count":22
},
{
"_id": {
"2023-01-16T12:30:00.000+00:00"
}
},
"totalIn": 0,
"count":0
},
{
"_id": {
"2023-01-16T01:00:00.000+00:00"
}
},
"totalIn": 0,
"count":0
},
{
"_id": {
"2023-01-16T12:00:00.000+00:00"
}
},
"totalIn": 222,
"count":2
}
...]

One option is to use $range with $dateAdd:
db.collection.aggregate([
{$match: {timestamp: {
$gte: startDate,
$lt: endDate
}}},
{$group: {
_id: {$dateTrunc: {date: "$timestamp", unit: "minute", binSize: 30}},
in: {$sum: "$a"},
out: {$sum: "$b"},
count: {$sum: 1}
}},
{$group: {
_id: 0,
data: {$push: {
timestamp: "$_id",
totalIn: {$add: ["$in", "$out"]},
count: "$count"
}}
}},
{$project: {
_id: 0, data: 1,
bins: {$map: {
input: {$range: [
0,
{$multiply: [
{$dateDiff: {
startDate: startDate,
endDate: endDate,
unit: "hour"
}},
2
]}
]},
in: {$dateAdd: {
startDate: startDate,
unit: "minute",
amount: {$multiply: ["$$this", 30]}
}}
}}
}},
{$unwind: "$bins"},
{$set: {data: {$filter: {
input: "$data",
cond: {$eq: ["$bins", "$$this.timestamp"]}
}}}},
{$project: {
_id: "$bins",
count: {$ifNull: [{$first: "$data.count"}, 0]},
totalIn: {$ifNull: [{$first: "$data.totalIn"}, 0]}
}}
])
See how it works on the playground example

Related

Mongodb array values subtract then sum

I have a collection contains 2 statuses of orders "Shipped" and "Delivered". I want to calculate the average in hours
Formula
(Delivered 1 - Shipped 1) + (Delivered 2 - Shipped 2) + (Delivered N - Shipped N)/N
here is my collection
{
trackingHistory: [
{
status: 'Shipped',
time: ISODate("2022-11-22T06:30:49.000Z")
},
{
status: 'Delivered',
time: ISODate("2022-11-25T15:30:00.000Z")
}
]
},
{
trackingHistory: [
{
status: 'Shipped',
time: ISODate("2022-11-22T09:29:45.000Z")
},
{
status: 'Delivered',
time: ISODate("2022-11-23T19:26:00.000Z")
}
]
}
here is my code
db.client_order_news.aggregate([
{ $match : {
receiverCity : 'New York',
created_at:{$gte:ISODate("2022-11-01T00:00:00.398Z"),$lt:ISODate("2022-11-30T23:59:59.398Z")},
"trackingHistory. status":"Shipped",
"trackingHistory.status":"Delivered"
} },
{ $project : { _id : 0, trackingHistory : {$filter: {
input: '$trackingHistory',
as: 'tracking',
cond: {$or: [{ $eq: ['$$tracking.status', "Shipped"] }, { $eq: ['$$tracking.status',"Delivered"] }]}
}}, } },
{$project: { "$sum": ["$price", { "$subtract": ["$deposits.amount"] } ] }}
]).pretty()
If we can assume Delivered has always newer timestamp than Shipped, one option is to use a simple $dateDiff with a $group step:
db.collection.aggregate([
{$project: {trackingHistory: "$trackingHistory.time", _id: 0}},
{$group: {
_id: 0,
timeDiff: {
$push: {
$abs: {
$dateDiff: {
startDate: {$first: "$trackingHistory"},
endDate: {$last: "$trackingHistory"},
unit: "hour"
}
}
}
}
}
},
{$project: {averageHour: {$avg: "$timeDiff"}, _id: 0}}
])
See how it works on the playground example

how to use $match after $group in mongodb aggregation

I have 4 products. I want to know the count of product-4 for users who has product-1 or product-2
Sample data:
[
{
"user_id": 1,
"product_type": "product-1"
},
{
"user_id": 1,
"product_type": "product-4"
},
{
"user_id": 1,
"product_type": "product-4"
},
{
"user_id": 2,
"product_type": "product-1"
}
]
user-1 has two product-4 and one product-1 (that counts 2)
user-2 has only product-1, but no product-4 (hence that does not count)
This is how I tried
db.collection.aggregate([
{
$match: {
product_type: {
$in: [
"product-1​",
"product-2",
],
},
},
},
{
$group: {
_id: "$user_id",
},
},
{
$match: {
user_id: { $in: "$_id"}, // I want to use $group's result in here
product_type: "product-4",
},
}
]);
Expected results are:
[
{
"_id": 1,
"count": 2
},
{
"_id": 2,
"count": 0
}
]
Note:
I dont have a backend, I have to this using mongodb only.
Does this answer your question?
db.collection.aggregate([
{$group: {_id: "$user_id", data: {$push: "$product_type"}}},
{$match: {$expr: {$or: [
{$in: ["product-1", "$data"]},
{$in: ["product-2", "$data"]}
]}}},
{$project: {
count: {
$size: {
$filter: {
input: "$data",
cond: {$eq: ["$$this", "product-4"]}
}
}
}
}}
])
See how it works on the playground example

MongoDB - get datewise/houlty aggregate count of column

I have set of documents in my mongoDB collection. I am looking to get datewise aggregate count of document if date range is more than a day and hourly aggregate count for same column if date query is for single day. The data may have documents with same conversationId, hence it is necessary to group with conversationId as well.Below is sample of data for reference
[
{
"_id":"c438a671-2391-4b85-815c-ecfcb3d2bb54",
"status":"INTERNAL_UPDATE",
"conversationId":"ac44781d-caab-4410-a708-9d6db8480fc3",
"messageIds":[],
"messageId":"4dc02026-ac06-4eb1-aa59-e385fcce4a36",
"responseId":"0c00c83d-61c5-4937-846c-2e6a46aae857",
"conversation":{},
"message":{},
"params":{},
"timestamp":"2021-05-04T11:40:06.552Z",
"source":{}
},
{
"_id":"98370ddf-9ff8-4347-bab7-1f7777ab9e9d",
"status":"NEW",
"conversationId":"b5dc39d2-56a1-4eb6-a728-cdbe33dca580",
"messageIds":[],
"messageId":"ba94b839-f795-44f2-aea0-173d26006f14",
"responseId":"a2b75364-447b-4345-8008-2beccd6cbb34",
"conversation":{},
"message":{},
"params":{},
"timestamp":"2021-05-05T11:40:30.897Z",
"source":{}
},
{
"_id":"db1eae2b-62d9-455c-ab46-dbfc5baf8b67",
"status":"INTERNAL_UPDATE",
"conversationId":"b5dc39d2-56a1-4eb6-a728-cdbe33dcb584",
"messageIds":[],
"messageId":"b83c743b-d36e-4fdd-9c03-21988af47263",
"responseId":"97198c09-0130-48dc-a225-6d0faeff3116",
"conversation":{},
"message":{},
"params":{},
"timestamp":"2021-05-05T11:40:31.418Z",
"source":{}
},
{
"_id":"12a21495-f857-4f18-a06e-f8ba0b951ade",
"status":"NEW",
"conversationId":"8e37c704-add8-4f9f-8e70-d630c24f653b",
"messageIds":[],
"messageId":"51a48362-545c-4f9f-930b-42e4841fc974",
"responseId":"4691468b-a43b-41d1-83df-1349fb554bfa",
"conversation":{},
"message":{},
"params":{},
"timestamp":"2021-05-06T11:43:58.174Z",
"source":{}
},
{
"_id":"4afaa735-4618-40cf-8b4f-00ee83b2c3c5",
"status":"INTERNAL_UPDATE",
"conversationId":"8e37c704-add8-4f9f-8e70-d630c24f653b",
"messageIds":[],
"messageId":"7c860126-bf1e-41b2-a7d3-6bcec3e8d5fb",
"responseId":"09cec9a1-2621-481d-b527-d98b007ef5be",
"conversation":{},
"message":{},
"params":{},
"timestamp":"2021-05-06T11:43:58.736Z",
"source":{}
},
{
"_id":"cf8deeca-2cfd-497e-b92b-03204c84217a",
"status":"NEW",
"conversationId":"3c6870b5-88d6-4e21-8629-28137dea3fee",
"messageIds":[],
"messageId":"da84e414-2269-4812-8ddd-e2cd6c9be4fd",
"responseId":"ae1014b2-0cc1-41f0-9990-cf724ed67ab7",
"conversation":{},
"message":{},
"params":{},
"timestamp":"2021-05-06T13:37:55.060Z",
"source":{}
}
]
Presently I am able to group by conversationId, but unable to get data aggregated datewise or on hourly basis if date range is on single date.
Below is the query for same
db.documentName.aggregate([
{
'$match': {
'$and': [
{
timestamp: {
'$gte': ISODate('2021-05-01T00:00:00.000Z'),
'$lte': ISODate('2021-05-10T23:59:59.999Z')
}
},
{ 'source.author': { '$regex': 'user', '$options': 'i' } },
{},
{}
]
}
},
{ '$group': {
_id: {'conversationId': '$conversationId'} },
{ '$count': 'document_count' }
])
I have tried adding something like, $hour: '$timestamp' with comma separation beside conversationId in $group, but its of no use and is giving error.
The desired result I am trying to get for above data is, something like this
[{"date": "2021-05-04", "doc_count": 1},
{"date": "2021-05-05", "doc_count": 2},
{"date": "2021-05-06", "doc_count": 2}]
As for 2021-05-05 there are 2 docs with different conversationId, and for 2021-05-06 there are 3 docs in total but 2 documents have same conversationId hence aggregate count for 2021-05-06 is also 2. Hope this clarifies my quesiton.
The question is not entirely clear to me, but it sounds like you want something like this:
The groupId is a field to rebuild the date including the hour, or not, according to your condition:
EDIT:
db.collection.aggregate([
{$match: {
timestamp: {
$gte: ISODate("2021-05-01T00:00:00.000Z"),
$lte: ISODate("2021-05-07T23:59:59.999Z")
}
}
},
{$project: {
conversationId: 1,
groupId: {
$dateFromParts: {
year: {$year: "$timestamp"},
month: {$month: "$timestamp"},
day: {$dayOfMonth: "$timestamp"},
hour: {$cond: [
{$gte: [
{$dateDiff: {
startDate: ISODate("2021-05-01T00:00:00.000Z"),
endDate: ISODate("2021-05-07T23:59:59.999Z"),
unit: "day"}}, 1]},
0,
{$hour: "$timestamp"}]}
}
}
}
},
{$group: {_id: {conversationId: "$conversationId", groupId: "$groupId"}}},
{$group: {_id: "$_id.groupId", doc_count: {$sum: 1}}},
{$project: {date: {$toString: "$_id"}, doc_count: 1, _id: 0}}
])
See how it works on the playground example
As suggested by #nimrodserok, for mongo version 4.2.9 the query would be
db.collection.aggregate([
{
$match: {
timestamp: {
$gte: ISODate("2021-05-01T00:00:00.000Z"),
$lte: ISODate("2021-05-07T23:59:59.999Z")
}
}
},
{
$project: {
conversationId: 1,
groupId: {
$dateFromParts: {
year: {
$year: "$timestamp"
},
month: {
$month: "$timestamp"
},
day: {
$dayOfMonth: "$timestamp"
},
hour: {
$cond: [
{
$gte: [
{
$subtract: [
{
$toLong: ISODate("2021-05-07T23:59:59.999Z")
},
{
$toLong: ISODate("2021-05-01T00:00:00.000Z")
}
]
},
86400000
]
},
0,
{
$hour: "$timestamp"
}
]
}
}
}
}
},
{
$group: {
_id: {
conversationId: "$conversationId",
groupId: "$groupId"
}
}
},
{
$group: {
_id: "$_id.groupId",
doc_count: {
$sum: 1
}
}
},
{
$project: {
date: {
$toString: "$_id"
},
doc_count: 1,
_id: 0
}
}
])

Mongo DB aggregation query to get an attribute based on another array of objects

I am trying to
get all the ids for which the period.startDate > sysdate
get all the ids for which the period.endDate < sysdate
from the JSON.
[
{
"id": 1,
"period":
[
{
"startDate": "2020-05-05",
"endDate": "2020-05-06"
},
{
"startDate": "2020-06-05",
"endDate": "2020-06-06"
}
]
},
{
"id": 2,
"period":
[
{
"startDate": "2024-07-05",
"endDate": "2024-07-06"
},
{
"startDate": "2024-08-05",
"endDate": "2024-08-06"
}
]
}
]
I have tried to go far as below aggregation:
[{
$project: {
_id: 0,
sId: '$id',
period: 1
} }, {
$unwind: {
path: '$period',
includeArrayIndex: 'index'
} }, {
$group: {
_id: '$sId',
minDate: {
$min: '$periods.startDate'
}
} }, {
$project: {
storeId: '$_id',
_id: 0,
minDated: {
$dateFromString: {
dateString: '$minDate'
}
},
today: ISODate('2022-08-03T11:37:03.954Z')
} }]
One option is using $reduce and $group:
db.collection.aggregate([
{$project: {
_id: 0,
id: 1,
minDate: {
$dateFromString: {
dateString: {
$reduce: {
input: "$period",
initialValue: {$first: "$period.startDate"},
in: {$min: ["$$value", "$$this.startDate"]}
}
}
}
},
maxDate: {
$dateFromString: {
dateString: {
$reduce: {
input: "$period",
initialValue: {$first: "$period.endDate"},
in: {$max: ["$$value", "$$this.startDate"]}
}
}
}
}
}
},
{$group: {
_id: 0,
startDateLargerIds: {
$push: {
$cond: [{$gt: ["$minDate", ISODate("2022-08-03T11:37:03.954Z")]},
"$id", "$$REMOVE"]}
},
endDateSmallerIds: {
$push: {
$cond: [{$lt: ["$maxDate", ISODate("2022-08-03T11:37:03.954Z")]},
"$id", "$$REMOVE"]}
}
}
},
{$unset: "_id"}
])
See how it works on the playground example

Get current state from snapshot documents - mongoDB

I'm trying to get a list of current holders at specific times from a collection. My collection looks like this:
[
{
"time": 1,
"holdings": [
{ "owner": "A", "tokens": 2 },
{ "owner": "B", "tokens": 1 }
]
},
{
"time": 2,
"holdings": [
{ "owner": "B", "tokens": 2 }
]
},
{
"time": 3,
"holdings": [
{ "owner": "A", "tokens": 3 },
{ "owner": "B", "tokens": 1 },
{ "owner": "C", "tokens": 1 }
]
},
{
"time": 4,
"holdings": [
{ "owner": "C", "tokens": 0 }
]
}
]
tokens show the current holdings of an owner if the holdings have changed to the last document. I would like to change the collection so that holdings always includes the full current holdings for any point in time.
At time: 1, the holdings are: A: 2, B: 1.
At time: 2, the holdings are: A: 2, B: 2. The collections does not include A's holdings however, because they haven't changed. So what I'd like to get is:
[
{
"time": 1,
"holdings": [
{ "owner": "A", "tokens": 2 },
{ "owner": "B", "tokens": 1 }
]
},
{
"time": 2,
"holdings": [
{ "owner": "A", "tokens": 2 }, // merged from prev doc.
{ "owner": "B", "tokens": 2 }
]
},
{
"time": 3,
"holdings": [
{ "owner": "A", "tokens": 3 },
{ "owner": "B", "tokens": 1 },
{ "owner": "C", "tokens": 1 }
]
},
{
"time": 4,
"holdings": [
{ "owner": "A", "tokens": 3 }, // merged from prev
{ "owner": "B", "tokens": 1 }, // merged from prev
{ "owner": "C", "tokens": 0 }
]
}
]
From what I understand $mergeObjects does that, but I don't understand how I can merge all previous docs in order up to the current doc for each doc. So I'm looking for a way to combine setWindowFields with mergeObjects I think.
This is a nice challenge.
So far, I got this complicated solution:
Get all of our timestamps in all of our documents. This is the purpose of the first 4 steps. $setWindowFields is used to accumulate this data.
$group by owner and calculate the empty timestamps as wantedTimes- next 5 steps.
$set empty timestamps with tokens: null to be filled with actual data and $unwind to separate - next 3 steps
Use $setWindowFields to find the last known token for each owner at each timestamp.
Fill this last known state for documents with unknown token - 2 steps
$group and format answer:
db.collection.aggregate([
{
$setWindowFields: {
sortBy: {time: 1},
output: {
allTimes: {$addToSet: "$time", window: {documents: ["unbounded", "current"]}
}
}
}
},
{
$setWindowFields: {
sortBy: {time: -1},
output: {
allTimes: {$addToSet: "$allTimes", window: {documents: ["unbounded", "current"]}
}
}
}
},
{
$set: {
allTimes: {
$reduce: {
input: "$allTimes",
initialValue: [],
in: {"$concatArrays": ["$$value", "$$this"]}
}
}
}
},
{$set: {allTimes: {$setIntersection: "$allTimes"}}},
{$unwind: "$holdings"},
{$sort: {time: 1}},
{$group: { _id: "$holdings.owner",
tokens: {$push: {tokens: "$holdings.tokens", time: "$time"}},
times: {$push: "$time"}, firstTime: {$first: "$time"},
allTimes: {$first: "$allTimes"}}
},
{
$addFields: {
wantedTimes: {
$filter: {
input: "$allTimes",
as: "item",
cond: {$gte: ["$$item", "$firstTime"]}
}
}
}
},
{
$project: {
tokens: 1,
wantedTimes: {$setDifference: ["$wantedTimes", "$times"]}
}
},
{
$set: {
data: {
$map: {
input: "$wantedTimes",
as: "item",
in: {time: "$$item", tokens: null}
}
}
}
},
{$project: {tokens: {"$concatArrays": ["$tokens", "$data"]}}},
{$unwind: "$tokens"},
{
$setWindowFields: {
partitionBy: "$_id",
sortBy: {"tokens.time": 1},
output: {
lastTokens: {
$push: "$tokens.tokens",
window: {documents: ["unbounded", "current"]}
}
}
}
},
{
$set: {
lastTokens: {
$filter: {
input: "$lastTokens",
as: "item",
cond: {$ne: ["$$item", null]}
}
}
}
},
{
$set: {
"tokens.tokens": {$ifNull: ["$tokens.tokens", {$last: "$lastTokens"}]}
}
},
{
$group: {
_id: "$tokens.time",
holdings: {$push: {owner: "$_id", tokens: "$tokens.tokens" }}
}
},
{$project: {time: "$_id", holdings: 1, _id: 0}},
{$sort: {time: 1}}
])
Playground example
From a performance perspective I recommend you split it into 2 calls, the first will be a quick findOne just to get the maximum time value in the collection.
Once you have that value the pipeline can be much leaner:
const maxItem = await db.collection.findOne({}).sort({ time: -1 });
db.collection.aggregate([
{
$unwind: "$holdings"
},
{
$group: {
_id: "$holdings.owner",
times: {
$push: {
time: "$time",
tokens: "$holdings.tokens"
}
},
minTime: {
$min: "$time"
}
}
},
{
$addFields: {
times: {
$reduce: {
input: {
$range: [
"$minTime",
maxItem.time + 1 // this is max time
]
},
initialValue: {
values: [],
lastIndex: 0
},
in: {
values: {
"$concatArrays": [
"$$value.values",
[
{
$cond: [
{
$in: [
"$$this",
"$times.time"
]
},
{
"$arrayElemAt": [
"$times",
"$$value.lastIndex"
]
},
{
"$mergeObjects": [
{
tokens: 0
},
{
"$arrayElemAt": [
"$times",
{
$subtract: [
"$$value.lastIndex",
1
]
}
]
},
{
time: "$$this"
}
]
}
]
}
]
]
},
lastIndex: {
$cond: [
{
$in: [
"$$this",
"$times.time"
]
},
{
$sum: [
"$$value.lastIndex",
1
]
},
"$$value.lastIndex"
]
}
}
}
}
}
},
{
$unwind: "$times.values"
},
{
$group: {
_id: "$times.values.time",
holdings: {
$push: {
owner: "$_id",
tokens: "$times.values.tokens"
}
}
}
},
{
$project: {
_id: 0,
time: "$_id",
holdings: 1
}
},
{
$sort: {
time: 1
}
}
])
This is still quite a heavy query as it requires to $unwind and $group the entire collection, however there is no workaround this due to the requirements. if the collection is too big for this approach I recommend iteration owner by owner, or time by time and doing separate updates accordingly.
Mongo Playground
If you don't care about performance at all and want it in a single query you can still use the same pipeline, you will have to first extract the max time in the collection, this will require you to add an initial $group stage, like so:
db.collection.aggregate([
{
$group: {
_id: null,
maxTime: {
$max: "$time"
},
roots: {
$push: "$$ROOT"
}
}
},
{
$unwind: "$roots"
},
{
$replaceRoot: {
newRoot: {
"$mergeObjects": [
"$roots",
{
maxTime: "$maxTime"
}
]
}
}
},
... same pipeline ...
])