Creating measures in a mongodb aggregation pipeline - mongodb

I have a report that has been developed in PowerBI. It runs over a collection of jobs, and for a given month and year counts the number of jobs that were created, due or completed in that month using measures.
I am attempting to reproduce this report using a mongoDB aggregation pipeline. At first, I thought I could just use the $group stage to do this, but quickly realised that grouping by a specific date would exclude jobs.
Some sample documents are below (most fields excluded as they are not relevant):
{
"_id": <UUID>,
"createdOn": ISODate("2022-07-01T00:00"),
"dueOn": ISODate("2022-08-01T00:00"),
"completedOn": ISODate("2022-07-29T00:00")
},
{
"_id": <UUID>,
"createdOn": ISODate("2022-06-01T00:00"),
"dueOn": ISODate("2022-08-01T00:00"),
"completedOn": ISODate("2022-07-24T00:00")
}
For example, if I group by created date, the record for July 2022 would show 1 created job and only 1 completed job, but it should show 2.
How can I go about recreating this report? One idea was that I needed to determine the minimum and maximum of all the possible dates across those 3 date fields in my collection, but I don't know where to go from there

I ended up solving this by using a facet. I followed this process:
Each facet field grouped by a different date field from the source document, and then aggregated the relevant field (e.g. counts, or sums as required). I ensured each of these fields in the facet had a unique name.
I then did a project stage where I took each of the facet stage fields (arrays), and concat them into a single array
I unwound the array, and then replaced the root to make it simpler to work with
I then grouped again by the _id field which was set to the relevant date during the facet field, and then grabbed the relevant fields.
The relevant parts of the pipeline are below:
db.getCollection("jobs").aggregate(
// Pipeline
[
// Stage 3
{
$facet: {
//Facet 1, group by created date, count number of jobs created
//facet 2, group by completed date, count number of jobs completed
//facet 3, group by due date, count number of jobs due
"created" : [
{
$addFields : {
"monthStarting" : {
"$dateFromString" : {
"dateString" : {
"$dateToString" : {
"date" : {
"$dateTrunc" : {
"date" : "$createdAt",
"unit" : "month",
"binSize" : 1.0,
"timezone" : "$timezone",
"startOfWeek" : "mon"
}
},
"timezone" : "$timezone"
}
}
}
},
"yearStarting" : {
"$dateFromString" : {
"dateString" : {
"$dateToString" : {
"date" : {
"$dateTrunc" : {
"date" : "$createdAt",
"unit" : "year",
"binSize" : 1.0,
"timezone" : "$timezone"
}
},
"timezone" : "$timezone"
}
}
}
}
}
},
{
$group : {
"_id" : {
"year" : "$yearStarting",
"month" : "$monthStarting"
},
"monthStarting" : {
"$first" : "$monthStarting"
},
"yearStarting" : {
"$first" : "$yearStarting"
},
"createdCount": {$sum: 1}
}
}
],
"completed" : [
{
$addFields : {
"monthStarting" : {
"$dateFromString" : {
"dateString" : {
"$dateToString" : {
"date" : {
"$dateTrunc" : {
"date" : "$completedDate",
"unit" : "month",
"binSize" : 1.0,
"timezone" : "$timezone",
"startOfWeek" : "mon"
}
},
"timezone" : "$timezone"
}
}
}
},
"yearStarting" : {
"$dateFromString" : {
"dateString" : {
"$dateToString" : {
"date" : {
"$dateTrunc" : {
"date" : "$completedDate",
"unit" : "year",
"binSize" : 1.0,
"timezone" : "$timezone"
}
},
"timezone" : "$timezone"
}
}
}
}
}
},
{
$group : {
"_id" : {
"year" : "$yearStarting",
"month" : "$monthStarting"
},
"monthStarting" : {
"$first" : "$monthStarting"
},
"yearStarting" : {
"$first" : "$yearStarting"
},
"completedCount": {$sum: 1}
}
}
],
"due": [
{
$match: {
"dueDate": {$ne: null}
}
},
{
$addFields : {
"monthStarting" : {
"$dateFromString" : {
"dateString" : {
"$dateToString" : {
"date" : {
"$dateTrunc" : {
"date" : "$dueDate",
"unit" : "month",
"binSize" : 1.0,
"timezone" : "$timezone",
"startOfWeek" : "mon"
}
},
"timezone" : "$timezone"
}
}
}
},
"yearStarting" : {
"$dateFromString" : {
"dateString" : {
"$dateToString" : {
"date" : {
"$dateTrunc" : {
"date" : "$dueDate",
"unit" : "year",
"binSize" : 1.0,
"timezone" : "$timezone"
}
},
"timezone" : "$timezone"
}
}
}
}
}
},
{
$group : {
"_id" : {
"year" : "$yearStarting",
"month" : "$monthStarting"
},
"monthStarting" : {
"$first" : "$monthStarting"
},
"yearStarting" : {
"$first" : "$yearStarting"
},
"dueCount": {$sum: 1},
"salesRevenue": {$sum: "$totalSellPrice"},
"costGenerated": {$sum: "$totalBuyPrice"},
"profit": {$sum: "$profit"},
"avgValue": {$avg: "$totalSellPrice"},
"finalisedRevenue": {$sum: {
$cond: {
"if": {$in: ["$status",["Finalised","Closed"]]},
"then": "$totalSellPrice",
"else": 0
}
}}
}
}
]
}
},
// Stage 4
{
$project: {
"docs": {$concatArrays: ["$created","$completed","$due"]}
}
},
// Stage 5
{
$unwind: {
path: "$docs",
}
},
// Stage 6
{
$replaceRoot: {
// specifications
"newRoot": "$docs"
}
},
// Stage 7
{
$group: {
_id: "$_id",
"monthStarting" : {
"$first" : "$monthStarting"
},
"yearStarting" : {
"$first" : "$yearStarting"
},
"monthStarting" : {
"$first" : "$monthStarting"
},
"createdCountSum" : {
"$sum" : "$createdCount"
},
"completedCountSum" : {
"$sum" : "$completedCount"
},
"dueCountSum" : {
"$sum" : "$dueCount"
},
"salesRevenue" : {
"$sum" : "$salesRevenue"
},
"costGenerated" : {
"$sum" : "$costGenerated"
},
"profit" : {
"$sum" : "$profit"
},
"finalisedRevenue" : {
"$sum" : "$finalisedRevenue"
},
"avgJobValue": {
$sum: "$avgValue"
}
}
},
],
);

Related

Counting distinct number of users from beginning

I have a MongoDB aggregation pipeline that has been frustrating me for a while now, because it never seems to be accurate or correct to my needs. The aim is to count the number of new unique users each day per chatbot, starting from the very beginning.
Here's what my pipeline looks like right now.
[
{
"$project" : {
"_id" : 0,
"bot_id" : 1,
"customer_id" : 1,
"timestamp" : {
"$ifNull" : [
'$incoming_log.created_at', '$outcome_log.created_at'
]
}
}
},
{
"$project" : {
"customer_id" : 1,
"bot_id" : 1,
"timestamp" : {
"$dateFromString" : {
"dateString" : {
"$substr" : [
"$timestamp", 0, 10
]
}
}
}
}
},
{
"$group" : {
"_id" : "$customer_id",
"timestamp" : {
"$first" : "$timestamp"
},
"bot_id" : {
"$addToSet" : "$bot_id"
}
}
},
{
"$unwind" : "$bot_id"
},
{
"$group" : {
"_id" : {
"bot_id" : "$bot_id",
"customer_id" : "$_id"
},
"timestamp" : {
"$first" : "$timestamp"
}
}
},
{
"$project" : {
"_id" : 0,
"timestamp" : 1,
"customer_id" : "$_id.customer_id",
"bot_id" : "$_id.bot_id"
}
},
{
"$group" : {
"_id": {
"timestamp" : "$timestamp",
"bot_id" : "$bot_id"
},
"new_users" : {
"$sum" : 1
}
}
},
{
"$project" : {
"_id" : 0,
"timestamp" : "$_id.timestamp",
"bot_id" : "$_id.bot_id",
"new_users" : 1
}
}
]
Some sample data for an idea of what the data looks like...
{
"mid" : "...",
"bot_id" : "...",
"bot_name" : "JOBBY",
"customer_id" : "U122...",
"incoming_log" : {
"created_at" : ISODate("2020-12-08T09:14:16.237Z"),
"event_payload" : "",
"event_type" : "text"
},
"outcome_log" : {
"created_at" : ISODate("2020-12-08T09:14:18.145Z"),
"distance" : 0.25,
"incoming_msg" : "đŸ„ș"
}
}
My expected outcome is something along the lines of:
{
"new_users" : 1187.0,
"timestamp" : ISODate("2021-01-27T00:00:00.000Z"),
"bot_id" : "5ffd......."
},
{
"new_users" : 1359.0,
"timestamp" : ISODate("2021-01-27T00:00:00.000Z"),
"bot_id" : "6def......."
}
Have I overcomplicated my pipeline somewhere? I seem to get a reasonable number of new users per bot each day, but for some reason my colleague tells me that the number is too high. I need some tips, please!
I have really no idea what you are looking for.
"The aim is to count the number of new unique users each day per chatbot, starting from the very beginning."
What is "new unique users"? What do you mean by "starting from the very beginning"? You ask for count per day but you use {"$group": {"_id": "$customer_id", "timestamp": { "$first": "$timestamp" } } }
For me your grouping does not make any sense. With only one single sample document, it is almost impossible to guess what you like to count.
Regarding group per day: I prefer to work always with Date values, rather than strings. It is less error prone. Maybe you have to consider time zones, because UTC midnight is not your local midnight. When you work with Dates then you have better control over it.
The $project stages are useless when you do $group afterwards. Typically you have only one $project stage at the end.
So, put something to start.
db.collection.aggregate([
{
$set: {
day: {
$dateToParts: {
date: { $ifNull: ["$incoming_log.created_at", "$outcome_log.created_at"] }
}
}
}
},
{
$group: {
_id: "$customer_id",
timestamp: {$min: { $dateFromParts: { year: "$day.year", month: "$day.month", day: "$day.day" } }}
}
}
]);

Partition data around a match query during aggregation

What I have been trying to get my head around is to perform some kind of partitioning(split by predicate) in a mongo query. My current query looks like:
db.posts.aggregate([
{"$match": { $and:[ {$or:[{"toggled":false},{"toggled":true, "status":"INACTIVE"}]} , {"updatedAt":{$gte:1549786260000}} ] }},
{"$unwind" :"$interests"},
{"$group" : {"_id": {"iid": "$interests", "pid":"$publisher"}, "count": {"$sum" : 1}}},
{"$project":{ _id: 0, "iid": "$_id.iid", "pid": "$_id.pid", "count": 1 }}
])
This results in the following output:
{
"count" : 3.0,
"iid" : "INT456",
"pid" : "P789"
}
{
"count" : 2.0,
"iid" : "INT789",
"pid" : "P789"
}
{
"count" : 1.0,
"iid" : "INT123",
"pid" : "P789"
}
{
"count" : 1.0,
"iid" : "INT123",
"pid" : "P123"
}
All good so far, but then I had realized that for the documents that match the specific filter {"toggled":true, "status":"INACTIVE"}, I would rather decrement the count (-1). (considering the eventual value can be negative as well.)
Is there a way to somehow partition the data after match to make sure different grouping operations are performed for both the collection of documents?
Something that sounds similar to what I am looking for is
$mergeObjects, or maybe $reduce, but not much that I can relate from the documentation examples.
Note: I can sense, one straightforward way to deal with this would be to perform two queries, but I am looking for a single query to perform the operation.
Sample documents for the above output would be:
/* 1 */
{
"_id" : ObjectId("5d1f7******"),
"id" : "CON123",
"title" : "Game",
"content" : {},
"status" : "ACTIVE",
"toggle":false,
"publisher" : "P789",
"interests" : [
"INT456"
],
"updatedAt" : NumberLong(1582078628264)
}
/* 2 */
{
"_id" : ObjectId("5d1f8******"),
"id" : "CON456",
"title" : "Home",
"content" : {},
"status" : "INACTIVE",
"toggle":true,
"publisher" : "P789",
"interests" : [
"INT456",
"INT789"
],
"updatedAt" : NumberLong(1582078628264)
}
/* 3 */
{
"_id" : ObjectId("5d0e9******"),
"id" : "CON654",
"title" : "School",
"content" : {},
"status" : "ACTIVE",
"toggle":false,
"publisher" : "P789",
"interests" : [
"INT123",
"INT456",
"INT789"
],
"updatedAt" : NumberLong(1582078628264)
}
/* 4 */
{
"_id" : ObjectId("5d207*******"),
"id" : "CON789",
"title":"Stack",
"content" : { },
"status" : "ACTIVE",
"toggle":false,
"publisher" : "P123",
"interests" : [
"INT123"
],
"updatedAt" : NumberLong(1582078628264)
}
What I am looking forward to as a result though is
{
"count" : 1.0, (2-1)
"iid" : "INT456",
"pid" : "P789"
}
{
"count" : 0.0, (1-1)
"iid" : "INT789",
"pid" : "P789"
}
{
"count" : 1.0,
"iid" : "INT123",
"pid" : "P789"
}
{
"count" : 1.0,
"iid" : "INT123",
"pid" : "P123"
}
This aggregation gives the desired result.
db.posts.aggregate( [
{ $match: { updatedAt: { $gte: 1549786260000 } } },
{ $facet: {
FALSE: [
{ $match: { toggle: false } },
{ $unwind : "$interests" },
{ $group : { _id : { iid: "$interests", pid: "$publisher" }, count: { $sum : 1 } } },
],
TRUE: [
{ $match: { toggle: true, status: "INACTIVE" } },
{ $unwind : "$interests" },
{ $group : { _id : { iid: "$interests", pid: "$publisher" }, count: { $sum : -1 } } },
]
} },
{ $project: { result: { $concatArrays: [ "$FALSE", "$TRUE" ] } } },
{ $unwind: "$result" },
{ $replaceRoot: { newRoot: "$result" } },
{ $group : { _id : "$_id", count: { $sum : "$count" } } },
{ $project:{ _id: 0, iid: "$_id.iid", pid: "$_id.pid", count: 1 } }
] )
[ EDIT ADD ]
The output from the query using the input data from the question post:
{ "count" : 1, "iid" : "INT123", "pid" : "P789" }
{ "count" : 1, "iid" : "INT123", "pid" : "P123" }
{ "count" : 0, "iid" : "INT789", "pid" : "P789" }
{ "count" : 1, "iid" : "INT456", "pid" : "P789" }
[ EDIT ADD 2 ]
This query gets the same result with different approach (code):
db.posts.aggregate( [
{
$match: { updatedAt: { $gte: 1549786260000 } }
},
{
$unwind : "$interests"
},
{
$group : {
_id : {
iid: "$interests",
pid: "$publisher"
},
count: {
$sum: {
$switch: {
branches: [
{ case: { $eq: [ "$toggle", false ] },
then: 1 },
{ case: { $and: [ { $eq: [ "$toggle", true] }, { $eq: [ "$status", "INACTIVE" ] } ] },
then: -1 }
]
}
}
}
}
},
{
$project:{
_id: 0,
iid: "$_id.iid",
pid: "$_id.pid",
count: 1
}
}
] )
[ EDIT ADD 3 ]
NOTE:
The facet query runs the two facets (TRUE and FALSE) on the same set of documents; it is like two queries running in parallel. But, there is some duplication of code as well as additional stages for shaping the documents down the pipeline to get the desired output.
The second query avoids the code duplication, and there are much lesser stages in the aggregation pipeline. This will make difference when the input dataset has a large number of documents to process - in terms of performance. In general, lesser stages means lesser iterations of the documents (as a stage has to scan the documents which are output from the previous stage).

Get average per year and label?

I can use this query to get the average sqmPrice for a myArea
db.getCollection('sold').aggregate([
{$match:{}},
{$group: {_id: "$myArea", "sqmPrice": {$avg: "$sqmPrice"} }}
])
Output:
[
{
"_id" : "Yttre Aspudden",
"sqmPrice" : 48845.7777777778
},
{
"_id" : "HÀgerstensÄsen",
"sqmPrice" : 120
}
]
I would like to group this by year, ideally an object that looks like this:
{
"Yttre Aspudden": {
2008: 1232,
2009: 1244
...
}
...
}
but the formatting is not the most important.
Here is a sample object, I would like to use soldDate:
{
"_id" : ObjectId("5beca41c78f21248ab47f4a6"),
"location" : {
"address" : {
"streetAddress" : "Ljusstöparbacken 26C"
},
"position" : {
"latitude" : 59.31427884,
"longitude" : 18.00892421
},
"namedAreas" : [
"HĂ€gersten-Liljeholmen"
],
"region" : {
"municipalityName" : "Stockholm",
"countyName" : "Stockholms lÀn"
},
"distance" : {
"ocean" : 3777
}
},
"listPrice" : 1895000,
"rent" : 1959,
"floor" : 1,
"livingArea" : 38.5,
"source" : {
"name" : "FastighetsbyrÄn",
"id" : 1573,
"type" : "Broker",
"url" : "http://www.fastighetsbyran.se/"
},
"rooms" : 1.5,
"published" : ISODate("2018-11-02T20:55:19.000Z"),
"constructionYear" : 1959,
"objectType" : "LĂ€genhet",
"booliId" : 3278478,
"soldDate" : ISODate("2018-11-14T00:00:00.000Z"),
"soldPrice" : 2620000,
"soldPriceSource" : "bid",
"url" : "https://www.booli.se/annons/3278478",
"publishedDays" : 1735,
"soldDays" : 1747,
"daysUp" : 160,
"street" : "Ljusstöparbacken",
"streetYear" : "Ljusstöparbacken HÀgersten-Liljeholmen 1959",
"yearDay" : 318,
"yearWeek" : 46,
"roughSize" : 40,
"sqmPrice" : 49221,
"myArea" : "Gröndal",
"hotlist" : true
}
You need to generate your keys dynamically so you have to use $arrayToObject. To build an object which aggregates the data you need three $group stages and to create new root of your document you can use $replaceRoot, try:
db.sold.aggregate([
{ $group: {_id: { area: "$myArea", year: { $year: "$soldDate" } }, "sqmPrice": {$avg: "$sqmPrice"} }},
{ $group: { _id: "$_id.area", avgs: { $push: { k: { $toString: "$_id.year" }, v: "$sqmPrice" } } } },
{ $group: { _id: null, areas: { $push: { k: "$_id", v: { $arrayToObject: "$avgs" } } } } },
{ $replaceRoot: { newRoot: { $arrayToObject: "$areas" } } }
])

Mongodb aggregation json format by month result

Hello I am working with the reporting api which will going to use in highcharts and I am new to mongodb.
Below is my aggregation query (suggest me modification) :
db.product_sold.aggregate({
$group: {
_id: { year: { $year: "$solddate" }, month: { $month: "$solddate" }, productid: "$productid" },
totalQty: { $sum: "$qty" }
}
})
Output:
{
"_id" : {
"year" : NumberInt(2019),
"month" : NumberInt(2),
"productid" : "11"
},
"totalQty" : 6.0
}
// ----------------------------------------------
{
"_id" : {
"year" : NumberInt(2019),
"month" : NumberInt(2),
"productid" : "14"
},
"totalQty" : 7.0
}
// ----------------------------------------------
{
"_id" : {
"year" : NumberInt(2019),
"month" : NumberInt(1),
"productid" : "13"
},
"totalQty" : 3.0
}
// ----------------------------------------------
{
"_id" : {
"year" : NumberInt(2019),
"month" : NumberInt(2),
"productid" : "10"
},
"totalQty" : 6.0
}
// ----------------------------------------------
{
"_id" : {
"year" : NumberInt(2018),
"month" : NumberInt(2),
"productid" : "12"
},
"totalQty" : 5.0
}
// ----------------------------------------------
{
"_id" : {
"year" : NumberInt(2019),
"month" : NumberInt(2),
"productid" : "15"
},
"totalQty" : 8.0
}
// ----------------------------------------------
{
"_id" : {
"year" : NumberInt(2019),
"month" : NumberInt(1),
"productid" : "11"
},
"totalQty" : 2.0
}
// ----------------------------------------------
What I want in output is something like :
status: 200,
msg: "SUCCESS"
data: [{
1:[
{
"productid": 11,
"totalQty": 3
},
{
"productid": 12,
"totalQty": 27
}
],
2:[
{
"productid": 11,
"totalQty": 64
},
{
"productid": 12,
"totalQty": 10
}
]
}]
For reference attaching the image of the collection
Is there any way to achieve it using aggregation or anything else or I will have to do it manually by code ?
You can append below aggreagation stages to your current pipeline:
db.product_sold.aggregate([
// your current $group stage
{
$group: {
_id: "$_id.month",
docs: { $push: { productid: "$_id.productid", totalQty: "$totalQty" } }
}
},
{
$project: {
_id: 0,
k: { $toString: "$_id" },
v: "$docs"
}
},
{
$group: {
_id: null,
data: { $push: "$$ROOT" }
}
},
{
$project: {
_id: 0,
data: { $arrayToObject: "$data" }
}
}
])
The idea here is that you can use $group with _id set to null to get all the data into single document and then use $arrayToObject to get month number as key and all the aggregates for that month as value.

Get lowest per date from multiple arrays in mongodb

I've the following structure of docs:
{
"_id" : ObjectId("5786458371d24d924d8b4575"),
"uniqueNumber" : "3899822714",
"lastUpdatedAt" : ISODate("2016-07-13T20:11:11.000Z"),
"new" : [
{
"price" : 8.4,
"created" : ISODate("2016-07-13T13:11:28.000Z")
},
{
"price" : 10.0,
"created" : ISODate("2016-07-13T14:50:56.000Z")
}
],
"used" : [
{
"price" : 10.99,
"created" : ISODate("2016-07-08T13:46:31.000Z")
},
{
"price" : 8.59,
"created" : ISODate("2016-07-13T13:11:28.000Z")
}
]
}
Now I need to get a list that gives me the lowest price of each array per date.
So, as example:
{
"uniqueNumber" : 1234,
"prices" : {
"created" : 2016-07-08,
"minNew" : 123,
"minUsed" : 22
}
}
By now I've built the following query
db.getCollection('col').aggregate([
{
$match : {
"uniqueNumber" : "3899822714"
}
},
{
$unwind : "$used"
},
{
$project : {
"uniqueNumber" : "$uniqueNumber",
"price" : "$used.price",
"ts" : "$used.created"
}
},
{
$sort : { "ts" : 1 }
},
{
$group : {_id: "$uniqueNumber", priceOfMaxTS : { $min: "$price" }, ts : { $last: "$ts" }}
}
]);
But this one will only give me the lowest price for the highest date. I couldn't really find anything that pushes me to the right direction to get the desired result.
UPDATE
I've found a way to get the lowest price of the used array grouped by day with this query:
db.getCollection('col').aggregate([
{
$match : {
"uniqueNumber" : "3899822714"
}
},
{
$unwind : "$used"
},
{
$project : {
"asin" : "$uniqueNumber",
"price" : "$used.price",
"ts" : "$used.created",
"y" : { "$year" : "$used.created" },
"m" : { "$month" : "$used.created" },
"d" : { "$dayOfMonth" : "$used.created" }
}
},
{
$group : { _id : { "year" : "$y", "month" : "$m", "day" : "$d" }, minPriceOfDay : { $min: "$price" }}
}
]);
No I only need to find a way to do this also to the new array in the same query.