MongoDB optimizing big aggregation queries - mongodb

I have a collection of documents in MongoDB, representing some entity. For every entity there are some statistics data gathered on a daily basis. The statistics are put as a separate documents into different collections.
Entity collection schema:
{
_id: ObjectId,
filterField1: String, //indexed
filterField2: String, //indexed
}
Example schema of statistics collection:
{
_id: ObjectId,
entityId: ObjectId, //indexed
statisticsValue: Int32,
date: Date //indexed
}
There is a dashboard that needs to display some aggregated statistics based on the gathered data over some time period e.x. average value, sum, count etc. The dashboard enables filtering in/out entities and applying different date ranges which makes precalculating those aggregated statistics impossible.
As for now, I've been using aggregation pipeline to:
apply the filters on the entities collection (using match stage)
make necessary lookups stages to acquire statistics for aggregation
make grouping and aggregation (avg, sum, count, etc.)
Here is the pipeline:
db.getCollection('entities').aggregate([
{ $match: { $expr: { $and: [
// ENTITIES FILTERS based on filterField1 and filterField2 fields
] } } },
{ $lookup: {
from: 'statistics',
let: { entityId: '$_id' },
pipeline: [{ $match: { $expr: { $and: [
{ $eq: ["$entityId", "$$entityId"] },
{ $gte: [ "$date", new ISODate("2022-06-01T00:00:00Z") ] },
{ $lte: [ "$date", new ISODate("2022-06-01T23:59:59Z") ] },
] } } },
as: 'stats_start_date_range',
} },
{ $lookup: {
from: 'statistics',
let: { key: '$_key' },
pipeline: [{ $match: { $expr: { $and: [
{ $eq: ["$entityId", "$$entityId"] },
{ $gte: [ "$date", new ISODate("2022-06-31T00:00:00Z") ] },
{ $lte: [ "$date", new ISODate("2022-06-31T23:59:59Z") ] },
] } } },
as: 'stats_end_date_range',
} },
{ $addFields:
{
start_stats: { $first: "$stats_start_date_range" },
end_stats: { $first: "$stats_end_date_range" }
}
},
{
$group: {
_id: null,
avg_start: { $avg: "$start_stats.statisticsValue" },
avg_end: { $avg: "$end_stats.statisticsValue" }
}
}
])
In case of this query, the expected result is the average value of the statisticsValue field for the start and end date for every entity matching the filters.
I applied the index on the field used to left join collections in lookup stage. as well as on the date field used for getting statistics for a specific date.
The problem is that the query takes about 1 second for the max number of documents after the match stage (about 1000 documents). And I need to perform 4 such queries. The statistic collection contains 800k documents and the number is growing every day.
I was wondering, if I can do anything to make the query execution faster, I considered:
time series collection
reorganizing collections structure (don't know how)
merging those 4 separate queries into 1, using facet stage
But I'm not sure if MongoDB is suitable data source for such operations and maybe I should consider another data source if I want to perform such queries.

Hard to guess, what you would like to get. An approach could be this one:
const entities = db.getCollection('entities').aggregate([
{ $match: { filterField1: "a" } }
]).toArray().map(x => x._id)
db.getCollection('statistics').aggregate([
{
$match: {
entityId: { $in: entities },
date: {
$gte: ISODate("2022-06-01T00:00:00Z"),
$lte: ISODate("2022-06-31T23:59:59Z")
}
}
},
{
$facet: {
stats_start_date_range: [
{
$match: {
date: {
$gte: ISODate("2022-06-01T00:00:00Z"),
$lte: ISODate("2022-06-01T23:59:59Z")
}
}
}
],
stats_end_date_range: [
{
$match: {
date: {
$gte: ISODate("2022-06-31T00:00:00Z"),
$lte: ISODate("2022-06-31T23:59:59Z")
}
}
}
]
}
},
{
$addFields: {
start_stats: { $first: "$stats_start_date_range" },
end_stats: { $first: "$stats_end_date_range" }
}
},
{
$group: {
_id: null,
avg_start: { $avg: "$start_stats.statisticsValue" },
avg_end: { $avg: "$end_stats.statisticsValue" }
}
}
]);

Related

MongoDB aggregate performance

I have two collections one is bids and another one is auctions. I am able to get bid inside customer wise count bids collection inside almost one million records. and auctions collection have 500k records. I need this same result as quick fetching in mongodb.
this is getting almost 29 seconds for response time. but i need quick time to get response
{ $match: { customer: '00000000823026' } },
{ $group: {
_id: '$auctioncode',
data: {
$last: '$$ROOT'
}
}
},
{
$lookup: {
from: 'auctions',
let: { auctioncode: '$_id' },
pipeline: [
{
$match: {
$expr: {
$and: [{ $eq: ['$_id', '$$auctioncode'] }],
},
},
},
],
as: 'auction',
},
},
{ $match: { auction: { $exists: true, $not: { $size: 0 } } } },
{
$addFields: {
_id: '$data._id',
auctioncode: '$data.auctioncode',
amount: '$data.amount',
customer: '$data.customer',
customerName: '$data.customerName',
maxBid: '$data.maxBid',
stockcode: '$data.stockcode',
watchlistHidden: '$data.watchlistHidden',
winner: '$data.winner'
}
},
{
$match: {
$and: [
{
// to get only RUNNING auctions in bid history
'auction.status': { $ne: 'RUNNING'},
// to filter auctions based on status
// tslint:disable-next-line:max-line-length
},
],
},
},
{ $sort: { 'auction.enddate': -1 } },
{ $count: 'totalCount'}
current result is totalCount 2640
how to optimize and need to find a way to performance changes in mongodb
If all that you require is the count of results, the below code is more optimized.
Index customer key for even better execution time.
Note: You can make use of pipeline method of $lookup if you are using MongoDB version >= 5.0 as it makes use if indexes unlike the lower version.
db.collection.aggregate([
{
$match: {
customer: '00000000823026'
}
},
{
$group: {
_id: '$auctioncode',
data: {
$last: '$$ROOT'
}
}
},
{
$lookup: {
from: 'auctions',
localField: '_id',
foreignField: '_id',
as: 'auction',
},
},
{
$match: {
// auction: {$ne: []},
// to get only RUNNING auctions in bid history
'auction.status': { $ne: 'RUNNING'},
// to filter auctions based on status
// tslint:disable-next-line:max-line-length
// {
// $addFields: { <-- Not Required
// _id: '$data._id',
// auctioncode: '$data.auctioncode',
// amount: '$data.amount',
// customer: '$data.customer',
// customerName: '$data.customerName',
// maxBid: '$data.maxBid',
// stockcode: '$data.stockcode',
// watchlistHidden: '$data.watchlistHidden',
// winner: '$data.winner'
// }
// },
// { $sort: { 'auction.enddate': -1 } }, <-- Not Required
{ $count: 'totalCount'}
], {
allowDiskUse: true
})

How to find duplicate records based on an id and a datetime field in MongoDB?

I have a MongoDB collection with millions of record. Sample records are shown below:
[
{
_id: ObjectId("609977b0e8e1c615cb551bf5"),
activityId: "123456789",
updateDateTime: "2021-03-24T20:12:02Z"
},
{
_id: ObjectId("739177b0e8e1c615cb551bf5"),
activityId: "123456789",
updateDateTime: "2021-03-24T20:15:02Z"
},
{
_id: ObjectId("805577b0e8e1c615cb551bf5"),
activityId: "123456789",
updateDateTime: "2021-03-24T20:18:02Z"
}
]
Multiple records could have the same activityId, in this case i want just the record that has the largest updateDateTime.
I have tried doing this and it works fine on a smaller collection but times out on a large collection.
[
{
$lookup: {
from: "MY_TABLE",
let: {
existing_date: "$updateDateTime",
existing_sensorActivityId: "$activityId"
},
pipeline: [
{
$match: {
$expr: {
$and: [
{ $eq: ["$activityId", "$$existing_sensorActivityId"] },
{ $gt: ["$updateDateTime", "$$existing_date"] }
]
}
}
}
],
as: "matched_records"
}
},
{ $match: { "matched_records.0": { $exists: true } } },
{ $project: { _id: 1 } }
]
This gives me _ids for all the records which have the same activity id but smaller updateDateTime.
The slowness occurs at this step -> "matched_records.0": {$exists:true}
Is there a way to speed up this step or are there any other approach to this problem?
You can find unique documents and write result in new collection using $out instead of finding duplicate documents and deleting them,
How to find unique documents?
$sort by updateDateTime in descending order
$group by activityId and get first root record
$replaceRoot to replace record in root
$out to write query result in new collection
[
{ $sort: { updateDateTime: -1 } },
{
$group: {
_id: "$activityId",
record: { $first: "$$ROOT" }
}
},
{ $replaceRoot: { newRoot: "$record" } },
{ $out: "newCollectionName" } // set new collection name
]
Playground

Optimization and Indexing on Mongo Query

Help me on what kind of indexes need to be created and fields to be indexed.
I have tested multiple indexes but still it taking long time to execute
db.collection_1.aggregate([{ $match: { $and: [ { date: { $gte: new Date(1593561600000), $lt: new Date(1604966400000) } },
{ type: 0 }, { $or: [ { partysr: 0 }, {} ] }, { $or: [ { code: "******" }, { _id: { $type: -1 } } ] } ] } },
{ $sort: { date: -1 } }, { $skip: 0 }, { $limit: 100 },
{ $lookup: { from: "collection_2", localField: "code", foreignField: "code", as: "j" } },
{ $group: { _id: "$codeTsr" } } ]).explain("executionStats")
You gave us very little information here.
Have you tried testing this query with and without the $lookup stage?
How does the query behave without lookup speed-wise?
My first guess here is that your collection_2 collection does not have a proper index and it slows query. If your query is much faster without the lookup stage, I would create index on collection_2 for the "code" property.
Also, one more performance optimization might be to first do the $group stage, and after that, you do the $lookup stage.

MongoDB aggregates over row collections of predefined size

is it possible in MongoDB to perform aggregations over a collection of predefined number of rows, rather than grouping by. For example I want to calculate the average for every 1000 rows, instead of grouping by a certain column.
A smaller example would be the table below, I would like to calculate the Average rating of every 4 consecutive rows:
So my restult should like somthing like this:
Below is the input data in JSON:
[{"ItemName":"Item1","Rating":4},
{"ItemName":"Item2","Rating":4},
{"ItemName":"Item2","Rating":4},
{"ItemName":"Item3","Rating":2},
{"ItemName":"Item4","Rating":5},
{"ItemName":"Item5","Rating":4},
{"ItemName":"Item6","Rating":2},
{"ItemName":"Item7","Rating":4},
{"ItemName":"Item8","Rating":1},
{"ItemName":"Item9","Rating":4},
{"ItemName":"Item10","Rating":3},
{"ItemName":"Item11","Rating":2},
{"ItemName":"Item12","Rating":2}]
There is no easy way. You will need to group entire collection into array which may require allowDiskUse for large datasets with a huge performance impact.
db.collection.aggregate([
// count all documents
{ $group: {
_id: null,
cnt: { $sum: 1},
docs: { $push: "$$ROOT" }
} },
// add _batch field to group documents by
{ $project: {
_id: 0,
docs: { $map: {
// add a sequential number to each
input: { $zip: {
inputs: [ "$docs", { $range: [ 0, "$cnt" ] } ]
} },
as: "doc",
in: { $mergeObjects: [
{ $arrayElemAt: [ "$$doc", 0 ] },
// split it in batches by 4 based on the sequential number
{ _batch: { $cond: [
{ $eq: [ { $arrayElemAt: [ "$$doc", 1 ] }, 0 ] },
1,
{ $ceil: { $divide: [ { $arrayElemAt: [ "$$doc", 1 ] }, 4 ] } }
] } }
] }
} }
} },
{ $unwind: "$docs" },
{ $replaceRoot: { newRoot: "$docs" } },
// ensure original order, only if you need ItemRange as a string
{ $sort: { _id: 1 } },
// calculate averages per batch
{ $group: {
_id: "$_batch",
start: { $first: "$ItemName" }, // only if you need ItemRange as a string
end: { $last: "$ItemName" }, // only if you need ItemRange as a string
RatingAvg: {$avg: "$Rating"}
} },
// only if you need them in order
{ $sort: { _id: 1 } },
// calculate ItemRange, only if you need ItemRange as a string
{ $project: {
_id: 0,
ItemRange: { $concat: [ "$start", "-", "$end" ] },
RatingAvg: 1
} },
])
Not sure about practical usecase as all averages will change when you remove e.g. the first document.
Anyway if you don't need ItemRange in format "FirstName-LastName" and can live with batch number instead, you can skip 2 lasts in-memory sorts which should improve performance.

Retrieving a count that matches specified criteria in a $group aggregation

So I am looking to group documents in my collection on a specific field, and for the output results of each group, I am looking to include the following:
A count of all documents in the group that match a specific query (i.e. a count of documents that satisfy some expression { "$Property": "Value" })
The total number of documents in the group
(Bonus, as I suspect that this is not easily accomplished) Properties of a document that correspond to a $min/$max accumulator
I am very new to the syntax used to query in mongo and don't quite understand how it all works, but after some research, I've managed to get it down to the following query (please note, I am currently using version 3.0.12 for my mongo db, but I believe we will upgrade in a couple of months time):
db.getCollection('myCollection').aggregate(
[
{
$group: {
_id: {
GroupID: "$GroupID",
Status: "$Status"
},
total: { $sum: 1 },
GroupName: { $first: "$GroupName" },
EarliestCreatedDate: { $min: "$DateCreated" },
LastModifiedDate: { $max: "$LastModifiedDate" }
}
},
{
$group: {
_id: "$_id.GroupID",
Statuses: {
$push: {
Status: "$_id.Status",
Count: "$total"
}
},
TotalCount: { $sum: "$total" },
GroupName: { $first: "$GroupName" },
EarliestCreatedDate: { $min: "$EarliestCreatedDate" },
LastModifiedDate: { $max: "$LastModifiedDate" }
}
}
]
)
Essentially what I am looking to retrieve is the Count for specific Status values, and project them into one final result document that looks like the following:
{
GroupName,
EarliestCreatedDate,
EarliestCreatedBy,
LastModifiedDate,
LastModifiedBy,
TotalCount,
PendingCount,
ClosedCount
}
Where PendingCount and ClosedCount are the total number of documents in each group that have a status Pending/Closed. I suspect I need to use $project with some other expression to extract this value, but I don't really understand the aggregation pipeline well enough to figure this out.
Also the EarliestCreatedBy and LastModifiedBy are the users who created/modified the document(s) corresponding to the EarliestCreatedDate and LastModifiedDate respectively. As I mentioned, I think retrieving these values will add another layer of complexity, so if there is no practical solution, I am willing to forgo this requirement.
Any suggestions/tips would be very much appreciated.
You can try below aggregation stages.
$group
Calculate all the necessary counts TotalCount, PendingCount and ClosedCount for each GroupID
Calculate $min and $max for EarliestCreatedDate and LastModifiedDate respectively and push all the fields to CreatedByLastModifiedBy to be compared later for fetching EarliestCreatedBy and LastModifiedBy for each GroupID
$project
Project all the fields for response
$filter the EarliestCreatedDate value against the data in the CreatedByLastModifiedBy and $map the matching CreatedBy to the EarliestCreatedBy and $arrayElemAt to convert the array to object.
Similar steps for calculating LastModifiedBy
db.getCollection('myCollection').aggregate(
[{
$group: {
_id: "$GroupID",
TotalCount: {
$sum: 1
},
PendingCount: {
$sum: {
$cond: {
if: {
$eq: ["Status", "Pending"]
},
then: 1,
else: 0
}
}
},
ClosedCount: {
$sum: {
$cond: {
if: {
$eq: ["Status", "Closed "]
},
then: 1,
else: 0
}
}
},
GroupName: {
$first: "$GroupName"
},
EarliestCreatedDate: {
$min: "$DateCreated"
},
LastModifiedDate: {
$max: "$LastModifiedDate"
},
CreatedByLastModifiedBy: {
$push: {
CreatedBy: "$CreatedBy",
LastModifiedBy: "$LastModifiedBy",
DateCreated: "$DateCreated",
LastModifiedDate: "$LastModifiedDate"
}
}
}
}, {
$project: {
_id: 0,
GroupName: 1,
EarliestCreatedDate: 1,
EarliestCreatedBy: {
$arrayElemAt: [{
$map: {
input: {
$filter: {
input: "$CreatedByLastModifiedBy",
as: "CrBy",
cond: {
"$eq": ["$EarliestCreatedDate", "$$CrBy.DateCreated"]
}
}
},
as: "EaCrBy",
in: {
"$$EaCrBy.CreatedBy"
}
}
}, 0]
},
LastModifiedDate: 1,
LastModifiedBy: {
$arrayElemAt: [{
$map: {
input: {
$filter: {
input: "$CreatedByLastModifiedBy",
as: "MoBy",
cond: {
"$eq": ["$LastModifiedDate", "$$MoBy.LastModifiedDate"]
}
}
},
as: "LaMoBy",
in: {
"$$LaMoBy.LastModifiedBy"
}
}
}, 0]
},
TotalCount: 1,
PendingCount: 1,
ClosedCount: 1
}
}]
)
Update for Version < 3.2
$filter is also not available in your version. Below is the equivalent.
The comparison logic is the same and creates an array with for every non matching entry the value of false or LastModifiedBy otherwise.
Next step is to use $setDifference to compare the previous array values with array [false] which returns the elements that only exist in the first set.
LastModifiedBy: {
$setDifference: [{
$map: {
input: "$CreatedByLastModifiedBy",
as: "MoBy",
in: {
$cond: [{
$eq: ["$LastModifiedDate", "$$MoBy.LastModifiedDate"]
},
"$$MoBy.LastModifiedBy",
false
]
}
}
},
[false]
]
}
Add $unwind stage after $project stage to change to object
{$unwind:"$LastModifiedBy"}
Similar steps for calculating EarliestCreatedBy