MongoDB aggregates over row collections of predefined size - mongodb

is it possible in MongoDB to perform aggregations over a collection of predefined number of rows, rather than grouping by. For example I want to calculate the average for every 1000 rows, instead of grouping by a certain column.
A smaller example would be the table below, I would like to calculate the Average rating of every 4 consecutive rows:
So my restult should like somthing like this:
Below is the input data in JSON:
[{"ItemName":"Item1","Rating":4},
{"ItemName":"Item2","Rating":4},
{"ItemName":"Item2","Rating":4},
{"ItemName":"Item3","Rating":2},
{"ItemName":"Item4","Rating":5},
{"ItemName":"Item5","Rating":4},
{"ItemName":"Item6","Rating":2},
{"ItemName":"Item7","Rating":4},
{"ItemName":"Item8","Rating":1},
{"ItemName":"Item9","Rating":4},
{"ItemName":"Item10","Rating":3},
{"ItemName":"Item11","Rating":2},
{"ItemName":"Item12","Rating":2}]

There is no easy way. You will need to group entire collection into array which may require allowDiskUse for large datasets with a huge performance impact.
db.collection.aggregate([
// count all documents
{ $group: {
_id: null,
cnt: { $sum: 1},
docs: { $push: "$$ROOT" }
} },
// add _batch field to group documents by
{ $project: {
_id: 0,
docs: { $map: {
// add a sequential number to each
input: { $zip: {
inputs: [ "$docs", { $range: [ 0, "$cnt" ] } ]
} },
as: "doc",
in: { $mergeObjects: [
{ $arrayElemAt: [ "$$doc", 0 ] },
// split it in batches by 4 based on the sequential number
{ _batch: { $cond: [
{ $eq: [ { $arrayElemAt: [ "$$doc", 1 ] }, 0 ] },
1,
{ $ceil: { $divide: [ { $arrayElemAt: [ "$$doc", 1 ] }, 4 ] } }
] } }
] }
} }
} },
{ $unwind: "$docs" },
{ $replaceRoot: { newRoot: "$docs" } },
// ensure original order, only if you need ItemRange as a string
{ $sort: { _id: 1 } },
// calculate averages per batch
{ $group: {
_id: "$_batch",
start: { $first: "$ItemName" }, // only if you need ItemRange as a string
end: { $last: "$ItemName" }, // only if you need ItemRange as a string
RatingAvg: {$avg: "$Rating"}
} },
// only if you need them in order
{ $sort: { _id: 1 } },
// calculate ItemRange, only if you need ItemRange as a string
{ $project: {
_id: 0,
ItemRange: { $concat: [ "$start", "-", "$end" ] },
RatingAvg: 1
} },
])
Not sure about practical usecase as all averages will change when you remove e.g. the first document.
Anyway if you don't need ItemRange in format "FirstName-LastName" and can live with batch number instead, you can skip 2 lasts in-memory sorts which should improve performance.

Related

MongoDB conditional $sum after using $addFields

I am using an aggregation pipeline to aggregate stats for my game. My pipeline consists of first filtering all the games in the collection by the player's ObjectID, then summing their stats and analytics. The filtering is done by checking each array in a game's players array. The players array is an array of objects, and I check the uuid field on each object, to see if it corresponds with my target ObjectID.
Aggregating the stats works fine for simple $sum, but I am now attempting to do a more advanced sum. I want to get the average opponent rating. Each player has a team field of either 1 or 2, representing the possible teams. If the player's team is 1, I need to fetch team 2's rating, if their team is 2, I need to get team 1's rating. I designate team 1 as blue team, and team 2 and red team in my schema for simplicity. Here is an example game
{
"type": "Regular",
"map": "Classic",
"winningTeam": 1,
"gameStats": {
"duration": 7,
"redScore": 1,
"blueScore": 0,
"redRating": 1000,
"blueRating": 1000,
},
"players": [
{
"uuid": "ObjectId",
...
"stats": {
"timePlayed": 7,
"goals": 0,
"ownGoals": 0,
"goalsFor": 1,
"goalsAgainst": 0,
},
}
And here is my pipeline
[
{
$addFields: {
players: {
$filter: {
input: "$players",
as: "player",
cond: {
$eq: [
"$$player.uuid",
playerObjectId
],
},
},
},
},
},
{
$group: {
_id: playerObjectId,
oppRating: {
$avg: {
$avg: {
$switch: {
branches: [
{
case: {
$eq: [
"$players.team",
1
]
},
then: "$gameStats.blueRating"
},
{
case: {
$eq: [
"$players.team",
2
]
},
then: "$gameStats.redRating"
},
]
}
}
}
},
timePlayed: {
$sum: {
$sum: "$players.stats.timePlayed",
},
},
},
goals: {
$sum: {
$sum: "$players.stats.goals",
},
...
]
Now my $switch doesn't work, and I've identified the problem to be the fact that I cant access the $players field for some reason. For example when I set the condition to
$eq: [
1,
1
],
It will work, and correctly get the average. I see my issue is being able to access the $players variable that I set up in my addfields, why cant I access this variable in the $switch statement, but I can access it in all my other fields, like the $sum for timeplayed. Do I need to rethink my filter query? I understand that I could simply add a field to every playerObject that reads "opponentRating", but I would like to see if there is simply an aggregation way to do this first.
players must be an object to considered inside the $switch block. Just need to add $unwind after the addFields, since $filter will return an array.
db.game.aggregate([
{
$addFields: {
players: {
$filter: {
input: "$players",
as: "player",
cond: {
$eq: [
"$$player.uuid",
playerObjectId
],
},
},
},
},
},
{
$unwind: '$players'
},
{
$group: {
_id: playerObjectId,
oppRating: {
$avg: {
$avg: {
$switch: {
branches: [
{
case: {
$eq: [
"$players.team",
1
]
},
then: "$gameStats.blueRating"
},
{
case: {
$eq: [
"$players.team",
2
]
},
then: "$gameStats.redRating"
},
]
}
}
}
},
timePlayed: {
$sum: {
$sum: "$players.stats.timePlayed",
},
},
goals: {
$sum: {
$sum: "$players.stats.goals",
}
}
}
}
])
Also, I thought some performance optimisations can be done & redundant functions could be removed on the pipeline such as
Instead of $filter for players, we can use $match, $unwind & $match
one $avg will suffice for oppRating
And, one $sum will suffice for timePlayed & goals
You can try the below pipeline
db.game.aggregate([
{
$match: {
'players.uuid': playerObjectId,
}
},
{
$unwind: '$players'
},
{
$match: {
'players.uuid': playerObjectId,
}
},
{
$group: {
_id: playerObjectId,
oppRating: {
$avg: {
$switch: {
branches: [
{
case: {
$eq: [
"$players.team",
1
]
},
then: "$gameStats.blueRating"
},
{
case: {
$eq: [
"$players.team",
2
]
},
then: "$gameStats.redRating"
},
]
}
}
},
timePlayed: {
$sum: "$players.stats.timePlayed",
},
goals: {
$sum: "$players.stats.goals",
}
}
}
])

How to query an array and retrieve it from MongoDB

Updated:
I have a document on the database that looks like this:
My question is the following:
How can I retrieve the first 10 elements from the friendsArray from database and sort it descending or ascending based on the lastTimestamp value.
I don't want to download all values to my API and then sort them in Python because that is wasting my resources.
I have tried it using this code (Python):
listOfUsers = db.user_relations.find_one({'userId': '123'}, {'friendsArray' : {'$orderBy': {'lastTimestamp': 1}}}).limit(10)
but it just gives me this error pymongo.errors.OperationFailure: Unknown expression $orderBy
Any answer at this point would be really helpful! Thank You!
use aggregate
first unwind
then sort according timestap
group by _id to create sorted array
use addfields and filter for getting first 10 item of array
db.collection.aggregate([
{ $match:{userId:"123"}},
{
"$unwind": "$friendsArray"
},
{
$sort: {
"friendsArray.lastTimeStamp": 1
}
},
{
$group: {
_id: "$_id",
friendsArray: {
$push: "$friendsArray"
}
},
},
{
$addFields: {
friendsArray: {
$filter: {
input: "$friendsArray",
as: "z",
cond: {
$lt: [
{
$indexOfArray: [
"$friendsArray",
"$$z"
]
},
10
]
}// 10 is n first item
}
}
},
}
])
https://mongoplayground.net/p/2Usk5sRY2L2
and for pagination use this
db.collection.aggregate([
{ $match:{userId:"123"}},
{
"$unwind": "$friendsArray"
},
{
$sort: {
"friendsArray.lastTimeStamp": 1
}
},
{
$group: {
_id: "$_id",
friendsArray: {
$push: "$friendsArray"
}
},
},
{
$addFields: {
friendsArray: {
$filter: {
input: "$friendsArray",
as: "z",
cond: {
$and: [
{
$gt: [
{
$indexOfArray: [
"$friendsArray",
"$$z"
]
},
10
]
},
{
$lt: [
{
$indexOfArray: [
"$friendsArray",
"$$z"
]
},
20
]
},
]
}// 10 is n first item
}
}
},
}
])
The translation of your find to aggregation(we need unwind that why aggregation is used) would be like the bellow query.
Test code here
Query (for descending replace 1 with -1)
db.collection.aggregate([
{
"$match": {
"userId": "123"
}
},
{
"$unwind": {
"path": "$friendsArray"
}
},
{
"$sort": {
"friendsArray.lastTimeStamp": 1
}
},
{
"$limit": 10
},
{
"$replaceRoot": {
"newRoot": "$friendsArray"
}
}
])
If you want to skip some before limit add one stage also
{
"$skip" : 10
}
To take the 10-20 messages for example.

Sum time series with different time stamps MongoDB

I have a mongoDB database with multiple time series data and each time stamp is a separate document with some additional meta data from sensors. I want to sum the two time series in an aggregation but I am heavily struggling with that and can't find any examples.
Assume we have sensor A and B and the time stamps from the different sensors don't align. See an example of the data below. Next I want to sum the "volume" metric of the two time series. So for the example below sensor A has two time stamps en sensor B 3. So the sum of A and B should have 5 time stamps such that the sum reflects all the changes in the total volume (see also the schematic example below).
Anyone knows how to solve this in a mongoDB aggregation query? I can only use the mongoDB query language and not use NodeJS.
Sensor A
{
"_id":5d67d9ee074e99274eef30d5
"sensor": A
"volume":12.4
"temperatue": 20
"timestamp":2019-08-29 15:58:06.093
"__v":0
}
{
"_id":5d67da66074e99274eef30ea
"sensor": A
"volume":12.3
"temperatue": 21
"timestamp":2019-08-29 16:48:06.078
"__v":0
}
..etc
Sensor B
{
"_id":5d67d9ee074e99274eef30d5
"sensor": B
"volume":32.4
"temperatue": 20
"timestamp":2019-08-29 15:55:06.093
"__v":0
}
{
"_id":5d67da66074e99274eef30ea
"sensor": B
"volume":21.2
"temperatue": 21
"timestamp":2019-08-29 16:49:06.178
"__v":0
}
{
"_id":5d67da66074e99274eef30ea
"sensor": B
"volume":22.3
"temperatue": 22
"timestamp":2019-08-29 17:04:06.078
"__v":0
}
..etc
Here also a sketch of the result I would like to have.
Try this one:
db.collection.aggregate([
// Determine start and end-time
{ $sort: { timestamp: -1 } },
{ $group: { _id: "$sensor", data: { $push: "$$ROOT" } } },
{
$set: {
data: {
$reduce: {
input: "$data",
initialValue: [],
in: {
$concatArrays: [
"$$value",
[
{
$mergeObjects: [
"$$this",
{
timestamp_end: {
$ifNull: [ { $last: "$$value.timestamp" }, "$$NOW" ]
}
}
]
}
]
]
}
}
}
}
},
{ $unwind: "$data" },
// find data per interval
{ $sort: { "data.timestamp": 1 } },
{
$group: {
_id: null,
data: { $push: "$data" },
timestamp: { $addToSet: "$data.timestamp" }
}
},
{
$set: {
sum_data: {
$map: {
input: "$timestamp",
as: "t",
in: {
$filter: {
input: "$data",
cond: {
$and: [
{ $lte: [ "$$this.timestamp", "$$t" ] },
{ $gt: [ "$$this.timestamp_end", "$$t" ] }
]
}
}
}
}
}
}
},
// sum up temperatures
{
$set: {
volume: {
$map: {
input: "$sum_data",
in: { $sum: "$$this.volume" }
}
},
result: { $range: [ 0, { $size: "$timestamp" } ] }
}
},
// Join arrays to final result
{
$project: {
result: {
$map: {
input: "$result",
as: "i",
in: {
timestamp: { $arrayElemAt: [ "$timestamp", "$$i" ] },
volume: { $arrayElemAt: [ "$volume", "$$i" ] }
}
}
}
}
}
])
Mongo Playground

Multiple sums with different calculations mongodb

maybe someone can help me. I have the following table in mongodb and I need to perform the following calculation:
Odds:
High
Average
Low
For each probability, a multiplier must be applied
Example:
High probability: Value * 0.87
Average probability: Value * 0.5
Low Probability: Value * 0.06
I made the following query in the db mongo, but I can apply only one multiplier. I was unable to differentiate each probability to multiply by the above values.
db.teste.aggregate(
{
$match: {
$and: [
{
"converted_fields.Probabilidade de fechamento": {
$ne: null
},
"current.value": {
$ne: 0
},
"current.add_time": {
$gte: ISODate("2020-07-01")
},
}
]
}
},
{
$project: {
"_id": "$_id",
"___group": {
"probabilidade": "$converted_fields.Probabilidade de fechamento"
},
"current___value": "$current.value"
}
},
{
$group: {
"_id": "$___group",
"count": {
$sum: "$current___value"
}
}
},
{
$project: {
"_id": 0,
"probabilidade": "$_id.probabilidade",
"valor": {
$multiply: ["$count", 0.5]
}
}
}
)
Result:
{
Alta - 379,5
Média - 1647,9
Baixa - 3763,32
}
how do I separate a different multiplier for each probability?
The aggregation might look something like this:
db.teste.aggregate([
{
$match: {
$and: [
{
"converted_fields.Probabilidade de fechamento": {
$ne: null
},
"current.value": {
$ne: 0
},
"current.add_time": {
$gte: ISODate("2020-07-01")
},
}
]
}
},
{
$group: {
_id: "$converted_fields.Probabilidade de fechamento",
count: { $sum: "$current.value"}
}
},
{
$project:
{
_id: 1,
valor:
{
$switch:
{
branches: [
{
case: { $eq: [ "$_id", "Alta"] },
then: { $multiply: ["$count", 0.87] }
},
{
case: { $eq: [ "$_id", "Médica"] },
then: { $multiply: ["$count", 0.5] }
},
{
case: { $eq: [ "$_id", "Baixa"] },
then: { $multiply: ["$count", 0.06] }
}
],
default: 0
}
}
}
},
{
$group: {
_id: null,
probabilidades: {
$push: {
k: "$_id",
v: "$valor"
}
}
}
},
{
$replaceRoot: {
newRoot: {
$arrayToObject: "$probabilidades"
}
}
}
])
The first $match stage is still as you had it. In my solution the first $group stage will return documents of this form:
{
_id: 'Alta',
count: 100
}
In the following $project stage, I use the $switch operator in order to determine what to multiply count by in order to get the correct valor. Using the sample document I showed before, this stage will return documents that look like this:
{
_id: 'Alta',
valor: 87
}
Next is another $group stage, where I group all of the probability documents together, and push them into an array. The document from this stage might look like this:
{
_id: null,
probabilidades: [
{ 'k': 'Alta', 'v': 87 },
{ 'k': 'Baixa', 'v': 6 }
]
}
In the final stage, $replaceRoot, I use $arrayToObject to turn the probabilidades array into your desired output.

Retrieving a count that matches specified criteria in a $group aggregation

So I am looking to group documents in my collection on a specific field, and for the output results of each group, I am looking to include the following:
A count of all documents in the group that match a specific query (i.e. a count of documents that satisfy some expression { "$Property": "Value" })
The total number of documents in the group
(Bonus, as I suspect that this is not easily accomplished) Properties of a document that correspond to a $min/$max accumulator
I am very new to the syntax used to query in mongo and don't quite understand how it all works, but after some research, I've managed to get it down to the following query (please note, I am currently using version 3.0.12 for my mongo db, but I believe we will upgrade in a couple of months time):
db.getCollection('myCollection').aggregate(
[
{
$group: {
_id: {
GroupID: "$GroupID",
Status: "$Status"
},
total: { $sum: 1 },
GroupName: { $first: "$GroupName" },
EarliestCreatedDate: { $min: "$DateCreated" },
LastModifiedDate: { $max: "$LastModifiedDate" }
}
},
{
$group: {
_id: "$_id.GroupID",
Statuses: {
$push: {
Status: "$_id.Status",
Count: "$total"
}
},
TotalCount: { $sum: "$total" },
GroupName: { $first: "$GroupName" },
EarliestCreatedDate: { $min: "$EarliestCreatedDate" },
LastModifiedDate: { $max: "$LastModifiedDate" }
}
}
]
)
Essentially what I am looking to retrieve is the Count for specific Status values, and project them into one final result document that looks like the following:
{
GroupName,
EarliestCreatedDate,
EarliestCreatedBy,
LastModifiedDate,
LastModifiedBy,
TotalCount,
PendingCount,
ClosedCount
}
Where PendingCount and ClosedCount are the total number of documents in each group that have a status Pending/Closed. I suspect I need to use $project with some other expression to extract this value, but I don't really understand the aggregation pipeline well enough to figure this out.
Also the EarliestCreatedBy and LastModifiedBy are the users who created/modified the document(s) corresponding to the EarliestCreatedDate and LastModifiedDate respectively. As I mentioned, I think retrieving these values will add another layer of complexity, so if there is no practical solution, I am willing to forgo this requirement.
Any suggestions/tips would be very much appreciated.
You can try below aggregation stages.
$group
Calculate all the necessary counts TotalCount, PendingCount and ClosedCount for each GroupID
Calculate $min and $max for EarliestCreatedDate and LastModifiedDate respectively and push all the fields to CreatedByLastModifiedBy to be compared later for fetching EarliestCreatedBy and LastModifiedBy for each GroupID
$project
Project all the fields for response
$filter the EarliestCreatedDate value against the data in the CreatedByLastModifiedBy and $map the matching CreatedBy to the EarliestCreatedBy and $arrayElemAt to convert the array to object.
Similar steps for calculating LastModifiedBy
db.getCollection('myCollection').aggregate(
[{
$group: {
_id: "$GroupID",
TotalCount: {
$sum: 1
},
PendingCount: {
$sum: {
$cond: {
if: {
$eq: ["Status", "Pending"]
},
then: 1,
else: 0
}
}
},
ClosedCount: {
$sum: {
$cond: {
if: {
$eq: ["Status", "Closed "]
},
then: 1,
else: 0
}
}
},
GroupName: {
$first: "$GroupName"
},
EarliestCreatedDate: {
$min: "$DateCreated"
},
LastModifiedDate: {
$max: "$LastModifiedDate"
},
CreatedByLastModifiedBy: {
$push: {
CreatedBy: "$CreatedBy",
LastModifiedBy: "$LastModifiedBy",
DateCreated: "$DateCreated",
LastModifiedDate: "$LastModifiedDate"
}
}
}
}, {
$project: {
_id: 0,
GroupName: 1,
EarliestCreatedDate: 1,
EarliestCreatedBy: {
$arrayElemAt: [{
$map: {
input: {
$filter: {
input: "$CreatedByLastModifiedBy",
as: "CrBy",
cond: {
"$eq": ["$EarliestCreatedDate", "$$CrBy.DateCreated"]
}
}
},
as: "EaCrBy",
in: {
"$$EaCrBy.CreatedBy"
}
}
}, 0]
},
LastModifiedDate: 1,
LastModifiedBy: {
$arrayElemAt: [{
$map: {
input: {
$filter: {
input: "$CreatedByLastModifiedBy",
as: "MoBy",
cond: {
"$eq": ["$LastModifiedDate", "$$MoBy.LastModifiedDate"]
}
}
},
as: "LaMoBy",
in: {
"$$LaMoBy.LastModifiedBy"
}
}
}, 0]
},
TotalCount: 1,
PendingCount: 1,
ClosedCount: 1
}
}]
)
Update for Version < 3.2
$filter is also not available in your version. Below is the equivalent.
The comparison logic is the same and creates an array with for every non matching entry the value of false or LastModifiedBy otherwise.
Next step is to use $setDifference to compare the previous array values with array [false] which returns the elements that only exist in the first set.
LastModifiedBy: {
$setDifference: [{
$map: {
input: "$CreatedByLastModifiedBy",
as: "MoBy",
in: {
$cond: [{
$eq: ["$LastModifiedDate", "$$MoBy.LastModifiedDate"]
},
"$$MoBy.LastModifiedBy",
false
]
}
}
},
[false]
]
}
Add $unwind stage after $project stage to change to object
{$unwind:"$LastModifiedBy"}
Similar steps for calculating EarliestCreatedBy