How to efficiently recalculate the values of large amounts of data? - mongodb

I have several collections in MongoDB:
payment
{
"_id":{
"$oid":"6060ded06aa032495d640536"
},
"type":"",
"amount":10,
"createdAt":{
"date":{
"$date":"2021-03-03T16:01:14.137Z"
},
"timestamp":1614787274.137138
},
"finishedAt":{
"date":{
"$date":"2021-03-03T16:13:15.678Z"
},
"timestamp":1614787995.678263
},
"status":true,
"state":"finished",
"destination":{
"identificator":"1234"
}
}
account
{
"_id":{
"$oid":"60677a2c88b356160e415a1e"
},
"name":"",
"providerAccount":{
"identificator":"1234"
},
"targetAmount":0,
"currentAmount":0,
"status":false,
"state":false,
"priority":false,
"createdAt":{
"date":{
"$date":"2021-03-29T00:00:00.000Z"
},
"timestamp":1616976000
}
}
I need to check each payment if it matches the account in the system. If destination.identificator == providerAccount.identificator I need to change the payment type to "internal" and add the payment amount to the currentAmount in the account.
At the moment, I have a python script that does all this by iterating over each payment, but the problem is that there are more than a million such payments, and such a process can take a very long time.
Is there a more efficient way to do this?

You can write two different Aggregation queries which will perform their own lookup operations on the alternating collections and update the values based on logics and conditions.
Note: The execution order is very important for this to work
Note: For both the Aggregation queries, I will be making use of the $merge stage which will work only on MongoDB version >= 4.4
If you are using any earlier versions of MongoDB, loop through the records of the Aggregation results and update the documents manually using PyMongo instead of the $merge pipeline stage which will be the last stage of the Pipelines.
The first query has to be performed on payment collection, which will check if the link exists in the account collection or not.
db.payment.aggregate([
{
"$lookup": {
"from": "account",
"let": {
"invIdentifactor": "$destination.identificator"
},
"pipeline": [
{
"$match": {
"$expr": {
"$eq": [
"$providerAccount.identificator",
"$$invIdentifactor"
],
},
},
},
{
"$project": {
"_id": 1,
},
},
],
"as": "matchedAcc"
}
},
{
"$match": {
"matchedAcc": {
"$ne": []
}
},
},
{
"$project": {
"type": {
"$literal": "internal"
}
},
},
{
"$merge": {
"into": "payment",
"on": "_id",
"whenMatched": "merge",
"whenNotMatched": "discard"
},
},
])
MongoDB Playground sample Execution
Next the Aggregation query for account collection based on "type": "internal" condition added by the previous query.
Note: If there are already documents with "type": "internal" value in payment collection, change type to a different unique key name in the $project stage and update it in the below query and finally unset the key after all the process is done.
db.account.aggregate([
{
"$lookup": {
"from": "payment",
"let": {
"accIdentifactor": "$providerAccount.identificator"
},
"pipeline": [
{
"$match": {
"$expr": {
"$eq": [
"$destination.identificator",
"$$accIdentifactor"
],
},
"type": "internal",
},
},
{
"$group": {
"_id": "$providerAccount.identificator",
"totalAmount": {
"$sum": "$amount"
}
},
},
],
"as": "matchedPayment"
},
},
{
"$match": {
"$expr": {
"$gt": [
{
"$arrayElemAt": [
"$matchedPayment.totalAmount",
0
]
},
0
]
},
},
},
{
"$project": {
"currentAmount": {
"$add": [
"$currentAmount",
{
"$arrayElemAt": [
"$matchedPayment.totalAmount",
0
]
}
]
}
},
},
{
"$merge": {
"into": "account",
"on": "_id",
"whenMatched": "merge",
"whenNotMatched": "discard"
},
},
])
Mongo Playground Sample Execution
Additionally, you can pass the allowDiskUse: true option on Aggregation commands and also consider perform indexing on providerAccount.identificator and destination.identificator keys to speed this up if required and later delete those indexes.
Let me know if you want an explanation of all the stages and operators in the aggregation pipeline.

Related

How to group same record into multiple groups using mongodb aggregate pipeline

I have a two collections.
OrgStructure (visualise this as a tree structure)
Example Document:
{
"id": "org1",
"nodes": [
{
"nodeId": "root",
"childNodes": ["child1"]
},
{
"nodeId": "child1",
"childNodes": ["child2"]
},
{
"nodeId": "child2",
"childNodes": []
}
]
}
Activity
Example Document:
[
{
"id":"A1",
"orgUnit": "root"
},
{
"id":"A2",
"orgUnit": "child1"
},
{
"id":"A3",
"orgUnit": "child2"
}
]
Now my expectation is to group activities by orgUnit such a way that by considering the child nodes as well.
Here i don't want to do a lookup and i need to consider one OrgStructure document as an input, so that i can construct some condition using the document such a way that the query will return the below result.
Expected result
[
{
"_id": "root",
"activities": ["A1","A2","A3"]
},
{
"_id": "child1",
"activities": ["A2","A3"]
},
{
"_id": "child2",
"activities": ["A3"]
}
]
So im ecpecting an aggregate query something like this
{
"$group": {
"_id": {
"$switch": {
"branches": [
{
"case": {"$in": ["$orgUnit",["root","child1","child2"]]},
"then": "root"
},
{
"case": {"$in": ["$orgUnit",["child1","child2"]]},
"then": "child1"
},
{
"case": {"$in": ["$orgUnit",["child2"]]},
"then": "child2"
}
],
"default": null
}
}
}
}
Thanks in advance!
You will need 2 steps:
create another collection nodes for recursive lookup. The original OrgStructure is hard to perform $graphLookup
db.OrgStructure.aggregate([
{
"$unwind": "$nodes"
},
{
"$replaceRoot": {
"newRoot": "$nodes"
}
},
{
$out: "nodes"
}
])
Perform $graphLookup on nodes collection to get all child nodes. Perform $lookup to Activity and do some wrangling.
db.nodes.aggregate([
{
"$graphLookup": {
"from": "nodes",
"startWith": "$nodeId",
"connectFromField": "childNodes",
"connectToField": "nodeId",
"as": "nodesLookup"
}
},
{
"$lookup": {
"from": "Activity",
"let": {
nodeId: "$nodesLookup.nodeId"
},
"pipeline": [
{
$match: {
$expr: {
$in: [
"$orgUnit",
"$$nodeId"
]
}
}
},
{
$group: {
_id: "$id"
}
}
],
"as": "activity"
}
},
{
$project: {
_id: "$nodeId",
activities: "$activity._id"
}
}
])
Here is the Mongo playground for your reference.

Mongodb aggregation lookup to add field in each array with condition

I have 3 collections.
User:
{
"_id":ObjectId("60a495cdd4ba8b122899d415"),
"email":"br9#gmail.com",
"username":"borhan"
}
Panel:
{
"_id": ObjectId("60a495cdd4ba8b122899d417"),
"name": "borhan",
"users": [
{
"role": "admin",
"joined": "2021-05-19T04:35:47.474Z",
"status": "active",
"_id": ObjectId("60a495cdd4ba8b122899d418"),
"user": ObjectId("60a495cdd4ba8b122899d415")
},
{
"role": "member",
"joined": "2021-05-19T04:35:47.474Z",
"status": "active",
"_id": ObjectId("60a49600d4ba8b122899d41a"),
"user": ObjectId("60a34e167958972d7ce6f966")
}
],
}
Team:
{
"_id":ObjectId("60a495e0d4ba8b122899d419"),
"title":"New Teams",
"users":[
ObjectId("60a495cdd4ba8b122899d415")
],
"panel":ObjectId("60a495cdd4ba8b122899d417")
}
I want to receive a output from querying Panel colllection just like this:
{
"_id": ObjectId("60a495cdd4ba8b122899d417"),
"name": "borhan",
"users": [
{
"role": "admin",
"joined": "2021-05-19T04:35:47.474Z",
"status": "active",
"_id": ObjectId("60a495cdd4ba8b122899d418"),
"user": ObjectId("60a495cdd4ba8b122899d415"),
"teams":[
{
"_id":ObjectId("60a495e0d4ba8b122899d419"),
"title":"New Teams",
"users":[
ObjectId("60a495cdd4ba8b122899d415")
],
"panel":ObjectId("60a495cdd4ba8b122899d417")
}
]
},
{
"role": "member",
"joined": "2021-05-19T04:35:47.474Z",
"status": "active",
"_id": ObjectId("60a49600d4ba8b122899d41a"),
"user": ObjectId("60a34e167958972d7ce6f966")
}
],
}
I mean i want to add teams field (which is array of teams that user is existed on it) to each user in Panel collection
Here is my match query in mongoose to select specific panel:
panel_model.aggregate([
{
$match: {
users: {
$elemMatch: {user: ObjectId("60a495cdd4ba8b122899d415"), role:"admin"}
}
}
},
])
Is it possible to get my output with $lookup or $addFields aggregations?
You need to join all three collections,
$unwind to deconstruct the array
$lookup there are two kind of lookups which help to join collections. First I used Multiple-join-conditions-with--lookup, and I used standrad lookup to join Users and Teams collections.
$match to match the user's id
$expr - when you use $match inside lookup, u must use it.
$set to add new fields
$group to we already destructed using $unwind. No we need to restructure it
here is the code
db.Panel.aggregate([
{ $unwind: "$users" },
{
"$lookup": {
"from": "User",
"let": { uId: "$users.user" },
"pipeline": [
{
$match: {
$expr: {
$eq: [ "$_id", "$$uId" ]
}
}
},
{
"$lookup": {
"from": "Team",
"localField": "_id",
"foreignField": "users",
"as": "teams"
}
}
],
"as": "users.join"
}
},
{
"$set": {
"users.getFirstElem": {
"$arrayElemAt": [ "$users.join", 0 ]
}
}
},
{
$set: {
"users.teams": "$users.getFirstElem.teams",
"users.join": "$$REMOVE",
"users.getFirstElem": "$$REMOVE"
}
},
{
"$group": {
"_id": "$_id",
"name": { "$first": "name" },
"users": { $push: "$users" }
}
}
])
Working Mongo playground
Note : Hope the panel and user collections are in 1-1 relationship. Otherwise let me know

Is there a better way to adding a field based on the result of aggregation in MongoDB

I have the following structure in my Mongo:
Merchants
- name
- _id
Transactions:
- _id
- status
- merchantId
Every merchant can have multiple transactions. I want to be able to add a field (to the result of aggregate to get all merchants) called hasHolds and set it to true if any of the transactions belonging to a merchant are in the status "capture_pending".
Here is how I achieved it:
Do a $lookup between merchants and transactions as merchantTransactions
addField "hasHolds" with a cond
The cond checks the size of merchantTransactions.status array for each merchant.
size runs a filter which only selects an element of the merchantTransactions.status array if its "capture_pending"
Thus:
collections:
db={
"merchants": [
{
"name": "m1",
"_id": "5b53494987eea4171d199dd3"
},
{
"name": "m2",
"_id": "5b53495087eea4171d199dd4"
},
{
"name": "m3",
"_id": "5b53495987eea4171d199dd5"
}
],
"transactions": [
{
"_id": "5b53494987eea4171d199dd4",
"merchantId": "5b53494987eea4171d199dd3",
"status": "capture_pending"
},
{
"_id": "5b53494987eea4171d199dd0",
"merchantId": "5b53494987eea4171d199dd3",
"status": "done"
},
{
"_id": "5b53494987eea4171d199dd8",
"merchantId": "5b53495987eea4171d199dd5",
"status": "capture_pending"
},
{
"_id": "5b53494987eea4171d199de8",
"merchantId": "5b53495987eea4171d199dd5",
"status": "capture_pending"
},
{
"_id": "5b53494987eea4171d199dd9",
"merchantId": "5b53495087eea4171d199dd4",
"status": "done"
},
{
"_id": "5b53494987eea4171d199de9",
"merchantId": "5b53495087eea4171d199dd4",
"status": "done"
}
]
}
Query:
db.merchants.aggregate([
{
"$lookup": {
from: "transactions",
localField: "_id",
foreignField: "merchantId",
as: "merchantTransactions",
}
},
{
"$addFields": {
"hasHolds": {
"$cond": {
"if": {
"$gt": [
{
$size: {
"$filter": {
"input": "$merchantTransactions.status",
"as": "st",
"cond": {
"$eq": [
"$$st",
"capture_pending"
]
}
}
}
},
0
]
},
"then": true,
"else": false
}
}
}
}
])
Run it here: https://mongoplayground.net/p/gJl-ot1TueB
Is there a better or easier way to achieve this?
You can just use $in operator in condition,
field merchantTransactions.status will return array of status and with $in condition it will quickly check your input status is in array or not,
{
"$addFields": {
"hasHolds": {
$in: ["capture_pending", "$merchantTransactions.status"]
}
}
}
Playground

Aggregate function on mongo running very slow when running query

Tried running a query on the aggregate function on Mongo which is currently taking 16 seconds when the result i wished for was under a second
{
"$lookup": {
"from": "session_attendances",
"let": { "id": "$_id" },
"pipeline": [
{
"$match": {
"$expr": {
"$eq": ["$student", "$$id"]
}
}
},
{
"$project": {
"attendance_code": "$attendance_code"
}
}
],
"as": "attendance"
}
},
{
// keep only matched students, can skip this and modifiy the next phase incase no such documents exists.
"$unwind": "$attendance"
},
{
"$lookup": {
"from": "attendance_codes",
"let": { "attendance_code": "$attendance.attendance_code" },
"pipeline": [
{
"$project": {
"type": 1
}
},
{
"$match": {
"$expr": {
"$eq": ["$_id", "$$attendance_code"]
}
}
}
],
"as": "attendance_code"
}
},
{
//again assuming we want to keep matched docs otherwise why lookup?
"$unwind": "$attendance_code"
},
{
"$group": {
"_id": { "a": "$attendance.attendance_code", "id": "$_id" },
"total": { "$sum": 1 },
"data": { "$first": "$$ROOT" } // if u want to keep document data
}
}
Hoping that some one can give me an answer to which part of my code is making the run time so slow.
Its not clear what your end goal is, if you wish to clarify that it would help me give an alternative to your current aggregation
With that said the second lookup stage is "useless" as you group right after without using any of the data gained by it, removing it will still get you the exact same result and save some time.
Assuming the second lookup is needed for some reason i recommend not nesting it but rather use after the first one, like so:
{
$lookup: {
from: 'session_attendances',
let: { 'id': '$_id' },
pipeline: [
{
"$match": {
"$expr": {
"$eq": ["$student", "$$id"]
}
}
}
,{
$project: {
attendance_code: '$attendance_code'
}
}
],
as: 'attendance'
}
},
{// keep only matched students, can skip this and modifiy the next phase incase no such documents exists.
$unwind: "$attendance"
},
{
$lookup: {
from: 'attendance_codes',
let: { 'attendance_code': '$attendance.attendance_code' },
pipeline: [
{
$project: {
type: 1
}
},
{
"$match": {
"$expr": {
"$eq": ["$_id", "$$attendance_code"]
}
}],
as: 'attendance_code'
}
}
},
{ //again assuming we want to keep matched docs otherwise why lookup?
$unwind: "$attendance_code"
},
{
$group: {
_id: {a: "$attendance.attendance_code", id: "$_id"}
total: { $sum: 1 },
data: {$first: "$$ROOT"} // if u want to keep document data
}
}
This should give you better performance, i also recommend dropping the projects stages, unless the documents are very large this usually does not end up helping performance but actually hurting it.

How to find match in documents in Mongo and Mongo aggregation?

I have following json structure in mongo collection-
{
"students":[
{
"name":"ABC",
"fee":1233
},
{
"name":"PQR",
"fee":345
}
],
"studentDept":[
{
"name":"ABC",
"dept":"A"
},
{
"name":"XYZ",
"dept":"X"
}
]
},
{
"students":[
{
"name":"XYZ",
"fee":133
},
{
"name":"LMN",
"fee":56
}
],
"studentDept":[
{
"name":"XYZ",
"dept":"X"
},
{
"name":"LMN",
"dept":"Y"
},
{
"name":"ABC",
"dept":"P"
}
]
}
Now I want to calculate following output.
if students.name = studentDept.name
so my result should be as below
{
"name":"ABC",
"fee":1233,
"dept":"A",
},
{
"name":"XYZ",
"fee":133,
"dept":"X"
}
{
"name":"LMN",
"fee":56,
"dept":"Y"
}
Do I need to use mongo aggregation or is it possible to get above given output without using aggregation???
What you are really asking here is how to make MongoDB return something that is actually quite different from the form in which you store it in your collection. The standard query operations do allow a "limitted" form of "projection", but even as the title on the page shared in that link suggests, this is really only about "limiting" the fields to display in results based on what is present in your document already.
So any form of "alteration" requires some form of aggregation, which with both the aggregate and mapReduce operations allow to "re-shape" the document results into a form that is different from the input. Perhaps also the main thing people miss with the aggregation framework in particular, is that it is not just all about "aggregating", and in fact the "re-shaping" concept is core to it's implementation.
So in order to get results how you want, you can take an approach like this, which should be suitable for most cases:
db.collection.aggregate([
{ "$unwind": "$students" },
{ "$unwind": "$studentDept" },
{ "$group": {
"_id": "$students.name",
"tfee": { "$first": "$students.fee" },
"tdept": {
"$min": {
"$cond": [
{ "$eq": [
"$students.name",
"$studentDept.name"
]},
"$studentDept.dept",
false
]
}
}
}},
{ "$match": { "tdept": { "$ne": false } } },
{ "$sort": { "_id": 1 } },
{ "$project": {
"_id": 0,
"name": "$_id",
"fee": "$tfee",
"dept": "$tdept"
}}
])
Or alternately just "filter out" the cases where the two "name" fields do not match and then just project the content with the fields you want, if crossing content between documents is not important to you:
db.collection.aggregate([
{ "$unwind": "$students" },
{ "$unwind": "$studentDept" },
{ "$project": {
"_id": 0,
"name": "$students.name",
"fee": "$students.fee",
"dept": "$studentDept.dept",
"same": { "$eq": [ "$students.name", "$studentDept.name" ] }
}},
{ "$match": { "same": true } },
{ "$project": {
"name": 1,
"fee": 1,
"dept": 1
}}
])
From MongoDB 2.6 and upwards you can even do the same thing "inline" to the document between the two arrays. You still want to reshape that array content in your final output though, but possible done a little faster:
db.collection.aggregate([
// Compares entries in each array within the document
{ "$project": {
"students": {
"$map": {
"input": "$students",
"as": "stu",
"in": {
"$setDifference": [
{ "$map": {
"input": "$studentDept",
"as": "dept",
"in": {
"$cond": [
{ "$eq": [ "$$stu.name", "$$dept.name" ] },
{
"name": "$$stu.name",
"fee": "$$stu.fee",
"dept": "$$dept.dept"
},
false
]
}
}},
[false]
]
}
}
}
}},
// Students is now an array of arrays. So unwind it twice
{ "$unwind": "$students" },
{ "$unwind": "$students" },
// Rename the fields and exclude
{ "$project": {
"_id": 0,
"name": "$students.name",
"fee": "$students.fee",
"dept": "$students.dept"
}},
])
So where you want to essentially "alter" the structure of the output then you need to use one of the aggregation tools to do. And you can, even if you are not really aggregating anything.