How to find almost similar records in mongodb? - mongodb

This is the search record:
A = {
field1: value1,
field2: value2,
...
fieldN: valueN
}
I have many such records in the database.
Other record (B) almost matches record A if even N-M fields in these records are equal. This is the example, M=2:
B = {
field1: OTHER_value1,
field2: OTHER_value2,
field3: value3,
...
fieldN: valueN
}
It can be any fields, not only the first.
P.S.: I have copied the same query for postgresql - How to find almost similar records in sql? and now I want to do this with mongodb.

My solution:
db.col.aggregate(
[
{
$addFields:
{
nonMatchCount: 0
}
},
{
$addFields: {
nonMatchCount:
{
$cond: [{$eq: ['$field1', 'OTHER_value1']}, '$nonMatchCount', {$sum: ['$nonMatchCount', 1]}]
}
}
},
{
$addFields: {
nonMatchCount:
{
$cond: [{$eq: ['$field2', 'OTHER_value2']}, '$nonMatchCount', {$sum: ['$nonMatchCount', 1]}]
}
}
},
{
$addFields: {
nonMatchCount:
{
$cond: [{$eq: ['$field3', 'value3']}, '$nonMatchCount', {$sum: ['$nonMatchCount', 1]}]
}
}
},
...
{
$addFields: {
nonMatchCount:
{
$cond: [{$eq: ['$fieldN', 'valueN']}, '$nonMatchCount', {$sum: ['$nonMatchCount', 1]}]
}
}
},
{$match: { nonMatchCount: {$lte: 2}}}
]
);

Related

Mongodb - Perform calculation with a just deleted value in aggregation pipeline

I have this document:
{
_id: ObjectId('asdu7329n'),
payments: [
{ _id: ObjectId('28sdf310'), paidAmount: 20 },
{ _id: ObjectId('2fsd410'), paidAmount: 15 },
{ _id: ObjectId('2fs32gd70'), paidAmount: 35 },
],
totalPaidAmount: 70
}
What I want is to re-calculate the totalPaidAmount field when a payment is removed, right now I'm deleting the payment in this way:
const query = { _id: ObjectId(saleId) };
const update = [
{ $set: { payments: { $filter: {
input: '$payments',
cond: {
$ne: [ "$$this._id", ObjectId(/* paymentId to delete */) ]
}
}}}}
]
await salesSchema.findOneAndUpdate(query, update);
I know that I have to use $subtract possibly in a second $set stage but how could I reference the paidAmount value from the object so that I can do something like this:
{
$set: {
totalPaidAmount: {
$subtract: [ '$totalPaidAmount', /* paidAmount value for the deleted payment */ ]
}
}
}
I know that I can just sum the paidAmount values for all the indexes of payments but what if there is like 1000 or more items? even if it doesn't hit the performance too much it seems to me more logical to take advantage of the totalPaidAmount field here.
If you want to subtract you can use the $filter:
db.collection.update(
{payments: {$elemMatch: {_id: ObjectId("63920f965d15e98e3d7c452c")}}},
[{$project: {
payments: {
$filter: {
input: "$payments",
cond: {$ne: ["$$this._id", ObjectId("63920f965d15e98e3d7c452c")]}
}
},
totalPaidAmount: {
$subtract: [
"$totalPaidAmount",
{$getField: {
input: {
$first: {
$filter: {
input: "$payments",
cond: {$eq: ["$$this._id", ObjectId("63920f965d15e98e3d7c452c")]}
}
}
},
field: "paidAmount"
}
}
]
}
}}
])
See how it works on the playground example
But I would go with the good old $sum:
db.collection.update(
{payments: {$elemMatch: {_id: ObjectId("63920f965d15e98e3d7c452c")}}},
[{$project: {
payments: {
$filter: {
input: "$payments",
cond: {$ne: ["$$this._id", ObjectId("63920f965d15e98e3d7c452c")]}
}
}
}},
{$set: {totalPaidAmount: {$sum: "$payments.paidAmount"}}}
])
See how it works on the playground example

Mongo db transaction query

I'm trying to create a mongodb query to see which invoices are paid or not. I would like to add few things to the outcome like:
virtualAmount : original amount - (sum of all creditnotes)
total paid amount : sum of all transactions where delete is false
paid (true/false) : if virtualAmount - total paid amount is 0
I have created mongo playground:
https://mongoplayground.net/p/0OyK_bOZu9X
Anyone know if this is possible?
How to still have the original object when using group?
Is it also possible to create this result with mongoose?
by $unwind array and $group it.
db.collection.aggregate(
[{
$match: {
_id: '62b46391be7c618aa5c9bf86'
}
}, {
$set: {
'transactions': {
$filter: {
'input': '$transactions',
'as': 'item',
'cond': { $eq: ['$$item.deleted', false] }
}
},
}
}, {
$unwind: {
path: '$transactions'
}
}, {
$group: {
_id: '_id',
'total-paid-amount': { $sum: '$transactions.amount.value'},
//keep to next stage
'creditnote': {$first: '$creditnote'},
'original-amount': {$first: '$amount.value'}
}
}, {
$unwind: {
path: '$creditnote'
}
}, {
$group: {
_id: '_id',
'sum-all-creditnotes': {$sum: '$creditnote.amount.value'},
//keep to next stage
'total-paid-amount': {$first: '$total-paid-amount'},
'original-amount': {$first: '$original-amount'}
}
}, {
$addFields: {
'virtual-amount': {$subtract: ['$original-amount','$sum-all-creditnotes']}
}
}, {
$addFields: {
paid: {$eq: [{$subtract: ['$virtual-amount','$total-paid-mount']},0]}
}
}]
result
{
"_id" : "_id",
"sum-all-creditnotes" : 1000,
"total-paid-amount" : 1000,
"original-amount" : 3370,
"virtual-amount" : 2370,
"paid" : false
}
EDIT
or easy way without group
db.collection.aggregate(
[{
$match: {
_id: '62b46391be7c618aa5c9bf86'
}
}, {
$set: {
'transactions': {
$filter: {
'input': '$transactions',
'as': 'item',
'cond': { $eq: ['$$item.deleted', false] }
}
},
}
}, {
$project:
{
'total-paid-amount': { $sum: '$transactions.amount.value' },
'sum-all-creditnotes': { $sum: '$creditnote.amount.value' },
'original-amount': '$amount.value'
}
}, {
$addFields: {
'virtual-amount': { $subtract: ['$original-amount', '$sum-all-creditnotes'] }
}
}, {
$addFields: {
'paid': { $eq: [{ $subtract: ['$virtual-amount', '$total-paid-mount'] }, 0] }
}
}])

Alternative solution to `$lookup` needed because the collection in the `from` field is sharded

Query with arbitrary number of filter conditions that come from querying the same collection
I am referring to the question above.
Here is an additional requirement:
The score table is sharded. Hence, it can no longer be in the $lookup stage.
Is there an alternative solution that also only makes one trip to the MongoDB API?
One way to do it without lookup is using $group, for example:
db.score.aggregate([
{
$group: {
_id: "$test_id",
highestScore: {$max: "$score"},
results: {
$push: {score: "$score", "tester_id": "$tester_id"}
},
ourTester: {
$push: {score: "$score", "tester_id": "$tester_id"}
}
}
},
{$match: {"ourTester.tester_id": userId}},
{
$project: {
ourTester: {
$filter: {
input: "$ourTester",
as: "item",
cond: {$eq: ["$$item.tester_id", userId]}
}
},
results: {
$filter: {
input: "$results",
as: "item",
cond: {$eq: ["$$item.score", "$highestScore"]}}
}
}
},
{
$project: {
ourTester: {"$arrayElemAt": ["$ourTester", 0]},
highest: {"$arrayElemAt": ["$results", 0]}
}
},
{
$match: {
$expr: {$gt: ["$highest.score", "$ourTester.score"]}
}
},
{
$project: {
score: "$highest.score",
tester_id: "$highest.tester_id",
test_id: "$res._id"
}
}
])
As you can see here

MongoDB Aggregate $unwind on sub-Documents

I'm struggling when I $unwind more than one field from a Sub-Document.
Here's what the data looks like:-
{
resp: {
field1: 'yes',
field2: ''
},
{
resp: {
field1: 'yes',
field2: ''
}
etc,etc...
If I process an Aggregation Pipeline for ONE field, it works OK, so this works...
{ $unwind: "$resp" },
{ $unwind: "$resp.field1" },
{ $project: { field1: "$resp.field1" } }
{ $group: {
_id: 1,
field1: { $sum: { $cond: [{ $eq: ["$field1","yes"] },1,0] } }
}
}
But if I now want to return field 2 in the same aggregation, using the following, it will return a count of Zero for both fields, whereas previously field1 had a count > Zero.
{ $unwind: "$resp" },
{ $unwind: "$resp.field1" },
{ $unwind: "$resp.field2" },
{
$project: {
field1: "$resp.field1",
field2: "$resp.field2"
},
{ $group: {
_id: 1,
field1: { $sum: { $cond: [{ $eq: ["$field1","yes"] },1,0] } },
field2: { $sum: { $cond: [{ $eq: ["$field2","yes"] },1,0] } }
}
}
Any suggestions would be much appreciated.
it seems the above is the correct way to do this, but I'd happily take alternative suggestions. The error was in may mapping of the fields in the $project stage. When typing the issue into SO I realised where the problem was !

MongoDB Aggregation based on userID and time period

I would like to achieve something like
{ _id: "A", count: 2 }
{ _id: "B", count: 1 }
from
{ userId: "A", timeStamp: "12:30PM" } <- start of 5 min interval A: 1
{ userId: "B", timeStamp: "12:30PM" } <- start of 5 min interval B: 1
{ userId: "B", timeStamp: "12:31PM" } <- ignored
{ userId: "A", timeStamp: "12:32PM" } <- ignored
{ userId: "B", timeStamp: "12:33PM" } <- ignored
{ userId: "A", timeStamp: "12:37PM" } <- start of next 5 min A : 2
where it groups based on userId and then after userId is group, the count is triggered every 5 mins.
For example: Within any 5 min period, starting at say midnight, an unlimited number of collections can have a timeStamp from 00:00 to 00:05 but would only be counted as 1 hit.
Hopefully I am explaining this clearly.
I'm able to group by userId and get the count in general but setting a condition of the count seems to be tricky.
You can try $bucket and $addToSet - the drawback is that you have to specify all the ranges manually:
db.col.aggregate([
{
$bucket: {
groupBy: "$timeStamp",
boundaries: [ "12:30PM", "12:35PM", "12:40PM", "12:45PM", "12:50PM", "12:55PM", "13:00PM" ],
output: {
"users" : { $addToSet: "$userId" }
}
}
},
{
$unwind: "$users"
},
{
$group: { _id: "$users", count: { $sum: 1 } }
}
])
Micki's solution is better if you have mongo 3.6.
If you have mongo 3.4 you can use $switch.
Obviously you would need to add all the cases in the day.
db.getCollection('user_timestamps').aggregate(
{
$group: {
_id: '$userId',
timeStamp: {$push: '$timeStamp'}
}
},
{
$project: {
timeStamps: {
$map: {
input: '$timeStamp',
as: 'timeStamp',
in: {
$switch: {
branches: [
{
case: {
$and: [
{$gte: ['$$timeStamp', '12:30PM']},
{$lt: ['$$timeStamp', '12:35PM']}
]
},
then: 1
},
{
case: {
$and: [
{$gte: ['$$timeStamp', '12:35PM']},
{$lt: ['$$timeStamp', '12:40PM']}
]
},
then: 2
}
],
default: 0
}
}
}
}
}
},
{
$unwind: '$timeStamps'
},
{
$group: {
_id: '$_id',
count: {
$addToSet: '$timeStamps'
}
}
},
{
$project: {
_id: true,
count: {$size: '$count'}
}
}
)
If you don't have mongo 3.4 you can replace the $switch with
cond: [
{
$and: [
{$gte: ['$$timeStamp', '12:30PM']},
{$lt: ['$$timeStamp', '12:35PM']}
]
},
1,
{
cond: [
{
$and: [
{$gte: ['$$timeStamp', '12:35PM']},
{$lt: ['$$timeStamp', '12:40PM']}
]
},
2,
0
]
}
]