Compare duplicates by Score By Not Max Field or Earlier Date

Compare duplicates by Score By Not Max Field or Earlier Date - mongodb

I need to compare duplicated documents and get the duplicated ones with the Lowest Score.
If the Score between two duplicates is Equal, then get the one with earlier date.
{
"_id": UUID("c77c72de-edd8-4576-a72c-983cf93a0f31"),
"DocumentId": "05240423067",
"Name": "John Doe",
"CreationDate": ISODate("0001-01-01T00:00:00.000+00:00"),
"Score": 5,
},
{
"_id": UUID("b5a7d404-a341-45dd-b875-864cd1e6bda2"),
"DocumentId": "05240423067",
"Name": "John Doe",
"CreationDate": ISODate("2021-07-17T00:00:00.000+00:00"),
"Score": 2
},
{
"_id": UUID("9efddd23-4b6b-4e96-ab43-b24a080107db"),
"DocumentId": "05240423067",
"Name": "John Doe",
"CreationDate": ISODate("2021-07-10T00:00:00.000+00:00"),
"Score": 2
},
{
"_id": UUID("f1a063a5-f9dd-4998-b6aa-df2071dd8677"),
"DocumentId": "88313825863",
"Name": "Marcus Joseph",
"CreationDate": ISODate("2021-07-17T00:00:00.000+00:00"),
"Score": 2
},
{
"_id": UUID("e3262f8e-bd6a-49e8-abe5-c3c1a4e49900"),
"DocumentId": "88313825863",
"Name": "Marcus Joseph",
"CreationDate": ISODate("0001-01-01T00:00:00.000+00:00"),
"Score": 1
}
Later, the resulting documents will be deleted.
Expected Result:
{
"_id": UUID("b5a7d404-a341-45dd-b875-864cd1e6bda2"),
"DocumentId": "05240423067",
"Name": "John Doe",
"CreationDate": ISODate("2021-07-17T00:00:00.000+00:00"),
"Score": 2 // Return Documents with the **Lowest Score**
},
{
"_id": UUID("9efddd23-4b6b-4e96-ab43-b24a080107db"),
"DocumentId": "05240423067",
"Name": "John Doe",
"CreationDate": ISODate("2021-07-10T00:00:00.000+00:00"),
"Score": 2 // Return Documents with the **Lowest Score**
},
{
"_id": UUID("e3262f8e-bd6a-49e8-abe5-c3c1a4e49900"),
"DocumentId": "88313825863",
"Name": "Marcus Joseph",
"CreationDate": ISODate("0001-01-01T00:00:00.000+00:00"),
"Score": 2 // If both Scores Equal, Compare CreationDate earlier
}
Mongo Version 4.2.21

This would be easier with some of the newer "$group" accumulators introduced in more recent versions of MongoDB, but here's one way you could do it.
db.collection.aggregate([
{
"$group": {
"_id": "$DocumentId",
"count": {"$sum": 1},
"docs": {"$push": "$$ROOT"}
}
},
{ // if only 1, keep it
"$match": {
"$expr": {"$gt": ["$count", 1]}
}
},
{ // find the doc to keep
"$set": {
"keepDoc": {
"$reduce": {
"input": "$docs",
"initialValue": {
"Score": {"$minKey": 1}
},
"in": {
"$switch": {
"branches": [
{
"case": {"$gt": ["$$this.Score", "$$value.Score"]},
"then": "$$this"
},
{
"case": {"$eq": ["$$this.Score", "$$value.Score"]},
"then": {
"$cond": [
{"$gt": ["$$this.CreationDate", "$$value.CreationDate"]},
"$$this",
"$$value"
]
}
}
],
"default": "$$value"
}
}
}
}
}
},
{ // get docs other than keepDoc
"$project": {
"_id": 0,
"expiredDocs": {
"$filter": {
"input": "$docs",
"cond": {"$ne": ["$$this", "$keepDoc"]}
}
}
}
},
{"$unwind": "$expiredDocs"},
{"$replaceWith": "$expiredDocs"}
])
Try it on mongoplayground.net.
N.B.: On mongoplayground.net, there's no easy way that I know of to enter binary UUID values in the BSON configuration, so I just used strings. It should be inconsequential to the pipeline.

Related

Modifying the Aggregation Function to get a reformatted result

I have a dataset like this in mongodb
[
{
"task_id": "as4d2rds5",
"url": "https:example1.com",
"organization": "Avengers",
"val": "null"
},
{
"task_id": "rfre43fed",
"url": "https:example1.com",
"organization": "Avengers",
"val": "valid"
},
{
"task_id": "uyje3dsxs",
"url": "https:example2.com",
"organization": "Metro",
"val": "valid"
},
{
"task_id": "ghs563vt6",
"url": "https:example1.com",
"organization": "Avengers",
"val": "invalid"
},
{
"task_id": "erf6egy64",
"url": "https:example2.com",
"organization": "Metro",
"val": "null"
}
]
I am trying to create an mongodb aggregate function so that it will yield a result like
[
{
"Metro": {
"invalid": 0,
"null": 1,
"valid": 1,
"url": "https:example2.com"
},
},
{
"Avengers": {
"invalid": 1,
"null": 1,
"valid": 1,
"url": "https:example1.com"
}
}
]
I got a great deal of help from stackoverflow to reach here.
I need to reformat the data received from an aggregator so that it produces the above result. Present aggregation script is
db.collection.aggregate([ {"$group": {"_id": {"k": "$organization","v": "$val"},"cnt": {"$sum": 1},"url": {$first: "$url"}}},
{"$project": {"_id": 0, "url": 1, "k": "$_id.k", "o": {"k": "$_id.v", "v": "$cnt"}}},
{"$group": {"_id": "$k", "v": { "$push": "$o"}, "url": {"$first": "$url"}}},
{"$addFields": {"v": {"$mergeObjects": [{"null": 0,"valid": 0,"invalid": 0},{"$arrayToObject": "$v"}]}}},
{"$project": {"_id": 0, "url": 1, "new": [{"k": "$_id","v": "$v"}]}},
{"$addFields": {"new": {"$mergeObjects": [{"$arrayToObject": "$new"},{"url": "$url"}]}}},
{"$replaceRoot": {"newRoot": "$new"}} ])

You can try this query to avoid multiple $group (as a trade off you have three $filter but I think this is still better than a multiple $group):
This query group by organization and then use $project to output the size of how many "valid", "invalid" and "null" exists.
Edit: Also you can add an extra step $replaceRoot to get exactly the same output as you want.
db.collection.aggregate([
{
"$group": {
"_id": "$organization",
"val": {
"$push": "$val"
},
"url": {
"$first": "$url"
}
}
},
{
"$project": {
"_id": 0,
"organization": [
{
"k": "$_id",
"v": {
"url": "$url",
"invalid": {
"$size": {
"$filter": {
"input": "$val",
"cond": {
"$eq": [
"$$this",
"invalid"
]
}
}
}
},
"valid": {
"$size": {
"$filter": {
"input": "$val",
"cond": {
"$eq": [
"$$this",
"valid"
]
}
}
}
},
"null": {
"$size": {
"$filter": {
"input": "$val",
"cond": {
"$eq": [
"$$this",
"null"
]
}
}
}
}
}
}
]
}
},
{
"$replaceRoot": {
"newRoot": {
"$arrayToObject": "$organization"
}
}
}
])
Example here

Mongo Aggregate Combine Two Documents

Once I've unwound a sub-document array, how do I put it back together with all the original root fields?
Consider the following Tasks data set:
[
{
"_id": "5e95bb1cf36c0ab3247036bd",
"name": "Task A",
"org": "5e95b9894a0aa0b30dfcbc0b",
"creator": "5e117e5cd90de7187b000d87"
},
{
"_id": "5e95bb30f36c0ab3247036be",
"name": "Task B1",
"org": "5e95b9894a0aa0b30dfcbc0b",
"creator": "5e117e5cd90de7187b000d87",
"parent": "5e95bb1cf36c0ab3247036bd"
},
{
"_id": "5e95bb35f36c0ab3247036bf",
"name": "Task B2",
"org": "5e95b9894a0aa0b30dfcbc0b",
"creator": "5e117e5cd90de7187b000d87",
"parent": "5e95bb1cf36c0ab3247036bd"
}
]
So, then I run $graphLookup to get the parent task and populate it's children and then $unwind it and populate the creator field:
[
{
"$match": {
"parent": {
"$exists": false
}
}
},
{
"$graphLookup": {
"from": "tasks",
"startWith": "$_id",
"connectFromField": "_id",
"connectToField": "parent",
"as": "children"
}
},
{
"$unwind": {
"path": "$children"
}
},
{
"$lookup": {
"from": "users",
"localField": "children.creator",
"foreignField": "_id",
"as": "children.creator"
}
},
{
"$unwind": {
"path": "$children.creator"
}
}
]
Which returns the following documents:
[
{
"_id": "5e95bb1cf36c0ab3247036bd",
"name": "Task A",
"org": "5e95b9894a0aa0b30dfcbc0b",
"creator": "5e117e5cd90de7187b000d87",
"children": [
{
"_id": "5e95bb30f36c0ab3247036be",
"name": "Task B1",
"org": "5e95b9894a0aa0b30dfcbc0b",
"creator": {
"name": "Jack Frost"
},
"parent": "5e95bb1cf36c0ab3247036bd"
}
]
},
{
"_id": "5e95bb1cf36c0ab3247036bd",
"name": "Task A",
"org": "5e95b9894a0aa0b30dfcbc0b",
"creator": "5e117e5cd90de7187b000d87",
"children": [
{
"_id": "5e95bb35f36c0ab3247036bf",
"name": "Task B2",
"org": "5e95b9894a0aa0b30dfcbc0b",
"creator": {
"name": "Bill Nye"
},
"parent": "5e95bb1cf36c0ab3247036bd"
}
]
},
]
Lastly, I need to merge all of these duplicate documents back together and join the $children. This is the part I can't figure out. Below is some junk I'm trying but it seems messy to have to specifically list every property.
Is there a better way to combine multiple (mostly) matching docs?
[
...
{
"$group": {
"_id": "$_id",
"name": {
"$mergeObjects": "$properties"
},
"watchers": {
"$addToSet": "$watchers"
},
"assignees": {
"$addToSet": "$assignees"
},
"org": {
"$addToSet": "$$ROOT.org"
},
"children": {
"$push": "$children"
}
}
}
]

Answering my own question here, the best solution I can find is to specify each property but pass it the $first operator. This will ensure that the original value will be passed through.
{
$group: {
_id: '$_id',
name: {$first: '$name'},
org: {$first: '$org'},
creator: {$first: '$creator'},
children: {$push: '$children'}
}
}

Fetch data from 2 collections in mongodb in single query

I wanted to fetch data from 2 independent collections and sort the results based on date through a single query. Is that even possible in mongodb? I have collections:
OrderType1
{
"id": "1",
"name": "Hello1",
"date": "2016-09-23T15:07:38.000Z"
},
{
"id": "2",
"name": "Hello1",
"date": "2015-09-23T15:07:38.000Z"
}
OrderType2
{
"id": "3",
"name": "Hello3",
"date": "2012-09-23T15:07:38.000Z"
},
{
"id": "4",
"name": "Hello4",
"date": "2018-09-23T15:07:38.000Z"
}
Expected Result
[
{
"id": "3",
"name": "Hello3",
"date": "2012-09-23T15:07:38.000Z"
},
{
"id": "2",
"name": "Hello1",
"date": "2015-09-23T15:07:38.000Z"
},
{
"id": "1",
"name": "Hello1",
"date": "2016-09-23T15:07:38.000Z"
},
{
"id": "4",
"name": "Hello4",
"date": "2018-09-23T15:07:38.000Z"
}
]
Now, I want to fetch both types of orders in a single query sorted by date.

You can try below aggregation with mongodb 3.6 and above but I think you should use two queries because for the large data set $lookup pipeline will breach BSON limit of 16mb. But also It depends upon your $match condition or $limit. If they are applied to the $lookup pipeline then your aggregation would work perfectly.
db.OrderType1.aggregate([
{ "$limit": 1 },
{ "$facet": {
"collection1": [
{ "$limit": 1 },
{ "$lookup": {
"from": "OrderType1",
"pipeline": [{ "$match": { } }],
"as": "collection1"
}}
],
"collection2": [
{ "$limit": 1 },
{ "$lookup": {
"from": "OrderType2",
"pipeline": [{ "$match": { } }],
"as": "collection2"
}}
]
}},
{ "$project": {
"data": {
"$concatArrays": [
{ "$arrayElemAt": ["$collection1.collection1", 0] },
{ "$arrayElemAt": ["$collection2.collection2", 0] },
]
}
}},
{ "$unwind": "$data" },
{ "$replaceRoot": { "newRoot": "$data" } }
])

Removing the largest variance from a group

If I have a set of objects each with the same description, but with different amounts.
{
{
"_id": "101",
"description": "DD from my employer1",
"amount": 1000.33
},
{
"_id": "102",
"description": "DD from my employer1",
"amount": 1000.34
},
{
"_id": "103",
"description": "DD from my employer1",
"amount": 999.35
},
{
"_id": "104",
"description": "DD from my employer1"",
"amount": 5000.00
},
{
"_id": "105",
"description": "DD from my employer2",
"amount": 2000.01
},
{
"_id": "106",
"description": "DD from my employer2",
"amount": 1999.33
},
{
"_id": "107",
"description": "DD from my employer2",
"amount": 1999.33
}
}
Below, I am able to group them using the following:
{
{
"$group": {
"_id": {
"$subtract": [
{
"$trunc": "$amount"
},
{
"$mod": [
{
"$trunc": "$amount"
},
10
]
}
]
},
"results": {
"$push": "$_id"
}
}
},
{
"$redact": {
"$cond": {
"if": {
"$gt": [
{
"$size": "$results"
},
1
]
},
"then": "$$KEEP",
"else": "$$PRUNE"
}
}
},
{
"$unwind": "$results"
},
{
"$group": {
"_id": null,
"results": {
"$push": "$results"
}
}
}
}
Is there a way to include all the amounts in the group (_ids: 101, 102, and 103 plus 105,106,107) even if they have a small difference, but exclude the bonus amount, which in the sample above as _id 104?
I am looking for a simple array output of just the _ids.
Looking for the following result:
{ "result": [ "101", "102", "103", "105", "106", "107" ] }

I think it's all a bit subjective to the actual data, but if it's simply about a significant "positive" difference to the "average" payment then that is the best algorithm to apply:
db.collection.aggregate([
{ "$group": {
"_id": "$description",
"avg": { "$avg": "$amount" },
"docs": { "$push": { "_id": "$_id", "amount": "$amount" } }
}},
{ "$addFields": {
"docs": {
"$filter": {
"input": "$docs",
"as": "doc",
"cond": {
"$gt": [ "$avg", { "$subtract": [ "$$doc.amount", "$avg" ] } ]
}
}
}
}},
{ "$unwind": "$docs" },
{ "$group": {
"_id": null,
"results": { "$push": "$docs._id" }
}}
])
From your given data this would exclude the "104" amount since the difference from the amount to the average amounts from "employer 1" is greater than the average itself. This will be the case for a large "upwards" variation.
As with all "grouping" methods that rely on creating an array within the grouped document, you need to be careful in real world scenarios to not break the BSON limit.

MongoDB projection. Operator $add field|expression array awareness or after $slice

I've got collection that looks like:
[{
"org": "A",
"type": "simple",
"payFor": 3,
"price": 100
},
{
"org": "A",
"type": "custom",
"payFor": 2,
"price": 115
},
{
"org": "B",
"type": "simple",
"payFor": 1,
"price": 110
},
{
"org": "B",
"type": "custom",
"payFor": 2,
"price": 200
},
{
"org": "B",
"type": "custom",
"payFor": 4,
"price": 220
}]
And need to produce result with query to perform group by "org" where payments appears for only first "payFor" prices in "type".
I'm trying to use expression result by $slice operator in $add but this is not works.
pipeline:
[{
"$group": {
"_id": {
"org": "$org",
"type": "$type"
},
"payFor": {
"$max": "$payFor"
},
"count": {
"$sum": 1
},
"prices": {
"$push": "$price"
}
}
},
{
"$group": {
"_id": "$_id.org",
"payments": {
"$push": {
"type": "$_id.type",
"forFirst": "$payFor",
"sum": {
"$cond": [
{
"$gte": [
"$payFor",
"$count"
]
},
{
"$add": {
"$prices": {
"$slice": "$count"
}
}
},
{
"$add": "$prices"
}
]
}
}
}
}
}]
I know that it is possible to traverse unwinded prices and pick only "payFor" count of them. but result collections are more rich than in example above and this operation will produce some unecessary overheads.
Need some advice from community. Please. Thanks.