Check duplicates of certain field for documents array with inner array - mongodb

I have 2 objects,
{
_id: ObjectId("5cd9010310b80b3e38cd3f88")
subGroup: [
bookList: [
{
title: "A good book",
id: "abc123"
}
]
]
}
{
_id: ObjectId("5cd9010710b80b3e38cd3f89")
subGroup: [
bookList: [
{
title: "A good book",
id: "abc123"
}
]
These are 2 different objects. I would like to detect the occurence of these 2 objects where the title is duplicated (eg the same).
I tried this query
db.scope.aggregate({"$unwind": "$subGroup.bookList"}, {"$group" : { "_id": "$title", "count": { "$sum": 1 } } }, {"$match": {"id" :{ "$ne" : null } , "count" : {"$gt": 1} } })
which i looked at other threads on stackoverflow. However, it does not return me anything. How can i solve this?

There are few issues here:
$unwind should be run on subGroup and on subGroup.bookList separately
when specifying _id for $group stage you should use full path (subGroup.bookList.title)
in your $match stage you want to check if _id (not id) is $ne null
Try:
db.col.aggregate([
{"$unwind": "$subGroup"},
{"$unwind": "$subGroup.bookList"},
{"$group" : { "_id": "$subGroup.bookList.title", "count": { "$sum": 1 } } },
{"$match": { "_id" :{ "$ne" : null } , "count" : { "$gt": 1} } }
])
Mongo playground

Related

mongo - count return no docoument found instead of 0

In SQL query
select count(*) from table where id=1
would return 0 as result where there isn't any record with such id.
I would like to get exactly the same behavior but in mongo. Unfortunately I can only use aggregate function.
I was trying something like this
db.collection.aggregate([
{
"$match": {
"key": 1
}
},
{
$count: "s"
}
])
It works but only with records with key:1 but when this key does not exist there is "no document found"
You can use this aggregation query using $facet to create two possible ways: If document exists or if document does not exists.
First $facet to create the two ways
Into notFound way the result will always be {count: 0} ; into found way there is the match
Then $replaceRoot merging results to get desired value.
db.collection.aggregate([
{
"$facet": {
"notFound": [
{
"$project": {
"_id": 0,
"count": {
"$const": 0
}
}
},
{
"$limit": 1
}
],
"found": [
{
"$match": {
"key": 1
}
},
{
"$count": "count"
}
]
}
},
{
"$replaceRoot": {
"newRoot": {
"$mergeObjects": [
{
"$arrayElemAt": [
"$notFound",
0
]
},
{
"$arrayElemAt": [
"$found",
0
]
}
]
}
}
}
])
Example here where key exists and here where key doesn't exists.
Also I've tested with this using $ifNull instead of $mergeObjects and seem works ok too.
I think the right way to do it is with the driver code, if you get empty results you make that document {"count" : 0} you dont need i think to do anything in the database.
Another solution can be this (replace the 5 with the key value you want)
Test code here
creates 2 groups the matched(count>0) and the not matched(count=0)
sort by {"count" : -1}
take the first, if there was a match count will be the one matched,
else it will be 0
aggregate(
[ {
"$group" : {
"_id" : {
"$cond" : [ {"$eq" : [ "$key", 5 ]}, "$key", "not_match" ]
},
"count" : {
"$sum" : {"$cond" : [ {"$eq" : [ "$key", 5 ]}, 1, 0 ]}
}
}
},
{"$sort" : {"count" : -1}},
{
"$group" : {
"_id" : null,
"count" : {"$first" : "$count"}
}
},
{"$project" : {"_id" : 0}}
])
I did it by using $facet,$project and when there were no documents to project it was showing undefined, so I used $ifNull expression. I've kept zero value for replacement expression value (see the $ifNull docs).
db.collection.aggregate([
{
"$facet": {
"keyFound": [
{
"$match": {
"key": 1
}
},
{
"$count": "count"
}
]
}
},
{
"$project": {
"keyFoundCount": {
"$ifNull": [
{
"$arrayElemAt": [
"$keyFound.count",
0
]
},
0
]
}
}
}
])
testCodeHere

MongoDB group and only show results whose count is greater than 1 [duplicate]

How would I find duplicate fields in a mongo collection.
I'd like to check if any of the "name" fields are duplicates.
{
"name" : "ksqn291",
"__v" : 0,
"_id" : ObjectId("540f346c3e7fc1054ffa7086"),
"channel" : "Sales"
}
Many thanks!
Use aggregation on name and get name with count > 1:
db.collection.aggregate([
{"$group" : { "_id": "$name", "count": { "$sum": 1 } } },
{"$match": {"_id" :{ "$ne" : null } , "count" : {"$gt": 1} } },
{"$project": {"name" : "$_id", "_id" : 0} }
]);
To sort the results by most to least duplicates:
db.collection.aggregate([
{"$group" : { "_id": "$name", "count": { "$sum": 1 } } },
{"$match": {"_id" :{ "$ne" : null } , "count" : {"$gt": 1} } },
{"$sort": {"count" : -1} },
{"$project": {"name" : "$_id", "_id" : 0} }
]);
To use with another column name than "name", change "$name" to "$column_name"
You can find the list of duplicate names using the following aggregate pipeline:
Group all the records having similar name.
Match those groups having records greater than 1.
Then group again to project all the duplicate names as an array.
The Code:
db.collection.aggregate([
{$group:{"_id":"$name","name":{$first:"$name"},"count":{$sum:1}}},
{$match:{"count":{$gt:1}}},
{$project:{"name":1,"_id":0}},
{$group:{"_id":null,"duplicateNames":{$push:"$name"}}},
{$project:{"_id":0,"duplicateNames":1}}
])
o/p:
{ "duplicateNames" : [ "ksqn291", "ksqn29123213Test" ] }
The answer anhic gave can be very inefficient if you have a large database and the attribute name is present only in some of the documents.
To improve efficiency you can add a $match to the aggregation.
db.collection.aggregate(
{"$match": {"name" :{ "$ne" : null } } },
{"$group" : {"_id": "$name", "count": { "$sum": 1 } } },
{"$match": {"count" : {"$gt": 1} } },
{"$project": {"name" : "$_id", "_id" : 0} }
)
Another option is to use $sortByCount stage.
db.collection.aggregate([
{ $sortByCount: '$name' }
]
This is the combination of $group & $sort.
The $sortByCount stage is equivalent to the following $group + $sort sequence:
{ $group: { _id: <expression>, count: { $sum: 1 } } },
{ $sort: { count: -1 } }
db.getCollection('orders').aggregate([
{$group: {
_id: {name: "$name"},
uniqueIds: {$addToSet: "$_id"},
count: {$sum: 1}
}
},
{$match: {
count: {"$gt": 1}
}
}
])
First Group Query the group according to the fields.
Then we check the unique Id and count it, If count is greater then 1 then the field is duplicate in the entire collection so that thing is to be handle by $match query.
this is how we can achieve this in mongoDB compass
In case you need to see all duplicated rows:
db.collection.aggregate([
{"$group" : { "_id": "$name", "count": { "$sum": 1 },"data": { "$push": "$$ROOT" }}},
{"$unwind": "$data"},
{"$match": {"_id" :{ "$ne" : null } , "count" : {"$gt": 1} } },
]);
If somebody is looking for a query for duplicates with an extra "$and" where clause, like "and where someOtherField is true"
The trick is to start with that other $match, because after the grouping you don't have all the data available anymore
// Do a first match before the grouping
{ $match: { "someOtherField": true }},
{ $group: {
_id: { name: "$name" },
count: { $sum: 1 }
}},
{ $match: { count: { $gte: 2 } }},
I searched for a very long time to find this notation, hope I can help somebody with the same problem
Search for duplicates in Compass Mongo db using $sortByCount
[screenshot]: https://i.stack.imgur.com/L85QV.png
Sometimes you want to find duplicates regardless the case, when you want to create a case insensitive index for instance. In this case you can use this aggregation pipeline
db.collection.aggregate([
{'$group': {'_id': {'$toLower': '$name'}, 'count': { '$sum': 1 }, 'duplicates': { '$push': '$$ROOT' } } },
{'$match': { 'count': { '$gt': 1 } }
]);
Explanation:
group by name but first change the case to lower case and push the docs to the duplicates array.
match those groups having records greater than 1 (the duplicates).

MongoDB - Cannot divide by zero error [duplicate]

Given the following record in my MongoDB table:
{
"_id" : ObjectId("5a00c1c71680084c55811ae2"),
"name" : "test",
"tenantId" : "paul",
"price" : 300,
"deposits" : [
{
"amount" : 100,
"date" : ISODate("2017-11-07T14:08:19.324Z"),
"_id" : ObjectId("5a01be55424b0f8922a5b472")
},
{
"amount" : 50,
"date" : ISODate("2017-11-87T14:08:19.324Z"),
"_id" : ObjectId("5a01be55424b0f8922a5b473")
}
],
"attention" : "",
"due" : ISODate("2017-10-26T22:00:00.000Z")
}
I would like to filter all the records with a specific tenantId, and then subtract the SUM of my amounts in the subdocument.
I found out how to Sum the Subdocument:
db.table.aggregate( [
{ $match : { tenantId: "paul" } },
{ $unwind:{ path: "$deposits", preserveNullAndEmptyArrays: true }},
{ $group: {
_id: '$_id',
deposits: { $sum: '$deposits.amount' },
} }
] );
but when i try to subtract the $sum from $price like
deposits: { $subtract: [ $price , $sum: '$deposits.amount' ] },
than i get an error saying
Error: Line 6: Unexpected token :
Actually you can simply do:
db.table.aggregate( [
{ "$match" : { "tenantId": "paul" } },
//{ $unwind:{ path: "$deposits", preserveNullAndEmptyArrays: true }},
{ "$project":
"deposits": { "$subtract": ["$price", { "$sum": "$deposits.amount" } ] }
}}
])
Since MongoDB 3.2 you can actually $project with $sum and an array of arguments ( or an array ) and therefore do not need to $unwind at all.
Changed in version 3.2: $sum is available in the $group and $project stages. In previous versions of MongoDB, $sum is available in the $group stage only.
When used in the $project stage, $sum returns the sum of the specified expression or list of expressions for each document ...
The "long" way, which is the "old" way is to actually use $unwind, but you would then actually add a $project following the $group:
db.table.aggregate( [
{ "$match" : { "tenantId": "paul" } },
{ $unwind:{ path: "$deposits", preserveNullAndEmptyArrays: true }},
{ "$group":
"_id": "$_id",
"price": { "$first": "$price" },
"deposits": { "$sum": "$deposits.amount" }
}},
{ "$project": {
"deposits": { "$subtract": [ "$price", "$deposits" ] }
}}
])
And of course you then need the $first accumulator in order to return the "price" field from the $group stage so it can be used in the following stage.
But if you can do preserveNullAndEmptyArrays, then you actually have MongoDB 3.2, and therefore are better off using the statement without the $unwind at all, since it's much faster to do it that way.

Find duplicate records in MongoDB

How would I find duplicate fields in a mongo collection.
I'd like to check if any of the "name" fields are duplicates.
{
"name" : "ksqn291",
"__v" : 0,
"_id" : ObjectId("540f346c3e7fc1054ffa7086"),
"channel" : "Sales"
}
Many thanks!
Use aggregation on name and get name with count > 1:
db.collection.aggregate([
{"$group" : { "_id": "$name", "count": { "$sum": 1 } } },
{"$match": {"_id" :{ "$ne" : null } , "count" : {"$gt": 1} } },
{"$project": {"name" : "$_id", "_id" : 0} }
]);
To sort the results by most to least duplicates:
db.collection.aggregate([
{"$group" : { "_id": "$name", "count": { "$sum": 1 } } },
{"$match": {"_id" :{ "$ne" : null } , "count" : {"$gt": 1} } },
{"$sort": {"count" : -1} },
{"$project": {"name" : "$_id", "_id" : 0} }
]);
To use with another column name than "name", change "$name" to "$column_name"
You can find the list of duplicate names using the following aggregate pipeline:
Group all the records having similar name.
Match those groups having records greater than 1.
Then group again to project all the duplicate names as an array.
The Code:
db.collection.aggregate([
{$group:{"_id":"$name","name":{$first:"$name"},"count":{$sum:1}}},
{$match:{"count":{$gt:1}}},
{$project:{"name":1,"_id":0}},
{$group:{"_id":null,"duplicateNames":{$push:"$name"}}},
{$project:{"_id":0,"duplicateNames":1}}
])
o/p:
{ "duplicateNames" : [ "ksqn291", "ksqn29123213Test" ] }
The answer anhic gave can be very inefficient if you have a large database and the attribute name is present only in some of the documents.
To improve efficiency you can add a $match to the aggregation.
db.collection.aggregate(
{"$match": {"name" :{ "$ne" : null } } },
{"$group" : {"_id": "$name", "count": { "$sum": 1 } } },
{"$match": {"count" : {"$gt": 1} } },
{"$project": {"name" : "$_id", "_id" : 0} }
)
Another option is to use $sortByCount stage.
db.collection.aggregate([
{ $sortByCount: '$name' }
]
This is the combination of $group & $sort.
The $sortByCount stage is equivalent to the following $group + $sort sequence:
{ $group: { _id: <expression>, count: { $sum: 1 } } },
{ $sort: { count: -1 } }
db.getCollection('orders').aggregate([
{$group: {
_id: {name: "$name"},
uniqueIds: {$addToSet: "$_id"},
count: {$sum: 1}
}
},
{$match: {
count: {"$gt": 1}
}
}
])
First Group Query the group according to the fields.
Then we check the unique Id and count it, If count is greater then 1 then the field is duplicate in the entire collection so that thing is to be handle by $match query.
this is how we can achieve this in mongoDB compass
In case you need to see all duplicated rows:
db.collection.aggregate([
{"$group" : { "_id": "$name", "count": { "$sum": 1 },"data": { "$push": "$$ROOT" }}},
{"$unwind": "$data"},
{"$match": {"_id" :{ "$ne" : null } , "count" : {"$gt": 1} } },
]);
If somebody is looking for a query for duplicates with an extra "$and" where clause, like "and where someOtherField is true"
The trick is to start with that other $match, because after the grouping you don't have all the data available anymore
// Do a first match before the grouping
{ $match: { "someOtherField": true }},
{ $group: {
_id: { name: "$name" },
count: { $sum: 1 }
}},
{ $match: { count: { $gte: 2 } }},
I searched for a very long time to find this notation, hope I can help somebody with the same problem
Search for duplicates in Compass Mongo db using $sortByCount
[screenshot]: https://i.stack.imgur.com/L85QV.png
Sometimes you want to find duplicates regardless the case, when you want to create a case insensitive index for instance. In this case you can use this aggregation pipeline
db.collection.aggregate([
{'$group': {'_id': {'$toLower': '$name'}, 'count': { '$sum': 1 }, 'duplicates': { '$push': '$$ROOT' } } },
{'$match': { 'count': { '$gt': 1 } }
]);
Explanation:
group by name but first change the case to lower case and push the docs to the duplicates array.
match those groups having records greater than 1 (the duplicates).

Selecting Distinct values from Array in MongoDB

I have a collection name Alpha_Num, It has following structure. I am trying to find out which Alphabet-Numerals pair will appear maximum number of times ?
If we just go with the data below, pair abcd-123 appears twice so as pair efgh-10001, but the second one is not a valid case for me as it appears in same document.
{
"_id" : 12345,
"Alphabet" : "abcd",
"Numerals" : [
"123",
"456",
"2345"
]
}
{
"_id" : 123456,
"Alphabet" : "efgh",
"Numerals" : [
"10001",
"10001",
"1002"
]
}
{
"_id" : 123456567,
"Alphabet" : "abcd",
"Numerals" : [
"123"
]
}
I tried to use aggregation frame work, something like below
db.Alpha_Num.aggregate([
{"$unwind":"$Numerals"},
{"$group":
{"_id":{"Alpha":"$Alphabet","Num":"$Numerals"},
"count":{$sum:1}}
},
{"$sort":{"count":-1}}
])
Problem in this query is it gives pair efgh-10001 twice.
Question : How to select distinct values from array "Numerals" in the above condition ?
Problem solved.
db.Alpha_Num.aggregate([{
"$unwind": "$Numerals"
}, {
"$group": {
_id: {
"_id": "$_id",
"Alpha": "$Alphabet"
},
Num: {
$addToSet: "$Numerals"
}
}
}, {
"$unwind": "$Num"
}, {
"$group": {
_id: {
"Alplha": "$_id.Alpha",
"Num": "$Num"
},
count: {
"$sum": 1
}
}
}])
Grouping using $addToSet and unwinding again did the trick. Got the answer from one of 10gen online course.