How to remove duplicated documents in MongoDB 4.0 - mongodb

Lets say I have the next documents in the colecction sample:
{_id: 1, comp_index1: "one", comp_index2: "AAA", field: "lots of text" }
{_id: 2, comp_index1: "two", comp_index2: "BBB", field: "mucho texto" }
{_id: 3, comp_index1: "one", comp_index2: "CCC", field: "more text" }
{_id: 4, comp_index1: "two", comp_index2: "AAA", field: "más texto" }
{_id: 5, comp_index1: "one", comp_index2: "AAA", field: "lots of text" }
I want to make comp_index1 and comp_index2 an actual unique compound index.
If I run db.sample.createIndex( { comp_index1: 1, comp_index2: 1}, { unique: true } ) it will throw E11000 duplicate key error collection so I decided to remove duplicates first (due the removal of dropDups option).
Right now I have this brute force algorithm that does the job:
db.sample.aggregate([
{
$group: {
_id: {
comp_index1: "$comp_index1",
comp_index2: "$comp_index2"
},
count: { $sum: 1 }
}
},
{
$match: { count: { $gt: 1 } }
}
], { allowDiskUse: true }).forEach(function (doc) {
for (i = 1; i < doc.count; i++) {
db.sample.remove({
comp_index1: doc._id.comp_index1,
comp_index2: doc._id.comp_index2
},
{
justOne: true
});
}
print("Removed " + (i-1) + " dups of <" + doc._id.comp_index1 + " " + doc._id.comp_index2 + ">")
})
The problem is that I have over 1.4 M documents and there are almost 200 000 dups, so this takes forever to be done, so I was wondering if there is a faster better approach.

After several hours I finally managed to come with a 1000 times faster solution.
var ids = [];
db.sample.aggregate([
{
$group: {
_id: {
comp_index1: "$comp_index1",
comp_index2: "$comp_index2"
},
unique_ids: { $addToSet: "$_id" }
count: { $sum: 1 }
}
},
{
$match: { count: { $gt: 1 } }
}
], { allowDiskUse: true }).forEach(function (doc) {
var i = 0;
doc.unique_ids.forEach(function (id) {
if (i++ > 0) ids.push(id);
})
})
db.sample.remove({"_id": {$in: ids}});
Despite being overall the same approach, saving in RAM all the ids to remove and then performing remove with the operator $in is way way faster. This one was took only few seconds to execute.
If you come up with another solution that does not requires using RAM, please share.

I recently create a code to delete duplicated documents from MongoDB, this should work:
const query = [
{
$group: {
_id: {
comp_index1: "$comp_index1",
comp_index2: "$comp_index2"
},
dups: {
$addToSet: "$_id",
},
count: {
$sum: 1,
},
},
},
{
$match: {
count: {
$gt: 1,
},
},
},
];
const cursor = collection.aggregate(query).cursor({ batchSize: 10 }).exec();
cursor.eachAsync((doc, i) => {
doc.dups.shift(); // First element skipped for deleting
doc.dups.map(async (dupId) => {
await collection.findByIdAndDelete({ _id: dupId });
});
});

Related

MongoDB: How to speed up my data reorganisation query/operation?

I'm trying to analyse some data and I thought my queries would be faster ultimately by storing a relationship between my collections instead. So I wrote something to do the data normalisation, which is as follows:
var count = 0;
db.Interest.find({'PersonID':{$exists: false}, 'Data.DateOfBirth': {$ne: null}})
.toArray()
.forEach(function (x) {
if (null != x.Data.DateOfBirth) {
var peep = { 'Name': x.Data.Name, 'BirthMonth' :x.Data.DateOfBirth.Month, 'BirthYear' :x.Data.DateOfBirth.Year};
var person = db.People.findOne(peep);
if (null == person) {
peep._id = db.People.insertOne(peep).insertedId;
//print(peep._id);
}
db.Interest.updateOne({ '_id': x._id }, {$set: { 'PersonID':peep._id }})
++count;
if ((count % 1000) == 0) {
print(count + ' updated');
}
}
})
This script is just passed to mongo.exe.
Basically, I attempt to find an existing person, if they don't exist create them. In either case, link the originating record with the individual person.
However this is very slow! There's about 10 million documents and at the current rate it will take about 5 days to complete.
Can I speed this up simply? I know I can multithread it to cut it down, but have I missed something?
In order to insert new persons into People collection, use this one:
db.Interest.aggregate([
{
$project: {
Name: "$Data.Name",
BirthMonth: "$Data.DateOfBirth.Month",
BirthYear: "$Data.DateOfBirth.Year",
_id: 0
}
},
{
$merge: {
into: "People",
// requires an unique index on {Name: 1, BirthMonth: 1, BirthYear: 1}
on: ["Name", "BirthMonth", "BirthYear"]
}
}
])
For updating PersonID in Interest collection use this pipeline:
db.Interest.aggregate([
{
$lookup: {
from: "People",
let: {
name: "$Data.Name",
month: "$Data.DateOfBirth.Month",
year: "$Data.DateOfBirth.Year"
},
pipeline: [
{
$match: {
$expr: {
$and: [
{ $eq: ["$Name", "$$name"] },
{ $eq: ["$BirthMonth", "$$month"] },
{ $eq: ["$BirthYear", "$$year"] }
]
}
}
},
{ $project: { _id: 1 } }
],
as: "interests"
}
},
{
$set: {
PersonID: { $first: "$interests._id" },
interests: "$$REMOVE"
}
},
{ $merge: { into: "Interest" } }
])
Mongo Playground

Mongo: Upsert to copy all static fields to new document atomically

I'm using the Bucket Pattern to limit documents' array size to maxBucketSize elements. Once a document's array elements is full (bucketSize = maxBucketSize), the next update will create a new document with a new array to hold more elements using upsert.
How would you copy the static fields (see below recordType and recordDesc) from the last full bucket with a single call?
Sample document:
// records collection
{
recordId: 12345, // non-unique index
recordType: "someType",
recordDesc: "Some record description.",
elements: [ { a: 1, b: 2 }, { a: 3, b: 4 } ]
bucketsize: 2,
}
Bucket implementation (copies only queried fields):
const maxBucketSize = 2;
db.collection('records').updateOne(
{
recordId: 12345,
bucketSize: { $lt: maxBucketSize } // false
},
{
$push: { elements: { a: 5, b: 6 } },
$inc: { bucketSize: 1 },
$setOnInsert: {
// Should be executed because bucketSize condition is false
// but fields `recordType` and `recordDesc` inaccessible as
// no documents matched and were not included in the query
}
},
{ upsert: true }
)
Possible solution
To make this work, I can always make two calls, findOne() to get static values and then updateOne() where I set fields with setOnInsert, but it's inefficient
How can I modify this as one call with an aggregate? Examining one (last added) document matching recordId (index), evaluate if array is full, and add new document.
Attempt:
// Evaluate last document added
db.collection('records').findOneAndUpdate(
{ recordId: 12345 },
{
$push: { elements: {
$cond: {
if: { $lt: [ '$bucketSize', maxBucketSize ] },
then: { a: 5, b: 6 }, else: null
}
}},
$inc: { bucketSize: {
$cond: {
if: { $lt: [ '$bucketSize', maxBucketSize ] },
then: 1, else: 0
}
}},
$setOnInsert: {
recordType: '$recordType',
recordDesc: '$recordDesc'
}
},
{
sort: { $natural: -1 }, // last document
upsert: true, // Bucket overflow
}
)
This comes back with:
MongoError: Cannot increment with non-numeric argument: { bucketSize: { $cond: { if: { $lt: [ "$bucketSize", 2 ] }, then: 1, else: 0 } }}

Mongodb aggregate match and group documents, with additional check

So I have 2 models: Question and Answer.
Answer has: questionId, userId, answer (String).
I need an aggregation pipline that will:
match all answers by questionId
see if the current user already voted (is his id in matched documents)
group answers and count them
I implemented 1 and 3 like this:
const q = ObjectId('5d6e52a68558b63fb9302efd');
const user = ObjectId('5d0b3f7daceeb50c477b49e0');
Answer.aggregate([
{ $match: { questionId: q } },
{ $group: { _id: '$answer', count: { $sum: 1 } } },
])
I am missing a step between those 2 aggregation pipelines, where I would iterate thru matched documents, and check if userId matches user.
I would like to get some object like this:
{
didIVote: true,
result: [ { _id: 'YES', count: 5 }, { _id: 'NO', count: 2 } ]
}
Or maybe even like this:
[
{ _id: 'YES', count: 5, didIVote: true },
{ _id: 'NO', count: 2, didIVote: false },
]
In the $group stage, create an array with the users that voted
for each answer.
Add an aditional $project stage to check if the user is in the array.
const q = ObjectId('5d6e52a68558b63fb9302efd');
const user = ObjectId('5d0b3f7daceeb50c477b49e0');
Answer.aggregate([
{ $match: { questionId: q } },
{
$group: {
_id: '$answer',
count: { $sum: 1 },
voted: { $addToSet: "$userId" }
}
},
{
$project: {
count: 1,
didIVote: { $in: [ user, "$voted" ] },
}
}
]);

Count words of nested subschema field of mongoDB collecction

I am working on data analysis of CV data of a large mongoDB collection. I try to count the absolute frequencey of words in the job title (jobs.jobTitle field in below schema).
The documents are structured like this:
{
firstName: String,
lastName: String,
jobs: [{jobTitle: 'software architect', company: String, ...}, {jobTitle: 'full stack software engineer', company: String, ...}, {jobTitle: 'javascript developer', company: String, ...}],
...
}
I would like to iterate over the entire collection and get an outcome like this:
[{word: 'manager', count: 3245},{word: 'engineer', count: 3102}, {word: 'software', count: 3021}, ..]
I tried the following aggregation:
db.cvs.aggregate([
{
$project: {
words: { $split: ["$jobs.jobTitle", " "] }
}
},
{
$unwind: {
path: "$words"
}
},
{
$group: {
_id: "$words",
count: { $sum: 1 }
}
},
{ $sort: { "count": -1 } }
])
Which results to the following error message:
$split requires an expression that evaluates to a string as a first argument, found: array
Can I concat the string values of jobs.jobTitle first to a string by using an aggregation? Or is there any other way to achive the expected result?
Thanks for the quick comment #NeilLunn
I would like to share the corrected query with everyone:
db.cvs.aggregate([
{ "$unwind": "$jobs" },
{
$project: {
words: { $split: ["$jobs.jobTitle", " "] }
}
},
{
$unwind: {
path: "$words"
}
},
{
$group: {
_id: "$words",
count: { $sum: 1 }
}
},
{ $sort: { "count": -1 } }
])

MongoDB Update array element (document with a key) if exists, else push

I have such a schema:
doc:
{
//Some fields
visits:
[
{
userID: Int32
time: Int64
}
]
}
I want to first check if a specific userID exists, if not, push a document with that userID and system time, else just update time value. I know neither $push nor $addToSet are not able to do that. Also using $ with upsert:true doesn't work, because of official documentation advice which says DB will use $ as field name instead of operator when trying to upsert.
Please guide me about this. Thanks
You can use $addToSet to add an item to the array and $set to update an existing item in this array.
The following will add a new item to the array if the userID is not found in the array :
db.doc.update({
visits: {
"$not": {
"$elemMatch": {
"userID": 4
}
}
}
}, {
$addToSet: {
visits: {
"userID": 4,
"time": 1482607614
}
}
}, { multi: true });
The following will update the subdocument array item if it matches the userId :
db.doc.update({ "visits.userID": 2 }, {
$set: {
"visits.$.time": 1482607614
}
}, { multi: true });
const p = await Transaction.findOneAndUpdate(
{
_id: data.id,
'products.id': { $nin: [product.id] },
},
{
$inc: {
actualCost: product.mrp,
},
$push: {
products: { id: product.id },
},
},
{ new: true }
);
or
db.collection.aggregate([
{
"$match": {
"_id": 1
}
},
{
"$match": {
"sizes.id": {
"$nin": [
7
]
}
}
},
{
"$set": {
"price": 20
}
}
])
https://mongoplayground.net/p/BguFa6E9Tra
I know it's very late. But it may help others. Starting from mongo4.4, we can use $function to use a custom function to implement our own logic. Also, we can use the bulk operation to achieve this output.
Assuming the existing data is as below
{
"_id" : ObjectId("62de4e31daa9b8acd56656ba"),
"entrance" : "Entrance1",
"visits" : [
{
"userId" : 1,
"time" : 1658736074
},
{
"userId" : 2,
"time" : 1658736671
}
]
}
Solution 1: using custom function
db.visitors.updateMany(
{_id: ObjectId('62de4e31daa9b8acd56656ba')},
[
{
$set: {
visits: {
$function: {
lang: "js",
args: ["$visits"],
body: function(visits) {
let v = []
let input = {userId: 3, time: Math.floor(Date.now() / 1000)};
if(Array.isArray(visits)) {
v = visits.filter(x => x.userId != input.userId)
}
v.push(input)
return v;
}
}
}
}
}
]
)
In NodeJS, the function body should be enclosed with ` character
...
lang: 'js',
args: ["$visits"],
body: `function(visits) {
let v = []
let input = {userId: 3, time: Math.floor(Date.now() / 1000)};
if(Array.isArray(visits)) {
v = visits.filter(x => x.userId != input.userId)
}
v.push(input)
return v;
}`
...
Solution 2: Using bulk operation:
Please note that the time here will be in the ISODate
var bulkOp = db.visitors.initializeOrderedBulkOp()
bulkOp.find({ _id: ObjectId('62de4e31daa9b8acd56656ba') }).updateOne({$pull: { visits: { userId: 2 }} });
bulkOp.find({ _id: ObjectId('62de4e31daa9b8acd56656ba') }).updateOne({$push: {visits: {userId: 2, time: new Date()}}})
bulkOp.execute()
Reference link