I have approximately 1.7M documents in mongodb (in future 10m+). Some of them represent duplicate entry which I do not want. Structure of document is something like this:
{
_id: 14124412,
nodes: [
12345,
54321
],
name: "Some beauty"
}
Document is duplicate if it has at least one node same as another document with same name. What is the fastest way to remove duplicates?
dropDups: true option is not available in 3.0.
I have solution with aggregation framework for collecting duplicates and then removing in one go.
It might be somewhat slower than system level "index" changes. But it is good by considering way you want to remove duplicate documents.
a. Remove all documents in one go
var duplicates = [];
db.collectionName.aggregate([
{ $match: {
name: { "$ne": '' } // discard selection criteria
}},
{ $group: {
_id: { name: "$name"}, // can be grouped on multiple properties
dups: { "$addToSet": "$_id" },
count: { "$sum": 1 }
}},
{ $match: {
count: { "$gt": 1 } // Duplicates considered as count greater than one
}}
],
{allowDiskUse: true} // For faster processing if set is larger
) // You can display result until this and check duplicates
.forEach(function(doc) {
doc.dups.shift(); // First element skipped for deleting
doc.dups.forEach( function(dupId){
duplicates.push(dupId); // Getting all duplicate ids
}
)
})
// If you want to Check all "_id" which you are deleting else print statement not needed
printjson(duplicates);
// Remove all duplicates in one go
db.collectionName.remove({_id:{$in:duplicates}})
b. You can delete documents one by one.
db.collectionName.aggregate([
// discard selection criteria, You can remove "$match" section if you want
{ $match: {
source_references.key: { "$ne": '' }
}},
{ $group: {
_id: { source_references.key: "$source_references.key"}, // can be grouped on multiple properties
dups: { "$addToSet": "$_id" },
count: { "$sum": 1 }
}},
{ $match: {
count: { "$gt": 1 } // Duplicates considered as count greater than one
}}
],
{allowDiskUse: true} // For faster processing if set is larger
) // You can display result until this and check duplicates
.forEach(function(doc) {
doc.dups.shift(); // First element skipped for deleting
db.collectionName.remove({_id : {$in: doc.dups }}); // Delete remaining duplicates
})
Assuming you want to permanently delete docs that contain a duplicate name + nodes entry from the collection, you can add a unique index with the dropDups: true option:
db.test.ensureIndex({name: 1, nodes: 1}, {unique: true, dropDups: true})
As the docs say, use extreme caution with this as it will delete data from your database. Back up your database first in case it doesn't do exactly as you're expecting.
UPDATE
This solution is only valid through MongoDB 2.x as the dropDups option is no longer available in 3.0 (docs).
Create collection dump with mongodump
Clear collection
Add unique index
Restore collection with mongorestore
I found this solution that works with MongoDB 3.4:
I'll assume the field with duplicates is called fieldX
db.collection.aggregate([
{
// only match documents that have this field
// you can omit this stage if you don't have missing fieldX
$match: {"fieldX": {$nin:[null]}}
},
{
$group: { "_id": "$fieldX", "doc" : {"$first": "$$ROOT"}}
},
{
$replaceRoot: { "newRoot": "$doc"}
}
],
{allowDiskUse:true})
Being new to mongoDB, I spent a lot of time and used other lengthy solutions to find and delete duplicates. However, I think this solution is neat and easy to understand.
It works by first matching documents that contain fieldX (I had some documents without this field, and I got one extra empty result).
The next stage groups documents by fieldX, and only inserts the $first document in each group using $$ROOT. Finally, it replaces the whole aggregated group by the document found using $first and $$ROOT.
I had to add allowDiskUse because my collection is large.
You can add this after any number of pipelines, and although the documentation for $first mentions a sort stage prior to using $first, it worked for me without it. " couldnt post a link here, my reputation is less than 10 :( "
You can save the results to a new collection by adding an $out stage...
Alternatively, if one is only interested in a few fields e.g. field1, field2, and not the whole document, in the group stage without replaceRoot:
db.collection.aggregate([
{
// only match documents that have this field
$match: {"fieldX": {$nin:[null]}}
},
{
$group: { "_id": "$fieldX", "field1": {"$first": "$$ROOT.field1"}, "field2": { "$first": "$field2" }}
}
],
{allowDiskUse:true})
The following Mongo aggregation pipeline does the deduplication and outputs it back to the same or different collection.
collection.aggregate([
{ $group: {
_id: '$field_to_dedup',
doc: { $first: '$$ROOT' }
} },
{ $replaceRoot: {
newRoot: '$doc'
} },
{ $out: 'collection' }
], { allowDiskUse: true })
My DB had millions of duplicate records. #somnath's answer did not work as is so writing the solution that worked for me for people looking to delete millions of duplicate records.
/** Create a array to store all duplicate records ids*/
var duplicates = [];
/** Start Aggregation pipeline*/
db.collection.aggregate([
{
$match: { /** Add any filter here. Add index for filter keys*/
filterKey: {
$exists: false
}
}
},
{
$sort: { /** Sort it in such a way that you want to retain first element*/
createdAt: -1
}
},
{
$group: {
_id: {
key1: "$key1", key2:"$key2" /** These are the keys which define the duplicate. Here document with same value for key1 and key2 will be considered duplicate*/
},
dups: {
$push: {
_id: "$_id"
}
},
count: {
$sum: 1
}
}
},
{
$match: {
count: {
"$gt": 1
}
}
}
],
{
allowDiskUse: true
}).forEach(function(doc){
doc.dups.shift();
doc.dups.forEach(function(dupId){
duplicates.push(dupId._id);
})
})
/** Delete the duplicates*/
var i,j,temparray,chunk = 100000;
for (i=0,j=duplicates.length; i<j; i+=chunk) {
temparray = duplicates.slice(i,i+chunk);
db.collection.bulkWrite([{deleteMany:{"filter":{"_id":{"$in":temparray}}}}])
}
Here is a slightly more 'manual' way of doing it:
Essentially, first, get a list of all the unique keys you are interested.
Then perform a search using each of those keys and delete if that search returns bigger than one.
db.collection.distinct("key").forEach((num)=>{
var i = 0;
db.collection.find({key: num}).forEach((doc)=>{
if (i) db.collection.remove({key: num}, { justOne: true })
i++
})
});
tips to speed up, when only small portion of your documents are duplicated:
you need an index on the field to detect duplicates.
$group does not use the index, but it can take advantage of $sort and $sort use the index. so you should put a $sort step at the beginning
do inplace delete_many() instead of $out to new collection, this will save lots of IO time and disk space.
if you use pymongo you can do:
index_uuid = IndexModel(
[
('uuid', pymongo.ASCENDING)
],
)
col.create_indexes([index_uuid])
pipeline = [
{"$sort": {"uuid":1}},
{
"$group": {
"_id": "$uuid",
"dups": {"$addToSet": "$_id"},
"count": {"$sum": 1}
}
},
{
"$match": {"count": {"$gt": 1}}
},
]
it_cursor = col.aggregate(
pipeline, allowDiskUse=True
)
# skip 1st dup of each dups group
dups = list(itertools.chain.from_iterable(map(lambda x: x["dups"][1:], it_cursor)))
col.delete_many({"_id":{"$in": dups}})
performance
I test it on a database contain 30M documents and 1TB large.
Without index/sort it takes more than an hour to get the cursor (I do not even have the patient to wait for it).
with index/sort but use $out to output to a new collection. This is safer if your filesystem does not support snapshot. But it requires lots of disk space and takes more than 40mins to finish despite the fact that we are using SSDs. It will be much slower if you are on HDD RAID.
with index/sort and inplace delete_many, it takes around 5mins in total.
The following method merges documents with the same name while only keeping the unique nodes without duplicating them.
I found using the $out operator to be a simple way. I unwind the array and then group it by adding to set. The $out operator allows the aggregation result to persist [docs].
If you put the name of the collection itself it will replace the collection with the new data. If the name does not exist it will create a new collection.
Hope this helps.
allowDiskUse may have to be added to the pipeline.
db.collectionName.aggregate([
{
$unwind:{path:"$nodes"},
},
{
$group:{
_id:"$name",
nodes:{
$addToSet:"$nodes"
}
},
{
$project:{
_id:0,
name:"$_id.name",
nodes:1
}
},
{
$out:"collectionNameWithoutDuplicates"
}
])
Using pymongo this should work.
Add the fields that need to be unique for the collection in unique_field
unique_field = {"field1":"$field1","field2":"$field2"}
cursor = DB.COL.aggregate([{"$group":{"_id":unique_field, "dups":{"$push":"$uuid"}, "count": {"$sum": 1}}},{"$match":{"count": {"$gt": 1}}},{"$group":"_id":None,"dups":{"$addToSet":{"$arrayElemAt":["$dups",1]}}}}],allowDiskUse=True)
slice the dups array depending on the duplications count(here i had only one extra duplicate for all)
items = list(cursor)
removeIds = items[0]['dups']
hold.remove({"uuid":{"$in":removeIds}})
I don't know whether is it going to answer main question, but for others it'll be usefull.
1.Query the duplicate row using findOne() method and store it as an object.
const User = db.User.findOne({_id:"duplicateid"});
2.Execute deleteMany() method to remove all the rows with the id "duplicateid"
db.User.deleteMany({_id:"duplicateid"});
3.Insert the values stored in User object.
db.User.insertOne(User);
Easy and fast!!!!
First, you can find all the duplicates and remove those duplicates in the DB. Here we take the id column to check and remove duplicates.
db.collection.aggregate([
{ "$group": { "_id": "$id", "count": { "$sum": 1 } } },
{ "$match": { "_id": { "$ne": null }, "count": { "$gt": 1 } } },
{ "$sort": { "count": -1 } },
{ "$project": { "name": "$_id", "_id": 0 } }
]).then(data => {
var dr = data.map(d => d.name);
console.log("duplicate Recods:: ", dr);
db.collection.remove({ id: { $in: dr } }).then(removedD => {
console.log("Removed duplicate Data:: ", removedD);
})
})
General idea is to use findOne https://docs.mongodb.com/manual/reference/method/db.collection.findOne/
to retrieve one random id from the duplicate records in the collection.
Delete all the records in the collection other than the random-id that we retrieved from findOne option.
You can do something like this if you are trying to do it in pymongo.
def _run_query():
try:
for record in (aggregate_based_on_field(collection)):
if not record:
continue
_logger.info("Working on Record %s", record)
try:
retain = db.collection.find_one(find_one({'fie1d1': 'x', 'field2':'y'}, {'_id': 1}))
_logger.info("_id to retain from duplicates %s", retain['_id'])
db.collection.remove({'fie1d1': 'x', 'field2':'y', '_id': {'$ne': retain['_id']}})
except Exception as ex:
_logger.error(" Error when retaining the record :%s Exception: %s", x, str(ex))
except Exception as e:
_logger.error("Mongo error when deleting duplicates %s", str(e))
def aggregate_based_on_field(collection):
return collection.aggregate([{'$group' : {'_id': "$fieldX"}}])
From the shell:
Replace find_one to findOne
Same remove command should work.
Related
I have an array of documents that looks like this:
patient: {
conditions: [
{
columnToSortBy: "value",
type: "PRIMARY"
},
{
columnToSortBy: "anotherValue",
type: "SECONDARY"
},
]
}
I need to be able to $sort by columnToSortBy, but using the item in the array where type is equal to PRIMARY. PRIMARY is not guaranteed to be the first item in the array every time.
How do I set my $sort up to accommodate this? Is there something akin to:
// I know this is invalid. It's for illustration purposes
$sort: "columnToSortBy", {$where: {type: "PRIMARY"}}
Is it possible to sort a field, but only when another field matches a query? I do not want the secondary conditions to affect the sort in any way. I am sorting on that one specific element alone.
You need to use aggregation framework
db.collection.aggregate([
{
$unwind: "$patient.conditions" //reshape the data
},
{
"$sort": {
"patient.conditions.columnToSortBy": -1 //sort it
}
},
{
$group: {
"_id": "$_id",
"conditions": { //re group it
"$push": "$patient.conditions"
}
}
},
{
"$project": { //project it
"_id": 1,
"patient.conditions": "$conditions"
}
}
])
Playground
I have a large collection of documents with datetime fields in them, and I need to retrieve the most recent document for any given queried list.
Sample data:
[
{"_id": "42.abc",
"ts_utc": "2019-05-27T23:43:16.963Z"},
{"_id": "42.def",
"ts_utc": "2019-05-27T23:43:17.055Z"},
{"_id": "69.abc",
"ts_utc": "2019-05-27T23:43:17.147Z"},
{"_id": "69.def",
"ts_utc": "2019-05-27T23:44:02.427Z"}
]
Essentially, I need to get the most recent record for the "42" group as well as the most recent record for the "69" group. Using the sample data above, the desired result for the "42" group would be document "42.def".
My current solution is to query each group one at a time (looping with PyMongo), sort by the ts_utc field, and limit it to one, but this is really slow.
// Requires official MongoShell 3.6+
db = db.getSiblingDB("someDB");
db.getCollection("collectionName").find(
{
"_id" : /^42\..*/
}
).sort(
{
"ts_utc" : -1.0
}
).limit(1);
Is there a faster way to get the results I'm after?
Assuming all your documents have the format displayed above, you can split the id into two parts (using the dot character) and use aggregation to find the max element per each first array (numeric) element.
That way you can do it in a one shot, instead of iterating per each group.
db.foo.aggregate([
{ $project: { id_parts : { $split: ["$_id", "."] }, ts_utc : 1 }},
{ $group: {"_id" : { $arrayElemAt: [ "$id_parts", 0 ] }, max : {$max: "$ts_utc"}}}
])
As #danh mentioned in the comment, the best way you can do is probably adding an auxiliary field to indicate the grouping. You may further index the auxiliary field to boost the performance.
Here is an ad-hoc way to derive the field and get the latest result per grouping:
db.collection.aggregate([
{
"$addFields": {
"group": {
"$arrayElemAt": [
{
"$split": [
"$_id",
"."
]
},
0
]
}
}
},
{
$sort: {
ts_utc: -1
}
},
{
"$group": {
"_id": "$group",
"doc": {
"$first": "$$ROOT"
}
}
},
{
"$replaceRoot": {
"newRoot": "$doc"
}
}
])
Here is the Mongo playground for your reference.
I am trying to write a script that uses 2 aggregates and saves the results as an array to be used for an updateMany.
The first aggregate finds any documents that has a firstTrackingId and a secondTrackingId on it. I save this into an array. This aggregate is working correctly when tested alone.
The second aggregate will use the first aggregate's result array, pulling all documents that have a firstTrackingId from the first aggregate's results. This one will pull any documents that do NOT have a secondTrackingId on it, and save the unique mongo _id/ObjectId to an array.
The updateMany will use all of the results from the second aggregation to update all relevant documents with a status of void.
All these functions are working when I give them hard-coded data, but I can't figure out how to pull the data from the arrays. I am not even sure if I'm "saving" it correctly, or if there is something else I should be doing aside from just initializing the aggregation as an array.
var ids = db.getCollection('Test').aggregate([
{
$match: {
"firstTrackingId": { "$ne": "" },
"secondTrackingId": { "$exists": true }
}
},
{
$group: {
_id: "$firstTrackingId",
}
},
])
var secondIds = db.getCollection('Test').aggregate([
{
$match: {
"firstTrackingId": { $in: ids },
"secondTrackingId": { $exists: false }
}
},
{
$group: {
"_id": "$_id",
}
},
])
db.getCollection('Test').updateMany({
"_id": {
"$in": secondIds
},
}, { $set: {
"status": "VOID"
} })
I tried printing the first aggregation's results out... can't really figure out how... so for the first one if I do:
print(ids.next(ids._id))
I get:
[object BSON]
Which leads me to believe I need to somehow perform an $objectToArray. If anyone has any insight, that'd be awesome. Thank you!
If you are using MongoDB 4.4+, you can do that with a single aggregation pipeline:
match documents with both first and second tracking ID
lookup an array of all documents with the same first tracking ID
unwind the array
consider the array elements as the root document
match to eliminate any that have a second tracking ID
set the desired status field
merge the results with the original collection
{$match: {
firstTrackingId: { $ne: "" },
secondTrackingId: { $exists: true }
}},
{$lookup:{
from: "Test",
localField:"firstTrackingId",
foreignField:"firstTrackingId",
as:"matched"
}},
{$unwind:"$matched"},
{$replaceRoot:{newRoot:"$matched"}},
{$match:{secondTrackingId:{"$exists":false}}},
{$addFields:{status:"VOID"}},
{$merge: {into: "Test"}}
my document structure is something like :
{
_id: ...,
key1: ....
key2: ....
....
min_value: //should be the minimum of all the values in options
options: [
{
source: 'a',
value: 12,
},
{
source: 'b',
value: 10,
},
...
]
},
{
_id: ...,
key1: ....
key2: ....
....
min_value: //should be the minimum of all the values in options
options: [
{
source: 'a',
value: 24,
},
{
source: 'b',
value: 36,
},
...
]
}
the value of various sources in options will keep getting updated on a frequent basis(evey few mins or hours),
assume the size of options array doesnt change, i.e. no extra elements are added to the list
my queries are of the following type:
-find all documents where the min_value of all the options falls between some limit.
I could first do an unwind on options(and then take min) and then run comparison queries, but I am new to mongo and not sure how performance
is affected by unwind operation. The number of documents of this type would be about a few million.
Or does anyone has any suggestions around changing the document structure which could help me simplify this query? ( apart from creating separate documents per source - it would involves lot of data duplication )
Thanks!
Using $unwind is indeed quite expensive, most notably so with larger arrays, but there is a cost in all cases of usage. There are a couple of way to approach not needing $unwind here without real structural changes.
Pure Aggregation
In the basic case, as of MongoDB 3.2.x release series the $min operator can work directly on an array of values in a "projection" sense in addition to it's standard grouping accumulator role. This means that with the help of the related $map operator for processing elements of an array, you can then get the minimal value without using $unwind:
db.collection.aggregate([
// Still makes sense to use an index to select only possible documents
{ "$match": {
"options": {
"$elemMatch": {
"value": { "$gte": minValue, "$lt": maxValue }
}
}
}},
// Provides a logical filter to remove non-matching documents
{ "$redact": {
"$cond": {
"if": {
"$let": {
"vars": {
"min_value": {
"$min": {
"$map": {
"input": "$options",
"as": "option",
"in": "$$option.value"
}
}
}
},
"in": { "$and": [
{ "$gte": [ "$$min_value", minValue ] },
{ "$lt": [ "$$min_value", maxValue ] }
]}
}
},
"then": "$$KEEP",
"else": "$$PRUNE"
}
}},
// Optionally return the min_value as a field
{ "$project": {
"min_value": {
"$min": {
"$map": {
"input": "$options",
"as": "option",
"in": "$$option.value"
}
}
}
}}
])
The basic case is to get the "minimum" value from the array ( done inside of $let since we want to use the result "twice" in logical conditions. Helps us not repeat ourselves ) is to first extract the "value" data from the "options" array. This is done using $map.
The output of $map is an array with just those values, so this is supplied as the argument to $min, which then returns the minimum value for that array.
Using $redact is sort of like a $match pipeline stage with the difference that rather than needing a field to be "present" in the document being examined, you instead just form a logical condition with calculations.
In this case the condition is $and where "both" the logical forms of $gte and $lt return true against the calculated value ( from $let as "$$min_value" ).
The $redact stage then has the special arguments to apply to $$KEEP the document when the condition is true or $$PRUNE the document from results when it is false.
It's all very much like doing $project and then $match to actually project the value into the document before filtering in another stage, but all done in one stage. Of course you might actually want to $project the resulting field in what you return, but it generally cuts the workload if you remove non-matched documents "first" using $redact instead.
Updating Documents
Of course I think the best option is to actually keep the "min_value" field in the document rather than work it out at run-time. So this is a very simple thing to do when adding to or altering array items during update.
For this there is the $min "update" operator. Use it when appending with $push:
db.collection.update({
{ "_id": id },
{
"$push": { "options": { "source": "a", "value": 9 } },
"$min": { "min_value": 9 }
}
})
Or when updating a value of an element:
db.collection.update({
{ "_id": id, "options.source": "a" },
{
"$set": { "options.$.value": 9 },
"$min": { "min_value": 9 }
}
})
If the current "min_value" in the document is greater than the argument in $min or the key does not yet exist then the value given will be written. If it is greater than, the existing value stays in place since it is already the smaller value.
You can even set all your existing data with a simple "bulk" operations update:
var ops = [];
db.collection.find({ "min_value": { "$exists": false } }).forEach(function(doc) {
// Queue operations
ops.push({
"updateOne": {
"filter": { "_id": doc._id },
"update": {
"$min": {
"min_value": Math.min.apply(
null,
doc.options.map(function(option) {
return option.value
})
)
}
}
}
});
// Write once in 1000 documents
if ( ops.length == 1000 ) {
db.collection.bulkWrite(ops);
ops = [];
}
});
// Clear any remaining operations
if ( ops.length > 0 )
db.collection.bulkWrite(ops);
Then with a field in place, it is just a simple range selection:
db.collection.find({
"min_value": {
"$gte": minValue, "$lt": maxValue
}
})
So it really should be in your best interests to keep a field ( or fields if you regularly need different conditions ) in the document since that provides the most efficient query.
Of course, the new functions of aggregation $min along with $map also make this viable to use without a field, if you prefer more dynamic conditions.
I would like to clear all duplicated of a specific field in a collection. leaving only the earliest entry of the duplicates.
Here is my aggregate query which works great for finding the duplicates:
db.History.aggregate([
{ $group: {
_id: { name: "$sessionId" },
uniqueIds: { $addToSet: "$_id" },
count: { $sum: 1 }
} },
{ $match: {
count: { $gte: 2 }
} },
{ $sort : { count : -1} }
],{ allowDiskUse:true,
cursor:{}});
Only problem is that i need to execute a remove query as well and keep for each of the duplicates the youngest entry (determined by the field 'timeCreated':
"timeCreated" : ISODate("2016-03-07T10:48:43.251+02:00")
How exactly do i do that?
Personally I would take advantage of the fact that the ObjectId values themselves are "monotonic" or therefore "ever increasing in value" which means that the "youngest" or "most recent" would come at the end of a naturally sorted list.
So rather than force the aggregation pipeline to do the sorting, the most logical and efficient thing to do is simply sort the list of unique _id values returned per document as you process each response.
So basically working with the listing that you must have found:
Remove Duplicates from MongoDB
And is actually my answer ( and your the second person to reference this week, and yet no votes received for useful! Hmm! ), where it's just a simple .sort() applied within the cursor iteration for the returned array:
Using the _id Value
var bulk = db.History.initializeOrderedBulkOp(),
count = 0;
// List "all" fields that make a document "unique" in the `_id`
// I am only listing some for example purposes to follow
db.History.aggregate([
{ "$group": {
"_id": "$sessionId",
"ids": { "$push": "$_id" }, // _id values are already unique, so $addToSet adds nothing
"count": { "$sum": 1 }
}},
{ "$match": { "count": { "$gt": 1 } } }
],{ "allowDiskUse": true}).forEach(function(doc) {
doc.ids.sort().reverse(); // <-- this is the only real change
doc.ids.shift(); // remove first match, which is now youngest
bulk.find({ "_id": { "$in": doc.ids } }).remove(); // removes all $in list
count++;
// Execute 1 in 1000 and re-init
if ( count % 1000 == 0 ) {
bulk.execute();
bulk = db.History.initializeOrderedBulkOp();
}
});
if ( count % 1000 != 0 )
bulk.execute();
Using a specific field
If you "really" are set on adding another date value on which to determine which is youngest then just add to the array in $push first, then apply the client side sort function. Again just a really simple change:
var bulk = db.History.initializeOrderedBulkOp(),
count = 0;
// List "all" fields that make a document "unique" in the `_id`
// I am only listing some for example purposes to follow
db.History.aggregate([
{ "$group": {
"_id": "$sessionId",
"ids": { "$push": {
"_id": "$_id",
"created": "$timeCreated"
}},
"count": { "$sum": 1 }
}},
{ "$match": { "count": { "$gt": 1 } } }
],{ "allowDiskUse": true}).forEach(function(doc) {
doc.ids = doc.ids.sort(function(a,b) { // sort dates and just return _id
return a.created.valueOf() < a.created.valueOf()
}).map(function(el) { return el._id });
doc.ids.shift(); // remove first match, which is now youngest
bulk.find({ "_id": { "$in": doc.ids } }).remove(); // removes all $in list
count++;
// Execute 1 in 1000 and re-init
if ( count % 1000 == 0 ) {
bulk.execute();
bulk = db.History.initializeOrderedBulkOp();
}
});
if ( count % 1000 != 0 )
bulk.execute();
So it's a really simple process with no "real" alteration to the original process used to identify the duplicates and then remove all but one of them.
Always the best approach here to just let the server do the job of finding the duplicates, then client side when iterating the cursor you can then work out from the returned array which document is going to be kept and which ones you are going to remove.