Remove duplicates based on a key and referenced Objects in Mongodb? - mongodb

I have MongoDB models of Actor and Movies. The Mongoose schema of both the models is as following :
var ActorsSchema = new Schema({
id : {
type : Number
},
known_for:[{
type: Schema.Types.ObjectId,
ref: 'Movie'
}]
})
var MovieSchema = new Schema({
genres: [{
type: Schema.Types.ObjectId,
ref: 'Genre'
}],
id: {
type: Number
}
});
known_for attribute in the actor model contains the reference to a list of movies in which that actor has starred.
I want to delete duplicate Actor records which would be determined using the id fieled (not the _id). But what I also want to do is delete the movies referenced in the deleted actor's record in the known_for field to also be deleted and I want to do that from the Mongo interface as the number of records in these documents is very large and performing this function programmatically would be time inefficient.
I have looked in to a related question but it does not apply to models who reference other models as there fields.

Consider using the aggregation framework to identify the duplicate documents, get a list of the duplicate _ids for the actors collection alongside the arrays of movie ids and issue remove and update commands with the ids array as the query.
For testing purposes, suppose you have the following data in your collections (with minimum test cases, for demonstration purposes of course):
db.movies.insert([
{
"_id" : ObjectId("5543e79e42063d2be5d2ea84"),
"id" : 1,
"genres" : []
},
{
"_id" : ObjectId("5543e79e42063d2be5d2ea85"),
"id" : 2,
"genres" : []
},
{
"_id" : ObjectId("5543e79e42063d2be5d2ea86"),
"id" : 3,
"genres" : []
}
]);
db.actors.insert([
{ id: 1, known_for: [ObjectId("5543e79e42063d2be5d2ea84")] },
{ id: 1, known_for: [ObjectId("5543e79e42063d2be5d2ea84")] },
{ id: 2, known_for: [ObjectId("5543e79e42063d2be5d2ea84"), ObjectId("5543e79e42063d2be5d2ea85")] },
{ id: 3, known_for: [ObjectId("5543e79e42063d2be5d2ea85"), ObjectId("5543e79e42063d2be5d2ea86")] }
]);
Now for the magical part. The aggregation pipeline groups the actors documents by id, calculates the grouped count, creates two array fields which hold the actor _id duplicates and the movies object ids. The pipeline outputs the results to a collection dupes that will be used later on to remove the duplicates:
db.actors.aggregate([
{
"$group": {
"_id": "$id",
"duplicates": { "$addToSet": "$_id" },
"movies": { "$addToSet": "$known_for"},
"count": { "$sum": 1 }
}
},
{
"$match": {
"count": { "$gt": 1 }
}
},
{
"$out": "dupes"
}
])
Querying the dupes collection will give the result:
/* 1 */
{
"_id" : 1.0000000000000000,
"duplicates" : [
ObjectId("5543fc8e42063d2be5d2eaa2"),
ObjectId("5543fc8e42063d2be5d2eaa1")
],
"movies" : [
[
ObjectId("5543e79e42063d2be5d2ea84")
]
],
"count" : 2
}
Now for the fun part. Use the dupes collection to then remove the dupes from the actors collection. As you have noticed from the dupes collection, the movies field is an array of arrays so you will need to flatten it and use the flattened array to then remove the movies and pull the orphaned movie references from the actors collection:
db.dupes.find({}).find({}).forEach( function (doc) {
var movie_dupes = [];
db.actors.remove({ "_id": { "$in": doc.duplicates } });
doc.movies.forEach( function (arr){
arr.forEach(function (id){
movie_dupes.push(id)
});
});
db.movies.remove({ "_id": { "$in": movie_dupes } });
db.actors.update({ "known_for": { "$in": movie_dupes } }, { "$pull": { "known_for": { "$in": movie_dupes } } }, { "multi": true });
});
Logs to console:
Removed 2 record(s) in 38ms
Removed 1 record(s) in 2ms
Updated 1 existing record(s) in 1ms
Now to verify whether our duplicates have been obliterated:
db.actors.find()
/* 1 */
{
"_id" : ObjectId("5543fc8e42063d2be5d2eaa3"),
"id" : 2,
"known_for" : [
ObjectId("5543e79e42063d2be5d2ea85")
]
}
/* 2 */
{
"_id" : ObjectId("5543fc8e42063d2be5d2eaa4"),
"id" : 3,
"known_for" : [
ObjectId("5543e79e42063d2be5d2ea85"),
ObjectId("5543e79e42063d2be5d2ea86")
]
}
Actor with id 1 (which was a duplicate) was indeed removed.
db.movies.find()
/* 1 */
{
"_id" : ObjectId("5543e79e42063d2be5d2ea85"),
"id" : 2,
"genres" : []
}
/* 2 */
{
"_id" : ObjectId("5543e79e42063d2be5d2ea86"),
"id" : 3,
"genres" : []
}
Movie with ObjectId("5543e79e42063d2be5d2ea84") was removed.

Related

Project values of different columns into one field

{
"_id" : ObjectId("5ae84dd87f5b72618ba7a669"),
"main_sub" : "MATHS",
"reporting" : [
{
"teacher" : "ABC"
}
],
"subs" : [
{
"sub" : "GEOMETRIC",
"teacher" : "XYZ",
}
]
}
{
"_id" : ObjectId("5ae84dd87f5b72618ba7a669"),
"main_sub" : "SOCIAL SCIENCE",
"reporting" : [
{
"teacher" : "XYZ"
}
],
"subs" : [
{
"sub" : "CIVIL",
"teacher" : "ABC",
}
]
}
I have simplified the structure of the documents that i have.
The basic structure is that I have a parent subject with an array of reporting teachers and an array of sub-subjects(each having a teacher)
I now want to extract all the subject(parent/sub-subjects) along with the condition if they are sub-subjects or not which are taught by a particular teacher.
For eg:
for teacher ABC i want the following structure:
[{'subject':'MATHS', 'is_parent':'True'}, {'subject':'CIVIL', 'is_parent':'FALSE'}]
-- What is the most efficient query possible ..? I have tried $project with $cond and $switch but in both the cases I have had to repeat the conditional statement for 'subject' and 'is_parent'
-- Is it advised to do the computation in a query or should I get the data dump and then modify the structure in the server code? AS in, I could $unwind and get a mapping of the parent subjects with each sub-subject and then do a for loop.
I have tried
db.collection.aggregate(
{$unwind:'$reporting'},
{$project:{
'result':{$cond:[
{$eq:['ABC', '$reporting.teacher']},
"$main_sub",
"$subs.sub"]}
}}
)
then I realised that even if i transform the else part into another query for the sub-subjects I will have to write the exact same thing for the property of is_parent
You have 2 arrays, so you need to unwind both - the reporting and the subs.
After that stage each document will have at most 1 parent teacher-subj and at most 1 sub teacher-subj pairs.
You need to unwind them again to have a single teacher-subj per document, and it's where you define whether it is parent or not.
Then you can group by teacher. No need for $conds, $filters, or $facets. E.g.:
db.collection.aggregate([
{ $unwind: "$reporting" },
{ $unwind: "$subs" },
{ $project: {
teachers: [
{ teacher: "$reporting.teacher", sub: "$main_sub", is_parent: true },
{ teacher: "$subs.teacher", sub: "$subs.sub", is_parent: false }
]
} },
{ $unwind: "$teachers" },
{ $group: {
_id: "$teachers.teacher",
subs: { $push: {
subject: "$teachers.sub",
is_parent: "$teachers.is_parent"
} }
} }
])

Add field (boolean) to returned objects, when a specified value is in array, without including the array itself

I have a mongoose Schema that looks likes this :
var AnswerSchema = new Schema({
author: {type: Schema.Types.ObjectId, ref: 'User'},
likes: [{type: Schema.Types.ObjectId, ref: 'User'}]
text: String,
....
});
and I have an API endpoint that allow to get answers posted by a specific user (which exclude the likes array). What I want to do is add a field (with "true/false" value for example) to the answer(s) returned by the mongoose query, when a specific user_id is (or is not) in the likes array of an answer. This way, I can display to the user requesting the answers if he already liked an answer or not.
How could I achieve this in an optimised way ? I would like to avoid fetching the likes array, then look into it myself in my Javascript code to check if specified userId is present in it, then remove it before sending it back to the client... because it sounds wrong to fetch all this data from mongoDB to my node app for nothing. I'm sure there is a better way by using aggregation but I never used it and am a bit confused on how to do it right.
The database might grow very large so it must be quick and optimised.
One approach you could take is via the aggregation framework which allows you to add/modify fields via the $project pipeline, applying a host of logical operators that work in cohort to achieve the desired end result. For instance, in your above case this would translate to:
Answer.aggregate()
.project({
"author": 1,
"matched": {
"$eq": [
{
"$size": {
"$ifNull": [
{ "$setIntersection": [ "$likes", [userId] ] },
[]
]
}
},
1
]
}
})
.exec(function (err, docs){
console.log(docs);
})
As an example to test in mongo shell, let's insert some few test documents to the test collection:
db.test.insert([
{
"likes": [1, 2, 3]
},
{
"likes": [3, 2]
},
{
"likes": null
},
{
"another": "foo"
}
])
Running the above aggregation pipeline on the test collection to get the boolean field for userId = 2:
var userId = 2;
db.test.aggregate([
{
"$project": {
"matched": {
"$eq": [
{
"$size": {
"$ifNull": [
{ "$setIntersection": [ "$likes", [userId] ] },
[]
]
}
},
1
]
}
}
}
])
gives the following output:
{
"result" : [
{
"_id" : ObjectId("564f487c7d3c273d063cd21e"),
"matched" : true
},
{
"_id" : ObjectId("564f487c7d3c273d063cd21f"),
"matched" : true
},
{
"_id" : ObjectId("564f487c7d3c273d063cd220"),
"matched" : false
},
{
"_id" : ObjectId("564f487c7d3c273d063cd221"),
"matched" : false
}
],
"ok" : 1
}

MongoDB, how to query a document but limit an array in that document

I have the following document:
{
_id: asdfasdf,
title: "ParentA",
children: [
{
_id: abcd <-- using new ObjectId() to generate these on creation
title: "ChildA",
},
{
_id: efgh,
title: "ChildB"
}
]
}
What I want to do is use findOne but I only want the returned document to contain a single child in its array.
Sudo logic
Categories.findOne({ _id: "asdfasdf" }, { children: _Id: "abcd" });
I want the returned document to look like this:
{
_id: asdfasdf,
title: "ParentA",
children: [
{
_id: abcd <-- using new ObjectId() to generate these on creation
title: "ChildA",
}
]
}
The purpose of this is so I can pass the information into an edit form, and then update that single child object in the array on save.
I'm getting confused as to how to limit the result set.
Thank you very much!
---- Edit ----
After attempting to use the suggested duplicate question as a reference, I'm getting undefined in my results. I really want to use findOne instead of find() as well. On the client, a collection object, even though it contains one item, is treated differently than a single (findOne) object that is returned.
Here is what I've tried.
db.Category.findOne({
"_id": parentid,
"children._id": childid
},{
"_id": childid,
"children": {
"$elemMatch": {
"_id": childid
}
}
});
db.Category.findOne({
"_id": parentid
},{
"children": {
"$elemMatch": {
"_id": childid
}
}
});
I've tried several more variations like the above.
---- Edit 2 ----
Based on a comment, here is the output of the following query:
db.category.findOne({ "_id" : "9dYgKFczgiRcNouij"});
{
"title" : "Appliances",
"active" : true,
"children" : [
{
"_id" : ObjectId("680d55c6995ef6f0748278c2"),
"title" : "Laundry",
"active" : true
},
{
"_id" : ObjectId("2b4469c1a4c8e086942a1233"),
"title" : "Kitchen"
"active" : true
},
{
"_id" : ObjectId("4f5562ef7668839704c851d6"),
"title" : "Other"
"active" : true
}
],
"_id" : "9dYgKFczgiRcNouij"
}
So I think perhaps my problem is how I created the children._id in the array. I used new ObjectId() to generate the _id.
--- Edit 3 ---
db.category.findOne({
"_id": "9dYgKFczgiRcNouij"
},{
"children": {
"$elemMatch": {
"_id": ObjectId("4f5562ef7668839704c851d6")
}
}
});
This returns ObjectID is not defined.

Search on multiple collections in MongoDB

I know the theory of MongoDB and the fact that is doesn't support joins, and that I should use embeded documents or denormalize as much as possible, but here goes:
I have multiple documents, such as:
Users, which embed Suburbs, but also has: first name, last name
Suburbs, which embed States
Child, which embeds School, belongs to a User, but also has: first name, last name
Example:
Users:
{ _id: 1, first_name: 'Bill', last_name: 'Gates', suburb: 1 }
{ _id: 2, first_name: 'Steve', last_name: 'Jobs', suburb: 3 }
Suburb:
{ _id: 1, name: 'Suburb A', state: 1 }
{ _id: 2, name: 'Suburb B', state: 1 }
{ _id: 3, name: 'Suburb C', state: 3 }
State:
{ _id: 1, name: 'LA' }
{ _id: 3, name: 'NY' }
Child:
{ _id: 1, _user_id: 1, first_name: 'Little Billy', last_name: 'Gates' }
{ _id: 2, _user_id: 2, first_name: 'Little Stevie', last_name: 'Jobs' }
The search I need to implement is on:
first name, last name of Users and Child
State from Users
I know that I have to do multiple queries to get it done, but how can that be achieved? With mapReduce or aggregate?
Can you point out a solution please?
I've tried to use mapReduce but that didn't get me to have documents from Users which contained a state_id, so that's why I brought it up here.
This answer is outdated. Since version 3.2, MongoDB has limited support for left outer joins with the $lookup aggregation operator
MongoDB does not do queries which span multiple collections - period. When you need to join data from multiple collections, you have to do it on the application level by doing multiple queries.
Query collection A
Get the secondary keys from the result and put them into an array
Query collection B passing that array as the value of the $in-operator
Join the results of both queries programmatically on the application layer
Having to do this should be rather the exception than the norm. When you frequently need to emulate JOINs like that, it either means that you are still thinking too relational when you design your database schema or that your data is simply not suited for the document-based storage concept of MongoDB.
So now join is possible in mongodb and you can achieve this using $lookup and $facet aggregation here and which is probably the best way to find in multiple collections
db.collection.aggregate([
{ "$limit": 1 },
{ "$facet": {
"c1": [
{ "$lookup": {
"from": Users.collection.name,
"pipeline": [
{ "$match": { "first_name": "your_search_data" } }
],
"as": "collection1"
}}
],
"c2": [
{ "$lookup": {
"from": State.collection.name,
"pipeline": [
{ "$match": { "name": "your_search_data" } }
],
"as": "collection2"
}}
],
"c3": [
{ "$lookup": {
"from": State.collection.name,
"pipeline": [
{ "$match": { "name": "your_search_data" } }
],
"as": "collection3"
}}
]
}},
{ "$project": {
"data": {
"$concatArrays": [ "$c1", "$c2", "$c3" ]
}
}},
{ "$unwind": "$data" },
{ "$replaceRoot": { "newRoot": "$data" } }
])
You'll find MongoDB easier to understand if you take a denormalized approach to schema design. That is, you want to structure your documents the way the requesting client application understands them. Essentially, you are modeling your documents as domain objects with which the applicaiton deals. Joins become less important when you model your data this way. Consider how I've denormalized your data into a single collection:
{
_id: 1,
first_name: 'Bill',
last_name: 'Gates',
suburb: 'Suburb A',
state: 'LA',
child : [ 3 ]
}
{
_id: 2,
first_name: 'Steve',
last_name: 'Jobs',
suburb: 'Suburb C',
state 'NY',
child: [ 4 ]
}
{
_id: 3,
first_name: 'Little Billy',
last_name: 'Gates',
suburb: 'Suburb A',
state: 'LA',
parent : [ 1 ]
}
{
_id: 4,
first_name: 'Little Stevie',
last_name: 'Jobs'
suburb: 'Suburb C',
state 'NY',
parent: [ 2 ]
}
The first advantage is that this schema is far easier to query. Plus, updates to address fields are now consistent with the individual Person entity since the fields are embedded in a single document. Notice also the bidirectional relationship between parent and children? This makes this collection more than just a collection of individual people. The parent-child relationships mean this collection is also a social graph. Here are some resoures which may be helpful to you when thinking about schema design in MongoDB.
Here's a JavaScript function that will return an array of all records matching specified criteria, searching across all collections in the current database:
function searchAll(query,fields,sort) {
var all = db.getCollectionNames();
var results = [];
for (var i in all) {
var coll = all[i];
if (coll == "system.indexes") continue;
db[coll].find(query,fields).sort(sort).forEach(
function (rec) {results.push(rec);} );
}
return results;
}
From the Mongo shell, you can copy/paste the function in, then call it like so:
> var recs = searchAll( {filename: {$regex:'.pdf$'} }, {moddate:1,filename:1,_id:0}, {filename:1} )
> recs
Based on #brian-moquin and others, I made a set of functions to search entire collections with entire keys(fields) by simple keyword.
It's in my gist; https://gist.github.com/fkiller/005dc8a07eaa3321110b3e5753dda71b
For more detail, I first made a function to gather all keys.
function keys(collectionName) {
mr = db.runCommand({
'mapreduce': collectionName,
'map': function () {
for (var key in this) { emit(key, null); }
},
'reduce': function (key, stuff) { return null; },
'out': 'my_collection' + '_keys'
});
return db[mr.result].distinct('_id');
}
Then one more to generate $or query from keys array.
function createOR(fieldNames, keyword) {
var query = [];
fieldNames.forEach(function (item) {
var temp = {};
temp[item] = { $regex: '.*' + keyword + '.*' };
query.push(temp);
});
if (query.length == 0) return false;
return { $or: query };
}
Below is a function to search a single collection.
function findany(collection, keyword) {
var query = createOR(keys(collection.getName()));
if (query) {
return collection.findOne(query, keyword);
} else {
return false;
}
}
And, finally a search function for every collections.
function searchAll(keyword) {
var all = db.getCollectionNames();
var results = [];
all.forEach(function (collectionName) {
print(collectionName);
if (db[collectionName]) results.push(findany(db[collectionName], keyword));
});
return results;
}
You can simply load all functions in Mongo console, and execute searchAll('any keyword')
You can achieve this using $mergeObjects by MongoDB Driver
Example
Create a collection orders with the following documents:
db.orders.insert([
{ "_id" : 1, "item" : "abc", "price" : 12, "ordered" : 2 },
{ "_id" : 2, "item" : "jkl", "price" : 20, "ordered" : 1 }
])
Create another collection items with the following documents:
db.items.insert([
{ "_id" : 1, "item" : "abc", description: "product 1", "instock" : 120 },
{ "_id" : 2, "item" : "def", description: "product 2", "instock" : 80 },
{ "_id" : 3, "item" : "jkl", description: "product 3", "instock" : 60 }
])
The following operation first uses the $lookup stage to join the two collections by the item fields and then uses $mergeObjects in the $replaceRoot to merge the joined documents from items and orders:
db.orders.aggregate([
{
$lookup: {
from: "items",
localField: "item", // field in the orders collection
foreignField: "item", // field in the items collection
as: "fromItems"
}
},
{
$replaceRoot: { newRoot: { $mergeObjects: [ { $arrayElemAt: [ "$fromItems", 0 ] }, "$$ROOT" ] } }
},
{ $project: { fromItems: 0 } }
])
The operation returns the following documents:
{ "_id" : 1, "item" : "abc", "description" : "product 1", "instock" : 120, "price" : 12, "ordered" : 2 }
{ "_id" : 2, "item" : "jkl", "description" : "product 3", "instock" : 60, "price" : 20, "ordered" : 1 }
This Technique merge Object and return the result
Minime solution worked except that it required a fix:
var query = createOR(keys(collection.getName()));
need to add keyword as 2nd parameter to createOR call here.

way to update multiple documents with different values

I have the following documents:
[{
"_id":1,
"name":"john",
"position":1
},
{"_id":2,
"name":"bob",
"position":2
},
{"_id":3,
"name":"tom",
"position":3
}]
In the UI a user can change position of items(eg moving Bob to first position, john gets position 2, tom - position 3).
Is there any way to update all positions in all documents at once?
You can not update two documents at once with a MongoDB query. You will always have to do that in two queries. You can of course set a value of a field to the same value, or increment with the same number, but you can not do two distinct updates in MongoDB with the same query.
You can use db.collection.bulkWrite() to perform multiple operations in bulk. It has been available since 3.2.
It is possible to perform operations out of order to increase performance.
From mongodb 4.2 you can do using pipeline in update using $set operator
there are many ways possible now due to many operators in aggregation pipeline though I am providing one of them
exports.updateDisplayOrder = async keyValPairArr => {
try {
let data = await ContestModel.collection.update(
{ _id: { $in: keyValPairArr.map(o => o.id) } },
[{
$set: {
displayOrder: {
$let: {
vars: { obj: { $arrayElemAt: [{ $filter: { input: keyValPairArr, as: "kvpa", cond: { $eq: ["$$kvpa.id", "$_id"] } } }, 0] } },
in:"$$obj.displayOrder"
}
}
}
}],
{ runValidators: true, multi: true }
)
return data;
} catch (error) {
throw error;
}
}
example key val pair is: [{"id":"5e7643d436963c21f14582ee","displayOrder":9}, {"id":"5e7643e736963c21f14582ef","displayOrder":4}]
Since MongoDB 4.2 update can accept aggregation pipeline as second argument, allowing modification of multiple documents based on their data.
See https://docs.mongodb.com/manual/reference/method/db.collection.update/#modify-a-field-using-the-values-of-the-other-fields-in-the-document
Excerpt from documentation:
Modify a Field Using the Values of the Other Fields in the Document
Create a members collection with the following documents:
db.members.insertMany([
{ "_id" : 1, "member" : "abc123", "status" : "A", "points" : 2, "misc1" : "note to self: confirm status", "misc2" : "Need to activate", "lastUpdate" : ISODate("2019-01-01T00:00:00Z") },
{ "_id" : 2, "member" : "xyz123", "status" : "A", "points" : 60, "misc1" : "reminder: ping me at 100pts", "misc2" : "Some random comment", "lastUpdate" : ISODate("2019-01-01T00:00:00Z") }
])
Assume that instead of separate misc1 and misc2 fields, you want to gather these into a new comments field. The following update operation uses an aggregation pipeline to:
add the new comments field and set the lastUpdate field.
remove the misc1 and misc2 fields for all documents in the collection.
db.members.update(
{ },
[
{ $set: { status: "Modified", comments: [ "$misc1", "$misc2" ], lastUpdate: "$$NOW" } },
{ $unset: [ "misc1", "misc2" ] }
],
{ multi: true }
)
Suppose after updating your position your array will looks like
const objectToUpdate = [{
"_id":1,
"name":"john",
"position":2
},
{
"_id":2,
"name":"bob",
"position":1
},
{
"_id":3,
"name":"tom",
"position":3
}].map( eachObj => {
return {
updateOne: {
filter: { _id: eachObj._id },
update: { name: eachObj.name, position: eachObj.position }
}
}
})
YourModelName.bulkWrite(objectToUpdate,
{ ordered: false }
).then((result) => {
console.log(result);
}).catch(err=>{
console.log(err.result.result.writeErrors[0].err.op.q);
})
It will update all position with different value.
Note : I have used here ordered : false for better performance.