Merge duplicates and remove the oldest - mongodb

I have a collection where there are some duplicate documents. In example:
First document:
{
"_id" : ObjectId("56f3d7cc1de31cb20c08ae6b"),
"AddedDate" : ISODate("2016-05-01T00:00:00.000Z"),
"Place": "THISPLACE",
"PresentInDB" : [
{
"InDB" : ISODate("2016-05-01T00:00:00.000Z")
}
],
"Checked" : [],
"Link": "http://www.mylink.com/first/84358"
}
Second document:
{
"_id" : ObjectId("577740526c1e542904725238"),
"AddedDate" : ISODate("2016-05-02T00:00:00.000Z"),
"Place": "THISPLACE",
"PresentInDB" : [
{
"InDB" : ISODate("2016-05-02T00:00:00.000Z")
},
{
"InDB" : ISODate("2016-05-03T00:00:00.000Z")
}
],
"Checked" : [
{
"Done" : ISODate("2016-05-02T00:00:00.000Z")
},
],
"Link": "http://www.mylink.com/second/84358"
}
Link field contains same sequense of numbers in both documents, 84358.
So I would like to achieve those steps:
Loop over each document in the collection.
Match the number sequence in each document in the Link field (i.e. 84358 above) and if there are several documents in
collection that have that sequence in the Link field. And also if Place field match in both documents:
Merge PresentInDB and Checked fields - > merge PresentInDB and Checked fields by adding array values from the newest document (by date in AddedDate
field) to the oldest document.
Remove the newest document.
How could I achieve such a query?

In MongoDB 3.3.6 release is introduced a $split operator for dealing with strings in aggregation framework (Jira). Before this release you only could solve this with a map/reduce solution.
After MongoDB 3.3.6 release: Aggregation framework solution
db.duplicatedCollection.aggregate(
[
{
$project: {
_id : 1,
AddedDate : 1,
Place : 1,
PresentInDB : 1,
Checked : 1,
Link : 1,
sequenceNumber: { $arrayElemAt: [ {$split: ["$Link", "/"]}, -1 ]},
}
},
{
$sort: { AddedDate: 1 }
},
{
$group: {
_id : {
sequenceNumber : "$sequenceNumber",
Place : "$Place"
},
id : { $first: "$_id"},
AddedDate: { $first: "$AddedDate" },
Place : { $first: "$Place" },
PresentInDB: {
$push: '$PresentInDB'
},
Checked: {
$push: '$Checked'
},
Link: { $first: "$Link"}
}
},
{
$unwind: "$PresentInDB"
},
{
$unwind: {
path : "$PresentInDB",
preserveNullAndEmptyArrays: true
}
},
{
$unwind: "$Checked"
},
{
$unwind: {
path : "$Checked",
preserveNullAndEmptyArrays: true
}
},
{
$group: {
_id : "$id",
AddedDate: { $first: "$AddedDate" },
Place : { $first: "$Place" },
PresentInDB : {
$addToSet: '$PresentInDB'
},
Checked : {
$addToSet: '$Checked'
},
Link: { $first: "$Link"}
}
},
{
$out: "duplicatedCollection"
}
]
);
Before MongoDB 3.3.6 release: Map/Reduce solution
Map Function:
var mapFunction = function() {
var linkArray = this.Link.split("/");
var sequenceNumber = linkArray[linkArray.length - 1];
var keyDoc = {
place : this.Place,
sequenceNumber: sequenceNumber,
};
emit(keyDoc, this);
};
Reduce Function:
var reduceFunction = function(key, values) {
var reducedDoc = {};
reducedDoc._id = values[0]._id;
reducedDoc.AddedDate = values[0].AddedDate;
reducedDoc.Link = values[0].Link;
reducedDoc.PresentInDB = [];
reducedDoc.Checked = [];
var presentInDbMillisArray = [];
var checkedMillisArray = [];
values.forEach(function(doc) {
if (reducedDoc.AddedDate < doc.AddedDate) {
reducedDoc._id = doc._id;
reducedDoc.AddedDate = doc.AddedDate;
reducedDoc.Link = doc.Link;
}
// PresentInDB field merge
doc.PresentInDB.forEach(function(presentInDBElem) {
var millis = presentInDBElem.InDB.getTime();
if (!Array.contains(presentInDbMillisArray, millis)) {
reducedDoc.PresentInDB.push(presentInDBElem);
presentInDbMillisArray.push(millis);
}
});
// same here with Checked field
doc.Checked.forEach(function(checkedElem) {
var millis = checkedElem.Done.getTime();
if (!Array.contains(checkedMillisArray, millis)) {
reducedDoc.Checked.push(checkedElem);
checkedMillisArray.push(millis);
}
});
});
return reducedDoc;
};
Map/Reduce:
db.duplicatedCollection.mapReduce(
mapFunction,
reduceFunction,
{
"out": "duplicatedCollection"
}
);
Unwrap the value from the map/reduce returned documents:
db.duplicatedCollection.find(
{
value : {
$exists: true
}
}
).forEach(function(doc) {
db.duplicatedCollection.insert(doc.value);
db.duplicatedCollection.remove({_id : doc._id});
});

You can use a single aggregation query to do that :
db.device.aggregate([{
"$unwind": "$PresentInDB"
}, {
"$match": {
"Link": /84358/
}
}, {
"$sort": {
"AddedDate": 1
}
}, {
"$group": {
_id: 0,
PresentInDB: {
$addToSet: '$PresentInDB'
},
AddedDate: {
$first: "$AddedDate"
},
id: {
$first: "$_id"
},
Link: {
$first: "$Link"
}
}
}, {
$out: "documents"
}])
$unwind your array to work on it
$match your id (here containing 84358)
$sort by ascending date
$group with :
a $addToSet to merge all your PresentInDB into one single array without duplicates
a $first for each field to keep. Keeping the first means you only want the older one since we previously sorted by ascending date
$out will save the results to a new collection called documents here

Related

MongoDB Aggregation to get count and Y sample entries

MongoDB version:4.2.17.
Trying out aggregation on data in a collection.
Example data:
{
"_id" : "244",
"pubName" : "p1",
"serviceIdRef" : "36e9c779-7865-4b74-a30b-e4d6a0cc5295",
"serviceName" : "my-service",
"subName" : "c1",
"pubState" : "INVITED"
}
I would like to:
Do a match by something (let’s say subName) and group by serviceIdRef and then limit to return X entries
Also return for each of the serviceIdRefs, the count of the documents in each of ACTIVE or INVITED states. And Y (for this example, say Y=3) documents that are in this state.
For example, the output would appear as (in brief):
[
{
serviceIdRef: "36e9c779-7865-4b74-a30b-e4d6a0cc5295",
serviceName:
state:[
{
pubState: "INVITED"
count: 200
sample: [ // Get those Y entries (here Y=3)
{
// sample1 like:
"_id" : "244",
"pubName" : "p1",
"serviceIdRef" : "36e9c779-7865-4b74-a30b-e4d6a0cc5295",
"serviceName" : "my-service",
"subName" : "c1",
"pubState" : "INVITED"
},
{
sample2
},
{
sample3
}
]
},
{
pubState: "ACTIVE", // For this state, repeat as we did for "INVITED" state above.
......
}
]
}
{
repeat for another service
}
]
So far I have written this but am not able to get those Y entries. Is there a (better) way?
This is what I have so far (not complete and not exactly outputs in the format above):
db.sub.aggregate(
[{
$match:
{
"subName": {
$in: ["c1", "c2"]
},
"$or": [
{
"pubState": "INVITED",
},
{
"pubState": "ACTIVE",
}
]
}
},
{
$group: {
_id: "$serviceIdRef",
subs: {
$push: "$$ROOT",
}
}
},
{
$sort: {
_id: -1,
}
},
{
$limit: 22
},
{
$facet:
{
facet1: [
{
$unwind: "$subs",
},
{
$group:
{
_id: {
"serviceName" : "$_id",
"pubState": "$subs.pubState",
"subState": "$subs.subsState"
},
count: {
$sum: 1
}
}
}
]
}
}
])
You have to do the second $group stage to manage nested structure,
$match your conditions
$sort by _id in descending order
$group by serviceIdRef and pubState, get first required fields and prepare the array for sample, and get count of documents
$group by only serviceIdRef and construct the state array
$slice for limit the document in sample
db.collection.aggregate([
{
$match: {
subName: { $in: ["c1", "c2"] },
pubState: { $in: ["INVITED", "ACTIVE"] }
}
},
{ $sort: { _id: -1 } },
{
$group: {
_id: {
serviceIdRef: "$serviceIdRef",
pubState: "$pubState"
},
serviceName: { $first: "$serviceName" },
sample: { $push: "$$ROOT" },
count: { $sum: 1 }
}
},
{
$group: {
_id: "$_id.serviceIdRef",
serviceName: { $first: "$serviceName" },
state: {
$push: {
pubState: "$_id.pubState",
count: "$count",
sample: { $slice: ["$sample", 22] }
}
}
}
}
])
Playground

How to delete Duplicate objects inside array in multiple documents in mongodb?

I am trying to delete the duplicate object inside the array in multiple documents in Mongodb.
I try many ways but not able to fix
Document Structure:-
{
"_id" : ObjectId("5a544fe234602415114601d3"),
"GstDetails" : [
{
"_id" : ObjectId("5e4837374d62f4c95163908e"),
"StateId" : "1",
"GstIn" : "33ABFFM1655H1ZF",
"StateDesc" : "TAMIL NADU",
"CityDesc" : "CHENNAI"
},
{
"_id" : ObjectId("5e4837484d62f4c9516395e8"),
"StateId" : "1",
"GstIn" : "33ABFFM1655H1ZF",
"StateDesc" : "TAMIL NADU",
"CityDesc" : "CHENNAI"
}
]
}
Like that many more documents
I tried:-
db.Supplier.find({ "GstDetails": { $size: 2 } }).limit(1).forEach(function (doc) {
var stateId;
doc.GstDetails.forEach(function (data) {
if (data.StateId == stateId) {
pull doc.GstDetails[0];
} else {
stateId = data.StateId
}
print(JSON.stringify(doc));
});
db.Supplier.save(doc)
});
Check if aggregation below meets your requirements:
db.Supplier.aggregate([
{
$unwind: "$GstDetails"
},
{
$group: {
_id: {
_id: "$_id",
StateId: "$GstDetails.StateId"
},
GstDetails: {
$push: "$GstDetails"
}
}
},
{
$addFields: {
GstDetails: {
$slice: [
"$GstDetails",
1
]
}
}
},
{
$unwind: "$GstDetails"
},
{
$group: {
_id: "$_id._id",
GstDetails: {
$push: "$GstDetails"
}
}
}
])
MongoPlayground
Note: This read-only query. If it is OK, you need to add as last stage below operator (once you execute it, it will update your documents, no rollback available):
{$out: "Supplier"}

How can i retrieve all nested array of object element which has been matched by using "elementMatch" mongodb

We have array of object and every object contain array of object "Conversation".We need to get the count/elements where the field deliveryStatus is sent and userId is exits
[{
_id: "12312312"
conversation:[
{
"messageInformation" : {
"deliveryStatus" : "sent"
},
"userId" : 12
},
{
"messageInformation" : {
"deliveryStatus" : "sent",
}
}
]},
_id: "213123123123"
conversation:[
{
"messageInformation" : {
"deliveryStatus" : "sent"
},
"userId" : 33
},
{
"messageInformation" : {
"deliveryStatus" : "sent",
"userInfo" : [ ]
}
}
]}
]
I need the total count. I had try try elemMatch but it only return the first element
let counter = await ProjectConversation.find({
conversation: {
$elemMatch: {
userId :{$exists: true},
"messageInformation.deliveryStatus": deliveryStatus.Sent
}
}
}, {
_id: 0,
conversation.$: 1
});
It will be fine if I get the object and do the count by using javascript
You can use aggregate $unwind to deconstruct the array and use $count to count the number of documents that matches the $match property, eg :
db.collection.aggregate([{
$unwind: "$conversation"
},
{
$match: {
"conversation.messageInformation.deliveryStatus": "sent",
"conversation.userId": {
$exists: true
}
}
},
{
$count: "NumberOfMsgSent"
}
])
On Mongo Playground

Mongo DB - Second Level Search - elemMatch

I am trying to fetch all records (and count of all records) for a structure like the following,
{
id: 1,
level1: {
level2:
[
{
field1:value1;
},
{
field1:value1;
},
]
}
},
{
id: 2,
level1: {
level2:
[
{
field1:null;
},
{
field1:value1;
},
]
}
}
My requirement is to fetch the number of records that have field1 populated (atleast one in level2). I need to say fetch all the ids or the number of such ids.
The query I am using is,
db.table.find({},
{
_id = id,
value: {
$elemMatch: {'level1.level2.field1':{$exists: true}}
}
}
})
Please suggest.
EDIT1:
This is the question I was trying to ask in the comment. I was unable to elucidate in the comment properly. Hence, editing the question.
{
id: 1,
level1: {
level2:
[
{
field1:value1;
},
{
field1:value1;
},
]
}
},
{
id: 2,
level1: {
level2:
[
{
field1:value2;
},
{
field1:value2;
},
{
field1:value2;
}
]
}
}
{
id: 3,
level1: {
level2:
[
{
field1:value1;
},
{
field1:value1;
},
]
}
}
The query we used results in
value1: 4
value2: 3
I want something like
value1: 2 // Once each for documents 1 & 3
value2: 1 // Once for document 2
You can do that with the following find query:
db.table.find({ "level1.level2" : { $elemMatch: { field1 : {$exists: true} } } }, {})
This will return all documents that have a field1 in the "level1.level2" structure.
For your question in the comment, you can use the following aggregation to "I had to return a grouping (and the corresponding count) for the values in field1":
db.table.aggregate(
[
{
$unwind: "$level1.level2"
},
{
$match: { "level1.level2.field1" : { $exists: true } }
},
{
$group: {
_id : "$level1.level2.field1",
count : {$sum : 1}
}
}
]
UPDATE: For your question "'value1 - 2` At level2, for a document, assume all values will be the same for field1.".
I hope i understand your question correctly, instead of grouping only on the value of field1, i added the document _id as an xtra grouping:
db.table.aggregate(
[
{
$unwind: "$level1.level2"
},
{
$match: {
"level1.level2.field1" : { $exists: true }
}
},
{
$group: {
_id : { id : "$_id", field1: "$level1.level2.field1" },
count : {$sum : 1}
}
}
]
);
UPDATE2:
I altered the aggregation and added a extra grouping, the aggregation below gives you the results you want.
db.table.aggregate(
[
{
$unwind: "$level1.level2"
},
{
$match: {
"level1.level2.field1" : { $exists: true }
}
},
{
$group: {
_id : { id : "$_id", field1: "$level1.level2.field1" }
}
},
{
$group: {
_id : { id : "$_id.field1"},
count : { $sum : 1}
}
}
]
);

MongoDB: Search minimum, maximum in nested object with dynamic field name

I have a query to get the minimum query and maximum query on below sample data set. In my case fields names are dynamic, like below product_1, product_2...
{
"_id" : NumberLong(540),
"product_1" : {
"orderCancelled" : 0,
"orderDelivered" : 6
},
"product_2" : {
"orderCancelled" : 3,
"orderDelivered" : 16
},
"product_3" : {
"orderCancelled" : 5,
"orderDelivered" : 11
}
}
I am not getting an idea how can i do this in Mongo where the field names are dynamic, means in future there may be other products also get created as product_4 and product_5 for the same id.
I need a query which gives me minimum value for orderDelivered and maximum value for orderCancelled, as for example in above document result will be orderDelivered:16 & orderCancelled:0 .
Thanks for any idea.
You should restructure your document so that all product documents are in an array:
{
"_id": NumberLong(540),
products: [
{
"name": "product_1",
"orderCancelled": 0,
"orderDelivered": 6
},
{
"name": "product_2",
"orderCancelled": 3,
"orderDelivered": 16
},
{
"name": "product_3",
"orderCancelled": 5,
"orderDelivered": 11
}
]
}
Then you'll be able to issue normal max/min queries like this:
db.test.aggregate([
{
$match: { "_id" : NumberLong(540) }
},
{
$unwind: "$products"
},
{
$group: {
_id: "$_id",
minDelivered: { $min: "$products.orderDelivered" },
maxCancelled: { $max: "$products.orderCancelled" }
}
}
])
You need to change your documents structure by updating them. You will need loop through each document using the .forEach method then $unset all field's name that startswith product_. From there you will need to add new fields products which is array of products using the $set update operator. That being said you should use "bulk" operations to update your documents for maximum efficiency
var bulkOp = db.collection.initializeOrderedBulkOp();
var count = 0;
db.collection.find().forEach(function(doc) {
var allproducts = [];
for(var key in doc) {
if(Object.prototype.hasOwnProperty.call(doc, key) && /^product_\d+/.test(key)) {
var product = {};
product["name"] = key;
product["orderCancelled"] = doc[key]["orderCancelled"];
product["orderDelivered"] = doc[key]["orderDelivered"];
allproducts.push(product);
var unsetField = {};
unsetField[key] = "";
bulkOp.find({"_id": doc._id}).update({ "$unset": unsetField });
};
count++;
};
bulkOp.find({"_id": doc._id}).update({
"$set": { "products": allproducts }
});
count++;
if(count % 500 === 0) {
// Execute per 500 operations and re-init
bulkOp.execute();
bulkOp = db.collection.initializeOrderedBulkOp();
}
})
// clean up queues
if(count > 0) {
bulkOp.execute();
}
Your documents will now look like this:
{
"_id" : NumberLong(542),
"products" : [
{
"name" : "product_1",
"orderCancelled" : 0,
"orderDelivered" : 6
},
{
"name" : "product_2",
"orderCancelled" : 3,
"orderDelivered" : 16
},
{
"name" : "product_3",
"orderCancelled" : 5,
"orderDelivered" : 11
}
]
}
Then comes your aggregation query using the .aggregate() method:
db.collection.aggregate([
{ "$match": { "_id": 542 } },
{ "$unwind": "$products" },
{ "$group": {
"_id": "$_id",
"maxOrderCancelled": { "$max": "$products.orderCancelled"},
"minOrderDelivvered": { "$min": "$products.orderDelivered"}
}}
])
Which returns:
{ "_id" : NumberLong(542), "maxOrderCancelled" : 5, "minOrderDelivvered" : 6 }
From version 3.2 you can use the $max and $min in your $project stage which is a much more better way to do this because there no need to $unwind your array first.
db.collection.aggregate([
{ "$match": { "_id": 542 } },
{ "$project": {
"maxOrderCancelled": {
"$max": {
"$map": {
"input": "$products",
"as": "order",
"in": "$$orc.orderCancelled"
}
}
},
"minOrderDelivered": {
"$min": {
"$map": {
"input": "$products",
"as": "orc",
"in": "$$orc.orderDelivered"
}
}
}
}}
])
Which yields:
{ "_id" : NumberLong(542), "maxOrderCancelled" : 5, "minOrderDelivered" : 6 }