Below is an example of documents i have in a collection. I want to delete all the duplicate documents except for the one that has the latest revision number filtered by user and grouped by guid
[
{
_id: ObjectId("5e8e2d28ca6e660006f263e6"),
guid: 1,
revision: 1,
user:1
},
{
_id: ObjectId("5e8e2d28ca6e660006f263e7"),
guid: 1,
revision: 2,
user:1
},
{
_id: ObjectId("5e8e2d28ca6e660006f264d0"),
guid: 2,
revision: 5,
user:1
},
{
_id: ObjectId("5e8e2d28ca6e660006f264d1"),
guid: 1,
revision: 5,
user:1
},
{
_id: ObjectId("5e8e2d28ca6e660006f264d2"),
guid: 3,
revision: 5,
user:2
}
]
Expect output should delete following documents
For user : 1
-- _id: ObjectId("5e8e2d28ca6e660006f263e6"),
-- _id: ObjectId("5e8e2d28ca6e660006f263e7"),
-- _id: ObjectId("5e8e2d28ca6e660006f264d0"),
For user : 2
None
I have tried below query it works but fails in scenario when i have same guid and same revision number. In case both are same then anyone can be deleted
db.collection.find({guid:1,revision:1}).sort({revision:-1}).forEach(function(doc){
console.log(db.collection.deleteMany({revision:{$lt:doc.revision}, guid:doc.guid}));
})
Please help. Thanks in advance
Try this
Step 1
Get the documents that are lastly revised
let data = await db.collection.aggregate([
{
$sort: {
user: 1,
guid: 1,
revision: -1
}
},
{
$group: {
_id: {
guid: "$guid",
user: "$user"
},
id: {
$first: "$_id"
},
guid: {
$first: "$guid"
},
user: {
$first: "$user"
},
revision: {
$first: {
$max: "$revision"
}
},
doc: {
$last: "$$ROOT"
}
},
},
{
$project: {
_id: "$id",
guid: 1,
revision: 1,
user: 1
}
},
])
mongoplayground
Step 2
Get the _ids of the resultant data
x = data.map(item=>item._id);
Step 3
Delete documents other than x
db1.deleteMany({ "_id": { $nin: x } });
Option 2 for Single query to do the same task
let data = await db.collection.aggregate([
{
$sort: {
user: 1,
guid: 1,
revision: -1
}
},
{
$group: {
_id: {
guid: "$guid",
user: "$user"
},
id: {
$first: "$_id"
},
guid: {
$first: "$guid"
},
user: {
$first: "$user"
},
revision: {
$first: {
$max: "$revision"
}
},
doc: {
$last: "$$ROOT"
}
},
},
{
$project: {
_id: "$id",
guid: 1,
revision: 1,
user: 1
}
},
{ $out: "temp1" } // temp1 is the collection name
])
Related
I am trying to finish up a data aggregation pipeline and having issues getting the data into the correct format. I'm not even sure if this is possible to do in one pipeline.
The original data looks like this:
[
{
answers: {
'question1': 'a',
'question2': 'c',
'question3': ['a','b'],
'question4': 1
},
createdAt: 2022-03-04T07:30:40.517Z,
},
{
answers: {
'question1': 'b',
'question2': 'c',
'question3': ['a','c']
'question4': 2
},
createdAt: 2022-03-04T07:30:40.518Z,
}
]
I've got my pipeline so far with this:
{ $project: {
"answers": { $objectToArray: "$answers" },
"date": { $dateToString: { format: "%Y-%m-%d", date: "$createdAt" }}
}},
{ $unwind: "$answers" },
{ $unwind: "$answers.v" },
{
$group: {
_id: { answers : "$answers", date: "$date"},
c: { $sum: 1 }}
},
and the data now looks like this:
{
_id: {
answers: { k: 'q3', v: 'b' },
date: '2022-03-04'
},
count: 1
},
{
_id: {
answers: { k: 'q3', v: 'a' },
date: '2022-03-04'
},
count: 2
},
{
_id: {
answers: { k: 'q4', v: 1 },
date: '2022-03-04'
},
count: 1
},
{
_id: {
answers: { k: 'q1', v: 'b' },
date: '2022-03-04'
},
count: 1
},
{
_id: {
answers: { k: 'q4', v: 2 },
date: '2022-03-04'
},
count: 1
},
{
_id: {
answers: { k: 'q2', v: 'c' },
date: '2022-03-04'
},
count: 2
},
{
_id: {
answers: { k: 'q3', v: 'c' },
date: '2022-03-04'
},
count: 1
},
{
_id: {
answers: { k: 'q1', v: 'a' },
date: '2022-03-04'
},
count: 1
}
I would like to get a result that looks something like this:
{
'dates': [
{
'date': '2022-03-04',
'q1': { 'a': 1, 'b': 1 }
'q2': { 'c': 2 },
'q3': { 'a': 2, 'b': 1, 'c': 1 },
'q4': { '1': 1, '2': 1 }
}
]
'totals': { // this would be the totals across all the dates
'q1': { 'a': 1, 'b': 1 }
'q2': { 'c': 2 },
'q3': { 'a': 2, 'b': 1, 'c': 1 },
'q4': { '1': 1, '2': 1 }
}
}
any help would be greatly appreciated, even if I can't get both the totals and breakdown in 1 query.
here is the mongoplaygroud I've been working on
Not that simple. An important stage you have to use is $facet in order to get totals and dates
Maybe with $setWindowFields the aggregation pipeline could be a little simpler, but that a quick guess.
db.collection.aggregate([
{
$project: {
_id: 0,
answers: { $objectToArray: "$answers" },
date: { $dateToString: { format: "%Y-%m-%d", date: "$createdAt" } }
}
},
{ $unwind: "$answers" },
{ $unwind: "$answers.v" },
{
$group: {
_id: {
answer: "$answers.v",
question: "$answers.k",
date: "$date"
},
count: { $sum: 1 }
}
},
{
$facet: {
dates: [
{
$group: {
_id: { question: "$_id.question", date: "$_id.date" },
count: {
$push: {
k: { $toString: "$_id.answer" },
v: "$count"
}
}
}
},
{
$group: {
_id: "$_id.date",
count: {
$push: {
k: "$_id.question",
v: { $arrayToObject: "$count" }
}
}
}
},
{
$replaceWith: {
$mergeObjects: [
{ date: "$_id" },
{ $arrayToObject: "$count" }
]
}
}
],
totals: [
{
$group: {
_id: { answer: "$_id.answer", question: "$_id.question" },
v: { $push: "$count" }
}
},
{
$group: {
_id: "$_id.question",
count: {
$push: {
k: { $toString: "$_id.answer" },
v: { $sum: "$v" }
}
}
}
},
{
$project: {
_id: 0,
k: "$_id",
v: { $arrayToObject: "$count" }
}
}
]
}
},
{ $set: { totals: { $arrayToObject: "$totals" } } }
])
Mongo Playground
Would like to query the following to obtain all item documents such that the last sale (ordered by soldDate) has a status of 2.
db.items.insertMany([
{ item: 1,
sales: [
{ soldDate: ISODate("2021-10-04"), status: 1 },
{ soldDate: ISODate("2021-10-05"), status: 2 }
]
},
{ item: 2,
sales: [
{ soldDate: ISODate("2021-09-29"), status: 3 },
{ soldDate: ISODate("2021-09-24"), status: 1 }
]
},
{ item: 3,
sales: [
{ soldDate: ISODate("2021-06-01"), status: 3 },
{ soldDate: ISODate("2021-06-12"), status: 2 },
{ soldDate: ISODate("2021-06-07"), status: 1 }
]
}
]);
So in this example, the query would return the following two documents:
{ item: 1,
sales: [
{ soldDate: ISODate("2021-10-04"), status: 1 },
{ soldDate: ISODate("2021-10-05"), status: 2 } // triggered by this
]
},
{ item: 3,
sales: [
{ soldDate: ISODate("2021-06-01"), status: 3 },
{ soldDate: ISODate("2021-06-12"), status: 2 }, // triggered by this
{ soldDate: ISODate("2021-06-07"), status: 1 }
]
}
Thanks for any help.
You stated: ordered by soldDate which can actually mean two things. Perhaps you want the documents sorted by the array, or perhaps you mean the array is sorted. I assumed the later.
Solution (Array sorted)
db.items.aggregate([
{ $match: { "sales.status": 2} },
{ $unwind: "$sales" },
{ $sort: { "item": 1, "sales.soldDate": 1} },
{ $group: { "_id": "$_id", "item": { $first: "$item" }, "sales": { $push: "$sales" } } }
])
Results
Enterprise replSet [primary] barrydb> db.items.aggregate([
... { $match: { "sales.status": 2} },
... { $unwind: "$sales" },
... { $sort: { "item": 1, "sales.soldDate": 1} },
... { $group: { "_id": "$_id", "item": { $first: "$item" }, "sales": { $push: "$sales" } } }
... ])
[
{
_id: ObjectId("617064519be05d9f1cbab346"),
item: 1,
sales: [
{ soldDate: ISODate("2021-10-04T00:00:00.000Z"), status: 1 },
{ soldDate: ISODate("2021-10-05T00:00:00.000Z"), status: 2 }
]
},
{
_id: ObjectId("617064519be05d9f1cbab348"),
item: 3,
sales: [
{ soldDate: ISODate("2021-06-01T00:00:00.000Z"), status: 3 },
{ soldDate: ISODate("2021-06-07T00:00:00.000Z"), status: 1 },
{ soldDate: ISODate("2021-06-12T00:00:00.000Z"), status: 2 }
]
}
]
But, to be complete here is a solution if you want the documents sorted (and the array not necessarily sorted).
Solution (Documents sorted)
db.items.aggregate([
{ $match: { "sales.status": 2} },
{ $sort: { "sales.soldDate": 1} }
])
Results
Enterprise replSet [primary] barrydb> db.items.aggregate([
... { $match: { "sales.status": 2} },
... { $sort: { "sales.soldDate": 1} }
... ])
[
{
_id: ObjectId("617064519be05d9f1cbab348"),
item: 3,
sales: [
{ soldDate: ISODate("2021-06-01T00:00:00.000Z"), status: 3 },
{ soldDate: ISODate("2021-06-12T00:00:00.000Z"), status: 2 },
{ soldDate: ISODate("2021-06-07T00:00:00.000Z"), status: 1 }
]
},
{
_id: ObjectId("617064519be05d9f1cbab346"),
item: 1,
sales: [
{ soldDate: ISODate("2021-10-04T00:00:00.000Z"), status: 1 },
{ soldDate: ISODate("2021-10-05T00:00:00.000Z"), status: 2 }
]
}
]
EDIT - After re-reading I believe you want only where the record having a status of 2 is also has the greatest date in the array
Solution (Only last having status of value 2 - docs and array unsorted)
db.items.aggregate([
{ $unwind: "$sales" },
{ $sort: { "item": 1, "sales.soldDate": -1} },
{ $group: { "_id": "$_id", "item": { $first: "$item" }, "sales": { $push: "$sales" } } },
{ $match : { "sales.0.status" : 2 } }
])
Results
Enterprise replSet [primary] barrydb> db.items.aggregate([
... { $unwind: "$sales" },
... { $sort: { "item": 1, "sales.soldDate": -1} },
... { $group: { "_id": "$_id", "item": { $first: "$item" }, "sales": { $push: "$sales" } } },
... { $match : { "sales.0.status" : 2 } }
... ])
[
{
_id: ObjectId("617064519be05d9f1cbab346"),
item: 1,
sales: [
{ soldDate: ISODate("2021-10-05T00:00:00.000Z"), status: 2 },
{ soldDate: ISODate("2021-10-04T00:00:00.000Z"), status: 1 }
]
},
{
_id: ObjectId("617064519be05d9f1cbab348"),
item: 3,
sales: [
{ soldDate: ISODate("2021-06-12T00:00:00.000Z"), status: 2 },
{ soldDate: ISODate("2021-06-07T00:00:00.000Z"), status: 1 },
{ soldDate: ISODate("2021-06-01T00:00:00.000Z"), status: 3 }
]
}
]
EDIT - Add Self Referencing Lookup
db.items.aggregate([
{ $unwind: "$sales" },
{ $sort: { "item": 1, "sales.soldDate": -1} },
{ $group: { "_id": "$_id", "item": { $first: "$item" }, "sales": { $push: "$sales" } } },
{ $match : { "sales.0.status" : 2 } },
{ $lookup : {
from: "items",
localField: "_id",
foreignField: "_id",
as: "results"
}
},
{ $unwind: "$results" },
{ $replaceRoot: { "newRoot": "$results" } }
])
With the self-referencing lookup we are treating MongoDB as a relational database. We find the documents that meet our requirements, but in doing so we have destroyed the original shape and content. By performing a lookup on the same records we can restore the shape but at a performance penalty.
Retain Copy
Rather than performing a lookup, which has a performance concern, a different approach is to leverage memory on the server. Keep a copy of the original while moving through the pipeline and manipulating the original to identify desired records...
db.items.aggregate([
{ $addFields: { "_original": "$$ROOT" } },
{ $unwind: "$sales" },
{ $sort: { "item": 1, "sales.soldDate": -1} },
{ $group: { "_id": "$_id", "_original": { $first: "$_original" }, "sales_status": { $push: "$sales.status" } } },
{ $match : { "sales_status.0" : 2 } },
{ $replaceRoot: { "newRoot": "$_original" } }
])
In this example we keep a copy of the original in the field _original then once we have identified the records we want we pivot the root back to _original. This may put pressure on the WiredTiger cache as we are keeping a duplicate of all selected records in memory during the execution of the pipeline. A $lookup approach also has this memory concern. Two queries would eliminate the cache pressure issues, but behaves like a $lookup and would not perform as well.
I have the following resolver:
const result = await UserPassage.aggregate([
{ $sort: { createdAt: -1 } },
{
$group: {
_id: '$level',
level: { $first: '$level' },
passageId: { $first: '$passageId' },
userId: { $first: '$userId' },
type: { $first: '$type' },
category: { $first: '$category' },
score: { $first: '$score' },
completedStage: { $first: '$completedStage' },
userPassageStatsId: {
_id: { $first: '$_id' },
stats: {
readingTime: { $first: '$readingTime' },
qtdVocab: { $first: '$qtdVocab' },
qtdTestDone: { $first: '$qtdTestDone' },
totalQuiz: { $first: '$totalQuiz' },
progress: { $first: '$progress' },
},
},
},
},
{ $sort: { level: 1 } },
]);
await UserPassageStats.populate(result, { path: 'userPassageStatsId' });
The problem is that I need to populate 'userPassageStatsId' and return it but it's not working well returning the following error:
MongoError: The field 'userPassageStatsId' must be an accumulator object
does anyone knows what I am doing wrong?
$group can only contain _id or accumulator objects like $first, $last, $sum etc. In your case your building nested object and that syntax is not allowed - accumulator has to be on a top level. You can try two approaches, either return flat structure from $group and then reshape using $project:
{
$group: {
_id: '$level',
level: { $first: '$level' },
passageId: { $first: '$passageId' },
userId: { $first: '$userId' },
type: { $first: '$type' },
category: { $first: '$category' },
score: { $first: '$score' },
completedStage: { $first: '$completedStage' },
userPassageStatsId_id: { $first: '$_id' },
readingTime: { $first: '$readingTime' },
qtdVocab: { $first: '$qtdVocab' },
qtdTestDone: { $first: '$qtdTestDone' },
totalQuiz: { $first: '$totalQuiz' },
progress: { $first: '$progress' }
}
},
{
$project: {
_id: 1,
level: 1,
...,
userPassageStatsId: {
_id: "$userPassageStatsId_id",
stats: {
readingTime: "$readingTime",
...
}
}
}
}
or use $$ROOT to capture first object for every group and reshape it using $project:
{
$group: {
_id: '$level',
d: { $first: "$$ROOT" }
}
},
{
$project: {
_id: 1,
level: "$d.level",
...,
userPassageStatsId: {
_id: "$d._id",
stats: {
readingTime: "$d.readingTime",
...
}
}
}
}
I have a collection with fields like "servicereqesttype", "zipcode", "date"
I want to fing the 3 most common "servicerequesttype" per zipcode for a specific day.
db.event.aggregate([
{
$match: {
creationdate: "2011-01-01"
}
},
{
$project: {
zipcode: "$zipcode",
servicerequesttype: "$servicerequesttype"
}
},
{
$group: {
_id: {
zipcode: "$zipcode",
servicerequesttype: "$servicerequesttype"
},
zipcode: {
$first: "$zipcode"
},
servicerequesttype: {
$first: "$servicerequesttype"
},
count: {$sum: 1}
}
},
{
$sort: {
"zipcode": -1,
"count": -1
}
},
{
$project: {
_id: 0,
zipcode: "$zipcode",
servicerequesttype: "$servicerequesttype",
count: "$count"
}
}
])
now all I have to is to select only 3 per zipcode and I need some help, maybe I have to use $bucket or $map...
db.event.aggregate([
{
$match: {
creationdate: "2011-01-01"
}
},
{
$project: {
zipcode: "$zipcode",
servicerequesttype: "$servicerequesttype"
}
},
{
$group: {
_id: {
zipcode: "$zipcode",
servicerequesttype: "$servicerequesttype"
},
zipcode: {
$first: "$zipcode"
},
servicerequesttype: {
$first: "$servicerequesttype"
},
count: {$sum: 1}
}
},
{
$sort: {
"zipcode": -1,
"count": -1
}
},
{
$project: {
_id: 0,
zipcode: "$zipcode",
servicerequesttype: "$servicerequesttype",
count: "$count",
arrayOfTypes: "$array1",
arrayOfIncidents: "$array2"
}
},
{
$group: {
_id: "$zipcode",
arrayOfTypes: {
$push: {type: "$servicerequesttype", count: "$count"}
}
}
},
{
$project: {
_id: "$_id",
array: {
$slice: ["$arrayOfTypes", 3]
}
}
},
{
$sort: {
"_id": -1
}
}
])
I am having a bad time trying to do an aggregation in MongoDB.
I need to cross some infos from each user and as a final result I want a list of users (where there is only one object for each user) and for each object there is some lists with distinct information.
1 - The createdAtList array must be ordered from the oldest to the newest date. The sumOfTotal means the current position total summed up with the previous sumOfTotal (Exemplified in the code below), not just the sum of the total's
2 - The categotyList must be ordered like: category1, category2, category3 ...
3 - The desired final result must be ordered like: user1, user2, user3 ...
Basically I need some help to do the following:
//List of docs from my collection:
[
{
_id: "doc1",
user: "user1",
category: "category1",
createdAt: "2018-01-01T00:00:00.000Z"
},
{
_id: "doc2",
user: "user1",
category: "category2",
createdAt: "2017-12-12T00:00:00.000Z",
},
{
_id: "doc3",
user: "user1",
category: "category1",
createdAt: "2017-12-12T00:00:00.000Z",
},
{
_id: "doc4",
user: "user1",
category: "category2",
createdAt: "2018-01-01T00:00:00.000Z"
},
{
_id: "doc5",
user: "user1",
category: "category3",
createdAt: "2017-11-11T00:00:00.000Z"
}
]
//Desired result:
{
user: "user1",
createdAtList: [ //list ordered by createdAt
{
createdAt: "2017-11-11T00:00:00.000Z",
total: 1,
sumOfTotal: 0
}
{
createdAt: "2017-12-12T00:00:00.000Z",
total: 2,
sumOfTotal: 3 //summed up with the previous
}
{
createdAt: "2018-01-01T00:00:00.000Z",
total: 2,
sumOfTotal: 5 //summed up with the previous
}
],
categotyList: [ //list ordered by category
{
category: "category1",
total: 2
},
{
category: "category2",
total: 2
},
{
category: "category3",
total: 1
}
]
},
...
Is possible to do this in the same aggregate?
I do not think it really makes sense to have the createdAtList.sumOfTotal field. I do not think the fields in an array should be dependent upon a particular order of the elements. If you want some field to contain the sum of the createdAtList.total field, I think there should only be one field (outside of the array). That being said, here is the query I came up with to give you the desired results (using "users" as the name of the collection):
db.users.aggregate([
{
$group: {
_id: {
user: "$user",
createdAt: "$createdAt"
},
total: { $sum: 1 },
category: { $push: "$category" }
}
},
{
$project: {
_id: 0,
user: "$_id.user",
createdAt: "$_id.createdAt",
total: "$total",
category: 1
}
},
{ $unwind: "$category" },
{
$group: {
_id: {
user: "$user",
category: "$category"
},
catTotal: { $sum: 1 },
createdAtList: {
$push: {
createdAt: "$createdAt",
total: "$total"
}
}
}
},
{
$project: {
_id: 0,
user: "$_id.user",
createdAtList: 1,
category: "$_id.category",
catTotal: 1
}
},
{ $unwind: "$createdAtList" },
{
$group: {
_id: "$user",
createdAtList: {
$addToSet: "$createdAtList"
},
categoryList: {
$addToSet: {
category: "$category",
total: "$catTotal"
}
}
}
},
{ $unwind: "$createdAtList" },
{ $sort: { "createdAtList.createdAt": 1 } },
{
$group: {
_id: "$_id",
createdAtList: {
$push: "$createdAtList"
},
categoryList: {
$first: "$categoryList"
}
}
},
{ $unwind: "$categoryList" },
{ $sort: { "categoryList.category": 1 } },
{
$group: {
_id: "$_id",
createdAtList: {
$first: "$createdAtList"
},
categoryList: {
$push: "$categoryList"
}
}
},
{
$project: {
_id: 0,
user: "$_id",
createdAtList: 1,
sumOfTotal: { $sum: "$createdAtList.total" },
categoryList: 1
}
},
{ $sort: { user: 1 } },
]).pretty()