How to count the documents duplicated in mongodb? - mongodb

I tried to search how to count the documents duplicated in mongodb and i got this function, it return the documents duplicated.
db.job_crawler_models_jobs_crawlings.aggregate(
{ $group: {
_id: { field1: "$field1", field2: "$field2" },
count: { $sum: 1 }
}},
{ $match: {
count: { $gt : 1 }
}}
)
But i want to get the number of documents duplicated. How can i do that?

You could try adding another $group in the pipeline. Not sure this is exactly what you are looking for though.
db.job_crawler_models_jobs_crawlings.aggregate(
{ $group: {
_id: { field1: "$field1", field2: "$field2" },
count: { $sum: 1 }
}},
{ $match: {
count: { $gt : 1 }
}},
{ $group: { _id: null, duplicatedCounts: { $sum:1 } } }
)

Related

Finding top 3 students in each subject MongoDB

I have tried searching for ways to solve my problem, except that my database is set up differently,
My documents in my collection are something like this:
{name:"MAX",
date:"2020-01-01"
Math:98,
Science:60,
English:80},
{name:"JANE",
date:"2020-01-01"
Math:80,
Science:70,
English:79},
{name:"ALEX",
date:"2020-01-01"
Math:95,
Science:68,
English:70},
{name:"JOHN",
date:"2020-01-01"
Math:95,
Science:68,
English:70}
{name:"MAX",
date:"2020-06-01"
Math:97,
Science:78,
English:90},
{name:"JANE",
date:"2020-06-01"
Math:78,
Science:76,
English:66},
{name:"ALEX",
date:"2020-06-01"
Math:93,
Science:75,
English:82},
{name:"JOHN",
date:"2020-06-01"
Math:92,
Science:80,
English:50}
I want to find the top 3 students for each subject without regard for the dates. I only managed to find the top 3 students in 1 subject.
So i group the students by name first, and add a column for max scores of a subject. Math in this case. Sort it in descending order and limit results to 3.
db.student_scores.aggregate(
[
{$group:{
_id: "$name",
maxMath: { $max: "$Math" }}},
{$sort:{"maxMath":-1}},
{$limit : 3}
]
)
Is there any way to get the top 3 students for each subject?
So, it would be top 3 for math, top 3 for science, top 3 for english
{
Math:{MAX, JANE, JOHN},
Science:{JOHN, ALEX, JANE},
English:{JANE, MAX, JOHN}
}
I just applied your code 3 times, using $facet
If you prefer a more compact result add
{$project:{English:"$Eng._id", Science:"$sci._id", Math:"$math._id"}}
PLAYGROUND
PIPELINE
db.collection.aggregate([
{
"$facet": {
"math": [
{
$group: {
_id: "$name",
maxMath: {
$max: "$Math"
}
}
},
{
$sort: {
"maxMath": -1
}
},
{
$limit: 3
}
],
"sci": [
{
$group: {
_id: "$name",
maxSci: {
$max: "$Science"
}
}
},
{
$sort: {
"maxSci": -1
}
},
{
$limit: 3
}
],
"Eng": [
{
$group: {
_id: "$name",
maxEng: {
$max: "$English"
}
}
},
{
$sort: {
"maxEng": -1
}
},
{
$limit: 3
}
]
}
}
])
Your question is not clear, but i can predict 2 scenario,
Get repetitive students along with date:
$project to show required fields and convert subjects object to array using $objectToArray
$unwind subjects array
$sort by subjects name in descending order
$group by subject name and get array of students
$project to get latest 3 students from students array
db.collection.aggregate([
{
$project: {
name: "$name",
date: "$date",
subjects: {
$objectToArray: {
Math: "$Math",
Science: "$Science",
English: "$English"
}
}
}
},
{ $unwind: "$subjects" },
{ $sort: { "subjects.v": -1 } },
{
$group: {
_id: "$subjects.k",
students: {
$push: {
name: "$name",
date: "$date",
score: "$subjects.v"
}
}
}
},
{
$project: {
_id: 0,
subject: "$_id",
students: { $slice: ["$students", 3] }
}
}
])
Playground
Sum of all date's score (means unique students):
$group by name, and get sum of all subjects using $sum,
$project to convert subjects object to array using $objectToArray
$unwind subjects array
$sort by subjects name in descending order
$group by subject name and get array of students
$project to get latest 3 students from students array
db.collection.aggregate([
{
$group: {
_id: "$name",
Math: { $sum: "$Math" },
Science: { $sum: "$Science" },
English: { $sum: "$English" }
}
},
{
$project: {
subjects: {
$objectToArray: {
Math: "$Math",
Science: "$Science",
English: "$English"
}
}
}
},
{ $unwind: "$subjects" },
{ $sort: { "subjects.v": -1 } },
{
$group: {
_id: "$subjects.k",
students: {
$push: {
name: "$_id",
score: "$subjects.v"
}
}
}
},
{
$project: {
_id: 0,
subject: "$_id",
students: { $slice: ["$students", 3] }
}
}
])
Playground

Aggregate by all days of month mongodb

Hey i need to get the sum of all totalPrice group by days
I get this result
but i need to fetch all rest days of month even if it returns 0
i need solution
this is my code
Order.aggregate([
{ $project: { yearMonthDay: { $dateToString: { format: "%Y-%m-%d", date: '$created' }}, totalPrice:"$totalPrice" }},
{ $group: { _id: "$yearMonthDay", count: { $sum: 1 }, total: {"$sum": "$totalPrice"} }},
{ $sort: { _id: -1 } },
{ $group: { _id: null, stats: { $push: "$$ROOT" }}},
{
$project: {
results: {
$map: {
input:{ $range:[16,31] },
as: 'day',
in: {
$let: {
vars: {
dateIndex: {
"$indexOfArray": ["$stats._id", {$dateToString:{ date:{$dateFromParts:{'year':2020, 'month':5, 'day':"$$day"}}, format:'%Y-%m-%d'}}]
}
},
in: {
$cond: {
if: { $ne: ["$$dateIndex", -1] },
then: { $arrayElemAt: ["$stats", "$$dateIndex"] },
else: { _id: {$dateToString:{ date:{$dateFromParts:{'year':2020, 'month':5, 'day':"$$day"}}, format:'%Y-%m-%d'}, count: 0, total: 0 } }
}
}
}
}
}
}
}
},
{ $unwind: "$results" },
{ $replaceRoot: { newRoot: "$results"}}
]
This query should work for you.
db.collectionName.aggregate([
{ $project: { yearMonthDay: { $dateToString: { format: "%Y-%m-%d", date: '$created' }}, totalPrice:"$totalPrice" }},
{ $group: { _id: "$yearMonthDay", count: { $sum: 1 }, total: {"$sum": "$totalPrice"} }},
{ $sort: { _id: -1 } },
{ $group: { _id: null, stats: { $push: "$$ROOT" }},
{
$project: {
results: {
$map: {
input: ["2020-05-16","2020-05-15","2020-05-14","2020-05-13","2020-05-12"],
as: "date",
in: {
$let: {
vars: {
dateIndex: {
"$indexOfArray": ["$stats._id", "$$date"]
}
},
in: {
$cond: {
if: { $ne: ["$$dateIndex", -1] },
then: { $arrayElemAt: ["$stats", "$$dateIndex"] },
else: { _id: "$$date", count: 0, total: 0 }
}
}
}
}
}
}
}
},
{ $unwind: "$results" },
{ $replaceRoot: { newRoot: "$results"}}
])
The First 3 steps is same as yours.
{ $group: { _id: null, stats: { $push: "$$ROOT" }} will push previous stage results into an arrray stats which we will use for lookup in later stage.
In last stage, we will create possible date range and iterate over that.
for each key in range.
"$indexOfArray": ["$stats._id", "$$date"] will check if date is present in stats array or not
Then we will use that index to fetch value from stats array otherwise push default values.
As these results are still under results, we will unwind that array and move to root.
If you server version is above 3.6,
we can simplify date range creation part as well. let's initialize input arrays as days using $range.
input:{ $range:[16,31] },
as: 'day'
and modifiy dateIndex part like this
dateIndex: {
"$indexOfArray": ["$stats._id", {$dateToString:{ date:{$dateFromParts:{'year':2020, 'month':5, 'day':"$$day"}}, format:'%Y-%m-%d'}]
}
And change default value part as well similarly.
else: { _id: {$dateToString:{ date:{$dateFromParts:{'year':2020, 'month':5, 'day':"$$day"}}, format:'%Y-%m-%d'}}, count: 0, total: 0 }
Or alternatively, we can also use concat for generating keys
dateIndex: {
"$indexOfArray": ["$stats._id", {$concat:["2020-05","-", {$convert:{input:"$$day", to:"string"}}]}]
}
// And default value
else: { _id: {$concat:["2020-05","-", {$convert:{input:"$$day", to:"string"}}]}, count: 0, total: 0 }
Similarly, you can run another loop for months as well.

MongoDB Aggregate $unwind on sub-Documents

I'm struggling when I $unwind more than one field from a Sub-Document.
Here's what the data looks like:-
{
resp: {
field1: 'yes',
field2: ''
},
{
resp: {
field1: 'yes',
field2: ''
}
etc,etc...
If I process an Aggregation Pipeline for ONE field, it works OK, so this works...
{ $unwind: "$resp" },
{ $unwind: "$resp.field1" },
{ $project: { field1: "$resp.field1" } }
{ $group: {
_id: 1,
field1: { $sum: { $cond: [{ $eq: ["$field1","yes"] },1,0] } }
}
}
But if I now want to return field 2 in the same aggregation, using the following, it will return a count of Zero for both fields, whereas previously field1 had a count > Zero.
{ $unwind: "$resp" },
{ $unwind: "$resp.field1" },
{ $unwind: "$resp.field2" },
{
$project: {
field1: "$resp.field1",
field2: "$resp.field2"
},
{ $group: {
_id: 1,
field1: { $sum: { $cond: [{ $eq: ["$field1","yes"] },1,0] } },
field2: { $sum: { $cond: [{ $eq: ["$field2","yes"] },1,0] } }
}
}
Any suggestions would be much appreciated.
it seems the above is the correct way to do this, but I'd happily take alternative suggestions. The error was in may mapping of the fields in the $project stage. When typing the issue into SO I realised where the problem was !

Sum value when satisfy condition in MongoDB

I am trying to get sum of values when certain condition is satisfied in the document.
In the below query i want to get sum of currentValue only when componentId = "ABC"
db.Pointnext_Activities.aggregate(
{ $project: {
_id: 0,
componentId:1,
currentValue:1
}
},
{ $group:
{ _id: "$componentId",
total: { $sum: "$currentValue" }
}
}
)
Please try this :
db.Pointnext_Activities.aggregate([{ $match: { componentId: 'ABC' } },
{
$group:
{
_id: "$componentId",
total: { $sum: "$currentValue" }
}
}, { $project: { 'componentId': '$_id', total: 1, _id: 0 } }])
If you just need the total value & doesn't care about componentId to be returned try this :
db.Pointnext_Activities.aggregate([{ $match: { componentId: 'ABC' } },
{
$group:
{
_id: "",
total: { $sum: "$currentValue" }
}
}, {$project :{total :1, _id:0}}])
It would be ideal in aggregation, if you always start with filter operation i.e; $match, as it would persist only needed documents for further steps.

MongoDB -- Find duplicate documents by multiple keys

I have a collection with documents that look like the following:
{
"_id" : ObjectId("55b377cb66b393427367c3e2"),
"comment" : "This is a comment",
"url_key" : "55b377cb66b393427367c3df", //This is an ObjectId from another record in a different collection
}
I need to find records in this collection that contain duplicate values for the both the comment AND the url_key.
I can easily generate (using aggregate) duplicate records for the same, single, key (eg: comment), but I can't figure out how to group by/aggregate for multiple keys.
Here's my current aggregation pipeline:
db.comments.aggregate([ { $group: { _id: { comment: "$comment" }, uniqueIds: { $addToSet: "$_id" }, count: { $sum: 1 } } }, { $match: { count: { $gte: 2 } } }, { $sort: { count : -1} }, {$limit 10 } ]);
Is it as simple as grouping by multiple keys or did I misunderstand your question?
...
{ $group: { _id: { id: "$_id", comment: "$comment" }, count: { $sum: 1 } } },
{ $match: { count: { $gte: 2 } } },
...