Workaround 16MB BSON limit to delete multiple documents - mongodb

I have my MongoDB data like that
Please look at the last field - time, as you can see, I have some "duplicate" data which have been marked with color.
For the small database, I can remove the duplicate values with below code
var cursor = db.getCollection("light").aggregate([
{$group : {
"_id": {
index: "$index",
unit: "$unit",
min: "$min",
max: "$max",
node: "$node",
year: { "$year": "$time" },
dayOfYear: { "$dayOfYear": "$time" },
hour: { "$hour": "$time" },
minute: { "$minute": "$time" }
},
_id_not_delete: { $last: "$_id" }
}}
],
{
"allowDiskUse" : true
}
)
var ids_not_delete = cursor.map(function (doc) { return doc._id_not_delete; });
db.getCollection("light").remove({"_id": { "$nin": ids_not_delete }});
But my database has more than 20 millions record, thus I receive this error
E QUERY [js] Error: Converting from JavaScript to BSON failed: Object size 23146644 exceeds limit of 16793600 bytes. :
Bulk/addToOperationsList#src/mongo/shell/bulk_api.js:611:28
Bulk/findOperations.remove#src/mongo/shell/bulk_api.js:743:24
DBCollection.prototype.remove#src/mongo/shell/collection.js:404:13
#(shell):1:1
I know that the root cause is
The maximum BSON document size is 16 megabytes
I think I should change below code, but I don't have any good solution.
var ids_not_delete = cursor.map(function (doc) { return doc._id_not_delete; });
Do you have any ideas to optimize my code?
Example documents in the collection:
{
"_id" : ObjectId("5be22d5808c08300545effee"),
"index" : "LIGHT",
"unit" : "LUX",
"min" : NumberInt(5),
"max" : NumberInt(6),
"avg" : 5.5,
"node" : "TH",
"time" : ISODate("2018-11-07T00:10:00.091+0000")
},
{
"_id" : ObjectId("5be22b0052122e0047c3467c"),
"index" : "LIGHT",
"unit" : "LUX",
"min" : NumberInt(3),
"max" : NumberInt(5),
"avg" : NumberInt(4),
"node" : "TH",
"time" : ISODate("2018-11-07T00:00:00.204+0000")
},
{
"_id" : ObjectId("5be22b0008c08300545eff79"),
"index" : "LIGHT",
"unit" : "LUX",
"min" : NumberInt(3),
"max" : NumberInt(5),
"avg" : NumberInt(4),
"node" : "TH",
"time" : ISODate("2018-11-07T00:00:00.081+0000")
}
MongoDB shell version v4.0.2
MongoDB 4.0.0

You can invert your aggregation to select ids you want to delete, rather than ones you want to keep:
const toDelete = db.getCollection("light").aggregate([
{ $group : {
"_id": {
index: "$index",
unit: "$unit",
min: "$min",
max: "$max",
node: "$node",
year: { "$year": "$time" },
dayOfYear: { "$dayOfYear": "$time" },
hour: { "$hour": "$time" },
minute: { "$minute": "$time" }
},
ids: {$push: "$_id"}
} },
{$project: {_id: {$slice: ["$ids", 1, 10000]}}},
{$unwind: "$_id"},
{$project: {_id: 0, deleteOne: { "filter" : { "_id" : "$_id"} } } }
]).toArray()
10,000 here is any big enough number significantly greater than expected number of duplicates within a group.
Then you can use bulkWrite:
db.getCollection("light").bulkWrite(toDelete);
The driver will split the array by batches 100,000 deletions each.

Related

How to group two or more fields using mongo aggregation to find the distincts

My Appointment Document looks like this
[{
"_id" : ObjectId("5f25686c946376355468caab"),
"status" : "approved",
"slot" : ObjectId("5ee751ab85596308c0272fa2"),
"student" : ObjectId("5eddc7d7cc5d3608c0393ce1"),
"teacher" : ObjectId("5eccfd6d4f5d8d48ac567a5d"),
"cost" : 49,
"createdAt" : ISODate("2020-08-01T13:04:44.696Z"),
"updatedAt" : ISODate("2020-08-01T13:20:36.164Z"),
"decisionTime" : ISODate("2020-08-01T13:20:36.161Z")
},
{
"_id" : ObjectId("5f25687b946376355468caac"),
"status" : "approved",
"slot" : ObjectId("5ee751ab85596308c0272fa3"),
"student" : ObjectId("5eddc7d7cc5d3608c0393ce1"),
"teacher" : ObjectId("5eccfd6d4f5d8d48ac567a5d"),
"cost" : 49,
"createdAt" : ISODate("2020-08-01T13:04:59.125Z"),
"updatedAt" : ISODate("2020-08-01T13:06:12.289Z"),
"decisionTime" : ISODate("2020-08-01T13:06:12.288Z")
},
{
"_id" : ObjectId("5f2ad883f0971a0c3c7d6e6f"),
"status" : "approved",
"slot" : ObjectId("5ee751ab85596308c0272fa4"),
"student" : ObjectId("5eddc7f4cc5d3608c0393ce3"),
"teacher" : ObjectId("5eccfd6d4f5d8d48ac567a5d"),
"cost" : 49,
"createdAt" : ISODate("2020-08-05T16:04:19.437Z"),
"updatedAt" : ISODate("2020-08-05T16:04:52.616Z"),
"decisionTime" : ISODate("2020-08-05T16:04:52.615Z")
}]
I want to group total number of distinct student, total number of appointment, total cost on a particular date(createdAt) using mongo aggregation.
How do I get Distinct Student on a distinct Date
Expected Output :
[
{
"_id": "01-08-2020",
"appointments": 2,
"totalCost": 98,
"totalStudents": 1
},
{
"_id": "05-08-2020",
"appointments": 1,
"totalCost": 49,
"totalStudents": 1
}
]
The problem here is that I want to find total number of distinct students
Group by createdAt field's day, month and year by using $dateFromParts operator and just sum up cost field.
For getting distinct student fields, use the $addToSet operator and push it to a set while grouping and in the project stage just project the size of that set.
Also format createdAt field $dateToString operator to your requirement %d-%m-%Y.
db.collection.aggregate([
{
$group: {
_id: {
$dateFromParts: {
day: {
$dayOfMonth: '$createdAt'
},
month: {
$month: '$createdAt'
},
year: {
$year: '$createdAt'
}
}
},
createdAt: {
$first: '$createdAt'
},
totalAppointments: {
$sum: 1
},
totalCost: {
$sum: '$cost'
},
students: {
$addToSet: '$student'
}
}
},
{
$project: {
_id: {
$dateToString: {
date: '$createdAt',
format: '%d-%m-%Y'
}
},
appointments: '$totalAppointments',
totalCost: '$totalCost',
totalStudents: {
$size: '$students'
}
}
}
]);
Giving output:
[
{
"_id": "05-08-2020",
"appointments": 1,
"totalCost": 49,
"totalStudents": 1
},
{
"_id": "01-08-2020",
"appointments": 2,
"totalCost": 98,
"totalStudents": 1
}
]
MongoDb playground

Filter by nested arrays/objects values (on different levels) and $push by multiple level - MongoDB Aggregate

I have a document with multiple level of embedded subdocument each has some nested array. Using $unwind and sort, do sorting based on day in descending and using push to combine each row records into single array. This Push is working only at one level means it allows only one push. If want to do the same things on the nested level and retains the top level data, got "errmsg" : "Unrecognized expression '$push'".
{
"_id" : ObjectId("5f5638d0ff25e01482432803"),
"name" : "XXXX",
"mobileNo" : 323232323,
"payroll" : [
{
"_id" : ObjectId("5f5638d0ff25e01482432801"),
"month" : "Jan",
"salary" : 18200,
"payrollDetails" : [
{
"day" : "1",
"salary" : 200,
},
{
"day" : "2",
"salary" : 201,
}
]
},
{
"_id" : ObjectId("5f5638d0ff25e01482432802"),
"month" : "Feb",
"salary" : 8300,
"payrollDetails" : [
{
"day" : "1",
"salary" : 300,
},
{
"day" : "2",
"salary" : 400,
}
]
}
],
}
Expected Result:
{
"_id" : ObjectId("5f5638d0ff25e01482432803"),
"name" : "XXXX",
"mobileNo" : 323232323,
"payroll" : [
{
"_id" : ObjectId("5f5638d0ff25e01482432801"),
"month" : "Jan",
"salary" : 18200,
"payrollDetails" : [
{
"day" : "2",
"salary" : 201
},
{
"day" : "1",
"salary" : 200
}
]
},
{
"_id" : ObjectId("5f5638d0ff25e01482432802"),
"month" : "Feb",
"salary" : 8300,
"payrollDetails" : [
{
"day" : "2",
"salary" : 400
},
{
"day" : "1",
"salary" : 300
}
]
}
],
}
Just day will be sorted and remaining things are same
I have tried but it got unrecognized expression '$push'
db.employee.aggregate([
{$unwind: '$payroll'},
{$unwind: '$payroll.payrollDetails'},
{$sort: {'payroll.payrollDetails.day': -1}},
{$group: {_id: '$_id', payroll: {$push: {payrollDetails:{$push:
'$payroll.payrollDetails'} }}}}])
It requires two time $group, you can't use $push operator two times in a field,
$group by main id and payroll id, construct payrollDetails array
$sort by payroll id (you can skip if not required)
$group by main id and construct payroll array
db.employee.aggregate([
{ $unwind: "$payroll" },
{ $unwind: "$payroll.payrollDetails" },
{ $sort: { "payroll.payrollDetails.day": -1 } },
{
$group: {
_id: {
_id: "$_id",
pid: "$payroll._id"
},
name: { $first: "$name" },
mobileNo: { $first: "$mobileNo" },
payrollDetails: { $push: "$payroll.payrollDetails" },
month: { $first: "$payroll.month" },
salary: { $first: "$payroll.salary" }
}
},
{ $sort: { "payroll._id": -1 } },
{
$group: {
_id: "$_id._id",
name: { $first: "$name" },
mobileNo: { $first: "$mobileNo" },
payroll: {
$push: {
_id: "$_id.pid",
month: "$month",
salary: "$salary",
payrollDetails: "$payrollDetails"
}
}
}
}
])
Playground

Mongodb aggregate by day and delete duplicate value

I'm trying to clean a huge database.
Sample DB :
{
"_id" : ObjectId("59fc5249d5ab401d99f3de7f"),
"addedAt" : ISODate("2017-11-03T11:26:01.744Z"),
"__v" : 0,
"check" : 17602,
"lastCheck" : ISODate("2018-04-05T11:47:00.609Z"),
"tracking" : [
{
"timeCheck" : ISODate("2017-11-06T13:17:20.861Z"),
"_id" : ObjectId("5a0060e00f3c330012bafe39"),
"rank" : 2395,
},
{
"timeCheck" : ISODate("2017-11-06T13:22:31.254Z"),
"_id" : ObjectId("5a0062170f3c330012bafe77"),
"rank" : 2395,
},
{
"timeCheck" : ISODate("2017-11-06T13:27:40.551Z"),
"_id" : ObjectId("5a00634c0f3c330012bafebe"),
"rank" : 2379,
},
{
"timeCheck" : ISODate("2017-11-06T13:32:41.084Z"),
"_id" : ObjectId("5a0064790f3c330012baff03"),
"rank" : 2395,
},
{
"timeCheck" : ISODate("2017-11-06T13:37:51.012Z"),
"_id" : ObjectId("5a0065af0f3c330012baff32"),
"rank" : 2379,
},
{
"timeCheck" : ISODate("2017-11-07T13:37:51.012Z"),
"_id" : ObjectId("5a0065af0f3c330012baff34"),
"rank" : 2379,
}]
}
I have a lot of duplicate value but I need to clean only by day.
To obtain this for example :
{
"_id" : ObjectId("59fc5249d5ab401d99f3de7f"),
"addedAt" : ISODate("2017-11-03T11:26:01.744Z"),
"__v" : 0,
"check" : 17602,
"lastCheck" : ISODate("2018-04-05T11:47:00.609Z"),
"tracking" : [
{
"timeCheck" : ISODate("2017-11-06T13:17:20.861Z"),
"_id" : ObjectId("5a0060e00f3c330012bafe39"),
"rank" : 2395,
},
{
"timeCheck" : ISODate("2017-11-06T13:27:40.551Z"),
"_id" : ObjectId("5a00634c0f3c330012bafebe"),
"rank" : 2379,
},
{
"timeCheck" : ISODate("2017-11-07T13:37:51.012Z"),
"_id" : ObjectId("5a0065af0f3c330012baff34"),
"rank" : 2379,
}]
}
How can I aggregate by day and after delete last value duplicate?
I need to keep the values per day even if they are identical with another day.
The aggregation framework cannot update data at this stage. However, you can use the following aggregation pipeline in order to get the desired output and then use e.g. a bulk replace to update all your documents:
db.collection.aggregate({
$unwind: "$tracking" // flatten the "tracking" array into separate documents
}, {
$sort: {
"tracking.timeCheck": 1 // sort by timeCheck to allow us to use the $first operator in the next stage reliably
}
}, {
$group: {
_id: { // group by
"_id": "$_id", // "_id" and
"rank": "$tracking.rank", // "rank" and
"date": { // the "date" part of the "timeCheck" field
$dateFromParts : {
year: { $year: "$tracking.timeCheck" },
month: { $month: "$tracking.timeCheck" },
day: { $dayOfWeek: "$tracking.timeCheck" }
}
}
},
"doc": { $first: "$$ROOT" } // only keep the first document per group
}
}, {
$sort: {
"doc.tracking.timeCheck": 1 // restore ascending sort order - may or may not be needed...
}
}, {
$group: {
_id: "$_id._id", // merge everything again per "_id"
"addedAt": { $first: "$doc.addedAt" },
"__v": { $first: "$doc.__v" },
"check": { $first: "$doc.check" },
"lastCheck": { $first: "$doc.lastCheck" },
"tracking": { $push: "$doc.tracking" } // in order to join the tracking values into an array again
}
})

MongoDB - Aggregate max/min/average for multiple variables at once

I am logging data into MongoDB in the following format:
{ "_id" : ObjectId("54f2393f80b72b00079d1a53"), "outT" : 10.88, "inT3" : 22.3, "light" : 336, "humidity" : 41.4, "pressure" : 990.31, "inT1" : 22.81, "logtime" : ISODate("2015-02-28T21:55:11.838Z"), "inT2" : 21.5 }
{ "_id" : ObjectId("54f2394580b72b00079d1a54"), "outT" : 10.88, "inT3" : 22.3, "light" : 338, "humidity" : 41.4, "pressure" : 990.43, "inT1" : 22.75, "logtime" : ISODate("2015-02-28T21:55:17.690Z"), "inT2" : 311.72 }
...
As you can see there is a single time element and multiple readings logged. I want to aggregate across all of the readings to provide a max min and average for each variable grouped by hour of day. I have managed to do this for a single variable using the following aggregation script:
db.logs.aggregate(
[
{
$match: {
logtime: {
$gte: ISODate("2015-03-01T00:00:00.000Z"),
$lt: ISODate("2015-03-03T00:00:00.000Z")
}
}
},
{
$project: {_id: 0, logtime: 1, outT: 1}
},
{
$group: {
_id: {
day: {$dayOfYear: "$logtime"},
hour: {$hour: "$logtime"}
},
average: {$avg: "$outT"},
max: {$max: "$outT"},
min:{$min: "$outT"}
}
}
]
)
which produces:
{ "_id" : { "day" : 61, "hour" : 22 }, "average" : 3.1878750000000116, "max" : 3.44, "min" : 3 }
{ "_id" : { "day" : 61, "hour" : 14 }, "average" : 13.979541666666638, "max" : 17.81, "min" : 8.81 }
...
I would like to produce output which looks like:
{"outT": { output from working aggregation above },
"inT1": { ... },
...
}
Everything I try seems to throw an error in the mongo console. Can anyone help?
Thanks
You can do this by including each statistic in your $group with a different name and then following that with a $project stage to reshape it into your desired format:
db.logs.aggregate([
{
$match: {
logtime: {
$gte: ISODate("2015-02-28T00:00:00.000Z"),
$lt: ISODate("2015-03-03T00:00:00.000Z")
}
}
},
{
$project: {_id: 0, logtime: 1, outT: 1, inT1: 1}
},
{
$group: {
_id: {
day: {$dayOfYear: "$logtime"},
hour: {$hour: "$logtime"}
},
outT_average: {$avg: "$outT"},
outT_max: {$max: "$outT"},
outT_min:{$min: "$outT"},
inT1_average: {$avg: "$inT1"},
inT1_max: {$max: "$inT1"},
inT1_min:{$min: "$inT1"}
}
},
{
$project: {
outT: {
average: '$outT_average',
max: '$outT_max',
min: '$outT_min'
},
inT1: {
average: '$inT1_average',
max: '$inT1_max',
min: '$inT1_min'
}
}
}
])
This gives you output that looks like:
{
"_id" : {
"day" : 59,
"hour" : 21
},
"outT" : {
"average" : 10.88,
"max" : 10.88,
"min" : 10.88
},
"inT1" : {
"average" : 22.78,
"max" : 22.81,
"min" : 22.75
}
}
$max in Mongodb gets the maximum of the corresponding values from all documents in the collection. $min gets the minimum values from all documents in the collection. $avg gets the average value from the collection.
you must go through the Mongodb link for sample examples.

MongoDB aggregate Timezone for date add

I have a problem of MongoDB's aggregate of timezone is UTC. I have looked for solutions from many other existing issues, but it is still not working. My code as follows:
MongoDB version : 2.2
Data
{ "_id" : ObjectId("52a3c9df46c6a9627eeb0337"), "Counting" : { "id" : "b1a93dfda46c47848f9862031300d24c", "group" : "Salary", "user_id" : "4d4ad2d37a464ad09d9aca2fee4c760c", "subGroup" : "e–ae3?", "bank_id" : "97e0fecc322b49b48c4eb3c8425fea77", "fee" : 646, "isIncome" : "true", "payment" : "", "consumeDate" : ISODate("2013-08-15T16:00:00Z"), "createDate" : ISODate("2013-12-08T01:22:39.008Z"), "bank_name" : "9edb6897-cdb8-4ce4-8f08-f5792cfa83d9" } }
{ "_id" : ObjectId("52a3c9df46c6a9627eeb0338"), "Counting" : { "id" : "33b341fc71314daebe851397c5cbaa40", "group" : "Salary", "user_id" : "cb9e06649cf943e5b368f6b05fc126c6", "subGroup" : "e–ae3?", "bank_id" : "e8da8cdae3ae495ca76f873fb3460b6d", "fee" : 647, "isIncome" : "true", "payment" : "", "consumeDate" : ISODate("2013-02-28T16:00:00Z"), "createDate" : ISODate("2013-12-08T01:22:39.016Z"), "bank_name" : "6913b48a-1a95-48c5-81f5-6920031358d7"} }
{ "_id" : ObjectId("52a3c9df46c6a9627eeb033a"), "Counting" : { "id" : "f0d41ed9f29f47e7b68a05c378cf939d", "group" : "Salary", "user_id" : "847cadbf55f84615af3ee63922446b54", "subGroup" : "e–ae3?", "bank_id" : "f45d62b5e62f4b7fa8172870cd992f19", "fee" : 623, "isIncome" : "true", "payment" : "", "consumeDate" : ISODate("2013-04-18T16:00:00Z"), "createDate" : ISODate("2013-12-08T01:22:39.152Z"), "bank_name" : "30dd169e-723e-4748-93cd-2d7a45b4a3b7"} }
db.Product.aggregate([{
"$group": {
"_id": {
"tyear": {
"$year": [{
"$add": ["$Counting.consumeDate", 28800000]
}]
},
"tMonth": {
"$month": [{
"$add": ["$Counting.consumeDate", 28800000]
}]
},
"tDate": {
"$dayOfMonth": [{
"$add": ["$Counting.consumeDate", 28800000]
}]
},
},
"count": {
"$sum": "$Counting.fee"
}
} }])
Error Message :
"errmsg" : "exception: $add does not support dates"
Reference
How to agregate by year-month-day on a different timezone
I'd recommend doing this in two-steps as a project then a group.
var millisecondsFromUTC = 8 * 60 * 60 * 1000; //PST is -8 hours from UTC
db.Product.aggregate([
{ $project : {
consumeDateLocal: {
$subtract : [ "$Counting.consumeDate", millisecondsFromUTC ]
},
fee: '$Counting.fee" } },
{ $group: {
_id: {
"tyear": { $year: "$consumeDateLocal" },
"tMonth": { "$month": "$consumeDateLocal" },
"tDate": { "$dayOfMonth": "consumeDateLocal" }
},
count: {
$sum: "$fee"
}
} } ], ...);
I do it like this.
millisecondsFromUTC = 8 * 3600 * 1000
Db.collection.aggreagte([
{$match: query},
{
$group: {
_id: {
$dateToString: {
format: "%Y-%m-%d",
date: {$add: ["$date", millisecondsFromUTC]}
},
click: {$sum: '$click'},
money: {$sum: {$divide: ['$money', 10000]}},
pv: {$sum: '$pv'},
req: {$sum: '$req'},
date: {$last: '$date'}
}
}]