Delete document that has size greater than a specific value - mongodb

I have a collection which contains a multiple documents whose size has increased from 16MBs or is about to reach 16MBs.
I want query that finds documents which have size greater than 10MBs and delete all of them.
I am using following to find the size of document.
Object.bsonsize(db.test.findOne({type:"auto"}))
Is there a way to embed this query inside db.test.deleteMany() query?

This following query deletes the documents with size greater than the specified size (the size is specified in bytes). This query is valid with MongoDB v4.4 or higher.
db.collection.deleteMany( {
$expr: { $gt: [ { $bsonSize: "$$ROOT" }, SIZE_LIMIT ] },
type: "auto"
} )
The following script runs for MongoDB v4.2 or earlier:
const SIZE_LIMIT = 75 // substitute your value here in bytes
let idsToDelete = [ ]
let crsr = db.collection.find()
while(crsr.hasNext()) {
let doc= crsr.next()
if (Object.bsonsize(doc) > SIZE_LIMIT) {
idsToDelete.push(doc._id)
}
}
db.collection.deleteMany( { _id: { $in: idsToDelete } } )

I think you have to do it like this:
db.test.aggregate([
{ $match: { type: "auto" } },
{ $project: { bsonSize: { $bsonSize: "$$ROOT" } } },
{ $match: { bsonSize: { $gt: 16e6 } } },
]).forEach(function (doc) {
db.test.deleteOne({ _id: doc._id });
})
Or if you prefer deleteMany:
var ids = db.test.aggregate([
{ $match: { type: "auto" } },
{ $project: { bsonSize: { $bsonSize: "$$ROOT" } } },
{ $match: { bsonSize: { $lt: 16e6 } } }
]).toArray().map(x => x._id);
db.test.deleteMany({ _id: { $in: ids } });

Related

MongoDB sorting does not work with inner array

I'm trying to query specific fields in my document and sort them by one of the fields, however, the engine seems to completely ignore the sort.
I use the query:
db.symbols.find({_id:'AAPL'}, {'income_statement.annual.totalRevenue':1,'income_statement.annual.fiscalDateEnding':1}).sort({'income_statement.annual.totalRevenue': 1})
This is the output:
[
{
_id: 'AAPL',
income_statement: {
annual: [
{
fiscalDateEnding: '2021-09-30',
totalRevenue: '363172000000'
},
{
fiscalDateEnding: '2020-09-30',
totalRevenue: '271642000000'
},
{
fiscalDateEnding: '2019-09-30',
totalRevenue: '256598000000'
},
{
fiscalDateEnding: '2018-09-30',
totalRevenue: '265595000000'
},
{
fiscalDateEnding: '2017-09-30',
totalRevenue: '229234000000'
}
]
}
}
]
I would expect to have the entries sorted by fiscalDateEnding, starting with 2017-09-30 ascending.
However, the order is fixed, even if I use -1 for sorting.
Any ideas?
The sort you are using is for the ordering of documents in the result set. This is different from the ordering of array elements inside the document.
For your case, if you are using a newer version of MongoDB (5.2+), you can use the $sortArray.
db.symbols.aggregate([
{
$project: {
_id: 1,
annual: {
$sortArray: {
input: "$income_statement.annual",
sortBy: {
fiscalDateEnding: 1
}
}
}
}
}
])
If you are using older version of MongoDB, you can do the followings to perform the sorting.
db.collection.aggregate([
{
"$unwind": "$income_statement.annual"
},
{
$sort: {
"income_statement.annual.fiscalDateEnding": 1
}
},
{
$group: {
_id: "$_id",
annual: {
$push: "$income_statement.annual"
}
}
},
{
"$project": {
_id: 1,
income_statement: {
annual: "$annual"
}
}
}
])
Here is the Mongo Playground for your reference.

How do you consistently migrate a large MongoDB collection?

I was trying to migrate a large MongoDB of ~600k documents, like so:
for await (const doc of db.collection('collection').find({
legacyProp: { $exists: true },
})) {
// additional data fetching from separate collections here
const newPropValue = await fetchNewPropValue(doc._id)
await db.collection('collection').findOneAndUpdate({ _id: doc._id }, [{ $set: { newProp: newPropValue } }, { $unset: ['legacyProp'] }])
}
}
When the migration script finished, data was still being updated for about 30 minutes or so. I've concluded this by computing document count of documents containing legacyProp property:
db.collection.countDocuments({ legacyProp: { $exists: true } })
which was decreasing on subsequent calls. After a while, the updates stopped and the final document count of documents containing legacy prop was around 300k, so the update failed silently resulting in a data loss. I'm curious what exactly happened, and most importantly, how do you update large MongoDB collections without any data loss? Keep in mind, there is additional data fetching involved before every update operation.
My first attempt would be to build function of fetchNewPropValue() in an aggregation pipeline.
Have a look at Aggregation Pipeline Operators
If this is not possible then you can try to put all newPropValue's into array and use it like this. 600k properties should fit easily into your RAM.
const newPropValues = await fetchNewPropValue() // getting all new properties as array [{_id: ..., val: ...}, {_id: ..., val: ...}, ...]
db.getCollection('collection').updateMany(
{ legacyProp: { $exists: true } },
[
{
$set: {
newProp: {
$first: {
$filter: { input: newPropValues, cond: { $eq: ["$_id", "$$this._id"] } }
}
}
}
},
{ $set: { legacyProp: "$$REMOVE", newProp: "$$newProp.val" } }
]
)
Or you can try bulkWrite:
let bulkOperations = []
db.getCollection('collection').find({ legacyProp: { $exists: true } }).forEach(doc => {
const newPropValue = await fetchNewPropValue(doc._id);
bulkOperations.push({
updateOne: {
filter: { _id: doc._id },
update: {
$set: { newProp: newPropValue },
$unset: { legacyProp: "" }
}
}
});
if (bulkOperations.length > 10000) {
db.getCollection('collection').bulkWrite(bulkOperations, { ordered: false });
bulkOperations = [];
}
})
if (bulkOperations.length > 0)
db.getCollection('collection').bulkWrite(bulkOperations, { ordered: false })

MongoDB: How to speed up my data reorganisation query/operation?

I'm trying to analyse some data and I thought my queries would be faster ultimately by storing a relationship between my collections instead. So I wrote something to do the data normalisation, which is as follows:
var count = 0;
db.Interest.find({'PersonID':{$exists: false}, 'Data.DateOfBirth': {$ne: null}})
.toArray()
.forEach(function (x) {
if (null != x.Data.DateOfBirth) {
var peep = { 'Name': x.Data.Name, 'BirthMonth' :x.Data.DateOfBirth.Month, 'BirthYear' :x.Data.DateOfBirth.Year};
var person = db.People.findOne(peep);
if (null == person) {
peep._id = db.People.insertOne(peep).insertedId;
//print(peep._id);
}
db.Interest.updateOne({ '_id': x._id }, {$set: { 'PersonID':peep._id }})
++count;
if ((count % 1000) == 0) {
print(count + ' updated');
}
}
})
This script is just passed to mongo.exe.
Basically, I attempt to find an existing person, if they don't exist create them. In either case, link the originating record with the individual person.
However this is very slow! There's about 10 million documents and at the current rate it will take about 5 days to complete.
Can I speed this up simply? I know I can multithread it to cut it down, but have I missed something?
In order to insert new persons into People collection, use this one:
db.Interest.aggregate([
{
$project: {
Name: "$Data.Name",
BirthMonth: "$Data.DateOfBirth.Month",
BirthYear: "$Data.DateOfBirth.Year",
_id: 0
}
},
{
$merge: {
into: "People",
// requires an unique index on {Name: 1, BirthMonth: 1, BirthYear: 1}
on: ["Name", "BirthMonth", "BirthYear"]
}
}
])
For updating PersonID in Interest collection use this pipeline:
db.Interest.aggregate([
{
$lookup: {
from: "People",
let: {
name: "$Data.Name",
month: "$Data.DateOfBirth.Month",
year: "$Data.DateOfBirth.Year"
},
pipeline: [
{
$match: {
$expr: {
$and: [
{ $eq: ["$Name", "$$name"] },
{ $eq: ["$BirthMonth", "$$month"] },
{ $eq: ["$BirthYear", "$$year"] }
]
}
}
},
{ $project: { _id: 1 } }
],
as: "interests"
}
},
{
$set: {
PersonID: { $first: "$interests._id" },
interests: "$$REMOVE"
}
},
{ $merge: { into: "Interest" } }
])
Mongo Playground

mongodb : find documents that contains the most a specific kind of property

I have a collection "MyCollection" where each document could have 0,1 or many properties named "propertyX" where X is the nth time the property appears in the document.
For example, my collection could looks like this :
{
"_id":ObjectId("XXXX")
"property1":xxxxxx
"property2":xxxxxx
"property3":xxxxxx
}
{
"_id":ObjectId("XXXX")
"property1":xxxxxx
"property2":xxxxxx
"property3":xxxxxx
"property4":xxxxxx
"property5":xxxxxx
}
{
"_id":ObjectId("XXXX")
"property1":xxxxxx
"property2":xxxxxx
"property3":xxxxxx
"property4":xxxxxx
"property5":xxxxxx
"property6":xxxxxx
"property7":xxxxxx
}
I'm trying to sort documents with a mongo query by the number of propertyX they had, so in my example, the third object must be at the top of my result (7 properties), then the second one (5 properties), then the first one (only 3 properties). What I've done for the moment, is implementing it in javascript (using mongo driver) and do a count of keys with the Object.Keys(myObject).filter(key=>key.startsWith('property')) but this is taking way too much time. Is there a way to do it directly with a mongo query, thanks
You can achieve this by using the aggregation tool.
First create a new property with the number of properties in you document, so you can use it to sort on a later stage:
db.getCollection("MyCollection").aggregate([
{
$addFields: {
properties: { $objectToArray: '$$ROOT' },
}
}
]);
At this stage your documents on the pipeline would look something like this:
{
"_id":ObjectId("XXXX")
"property1":xxxxxx
"property2":xxxxxx
"property3":xxxxxx
"properties: [
{ k: "_id", v: ObjectId("XXXX")},
{ k: "property1", v: xxxxxx },
{ k: "property2", v: xxxxxx },
{ k: "property3", v: xxxxxx },
]
}
Using the $size operator you can get the length of this array:
...
{
$addFields: {
properties: { $size: { $objectToArray: '$$ROOT' }},
}
}
...
So now you would have a property that shows the number of keys in your document:
{
"_id":ObjectId("XXXX"),
"property1":xxxxxx,
"property2":xxxxxx,
"property3":xxxxxx,
"properties: 4,
}
Which would make it possible to sort the documents by properties number:
...
{
$addFields: {
properties: { $size: { $objectToArray: '$$ROOT' }},
}
},
{
$sort: { properties: -1 }
}
...
To finish you can get rid of the property properties with $unset:
...
{
$addFields: {
properties: { $size: { $objectToArray: '$$ROOT' }},
}
},
{
$sort: { properties: -1 }
},
{
$unset: 'properties'
},
You would end up with the following:
[
{
"_id":ObjectId("XXXX"),
"property1":xxxxxx,
"property2":xxxxxx,
"property3":xxxxxx,
"property4":xxxxxx,
"property5":xxxxxx,
"property6":xxxxxx,
"property7":xxxxxx,
},
{
"_id":ObjectId("XXXX"),
"property1":xxxxxx,
"property2":xxxxxx,
"property3":xxxxxx,
"property4":xxxxxx,
"property5":xxxxxx,
},
{
"_id":ObjectId("XXXX"),
"property1":xxxxxx,
"property2":xxxxxx,
"property3":xxxxxx,
}
]

Mongodb aggregation pass argument to element size of $sample

Hello every body here any one can help me with query below
I want to get quiz list with random amount
the amount of rendom will
base on each lesson
The problem is
mongodb not allow to pass argument to element size of $sample
Any one can give me the solution
lessonModel.aggregate([
{ $match : {'quiz.status':1 } },
{
$lookup : {
from : 'quiz',
let : { 'lesson_id' : '$_id','limit' : '$quiz.amount' },
pipeline : [
{
$match: {
$expr: {
$eq: [ "$lesson_id", "$$lesson_id" ]
}
}
},
{
$project: {
title:1,
check_list:1,
duration:1
}
},
{ $sample: { size: '$$limit' } }
],
as: 'quiz'
}
},
{$unwind: '$quiz'},
{ $replaceRoot: { newRoot: "$quiz" } }
]).exec();
The error said size argument to $sample must be a number
Here is my sample data
UPDATE
I think your main problem is to randomly pick amount number of quizs under each lesson. Since $sample is not helpful use $function (New in version MongoDB 4.4).
Solution
Inside $function operator write some logic to
Shuffle the questions (You can change it to your requirement).
Slice it to return the number(amount) of questions required.
db.lessons.aggregate([
{ $match: { "quiz.status": 1 } },
{
$lookup: {
from: "quiz",
let: { "lesson_id": "$_id" },
pipeline: [
{
$match: {
$expr: { $eq: ["$lesson_id", "$$lesson_id"] }
}
},
{
$project: {
title: 1,
check_list: 1,
duration: 1
}
}
],
as: "questions"
}
},
{
$project: {
quiz: {
$function: {
body: function(questions, amount) {
if (amount == 0) return [];
for (let i = questions.length - 1; i > 0; i--) {
const j = Math.floor(Math.random() * (i + 1));
[questions[i], questions[j]] = [questions[j], questions[i]];
}
return questions.slice(0, amount);
},
args: ["$questions", { $ifNull: ["$quiz.amount", 0] }],
lang: "js"
}
}
}
},
{ $unwind: "$quiz" },
{ $replaceRoot: { newRoot: "$quiz" } }
]);
$sample does not support variables. A number must be specified explicitly like:
{
$sample: { size: 1 }
}
Also replace your let as shown below because last lesson document has no amount filed in the quiz object
let: {
'lesson_id': '$_id',
'limit': { $ifNull: ['$quiz.amount', 0] } // or any other number.
},
Wrong:
{
$sample: { size: "$$limit" } // Wont work!
}