Ok, as said in title, I have "performance issue" where I need to get all documents from a collection but it takes too long. Players collection contains around 300k documents with small size and query in service goes like this:
async getAllPlayers() {
const players = await this.playersCollection.find({}, {projection: { playerId: 1, name: 1, surname: 1, shirtNumber: 1, position: 1 }}).toArray();
return players;
}
Overall size is 6.4MB. I'm using Fastify adapter, fastify-compress and mongodb native driver. If I remove projection, it takes almost a minute.
Any idea how to improve this?
The best time I get is 8 seconds, where fast-json-stringify give me more than 10 seconds boost over 300k records:
'use strict'
// run fresh mongo
// docker run --name temp --rm -p 27017:27017 mongo
const fastify = require('fastify')({ logger: true })
const fjs = require('fast-json-stringify')
const toString = fjs({
type: 'object',
properties: {
playerId: { type: 'integer' },
name: { type: 'string' },
surname: { type: 'string' },
shirtNumber: { type: 'integer' },
}
})
fastify.register(require('fastify-mongodb'), {
forceClose: true,
url: 'mongodb://localhost/mydb'
})
fastify.get('/', (request, reply) => {
const dataStream = fastify.mongo.db.collection('foo')
.find({}, {
limit: 300000,
projection: { playerId: 1, name: 1, surname: 1, shirtNumber: 1, position: 1 }
})
.stream({
transform(doc) {
return toString(doc) + '\n'
}
})
reply.type('application/jsonl')
reply.send(dataStream)
})
fastify.get('/insert', async (request, reply) => {
const collection = fastify.mongo.db.collection('foo')
const batch = collection.initializeOrderedBulkOp();
for (let i = 0; i < 300000; i++) {
const player = {
playerId: i,
name: `Name ${i}`,
surname: `surname ${i}`,
shirtNumber: i
}
batch.insert(player);
}
const { result } = await batch.execute()
return result
})
fastify.listen(8080)
In any case, you should consider to:
paginate your output
or pushing the data into a bucket (like S3) and return to the client a URL to download the file directly, this will speed up a lot the process and will save your node.js process from this data streaming
Note that the compression in node.js is a heavy process, so it slows it down a lot the response. An nginx proxy adds it by default without the need to implement it in your business logic server.
Related
Question
I have provided my code below for reference. I'm using MongoDB and discord.js v12. So basically, I have made a !info command which shows some general info of the user.
What this code does is, it checks through the member's roles, and regarding which role they have, it calculates their total claim time (for giveaways etc.). The problem here, is with the donator role. I can't figure out why I can't use the donates variable outside the db.findOne block. Here, data.content.length shows the total donates of the users, which means donates * 5 is +5 claim time for each donate.
My Code
const moment = require('moment');
module.exports = {
name: 'info',
async execute(client, message, args, Discord){
const member = message.mentions.members.first() || message.guild.members.cache.get(args[0]) || message.member;
const db = require('../models/d-schema');
db.findOne({ guildid: message.guild.id, user: member.user.id }, async(err, data)=>{
if(err) throw err;
if(data){
const donates = parseInt(data.content.length);
}
})
var DefaultTime = 10;
var support = 0;
var donate = 0;
var boost = 0;
const userRoles = member.roles.cache.map((r) => r.name);
if (userRoles.includes("୨・supporter")) {
support = 3;
}
if (userRoles.includes("୨・donator")) {
donate = donates * 5;
}
if (userRoles.includes("୨・booster")) {
boost = 10;
}
const TotalTime = DefaultTime + support + donate + boost;
const embed = new Discord.MessageEmbed()
.setThumbnail(member.user.displayAvatarURL( {dynamic: true} ))
.addFields(
{name: member.user.tag, value: member.user, inline: true},
{name: 'Nickname', value: `${member.nickname !== null ? member.nickname : 'None'}`, inline: true},
{name: 'Is Bot', value: member.user.bot, inline: true},
{name: 'Joined', value: `${moment.utc(member.joinedAt).format("MMMM Do YYYY")}`, inline: true},
{name: 'Created', value: `${moment.utc(member.user.createdAt).format("MMMM Do YYYY")}`, inline: true},
{name: 'Claim Time', value: `${TotalTime} seconds`, inline: true},
)
.setFooter(`ID : ${member.user.id}`)
.setTimestamp()
.setColor('00ffcc')
message.channel.send(embed)
}
}
You cannot use the donates variable because you are declaring it inside the db.findOne() block. This is called variables scope. For better understanding you can read this answer.
If you want to use it outside of that block, you have to declare it beforehand, like this:
let donates;
db.findOne({ guildid: message.guild.id, user: member.user.id }, async(err, data)=>{
if(err) throw err;
if(data){
donates = parseInt(data.content.length);
}
})
Now you're able to use that variable outside of the db.findOne() block :)
Edit:
Alternative way:
It would be easier to use the function asynchronously. That way, everything can be scoped in the same block!
Example:
These two methods will give the same results:
const data = await Model.findOne({ ... });
console.log(data);
Model.findOne({ ... }, (err, data) => {
console.log(data);
});
Suggestion from Lioness100
i have Meteor App which is connected to MongoDB.
In mongo i have a table which has ~700k records.
I have a cron job each week, where i read all the records from the table (using Mongo Cursor) and in batches of 10k i want to insert them inside Elastic Search so they are indexed.
let articles = []
Collections.Articles.find({}).forEach(function(doc) {
articles.push({
index: {_index: 'main', _type: 'article', _id: doc.id }
},
doc);
if (0 === articles.length % 10000) {
client.bulk({ maxRetries: 5, index: 'main', type: 'article', body: articles })
data = []
}
})
Since for each is synchronous, goes over each record before it continues, and client.bulk is async, this is overloading the elastic search server and it crashes with Out of Memory Exception.
Is there a way to pause the forEach during the time when the insert is being done? I tried async/await but this does not seem to work as well.
let articles = []
Collections.Articles.find({}).forEach(async function(doc) {
articles.push({
index: {_index: 'main', _type: 'article', _id: doc.id }
},
doc);
if (0 === articles.length % 10000) {
await client.bulk({ maxRetries: 5, index: 'main', type: 'article', body: articles })
data = []
}
})
Any way how to achieve this?
EDIT: I am trying to achieve something like this - if i use promises
let articles = []
Collections.Articles.find({}).forEach(function(doc) {
articles.push({
index: {_index: 'main', _type: 'article', _id: doc.id }
},
doc);
if (0 === articles.length % 10000) {
// Pause FETCHING rows with forEach
client.bulk({ maxRetries: 5, index: 'main', type: 'article', body: articles }).then(() => {
console.log('inserted')
// RESUME FETCHING rows with forEach
console.log("RESUME READING");
})
data = []
}
})
Managed to get this working with ES2018 Async iteration
Got an idea from
Using async/await with a forEach loop
Here is the code that is working
let articles = []
let cursor = Collections.Articles.find({})
for await (doc of cursor) {
articles.push({
index: {_index: 'main', _type: 'article', _id: doc.id }
},
doc);
if (articles.length === 10000) {
await client.bulk({ maxRetries: 5, index: 'trusted', type: 'artikel', body: articles })
articles = []
}
}
This works correctly and it manages to insert all the records into Elastic Search without crashing.
If you are concerned with the unthrottled iteration, then may use the internal Meteor._sleepForMs method, that allows you to put a async timeout in your sync-styled code:
Collections.Articles.find().forEach((doc, index) => {
console.log(index, doc._id)
Meteor._sleepForMs(timeout)
})
Now this works fine within the Meteor environment (Meteor.startup, Meteor.methods, Meteor.publish).
You cron is likely to be not within this environment (= Fiber) so you may write a wrapper that binds the environment:
const bound = fct => Meteor.bindEnvironment(fct)
const iterateSlow = bound(function (timeout) {
Collections.Articles.find().forEach((doc, index) => {
console.log(index, doc._id)
Meteor._sleepForMs(timeout)
})
return true
})
iterateSlow(50) // iterates with 50ms timeout
Here is a complete minimal example, that you can reproduce with a fresh project:
// create a minimal collection
const MyDocs = new Mongo.Collection('myDocs')
// fill the collection
Meteor.startup(() => {
for (let i = 0; i < 100; i++) {
MyDocs.insert({})
}
})
// bind helper
const bound = fct => Meteor.bindEnvironment(fct)
// iterate docs with interval between
const iterateSlow = bound(function (timeout) {
MyDocs.find().forEach((doc, index) => {
console.log(index, doc._id)
Meteor._sleepForMs(timeout)
})
return true
})
// simulate external environment, like when cron runs
setTimeout(() => {
iterateSlow(50)
}, 2000)
I have this route in the backend express server:
router.route('/fillInformationAssetsSeverityEvaluation').post((req, res) => {
informationAssetsSeverityEvaluationRow.remove({}, (err) => {
if (err)
console.log(err);
else
// res.json("informationAssets Collection has been dropped!");
res.json('information Assets Severity Evaluation data has been received on the server side')
informationAssetsSeverityEvaluationRow.insertMany([req.body[0]], {
multi: true
}).then(documentsInserted => {
console.log('[req.body[0]]: ', [req.body[0]]);
console.log('documentsInserted: ', documentsInserted);
console.log('You have succesfully inserted ', documentsInserted.length, ' documents in informationAssetsSeverityEvaluation collection');
});
});
})
For the sake of simplicity, I am inserting only one document.
[req.body[0]]
{ REF: 'REFSHIT',
confFin: 'A',
confRep: 'A'}
But, in the real applications, I am inserting multiple documents similar to that.
This consoleLog :
console.log('documentsInserted: ', documentsInserted);
logs:
documentsInserted: [ { _id: 5d3453afc302d718e4870b53,
REF: 'REFSHIT',
confFin: 'A',
confRep: 'A'}]
As you see the id is automatically generated:
> _id: 5d3453afc302d718e4870b53
What I would like is: The ids of the different documents to be "numerically ordered". I.e:
Document 0 would have id 0
Document 1 would have id 1
Document 2 would have id 2
And so on and so forth.
After having made some research, I found out that I can do this manually by inserting the id manually inside the updateMany objects.
However, since I receive the documents objects from the request body, this is not a viable solution.
Any help?
Finally after trying four modules and a couple of days of trying for something that should be native to mongodb, I have found a simple solution. I hope it helps someone.
1/ Install mongoose-plugin-autoinc
2/
import mongoose from 'mongoose';
import { autoIncrement } from 'mongoose-plugin-autoinc';
const connection = mongoose.createConnection("mongodb://localhost/myDatabase");
const BookSchema = new mongoose.Schema({
author: { type: Schema.Types.ObjectId, ref: 'Author' },
title: String,
genre: String,
publishDate: Date
});
BookSchema.plugin(autoIncrement, 'Book');
const Book = connection.model('Book', BookSchema);
2/ In my case I have the models defined in models.js and the connection defined in server.js so I had to write this :
BookSchema.plugin(autoIncrement, 'Book');
in models.js
and instead of
const Book = connection.model('Book', BookSchema);
I have:
module.exports = {
informationAssetsRow: mongoose.model('informationAssetsRow', informationAssetsRow),
};
And in server.js:
const {
informationAssetsRow,
} = require('./models/models')
I have three schemas, one which references two others:
userSchema
{ name: String }
postSchema
{ content: String }
commentSchema
{
content: String,
user: { ObjectID, ref: 'User' },
post: { ObjectID, ref: 'Post' }
}
How can I seed this database in a sane, scalable way? Even using bluebird promises it quickly becomes a nightmare to write.
My attempt so far involves multiple nested promises and is very hard to maintain:
User
.create([{ name: 'alice' }])
.then(() => {
return Post.create([{ content: 'foo' }])
})
.then(() => {
User.find().then(users => {
Post.find().then(posts => {
// `users` isn't even *available* here!
Comment.create({ content: 'bar', user: users[0], post: posts[0] })
})
})
})
This is clearly not the correct way of doing this. What am I missing?
Not sure about bluebird, but the nodejs Promise.all should do the job:
Promise.all([
User.create([{ name: 'alice' }]),
Post.create([{ content: 'foo' }])
]).then(([users, posts]) => {
const comments = [
{ content: 'bar', user: users[0], post: posts[0] }
];
return Comment.create(comments);
})
If you want to seed database with automatically references, use Seedgoose.
This is the easiest seeder for you to use. You don't need to write any program files, but only data files. And Seedgoose handles smart references for you. And by the way, I'm the author and maintainer of this package.
Try this it will work fine:
Note: Node Promise.all will make sure that the both query is executed properly and then return the result in Array:[Users, Posts],
If you get any error during execution of any query, it will be handle by catch block of the Promise.all.
let queryArray = [];
queryArray.push(User.create([{ name: 'alice' }]));
queryArray.push(Post.create([{ content: 'foo' }]));
Promise.all(queryArray).then(([Users, Posts]) => {
const comments = [
{ content: 'bar', user: Users[0], post: posts[0] }
];
return Comment.create(comments);
}).catch(Error => {
console.log("Error: ", Error);
})
I'm working with the Mongodb native driver using a Map reduce function. Basically I have a mediaId as a key and want to count how many medias loaded and started per mediaId.
So what I've done was:
var map = function(){
emit(this.media.id, {
count: 1,
played: 0,
ph: this.project.id,
title: this.media.title,
media: this.media.id,
origin: this.origin,
thumbnail: this.media.thumbnail,
mediaDuration: this.media.mediaDuration,
state: this.state
});
};
var reduce = function(k, vals) {
result = {
count: 0,
played: 0,
ph: '',
title: '',
media: '',
origin: '',
thumbnail: '',
mediaDuration: 0,
state: ''
};
vals.forEach(function(doc){
result.count += doc.count;
result.ph = doc.ph;
result.title = doc.title;
result.media = doc.media;
result.thumbnail = doc.thumbnail;
result.mediaDuration = doc.mediaDuration;
result.state = doc.state;
result.origin = doc.origin;
if(doc.state === "started") {
result.played += 1;
}
});
return result;
};
In my test collection I have 2 different mediaIds. One with 553 objects and another one with just 1 object. I've putted all in the "started" state to test this so basically the number of count should be equal to the number of played.
When I run the Map/Reduce function it returns to me ( I used the "toArray" function of the mongodb native driver):
[ { _id: '12398asdsa9802193810asd120',
value:
{ count: 1,
played: 0,
ph: '123213ased12231',
title: 'xxxxxxxxxxxxxxxxxxxxxxxxxxx',
media: '1xxxxxxxxxxxxxxxxxxxxxxxxxxx1',
origin: 'http://www.google.com',
thumbnail: 'http://cache.ohinternet.com/images/0/0e/Forever_Alone.png',
mediaDuration: 12321321,
state: 'started' } },
{ _id: '2c9f94b42f5b5114012f5b92ea430066',
value:
{ count: 553,
played: 155,
ph: '316',
title: 'xxxxxxxxxxxxxxxxxxxxxxxxxxx',
media: '2xxxxxxxxxxxxxxxxxxxxxxxxxxx2',
origin: 'http://localhost:9000/views/index.html',
thumbnail: null,
mediaDuration: null,
state: 'started' } } ]
It seems that one I have just one object the reduce function isn't called ( I did some tests with another collection with more than 100 mediaIds and the behavior was identical. Does anyone have an idea of what is wrong with that?
Thanks A LOT for your time,
Cheers.
I sort of solved the "issue".
I did the filter on the Map Function and not on the Reduce function. Something like this:
var map = function(){
if(this.media.state==="started") {
var played = 1;
}else{var played = 0;}
emit(this.media.id, {
count: 1,
played: played,
ph: this.project.id,
title: this.media.title,
media: this.media.id,
origin: this.origin,
thumbnail: this.media.thumbnail,
mediaDuration: this.media.mediaDuration,
state: this.state
});
};
Hope it helps anyone that is having the same "problem"