MongoDB disk read performance is very low - mongodb

I have a MongoDB instance with a database and a collection with 80GB of data. The number of documents inside is about 4M with a comparatively large document size of about 20kB on average. Among other more elementary stuff, each documents contains one list of 1024 elements and also 3-4 lists of 200 numbers.
I perform a simple batch find query over a properly indexed string field ('isbn'), intending to get 5000 documents (projected on relevant part) in one batch. For this, I use the $in operator:
rows = COLLECTION.find({"isbn": {"$in": candidate_isbns}}, {"_id": 0,
"isbn": 1,
"other_stuff": 1})
The IXSCAN stage works properly as intended. Since the corresponding documents, however, are not within the WiredTiger cache yet (and probably never will be for my limited 32GB RAM), the data has to be read from disk during FETCH in most cases. (Unfortunately, "other_stuff" is too heavy to yield for an index that could cover this query.)
The SSD attached to my virtual cloud machine has a read performance of about 90MB/s, which is not great, but should be sufficient for now. However, when I monitor the disk read speed (via iostats, for example) the speed during the query goes down to roughly 3MB/s, which seems to be very poor. I can verify this poor behaviour by checking the profiler output (MongoDB seems to split the 5000 in further batches, so I show only the output for a sub-batch of 2094):
{
"op" : "getmore",
"ns" : "data.metadata",
"command" : {
"getMore" : NumberLong(7543502234201790529),
"collection" : "metadata",
"lsid" : {
"id" : UUID("2f410f2d-2f74-4d3a-9041-27c4ddc51bd2")
},
"$db" : "data"
},
"originatingCommand" : {
"$truncated" : "{ find: \"metadata\", filter: { isbn: { $in: [ \"9783927781313\", ..."
},
"cursorid" : NumberLong(7543502234201790529),
"keysExamined" : 4095,
"docsExamined" : 2095,
"numYield" : 803,
"nreturned" : 2094,
"locks" : {
"ReplicationStateTransition" : {
"acquireCount" : {
"w" : NumberLong(805)
}
},
"Global" : {
"acquireCount" : {
"r" : NumberLong(805)
}
},
"Database" : {
"acquireCount" : {
"r" : NumberLong(804)
}
},
"Collection" : {
"acquireCount" : {
"r" : NumberLong(804)
}
},
"Mutex" : {
"acquireCount" : {
"r" : NumberLong(1)
}
}
},
"flowControl" : {},
"storage" : {
"data" : {
"bytesRead" : NumberLong(65454770),
"timeReadingMicros" : NumberLong(21386543)
}
},
"responseLength" : 16769511,
"protocol" : "op_msg",
"millis" : 21745,
"planSummary" : "IXSCAN { isbn: 1 }",
"execStats" : {
"stage" : "PROJECTION_SIMPLE",
"nReturned" : 2196,
"executionTimeMillisEstimate" : 21126,
"works" : 4288,
"advanced" : 2196,
"needTime" : 2092,
"needYield" : 0,
"saveState" : 817,
"restoreState" : 817,
"isEOF" : 0,
"transformBy" : {},
"inputStage" : {
"stage" : "FETCH",
"nReturned" : 2196,
"executionTimeMillisEstimate" : 21116,
"works" : 4288,
"advanced" : 2196,
"needTime" : 2092,
"needYield" : 0,
"saveState" : 817,
"restoreState" : 817,
"isEOF" : 0,
"docsExamined" : 2196,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 2196,
"executionTimeMillisEstimate" : 531,
"works" : 4288,
"advanced" : 2196,
"needTime" : 2092,
"needYield" : 0,
"saveState" : 817,
"restoreState" : 817,
"isEOF" : 0,
"keyPattern" : {
"isbn" : 1.0
},
"indexName" : "isbn_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"isbn" : []
},
"isUnique" : true,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"isbn" : [
"[\"9780230391451\", \"9780230391451\"]",
"[\"9780230593206\", \"9780230593206\"]",
... ]
},
"keysExamined" : 4288,
"seeks" : 2093,
"dupsTested" : 0,
"dupsDropped" : 0
}
}
},
"ts" : ISODate("2022-01-24T07:57:12.132Z"),
"client" : "my_ip",
"allUsers" : [
{
"user" : "myUser",
"db" : "data"
}
],
"user" : "myUser#data"
}
By looking at the ratio of bytesRead vs. timeReadingMicros, this poor read speed of about 3MB/s can be confirmed, indeed.
My question: Why does this degradation of speed take place? Is it pathological, so that I need to do further investigations, or is it the expected behaviour, given the data setup above?
Any help is highly appreciated!

Related

mongodb contains query empty result slow

I have around 10 millions document in MongoDB.
I'm trying to search for text inside the db db.outMessage.find({ "text" : /.*m.*/}) but it took too long (around 30 second) with no result, but if I search for existing text it took less than a second.
I tried to put index on text with same result.
db.outMessage.find({ "text" : /.*m.*/}).explain(true)
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "notification_center.outMessage",
"indexFilterSet" : false,
"parsedQuery" : {
"text" : {
"$regex" : ".*m.*"
}
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"filter" : {
"text" : {
"$regex" : ".*m.*"
}
},
"keyPattern" : {
"text" : 1
},
"indexName" : "text",
"isMultiKey" : false,
"multiKeyPaths" : {
"text" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"text" : [
"[\"\", {})",
"[/.*m.*/, /.*m.*/]"
]
}
}
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 0,
"executionTimeMillis" : 14354,
"totalKeysExamined" : 10263270,
"totalDocsExamined" : 0,
"executionStages" : {
"stage" : "FETCH",
"nReturned" : 0,
"executionTimeMillisEstimate" : 12957,
"works" : 10263271,
"advanced" : 0,
"needTime" : 10263270,
"needYield" : 0,
"saveState" : 80258,
"restoreState" : 80258,
"isEOF" : 1,
"invalidates" : 0,
"docsExamined" : 0,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"filter" : {
"text" : {
"$regex" : ".*m.*"
}
},
"nReturned" : 0,
"executionTimeMillisEstimate" : 12461,
"works" : 10263271,
"advanced" : 0,
"needTime" : 10263270,
"needYield" : 0,
"saveState" : 80258,
"restoreState" : 80258,
"isEOF" : 1,
"invalidates" : 0,
"keyPattern" : {
"text" : 1
},
"indexName" : "text",
"isMultiKey" : false,
"multiKeyPaths" : {
"text" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"text" : [
"[\"\", {})",
"[/.*m.*/, /.*m.*/]"
]
},
"keysExamined" : 10263270,
"seeks" : 1,
"dupsTested" : 0,
"dupsDropped" : 0,
"seenInvalidated" : 0
}
},
"allPlansExecution" : [ ]
},
"serverInfo" : {
"host" : "acsdptest.arabiacell.net",
"port" : 27017,
"version" : "3.4.7",
"gitVersion" : "cf38c1b8a0a8dca4a11737581beafef4fe120bcd"
},
The index will essentially be a list of all the values of the text field, in lexicographical order, i.e. sorted by the first letter.
Since the query executor has no way to predict which values might contain an 'm', it must examine all of the index entries.
In the case of this query, that means 10,263,270 index keys were examined, after being read from disk if the index was not already in the cache.
If this is actually a keyword search and not a single-letter match, instead of $regex, you might be able to make use of the $text query operator, which requires a text index

Difficulty optimizing Mongo distinct query to use indexes

I am having difficulty persuading Mongo to run a distinct query that looks like it should be covered by the indexes without fetching a large number of documents in the collection.
My documents have the general form:
{
_tenantId: 'someString',
_productCategory: 'some string from a smallish set'
...
}
I have an index on (_tenantId, _productCategory).
I want to find out what the set of distinct product categories is for a given tenant, so the query is:
db.products.distinct( '_productCategory', { _tenantId: '463171c3-d15f-4699-893d-3046327f8e1f'})
This runs rather slowly (several seconds for a collection of around half a million products against a local DB, which is Mongo 3.2.9). Against our pre-production SaaS-based Mongo (which is probably more memory constrained than my local instance which has free run of my machine) it take several 10s of seconds for the same data.
Explaining the query yields:
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "engage-prod.products",
"indexFilterSet" : false,
"parsedQuery" : {
"_tenantId" : {
"$eq" : "463171c3-d15f-4699-893d-3046327f8e1f"
}
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"_tenantId" : 1,
"_productCategory" : 1
},
"indexName" : "_tenantId_1__productCategory_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"_tenantId" : [
"[\"463171c3-d15f-4699-893d-3046327f8e1f\", \"463171c3-d15f-4699-893d-3046327f8e1f\"]"
],
"_productCategory" : [
"[MinKey, MaxKey]"
]
}
}
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 406871,
"executionTimeMillis" : 358,
"totalKeysExamined" : 406871,
"totalDocsExamined" : 406871,
"executionStages" : {
"stage" : "FETCH",
"nReturned" : 406871,
"executionTimeMillisEstimate" : 80,
"works" : 406872,
"advanced" : 406871,
"needTime" : 0,
"needYield" : 0,
"saveState" : 3178,
"restoreState" : 3178,
"isEOF" : 1,
"invalidates" : 0,
"docsExamined" : 406871,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 406871,
"executionTimeMillisEstimate" : 40,
"works" : 406872,
"advanced" : 406871,
"needTime" : 0,
"needYield" : 0,
"saveState" : 3178,
"restoreState" : 3178,
"isEOF" : 1,
"invalidates" : 0,
"keyPattern" : {
"_tenantId" : 1,
"_productCategory" : 1
},
"indexName" : "_tenantId_1__productCategory_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"_tenantId" : [
"[\"463171c3-d15f-4699-893d-3046327f8e1f\", \"463171c3-d15f-4699-893d-3046327f8e1f\"]"
],
"_productCategory" : [
"[MinKey, MaxKey]"
]
},
"keysExamined" : 406871,
"dupsTested" : 0,
"dupsDropped" : 0,
"seenInvalidated" : 0
}
}
},
"serverInfo" : {
"host" : "Stevens-MacBook-Pro.local",
"port" : 27017,
"version" : "3.2.9",
"gitVersion" : "22ec9e93b40c85fc7cae7d56e7d6a02fd811088c"
},
"ok" : 1
}
Note that even though it runs an IXSCAN it still returns over 400K documents (nReturned).
If I create a compound field _tenantAndProductCategory containing a lexical concatenation (with a : separator) and index that so it's a single field index, then the query:
db.products.explain('executionStats').distinct( '_productTenantAndCategory', { _productTenantAndCategory: {$gte: '463171c3-d15f-4699-893d-3046327f8e1f',$lt: '463171c3-d15f-4699-893d-3046327f8e1g'}})
works entirely within the index and yields:
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "engage-prod.products",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"_productTenantAndCategory" : {
"$lt" : "463171c3-d15f-4699-893d-3046327f8e1g"
}
},
{
"_productTenantAndCategory" : {
"$gte" : "463171c3-d15f-4699-893d-3046327f8e1f"
}
}
]
},
"winningPlan" : {
"stage" : "PROJECTION",
"transformBy" : {
"_id" : 0,
"_productTenantAndCategory" : 1
},
"inputStage" : {
"stage" : "DISTINCT_SCAN",
"keyPattern" : {
"_productTenantAndCategory" : 1
},
"indexName" : "_productTenantAndCategory_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"_productTenantAndCategory" : [
"[\"463171c3-d15f-4699-893d-3046327f8e1f\", \"463171c3-d15f-4699-893d-3046327f8e1g\")"
]
}
}
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 62,
"executionTimeMillis" : 0,
"totalKeysExamined" : 63,
"totalDocsExamined" : 0,
"executionStages" : {
"stage" : "PROJECTION",
"nReturned" : 62,
"executionTimeMillisEstimate" : 0,
"works" : 63,
"advanced" : 62,
"needTime" : 0,
"needYield" : 0,
"saveState" : 0,
"restoreState" : 0,
"isEOF" : 1,
"invalidates" : 0,
"transformBy" : {
"_id" : 0,
"_productTenantAndCategory" : 1
},
"inputStage" : {
"stage" : "DISTINCT_SCAN",
"nReturned" : 62,
"executionTimeMillisEstimate" : 0,
"works" : 63,
"advanced" : 62,
"needTime" : 0,
"needYield" : 0,
"saveState" : 0,
"restoreState" : 0,
"isEOF" : 1,
"invalidates" : 0,
"keyPattern" : {
"_productTenantAndCategory" : 1
},
"indexName" : "_productTenantAndCategory_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"_productTenantAndCategory" : [
"[\"463171c3-d15f-4699-893d-3046327f8e1f\", \"463171c3-d15f-4699-893d-3046327f8e1g\")"
]
},
"keysExamined" : 63
}
}
},
"serverInfo" : {
"host" : "Stevens-MacBook-Pro.local",
"port" : 27017,
"version" : "3.2.9",
"gitVersion" : "22ec9e93b40c85fc7cae7d56e7d6a02fd811088c"
},
"ok" : 1
}
Having to build single field indexes with manually compounded keys for all the aggregation queries I need is not a very desirable path to follow. Since all the information is present in the compound index I started with, why can't Mongo execute the original distinct query with cover by that index? Is there anything I can do to overcome this in the way of query optimization?
Note This is actually a sub-problem of a slightly more complex one involving an aggregation pipeline to actually count the number of occurrences of each category, but I am restricting my question for now to the simpler distinct query since it seems to capture the essence of failure to use an index that should cover things (which I was also seeing in the aggregation pipeline case), while being a simpler overall query.

MongoDB multikey index performance

Background
I have a collection of users with structure of documents like this:
{
"_id" : ObjectId("54e61137cca5d2ff0a8b4567"),
"login" : "test1",
"emails" : [
{
"email" : "test1#example.com",
"is_primary" : true,
"_id" : ObjectId("57baf3e97323afb2688e639c")
},
{
"email" : "test1_1#example.com",
"is_primary" : false,
"_id" : ObjectId("57baf3e97323afb2688e639d")
}
]
}
Indexes:
{
"v" : 1,
"key" : {
"login" : 1
},
"name" : "login_1",
"ns" : "mydb.users",
"background" : true
},
{
"v" : 1,
"key" : {
"emails.email" : 1
},
"name" : "emails.email_1",
"ns" : "mydb.users"
}
Count of documents is ~700000
Scenario
To explain the search of users by login, I make this:
rs0:PRIMARY> db.users.explain('executionStats').find({'login' : /test123123123/})
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "mydb.users",
"indexFilterSet" : false,
"parsedQuery" : {
"login" : /test123123123/
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"filter" : {
"login" : /test123123123/
},
"keyPattern" : {
"login" : 1
},
"indexName" : "login_1",
"isMultiKey" : false,
"direction" : "forward",
"indexBounds" : {
"login" : [
"[\"\", {})",
"[/test123123123/, /test123123123/]"
]
}
}
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 0,
"executionTimeMillis" : 1040,
"totalKeysExamined" : 698993,
"totalDocsExamined" : 0,
"executionStages" : {
"stage" : "FETCH",
"nReturned" : 0,
"executionTimeMillisEstimate" : 930,
"works" : 698994,
"advanced" : 0,
"needTime" : 698993,
"needFetch" : 0,
"saveState" : 5460,
"restoreState" : 5460,
"isEOF" : 1,
"invalidates" : 0,
"docsExamined" : 0,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"filter" : {
"login" : /test123123123/
},
"nReturned" : 0,
"executionTimeMillisEstimate" : 920,
"works" : 698993,
"advanced" : 0,
"needTime" : 698993,
"needFetch" : 0,
"saveState" : 5460,
"restoreState" : 5460,
"isEOF" : 1,
"invalidates" : 0,
"keyPattern" : {
"login" : 1
},
"indexName" : "login_1",
"isMultiKey" : false,
"direction" : "forward",
"indexBounds" : {
"login" : [
"[\"\", {})",
"[/test123123123/, /test123123123/]"
]
},
"keysExamined" : 698993,
"dupsTested" : 0,
"dupsDropped" : 0,
"seenInvalidated" : 0,
"matchTested" : 0
}
}
},
"serverInfo" : {
"host" : "myhost",
"port" : 27017,
"version" : "3.0.12",
"gitVersion" : "33934938e0e95d534cebbaff656cde916b9c3573"
},
"ok" : 1
}
As you can see executionStats.executionStages.inputStage.nReturned is 0 and executionStats.totalDocsExamined is so 0. It's ok, I guess there is no documents with login like entered. But if I want search users by email I'll do next:
rs0:PRIMARY> db.users.explain('executionStats').find({'emails.email' : /test123123123/})
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "mydb.users",
"indexFilterSet" : false,
"parsedQuery" : {
"emails.email" : /test123123123/
},
"winningPlan" : {
"stage" : "FETCH",
"filter" : {
"emails.email" : /test123123123/
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"emails.email" : 1
},
"indexName" : "emails.email_1",
"isMultiKey" : true,
"direction" : "forward",
"indexBounds" : {
"emails.email" : [
"[\"\", {})",
"[/test123123123/, /test123123123/]"
]
}
}
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 0,
"executionTimeMillis" : 7666,
"totalKeysExamined" : 699016,
"totalDocsExamined" : 698993,
"executionStages" : {
"stage" : "FETCH",
"filter" : {
"emails.email" : /test123123123/
},
"nReturned" : 0,
"executionTimeMillisEstimate" : 7355,
"works" : 699017,
"advanced" : 0,
"needTime" : 699016,
"needFetch" : 0,
"saveState" : 5462,
"restoreState" : 5462,
"isEOF" : 1,
"invalidates" : 0,
"docsExamined" : 698993,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 698993,
"executionTimeMillisEstimate" : 1630,
"works" : 699016,
"advanced" : 698993,
"needTime" : 23,
"needFetch" : 0,
"saveState" : 5462,
"restoreState" : 5462,
"isEOF" : 1,
"invalidates" : 0,
"keyPattern" : {
"emails.email" : 1
},
"indexName" : "emails.email_1",
"isMultiKey" : true,
"direction" : "forward",
"indexBounds" : {
"emails.email" : [
"[\"\", {})",
"[/test123123123/, /test123123123/]"
]
},
"keysExamined" : 699016,
"dupsTested" : 699016,
"dupsDropped" : 23,
"seenInvalidated" : 0,
"matchTested" : 0
}
}
},
"serverInfo" : {
"host" : "myhost",
"port" : 27017,
"version" : "3.0.12",
"gitVersion" : "33934938e0e95d534cebbaff656cde916b9c3573"
},
"ok" : 1
}
And here executionStats.executionStages.inputStage.nReturned (and executionStats.totalDocsExamined) is equal 698993 (executionStats.nReturned is 0 like in first query)
Question
Why when I use search with multikey index (users.user) on the ixscan stage returns all my collection and fetch stage occurs all collection. But If I use search by non-multikey index (login) ixscan stage scans expected values and on the fetch stage I give what I want.
UPD: when I use regular expression not like /smth/, but /^smth/ then scan by emails.email field returns also 0 elements. Why multikey and ordinary index give me different results for regular expression like /smth/ ?
Because it is multikey index.
explained here
When a query filter specifies an exact match for an array as a whole, MongoDB can use the multikey index to look up the first element of the query array but cannot use the multikey index scan to find the whole array. Instead, after using the multikey index to look up the first element of the query array, MongoDB retrieves the associated documents and filters for documents whose array matches the array in the query.

MongoDB, can query fields slow down a query even if they form a partition?

Assuming I have only male and females in my user collection. Is the following :
User.find({ gender: { $in: ['male','female'] }})
slower than this one :
User.find()
I feel like it would be, but I don't really know how MongoDB works internally. Both requests return the entire collection. I'm building a filter feature and I'd like to simplify my api code by considering that every call is filtered somehow.
it is a good question as it touches basic query planning capabilites.
Comparing explain results we can see that using IN invokes collection scan by specified query parameter - which is more expensive than basic document dump, when querying without parameters.
db.User.find({ gender: { $in: ['male','female'] }}).explain("executionStats")
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "test.User",
"indexFilterSet" : false,
"parsedQuery" : {
"gender" : {
"$in" : [
"female",
"male"
]
}
},
"winningPlan" : {
"stage" : "COLLSCAN",
"filter" : {
"gender" : {
"$in" : [
"female",
"male"
]
}
},
"direction" : "forward"
},
"rejectedPlans" : []
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 24,
"executionTimeMillis" : 0,
"totalKeysExamined" : 0,
"totalDocsExamined" : 24,
"executionStages" : {
"stage" : "COLLSCAN",
"filter" : {
"gender" : {
"$in" : [
"female",
"male"
]
}
},
"nReturned" : 24,
"executionTimeMillisEstimate" : 0,
"works" : 26,
"advanced" : 24,
"needTime" : 1,
"needYield" : 0,
"saveState" : 0,
"restoreState" : 0,
"isEOF" : 1,
"invalidates" : 0,
"direction" : "forward",
"docsExamined" : 24
}
},
"serverInfo" : {
"host" : "greg",
"port" : 27017,
"version" : "3.2.3",
"gitVersion" : "b326ba837cf6f49d65c2f85e1b70f6f31ece7937"
},
"ok" : 1
}
db.User.find().explain("executionStats")
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "test.User",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : []
},
"winningPlan" : {
"stage" : "COLLSCAN",
"filter" : {
"$and" : []
},
"direction" : "forward"
},
"rejectedPlans" : []
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 24,
"executionTimeMillis" : 0,
"totalKeysExamined" : 0,
"totalDocsExamined" : 24,
"executionStages" : {
"stage" : "COLLSCAN",
"filter" : {
"$and" : []
},
"nReturned" : 24,
"executionTimeMillisEstimate" : 0,
"works" : 26,
"advanced" : 24,
"needTime" : 1,
"needYield" : 0,
"saveState" : 0,
"restoreState" : 0,
"isEOF" : 1,
"invalidates" : 0,
"direction" : "forward",
"docsExamined" : 24
}
},
"serverInfo" : {
"host" : "greg",
"port" : 27017,
"version" : "3.2.3",
"gitVersion" : "b326ba837cf6f49d65c2f85e1b70f6f31ece7937"
},
"ok" : 1
}
When querying without a condition, it return all the documents without checking. But if you and a condition. Simply it compile the condition into BSON and match with the data in the database, Which is slower. But if you create an index on gender. You can not see any difference in time (in both cases)

How to improve the mongodb performance with single node with large dataset?

I just started with mongodb. I am working with 38GB data set(68 million document) with ssd stoarge.
But the performance is going done with indexing and without indexing also. And its using so much ram for simple find query with two fields, No cpu use.
Its taking 18 min to fetch 1.6 million records. What are the factor, which are helpful to improve mongodb performance with single node?
My Document looks like this:
{ "_id" : ObjectId("55e7eec02756dd0f1e693b72"),
"categorieId" : 2,
"title" : "AntiMalware",
"messageValue" : " #\"Antimalware: \"Windows Defender\" is Not Updated and Running\"#",
"timestamp" : "8/19/2015 11:06:24 AM",
"resultStatusId" : 2,
"messageFormat" : "Text",
"titleId" : 1,
"resultStatus" : "Warning",
"antiMalwareName" : "Comodo Antivirus",
"categories" : "Security" }
My indexs are on titleId and resultStatusId.
My query is:
db.collection.find({"titleId":21, resultStatusId:1})
The explain output is:
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "techHealLogAnalysis.techHealTestLogData",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"resultStatusId" : {
"$eq" : 1
}
},
{
"titleId" : {
"$eq" : 21
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"titleId" : 1,
"resultStatusId" : 1
},
"indexName" : "titleId_1_resultStatusId_1",
"isMultiKey" : false,
"direction" : "forward",
"indexBounds" : {
"titleId" : [
"[21.0, 21.0]"
],
"resultStatusId" : [
"[1.0, 1.0]"
]
}
}
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 1671842,
"executionTimeMillis" : 1108805,
"totalKeysExamined" : 1671842,
"totalDocsExamined" : 1671842,
"executionStages" : {
"stage" : "FETCH",
"nReturned" : 1671842,
"executionTimeMillisEstimate" : 177670,
"works" : 2143234,
"advanced" : 1671842,
"needTime" : 0,
"needFetch" : 471391,
"saveState" : 471391,
"restoreState" : 471391,
"isEOF" : 1,
"invalidates" : 0,
"docsExamined" : 1671842,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 1671842,
"executionTimeMillisEstimate" : 1470,
"works" : 1671843,
"advanced" : 1671842,
"needTime" : 0,
"needFetch" : 0,
"saveState" : 471391,
"restoreState" : 471391,
"isEOF" : 1,
"invalidates" : 0,
"keyPattern" : {
"titleId" : 1,
"resultStatusId" : 1
},
"indexName" : "titleId_1_resultStatusId_1",
"isMultiKey" : false,
"direction" : "forward",
"indexBounds" : {
"titleId" : [
"[21.0, 21.0]"
],
"resultStatusId" : [
"[1.0, 1.0]"
]
},
"keysExamined" : 1671842,
"dupsTested" : 0,
"dupsDropped" : 0,
"seenInvalidated" : 0,
"matchTested" : 0
}
}
},
"serverInfo" : {
"host" : "instance-7",
"port" : 27017,
"version" : "3.0.6",
"gitVersion" : "1ef45a23a4c5e3480ac919b28afcba3c615488f2"
},
"ok" : 1
}
Database systems with large data sets and high throughput applications can challenge the capacity of a single server. Larger data sets exceed the storage capacity of a single machine. Finally, working set sizes larger than the system’s RAM stress the I/O capacity of disk drives. Deploying sharding for your case may really be useful. Once check out the following link.
http://docs.mongodb.org/manual/core/sharding-introduction/