$in slower when using indexed column - mongodb

I am trying to optimise my query and have found that when using $in on a non-indexed column that the performance appears to be faster than when on an indexed column.
For example:
I have added an index on myCollection: {"entryVals.col1" : 1}.
To confirm:
db.myCollection.getIndexes()
returns:
[
{
"v" : 2,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "myDb.myCollection"
},
{
"v" : 2,
"key" : {
"entryVals.col1" : 1
},
"name" : "entryVals.col1_1",
"ns" : "myDb.myCollection"
} ]
I then run a count with a query (printing the time taken) on both the indexed and non-indexed columns.
Count on indexed column
var a = new Date().getTime();
db.myCollection.count({"entryVals.col1": {$in:["a","b","c","d"]}});
new Date().getTime() - a;
returns
96 (time in ms)
Count on non-indexed column
var a = new Date().getTime();
db.myCollection.count({"entryVals.col2": {$in:["a","b","c","d"]}});
new Date().getTime() - a;
returns
60 (time in ms)
Please bare in mind that I ran the queries several times and took an average (there were little to no anomalies) .
Is anyone able to help enlighten me as to why the query on the column that is indexed is slower please?
Thanks in advance.
Explains
Count on indexed column
db.myCollection.explain().count({"entryVals.col1": {$in:["a","b","c","d"]}})
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "myDb.myCollection",
"indexFilterSet" : false,
"parsedQuery" : {
"entryVals.col1" : {
"$in" : [
"a",
"b",
"c",
"d"
]
}
},
"winningPlan" : {
"stage" : "COUNT",
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"entryVals.col1" : 1
},
"indexName" : "entryVals.col1_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"entryVals.col1" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"entryVals.col1" : [
"[\"a\", \"a\"]",
"[\"b\", \"b\"]",
"[\"c\", \"c\"]",
"[\"d\", \"d\"]"
]
}
}
}
},
"rejectedPlans" : [ ]
},
"serverInfo" : {
"host" : "obfuscated",
"port" : obfuscated,
"version" : "3.4.6-1.7",
"gitVersion" : "obfuscated"
},
"ok" : 1
}
Count on non-indexed column
db.myCollection.explain().count({"entryVals.col2": {$in:["a","b","c","d"]}})
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "myDb.myCollection",
"indexFilterSet" : false,
"parsedQuery" : {
"entryVals.col2" : {
"$in" : [
"a",
"b",
"c",
"d"
]
}
},
"winningPlan" : {
"stage" : "COUNT",
"inputStage" : {
"stage" : "COLLSCAN",
"filter" : {
"entryVals.col2" : {
"$in" : [
"a",
"b",
"c",
"d"
]
}
},
"direction" : "forward"
}
},
"rejectedPlans" : [ ]
},
"serverInfo" : {
"host" : "obfuscated",
"port" : obfuscated,
"version" : "3.4.6-1.7",
"gitVersion" : "obfuscated"
},
"ok" : 1
}

Related

When you have an index is using `min` and `max` faster than `$gte` and `$lt`? Why?

Sorry for the basic question, I'm new to MongoDB.
Suppose you have a collection called "students" with an index on a field called "grade". Which of these would be faster?
db.students.find({"grade": {$gte: 50}, "grade": {$lt: 90}})
db.students.find().min("grade": 50).max("grade": 90)
Other than the ability to provide a hint to the second option, is there and advantage to the second option?
The first query will be faster because it allows for bounding on the index. This is best seen when using explain.
For example:
db.stack.find({ "grade" : { "$lt" : 90, "$gt" : 50 } }).explain()
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "test.stack",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"grade" : {
"$lt" : 90
}
},
{
"grade" : {
"$gt" : 50
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"grade" : 1
},
"indexName" : "grade_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"grade" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"grade" : [
"(50.0, 90.0)"
]
}
}
},
"rejectedPlans" : [ ]
},
The indexBounds field above shows that the query is only scanning a subset of the index, specifically the keys between 50 and 90.
In comparison, the other form of the query scans the range of index and then parses the resulting cursor to perform the min and max functions:
db.stack.find().min({ "grade" : 50 }).max({"grade" : 90 }).explain()
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "test.stack",
"indexFilterSet" : false,
"parsedQuery" : {
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"grade" : 1
},
"indexName" : "grade_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"grade" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
}
}
},
"rejectedPlans" : [ ]
},
Note how indexBounds above is empty.
Make sense?
One other important note: the query listed in your question will not work as expected, as it will only apply the $lte : 90 filter in its current form.
Queries that apply multiple filters on a single field will need to use the $and operator to perform a logical AND across the multiple conditions. In my examples above, I instead combined the multiple filters into a single condition:
{ "grade" : { "$lt" : 90, "$gt" : 50 } }
This should be the same as:
{
"$and" : [
{ grade : { "$gt" : 50 } },
{ grade: { "$lt" : 90 } }
]
}

How Mongo Query planner choose his index

Mongo version : 3.4.3-10-g865d2fb
I have a request like this :
db.getCollection('c_zop_operations').find({
"a":{
"$in":[
"O",
"S",
"P"
]
},
"$or":[
{
"b":"008091",
"c":"1187",
"d":"F",
"e":ISODate("2018-07-22T22:00:00.000Z")
},
... x 39 elements in $or statement
]
}).explain("executionStats")
The request during 16 seconds and explain returns these results :
155769 documents parse in index !!!
{
"queryPlanner" : {
"plannerVersion" : 1,
...
"indexFilterSet" : false,
"parsedQuery" : {
...
},
"winningPlan" : {
"stage" : "FETCH",
"filter" : {
"$or" : [
{
"$and" : [
{
"c" : {
"$eq" : "1187"
}
},
{
"d" : {
"$eq" : "F"
}
},
{
"b" : {
"$eq" : "008091"
}
},
{
"e" : {
"$eq" : ISODate("2018-07-22T22:00:00.000Z")
}
}
]
},
x 39 times
...
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"a" : 1
},
"indexName" : "a",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"a" : [
"[\"O\", \"O\"]",
"[\"P\", \"P\"]",
"[\"S\", \"S\"]"
]
}
}
},
"rejectedPlans" : [
...
]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 0,
"executionTimeMillis" : 16010,
"totalKeysExamined" : 155769,
"totalDocsExamined" : 155769,
...
}
...
}
In my collection, I have a lot of indexes (65) and my collection contains 3 millions of documents.
Only two indexes interest me here :
aIndex : { "a" : 1 }
And
beIndex: { "b" : 1, "e" : 1 }
By default, mongo use { "a" : 1 } and the request takes 16 seconds.
If I use hint(beIndex), the request takes 0,011 second and totalKeysExamined = 0 and totalDocsExamined = 0.
Why MongoDB don't use the beIndex that is more effective ?
This behavior is a known bug. See SERVER-13732. This was fixed in MongoDB 3.6.
As a workaround, distributing the top-level filter on a into each $or clause should allow the query planner to make a better index choice.

MongoDB Shard-Key Performance Simple

I have a cluster with 3 config dbs and 3 shards. I am querying against a db with 106M records, each with 410 fields. I imported this data using a shard key of:
{state: 1, zipCode: 1}.
When I run the following queries individually each one completes in less than 5sec. (SC = 1.6M records, NC = 5.2M records)
db.records.find( { "state" : "NC" } ).count()
db.records.find( { "state" : "SC" } ).count()
db.records.find( { "state" : { $in : ["NC"] } } ).count()
db.records.find( { "state" : { $in : ["SC"] } } ).count()
However when I query both states using an $in or an $or the query takes over an hour to complete.
db.records.find( "state" : { $in : [ "NC" , "SC" ] } ).count()
db.records.find({ $or : [ { "state" : "NC" }, { "state" : "SC" } ).count()
The entirety of both states exist on 1 shard. Below are the results of .explain() using the $in query:
db.records.find({"state":{$in:["NC","SC"]}}).explain()
{
"queryPlanner" : {
"mongosPlannerVersion" : 1,
"winningPlan" : {
"stage" : "SINGLE_SHARD",
"shards" : [
{
"shardName" : "s2",
"connectionString" : "s2/192.168.2.17:27000,192.168.2.17:27001",
"serverInfo" : {
"host" : "MonDbShard2",
"port" : 27000,
"version" : "3.4.7",
"gitVersion" : "cf38c1b8a0a8dca4a11737581beafef4fe120bcd"
},
"plannerVersion" : 1,
"namespace" : "DBNAME.records",
"indexFilterSet" : false,
"parsedQuery" : {
"state" : {
"$in" : [
"NC",
"SC"
]
}
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "SHARDING_FILTER",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"state" : 1,
"zipCode" : 1
},
"indexName" : "state_1_zipCode_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"state" : [ ],
"zipCode" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"state" : [
"[\"NC\", \"NC\"]",
"[\"SC\", \"SC\"]"
],
"zipCode" : [
"[MinKey, MaxKey]"
]
}
}
}
},
"rejectedPlans" : [ ]
}
]
}
},
"ok" : 1
}
Why would querying two states at one time cause such a drastic difference in completion time? Also, querying a single zipCode without also including the corresponding state, results in the same drastic difference in completion time. I feel like I am misunderstanding how the shard-key actually operates. Any thoughts?

MongoDB sort winingplan overrides hint

I create a collection with three fields as described below. After that, I create an index over second field and executed a search using sort and hint operations.
Why - even using a hint over index created previously - MongoDB set sort as winningPlan?
I believe that if we filter data with some criteria and sort the result could be better, right?
Collection
> db.values.find()
{ "_id" : ObjectId("5763ffebe5a81f569b1005e5"), "field1" : "A", "field2" : "B", "field3" : "C" }
Indexes
> db.values.getIndexes()
[
{
"v" : 1,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "peftest.values"
},
{
"v" : 1,
"key" : {
"field2" : 1
},
"name" : "field2_1",
"ns" : "peftest.values"
}
]
Query and Explain
> db.values.find({field2:"B"}).sort({field1:1}).hint({field2:1}).explain()
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "peftest.values",
"indexFilterSet" : false,
"parsedQuery" : {
"field2" : {
"$eq" : "B"
}
},
"winningPlan" : {
"stage" : "SORT",
"sortPattern" : {
"field1" : 1
},
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"field2" : 1
},
"indexName" : "field2_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"field2" : [
"[\"B\", \"B\"]"
]
}
}
}
}
},
"rejectedPlans" : [ ]
},
"serverInfo" : {
"host" : "apstrd14501d.intraservice.corp",
"port" : 27017,
"version" : "3.2.4",
"gitVersion" : "e2ee9ffcf9f5a94fad76802e28cc978718bb7a30"
},
"ok" : 1
}
I think the plan is what you expect but you look at it from the wrong perspective :)
The input stage of the sort is an index scan so the query plan uses the index at first and the pass the result data to the sort.

Mongo $group too slow

I have a mongo db collections of about 168,200,000 documents. I am trying to get the average of a certain field with $group, and I am using $match before the $group in the pipeline to use the index on client.city. But the query is taking about 5 minutes to run, which is very slow.
Here are the things I tried:
db.ar12.aggregate(
{$match:{'client.city':'New York'}},
{'$group':{'_id':'client.city', 'avg':{'$avg':'$length'}}}
)
db.ar12.aggregate(
{$match:{'client.city':'New York'}},
{'$group':{'_id':null, 'avg':{'$avg':'$length'}}}
)
db.ar12.aggregate(
{$match:{'client.city':'New York'}},
{$project: {'length':1}},
{'$group':{'_id':null, 'avg':{'$avg':'$length'}}}
)
All 3 queries take about the same time, number of documents with client.city = to New York is 1,231,672, find({'client.city':'New York').count() takes a second to run
> db.version()
3.2.0
EDIT
Here's the explain result... As for the comment for adding a compound index with length, would that help, although I am not search by length I want all lengthes...
{
"waitedMS" : NumberLong(0),
"stages" : [
{
"$cursor" : {
"query" : {
"client.city" : "New York"
},
"fields" : {
"length" : 1,
"_id" : 1
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "clients.ar12",
"indexFilterSet" : false,
"parsedQuery" : {
"client.city" : {
"$eq" : "New York"
}
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"client.city" : 1
},
"indexName" : "client.city_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"client.city" : [
"[\"New York\", \"New York\"]"
]
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$project" : {
"length" : true
}
},
{
"$group" : {
"_id" : {
"$const" : null
},
"total" : {
"$avg" : "$length"
}
}
}
],
"ok" : 1
}
EDIT 2
I have added a compound index of client.city and length, but to no avail the speed is still too slow, I tried these 2 queries:
db.ar12.aggregate(
{$match: {'client.city':'New York'}},
{$project: {'client.city':1, 'length':1}},
{'$group':{'_id':'$client.city', 'avg':{'$avg':'$length'}}}
)
The above query wasn't using the compound index, so I tried this to force using it, and still nothing changed:
db.ar12.aggregate(
{$match: { $and : [{'client.city':'New York'}, {'length':{'$gt':0}}]}},
{$project: {'client.city':1, 'length':1}},
{'$group':{'_id':'$client.city', 'avg':{'$avg':'$length'}}}
)
below is the explain of the last query:
{
"waitedMS" : NumberLong(0),
"stages" : [
{
"$cursor" : {
"query" : {
"$and" : [
{
"client.city" : "New York"
},
{
"length" : {
"$gt" : 0
}
}
]
},
"fields" : {
"client.city" : 1,
"length" : 1,
"_id" : 1
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "clients.ar12",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"client.city" : {
"$eq" : "New York"
}
},
{
"length" : {
"$gt" : 0
}
}
]
},
"winningPlan" : {
"stage" : "CACHED_PLAN",
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"client.city" : 1,
"length" : 1
},
"indexName" : "client.city_1_length_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"client.city" : [
"[\"New York\", \"New York\"]"
],
"length" : [
"(0.0, inf.0]"
]
}
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$project" : {
"client" : {
"city" : true
},
"length" : true
}
},
{
"$group" : {
"_id" : "$client.city",
"avg" : {
"$avg" : "$length"
}
}
}
],
"ok" : 1
}
I have found a work around, length goes from 1 till 70. So what I did is in python I iterated from 1 to 70, and found the count of each length for each city,
db.ar12.find({'client.city':'New York', 'length':i}).count()
which is very fast, then calculated the average in python, it is taking about 2 seconds to run.
This is not the best solution, since I have other queries to run, I don't know if I can find a work around for all of them...