MongoDB sort winingplan overrides hint - mongodb

I create a collection with three fields as described below. After that, I create an index over second field and executed a search using sort and hint operations.
Why - even using a hint over index created previously - MongoDB set sort as winningPlan?
I believe that if we filter data with some criteria and sort the result could be better, right?
Collection
> db.values.find()
{ "_id" : ObjectId("5763ffebe5a81f569b1005e5"), "field1" : "A", "field2" : "B", "field3" : "C" }
Indexes
> db.values.getIndexes()
[
{
"v" : 1,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "peftest.values"
},
{
"v" : 1,
"key" : {
"field2" : 1
},
"name" : "field2_1",
"ns" : "peftest.values"
}
]
Query and Explain
> db.values.find({field2:"B"}).sort({field1:1}).hint({field2:1}).explain()
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "peftest.values",
"indexFilterSet" : false,
"parsedQuery" : {
"field2" : {
"$eq" : "B"
}
},
"winningPlan" : {
"stage" : "SORT",
"sortPattern" : {
"field1" : 1
},
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"field2" : 1
},
"indexName" : "field2_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"field2" : [
"[\"B\", \"B\"]"
]
}
}
}
}
},
"rejectedPlans" : [ ]
},
"serverInfo" : {
"host" : "apstrd14501d.intraservice.corp",
"port" : 27017,
"version" : "3.2.4",
"gitVersion" : "e2ee9ffcf9f5a94fad76802e28cc978718bb7a30"
},
"ok" : 1
}

I think the plan is what you expect but you look at it from the wrong perspective :)
The input stage of the sort is an index scan so the query plan uses the index at first and the pass the result data to the sort.

Related

Why will Mongodb only use my index if I use hint

I have a database with a ISODate() type field with an index (I also tried this experiment with string fields - same result). I am using the open source version of MongoDB (4.x) and when I do a query / sort to find the max _finish_time, the index is not used unless I specify a hint.
My query is:
db.getCollection("test").find({}, { _finish_time: 1}).sort({_finish_time: -1}).limit(1)
which explains as:
{
"queryPlanner" : {
"plannerVersion" : 1.0,
"namespace" : "vdm-service-ts-staging.test",
"indexFilterSet" : false,
"parsedQuery" : {
},
"winningPlan" : {
"stage" : "PROJECTION",
"transformBy" : {
"_finish_time" : 1.0
},
"inputStage" : {
"stage" : "SORT",
"sortPattern" : {
"_finish_time" : -1.0
},
"limitAmount" : 1.0,
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"inputStage" : {
"stage" : "COLLSCAN",
"direction" : "forward"
}
}
}
},
"rejectedPlans" : [
]
},
"serverInfo" : {
"host" : "ip-10-82-245-45.us-west-2.compute.internal",
"port" : 27017.0,
"version" : "4.0.1",
"gitVersion" : "54f1582fc6eb01de4d4c42f26fc133e623f065fb"
},
"ok" : 1.0,
"operationTime" : Timestamp(1573220526, 1),
"$clusterTime" : {
"clusterTime" : Timestamp(1573220526, 1),
"signature" : {
"hash" : BinData(0, "blIkiGcam87SDdbKeZKex/9JXBU="),
"keyId" : NumberLong(6715502669504446467)
}
}
}
Which scans the entire collection. When I specify a hint for my available index, as in:
db.getCollection("test").find({}, { _finish_time: 1}).sort({_finish_time: -1}).limit(1).hint("_finish_time")
I get the query plan:
{
"queryPlanner" : {
"plannerVersion" : 1.0,
"namespace" : "vdm-service-ts-staging.test",
"indexFilterSet" : false,
"parsedQuery" : {
},
"winningPlan" : {
"stage" : "LIMIT",
"limitAmount" : 1.0,
"inputStage" : {
"stage" : "PROJECTION",
"transformBy" : {
"_finish_time" : 1.0
},
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"_finish_time" : -1.0
},
"indexName" : "_finish_time",
"isMultiKey" : false,
"multiKeyPaths" : {
"_finish_time" : [
]
},
"isUnique" : false,
"isSparse" : true,
"isPartial" : false,
"indexVersion" : 2.0,
"direction" : "forward",
"indexBounds" : {
"_finish_time" : [
"[MaxKey, MinKey]"
]
}
}
}
}
},
"rejectedPlans" : [
]
},
"serverInfo" : {
"host" : "ip-10-82-245-45.us-west-2.compute.internal",
"port" : 27017.0,
"version" : "4.0.1",
"gitVersion" : "54f1582fc6eb01de4d4c42f26fc133e623f065fb"
},
"ok" : 1.0,
"operationTime" : Timestamp(1573220603, 3),
"$clusterTime" : {
"clusterTime" : Timestamp(1573220603, 3),
"signature" : {
"hash" : BinData(0, "qsGhD1DpI306XbqtNZDYVINPid8="),
"keyId" : NumberLong(6715502669504446467)
}
}
}
Which uses the index. I would prefer not to have to add hint() to my queries and I am perplexed why it refuses to use the index.
My index is sparse and not unique.
I have tried other indexes and searching around, but I cannot find any reference to this problem in Stack overflow or elsewhere.
According to documentation of MongoDB
If a sparse index would result in an incomplete result set for queries
and sort operations, MongoDB will not use that index unless a hint()
explicitly specifies the index.
To use the sparse index, explicitly specify the index with hint()
As hint() forces query optimizer to use index while performing execution of query

Why aggregation framework is slower than simple find query

I am new to mongodb and came across some strange behaviour of aggregation framework.
I have a collection named 'billingData', this collection has approximately 2M documents.
I am comparing two queries which give me same output but their execution time different.
Query 1:
db.billingData.find().sort({"_id":-1}).skip(100000).limit(50)
Execution Plan:
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "billingDetails.billingData",
"indexFilterSet" : false,
"parsedQuery" : {},
"winningPlan" : {
"stage" : "LIMIT",
"limitAmount" : 50,
"inputStage" : {
"stage" : "SKIP",
"skipAmount" : 100000,
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"_id" : 1
},
"indexName" : "_id_",
"isMultiKey" : false,
"multiKeyPaths" : {
"_id" : []
},
"isUnique" : true,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "backward",
"indexBounds" : {
"_id" : [
"[MaxKey, MinKey]"
]
}
}
}
}
},
"rejectedPlans" : []
},
"serverInfo" : {
"host" : "ip-172-60-62-125",
"port" : 27017,
"version" : "3.6.3",
"gitVersion" : "9586e557d54ef70f9ca4b43c26892cd55257e1a5"
},
"ok" : 1.0
}
Query 2:
db.billingData.aggregate([
{$sort : {"_id":-1}},
{$skip:100000},
{$limit:50}
])
Execution Plan:
{
"stages" : [
{
"$cursor" : {
"query" : {},
"sort" : {
"_id" : -1
},
"limit" : NumberLong(100050),
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "billingDetails.billingData",
"indexFilterSet" : false,
"parsedQuery" : {},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"_id" : 1
},
"indexName" : "_id_",
"isMultiKey" : false,
"multiKeyPaths" : {
"_id" : []
},
"isUnique" : true,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "backward",
"indexBounds" : {
"_id" : [
"[MaxKey, MinKey]"
]
}
}
},
"rejectedPlans" : []
}
}
},
{
"$skip" : NumberLong(100000)
}
],
"ok" : 1.0
}
I was expecting same results from aggregation framework and find query but find query returned results in 2sec and aggregation took 16sec.
Although in both the queries, I am sorting my documents in descending order(on the basis of _id) and fetching 50 records after skipping 100,000 records.
Can someone explain me why aggregation framework is working this way?
What can I do to make it performance wise similar to find query?

When you have an index is using `min` and `max` faster than `$gte` and `$lt`? Why?

Sorry for the basic question, I'm new to MongoDB.
Suppose you have a collection called "students" with an index on a field called "grade". Which of these would be faster?
db.students.find({"grade": {$gte: 50}, "grade": {$lt: 90}})
db.students.find().min("grade": 50).max("grade": 90)
Other than the ability to provide a hint to the second option, is there and advantage to the second option?
The first query will be faster because it allows for bounding on the index. This is best seen when using explain.
For example:
db.stack.find({ "grade" : { "$lt" : 90, "$gt" : 50 } }).explain()
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "test.stack",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"grade" : {
"$lt" : 90
}
},
{
"grade" : {
"$gt" : 50
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"grade" : 1
},
"indexName" : "grade_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"grade" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"grade" : [
"(50.0, 90.0)"
]
}
}
},
"rejectedPlans" : [ ]
},
The indexBounds field above shows that the query is only scanning a subset of the index, specifically the keys between 50 and 90.
In comparison, the other form of the query scans the range of index and then parses the resulting cursor to perform the min and max functions:
db.stack.find().min({ "grade" : 50 }).max({"grade" : 90 }).explain()
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "test.stack",
"indexFilterSet" : false,
"parsedQuery" : {
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"grade" : 1
},
"indexName" : "grade_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"grade" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
}
}
},
"rejectedPlans" : [ ]
},
Note how indexBounds above is empty.
Make sense?
One other important note: the query listed in your question will not work as expected, as it will only apply the $lte : 90 filter in its current form.
Queries that apply multiple filters on a single field will need to use the $and operator to perform a logical AND across the multiple conditions. In my examples above, I instead combined the multiple filters into a single condition:
{ "grade" : { "$lt" : 90, "$gt" : 50 } }
This should be the same as:
{
"$and" : [
{ grade : { "$gt" : 50 } },
{ grade: { "$lt" : 90 } }
]
}

Explain why results from mongo are being returned in reverse ObjectId order?

I have a list of news article items which I am tagging for entities, and topic tags.
my query
db["fmetadata"].find({'$and': [{'$text': {'$search': 'apple trump'}}, {'$or':
[{'entities': {'$elemMatch': {'$regex': 'apple|trump'}}}, {'tags': {'$elemMatch': {'$regex': 'apple|trump'}}}]}]}).explain()
query plan
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "dfabric.fmetadata",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"$or" : [
{
"entities" : {
"$elemMatch" : {
"$regex" : "apple|trump"
}
}
},
{
"tags" : {
"$elemMatch" : {
"$regex" : "apple|trump"
}
}
}
]
},
{
"$text" : {
"$search" : "apple trump",
"$language" : "english",
"$caseSensitive" : false,
"$diacriticSensitive" : false
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"filter" : {
"$or" : [
{
"entities" : {
"$elemMatch" : {
"$regex" : "apple|trump"
}
}
},
{
"tags" : {
"$elemMatch" : {
"$regex" : "apple|trump"
}
}
}
]
},
"inputStage" : {
"stage" : "TEXT",
"indexPrefix" : {
},
"indexName" : "title_text_tags_text_entities_text",
"parsedTextQuery" : {
"terms" : [
"appl",
"trump"
],
"negatedTerms" : [ ],
"phrases" : [ ],
"negatedPhrases" : [ ]
},
"textIndexVersion" : 3,
"inputStage" : {
"stage" : "TEXT_MATCH",
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "OR",
"inputStages" : [
{
"stage" : "IXSCAN",
"keyPattern" : {
"_fts" : "text",
"_ftsx" : 1
},
"indexName" : "title_text_tags_text_entities_text",
"isMultiKey" : true,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "backward",
"indexBounds" : {
}
},
{
"stage" : "IXSCAN",
"keyPattern" : {
"_fts" : "text",
"_ftsx" : 1
},
"indexName" : "title_text_tags_text_entities_text",
"isMultiKey" : true,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "backward",
"indexBounds" : {
}
}
]
}
}
}
}
},
"rejectedPlans" : [ ]
},
"serverInfo" : {
"host" : "fabric-dev",
"port" : 27017,
"version" : "4.0.2",
"gitVersion" : "fc1573ba18aee42f97a3bb13b67af7d837826b47"
},
"ok" : 1
}
I see that
["queryPlanner"]["winningPlan"]["inputStage"]["inputStage"]["inputStages"]
"stage": "IXSCAN"
"direction": "backward"
Can this please be explained why?
I was developing a pagination cursor using >lastId, and limit technique. But since, results are being returned backwards, I have to use < lastId which seems counterintuitive.
If I don't sort my results in the natural order, can it be guaranteed that it will always be backwards/reverse?
Edit: as mentioned in the comment below
My objective here is to get the intuition as to why the index was scanned backwards- is it the way I formulated my query? or something else entirely? The ordering- forwards or backwards doesn't matter as much as the consistency of it remaining always so does- either always forwards or vice versa
I came across this question on stackoverflow, and I believe the accepted answer, with the comments below satisfactorily gives me the intuition I was looking for.
How does MongoDB sort records when no sort order is specified?

$in slower when using indexed column

I am trying to optimise my query and have found that when using $in on a non-indexed column that the performance appears to be faster than when on an indexed column.
For example:
I have added an index on myCollection: {"entryVals.col1" : 1}.
To confirm:
db.myCollection.getIndexes()
returns:
[
{
"v" : 2,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "myDb.myCollection"
},
{
"v" : 2,
"key" : {
"entryVals.col1" : 1
},
"name" : "entryVals.col1_1",
"ns" : "myDb.myCollection"
} ]
I then run a count with a query (printing the time taken) on both the indexed and non-indexed columns.
Count on indexed column
var a = new Date().getTime();
db.myCollection.count({"entryVals.col1": {$in:["a","b","c","d"]}});
new Date().getTime() - a;
returns
96 (time in ms)
Count on non-indexed column
var a = new Date().getTime();
db.myCollection.count({"entryVals.col2": {$in:["a","b","c","d"]}});
new Date().getTime() - a;
returns
60 (time in ms)
Please bare in mind that I ran the queries several times and took an average (there were little to no anomalies) .
Is anyone able to help enlighten me as to why the query on the column that is indexed is slower please?
Thanks in advance.
Explains
Count on indexed column
db.myCollection.explain().count({"entryVals.col1": {$in:["a","b","c","d"]}})
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "myDb.myCollection",
"indexFilterSet" : false,
"parsedQuery" : {
"entryVals.col1" : {
"$in" : [
"a",
"b",
"c",
"d"
]
}
},
"winningPlan" : {
"stage" : "COUNT",
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"entryVals.col1" : 1
},
"indexName" : "entryVals.col1_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"entryVals.col1" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"entryVals.col1" : [
"[\"a\", \"a\"]",
"[\"b\", \"b\"]",
"[\"c\", \"c\"]",
"[\"d\", \"d\"]"
]
}
}
}
},
"rejectedPlans" : [ ]
},
"serverInfo" : {
"host" : "obfuscated",
"port" : obfuscated,
"version" : "3.4.6-1.7",
"gitVersion" : "obfuscated"
},
"ok" : 1
}
Count on non-indexed column
db.myCollection.explain().count({"entryVals.col2": {$in:["a","b","c","d"]}})
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "myDb.myCollection",
"indexFilterSet" : false,
"parsedQuery" : {
"entryVals.col2" : {
"$in" : [
"a",
"b",
"c",
"d"
]
}
},
"winningPlan" : {
"stage" : "COUNT",
"inputStage" : {
"stage" : "COLLSCAN",
"filter" : {
"entryVals.col2" : {
"$in" : [
"a",
"b",
"c",
"d"
]
}
},
"direction" : "forward"
}
},
"rejectedPlans" : [ ]
},
"serverInfo" : {
"host" : "obfuscated",
"port" : obfuscated,
"version" : "3.4.6-1.7",
"gitVersion" : "obfuscated"
},
"ok" : 1
}