Why is regex prefix query on indexed array slow in MongoDB? - mongodb

I am trying to perform regex query on an array of strings in MongoDB collection. I could only find this limitation in the docs:
$regex can only use an index efficiently when the regular expression
has an anchor for the beginning (i.e. ^) of a string and is a
case-sensitive match.
Let's make a test:
> for (var i=0; i<100000; i++) db.test.insert({f: ['a_0_'+i, 'a_1_2']})
> db.test.count()
100000
> db.test.ensureIndex({f: 1})
> db.test.find({f: /^a_(0)?_12$/ })
{ "_id" : ObjectId("514ac59886f004fe03ef2a96"), "f" : [ "a_0_12", "a_1_2" ] }
> db.test.find({f: /^a_(0)?_12$/ }).explain()
{
"cursor" : "BtreeCursor f_1 multi",
"isMultiKey" : true,
"n" : 1,
"nscannedObjects" : 200000,
"nscanned" : 200000,
"nscannedObjectsAllPlans" : 200000,
"nscannedAllPlans" : 200000,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 482,
"indexBounds" : {
"f" : [
[
"a_",
"a`"
],
[
/^a_(0)?_12$/,
/^a_(0)?_12$/
]
]
},
"server" : "someserver:27017"
}
The query is sloooow. On the other hand, this query is optimal: (but doesn't suit my use case)
> db.test.find({f: 'a_0_12' }).explain()
{
"cursor" : "BtreeCursor f_1",
"isMultiKey" : true,
"n" : 1,
"nscannedObjects" : 1,
"nscanned" : 1,
"nscannedObjectsAllPlans" : 1,
"nscannedAllPlans" : 1,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 0,
"indexBounds" : {
"f" : [
[
"a_0_12",
"a_0_12"
]
]
},
"server" : "someserver:27017"
}
Why is regex query scanning all (sub)records when it has an index? What am I missing?

Your test case has several characteristics that are unhelpful for regex and index usage:
each document includes an array of two values both starting with "a_". Your regex /^a_(0)?_12$/ is looking for a string starting with a followed by an optional "0", so leads to a comparison of all index entries (200k values).
your regex also matches a value that every document has (a_1_2), so will end up matching all documents irrespective of the index
Since you have a multikey (array index), the number of index comparisons is actually worse than just doing a full table scan of the 100k documents. You can test with a $natural hint to see:
db.test.find({f: /^a_(0|)12$/ }).hint({$natural:1}).explain()
{
"cursor" : "BasicCursor",
"isMultiKey" : false,
"n" : 0,
"nscannedObjects" : 100000,
"nscanned" : 100000,
"nscannedObjectsAllPlans" : 100000,
"nscannedAllPlans" : 100000,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 192,
"indexBounds" : {
},
}
More random data or a more selective regex will result in fewer comparisons.

Related

MongoDB - Index scan low performance

I'm very new to MongoDB and i'm trying to test some performance in order to understand if my structure is fine.
I have a collection with 5 fields (3 date, one Int and one pointer to another ObjectId)
In this collection i've created an index on two fields:
_p_monitor_ref Asc (this is the pointer)
collected Desc (this is one Date field)
The index name is: _p_monitor_ref_1_collected_-1
I've created this index in the beginning and populated the table with some records. After that, i've duplicated the records many times with this script.
var bulk = db.measurements.initializeUnorderedBulkOp();
db.measurements.find().limit(1483570).forEach(function(document) {
document._id = new ObjectId();
bulk.insert(document);
});
bulk.execute();
Now, the collection have 3 million of document.
Now, i try to execute explain to see if the collection use the index and how many time is needed to be executed. This is the query:
db.measurements.find({ "_p_monitor_ref": "Monitors$iKNoB6Ga5P" }).sort({collected: -1}).explain()
As you see, i use _p_monitor_ref to search all documents by pointer, and then i order for collected -1 (this is the index)
This is the first result when i run it. MongoDB use the index (BtreeCursor _p_monitor_ref_1_collected_-1) but the execution time is very hight "millis" : 120286,:
{
"cursor" : "BtreeCursor _p_monitor_ref_1_collected_-1",
"isMultiKey" : false,
"n" : 126862,
"nscannedObjects" : 126862,
"nscanned" : 126862,
"nscannedObjectsAllPlans" : 126862,
"nscannedAllPlans" : 126862,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 23569,
"nChunkSkips" : 0,
"millis" : 120286,
"indexBounds" : {
"_p_monitor_ref" : [
[
"Monitors$iKNoB6Ga5P",
"Monitors$iKNoB6Ga5P"
]
],
"collected" : [
[
{
"$maxElement" : 1
},
{
"$minElement" : 1
}
]
]
},
"server" : "my-pc",
"filterSet" : false
}
{
"cursor" : "BasicCursor",
"isMultiKey" : false,
"n" : 2967141,
"nscannedObjects" : 2967141,
"nscanned" : 2967141,
"nscannedObjectsAllPlans" : 2967141,
"nscannedAllPlans" : 2967141,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 27780,
"nChunkSkips" : 0,
"millis" : 11501,
"server" : "my-pc",
"filterSet" : false
}
Now, if i execute the explain again this is the result and the time is "millis" : 201:
{
"cursor" : "BtreeCursor _p_monitor_ref_1_collected_-1",
"isMultiKey" : false,
"n" : 126862,
"nscannedObjects" : 126862,
"nscanned" : 126862,
"nscannedObjectsAllPlans" : 126862,
"nscannedAllPlans" : 126862,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 991,
"nChunkSkips" : 0,
"millis" : 201,
"indexBounds" : {
"_p_monitor_ref" : [
[
"Monitors$iKNoB6Ga5P",
"Monitors$iKNoB6Ga5P"
]
],
"collected" : [
[
{
"$maxElement" : 1
},
{
"$minElement" : 1
}
]
]
},
"server" : "my-pc",
"filterSet" : false
}
{
"cursor" : "BasicCursor",
"isMultiKey" : false,
"n" : 2967141,
"nscannedObjects" : 2967141,
"nscanned" : 2967141,
"nscannedObjectsAllPlans" : 2967141,
"nscannedAllPlans" : 2967141,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 23180,
"nChunkSkips" : 0,
"millis" : 651,
"server" : "my-pc",
"filterSet" : false
}
Why i have this two very different results ? Maybe the second execution take the data from some kind of cache...
Now, the collection have 3 million of record... what if the collection will grow and become 10/20/30 million ?
I dont know if i'm doing something wrong. Sure, i'm executing it on my Laptop (i dont have a SSD).
The reason why you have smaller execution time at second attempt is connected with fact, that first attempt forced mongo to load data into memory and data was still available in memory when second attempt was executed.
When your collection will grow, index will grow as well - so that could affect that it will be to big to fit in free memory blocks and mongodb engine will load/unload part of that index - so performance will vary.

Query with $in and $nin doesn't use index

When matching an attribute against both $in and $nin, Mongo doesn't use the index correctly.
If only $in is used, then index takes advantage of that:
db.assets.find({
tags: {
$in: ['blah']
}
}).explain()
{
"cursor" : "BtreeCursor tags_1",
"isMultiKey" : true,
"n" : 6,
"nscannedObjects" : 6,
"nscanned" : 6,
"nscannedObjectsAllPlans" : 6,
"nscannedAllPlans" : 6,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 0,
"indexBounds" : {
"tags" : [
[ "blah", "blah" ]
]
}
}
However, if $nin is involved, instead of finding documents that match $in and then filtering out those that don't pass the $nin condition, it scans all documents.
db.assets.find({
tags: {
$in: ['blah'],
$nin: ['cat']
}
}).explain()
{
"cursor" : "BtreeCursor tags_1",
"isMultiKey" : true,
"n" : 75760,
"nscannedObjects" : 79974,
"nscanned" : 1197016,
"nscannedObjectsAllPlans" : 79974,
"nscannedAllPlans" : 1197130,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 9351,
"nChunkSkips" : 0,
"millis" : 2331,
"indexBounds" : {
"tags" : [
[ {"$minElement" : 1}, "cat" ],
[ "cat", {"$maxElement" : 1} ]
]
}
}
Is there a way to trick Mongo to do the right thing?

Mongodb indexing

I have a query
db.messages.find({'headers.Date':{'$gt': new Date(2001,3,1)}},{'headers.From':1, _id:0}).sort({'headers.From':1})
I have set headers.From as index. Now which part of query will use this index ? i.e find part of query or sort part of query?
Explain output is
{
"cursor" : "BtreeCursor headers.From_1",
"isMultiKey" : false,
"n" : 83057,
"nscannedObjects" : 120477,
"nscanned" : 120477,
"nscannedObjectsAllPlans" : 120581,
"nscannedAllPlans" : 120581,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 250,
"indexBounds" : {
"headers.From" : [
[
{
"$minElement" : 1
},
{
"$maxElement" : 1
}
]
]
},
"server" : "Andrews-iMac.local:27017"
}
Any help is appreciated !!!
The index is being used for the sort part, not for the query, as your query doesn't use the headers.From field and your sort does.

Multikey sparse index on empty array in MongoDB

What behavior is expected in this case? Will mongo treat empty array as null/undefined and include it into sparse index or if array is empty, document won't be indexed?
Empty arrays are not treated the same as null in MongoDB. As you can see in the following code from the Mongo shell, a sparse index finds the empty array as an empty array as opposed to as null.
> c = db.docs
test.docs
> c.insert({a : []})
> c.ensureIndex({a : 1}, {sparse: true})
> c.find({a : []}).count()
1
> c.find({a : null}).count()
0
As with most interesting questions about MongoDB, using explain will provide a wealth of info. For example, you can see that the test actually uses the index and that the boundaries of the index are null and [], demonstrating their unique treatment.
> c.find({a : null}).explain()
{
"cursor" : "BtreeCursor a_1",
"isMultiKey" : false,
"n" : 0,
"nscannedObjects" : 0,
"nscanned" : 0,
"nscannedObjectsAllPlans" : 0,
"nscannedAllPlans" : 0,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 0,
"indexBounds" : {
"a" : [
[
null,
null
]
]
},
"server" : "new-host.home:27017"
}
> c.find({a : []}).explain()
{
"cursor" : "BtreeCursor a_1 multi",
"isMultiKey" : false,
"n" : 1,
"nscannedObjects" : 1,
"nscanned" : 1,
"nscannedObjectsAllPlans" : 1,
"nscannedAllPlans" : 1,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 0,
"indexBounds" : {
"a" : [
[
null,
null
],
[
[ ],
[ ]
]
]
},
"server" : "new-host.home:27017"
}

Improve querying fields exist in MongoDB

I'm in progress with estimation of MongoDB for our customers. Per requirements we need associate with some entity ent variable set of name-value pairs.
db.ent.insert({'a':5775, 'b':'b1'})
db.ent.insert({'c':'its a c', 'b':'b2'})
db.ent.insert({'a':7557, 'c':'its a c'})
After this I need intensively query ent for presence of fields:
db.ent.find({'a':{$exists:true}})
db.ent.find({'c':{$exists:false}})
Per MongoDB docs:
$exists is not very efficient even with an index, and esp. with {$exists:true} since it will effectively have to scan all indexed values.
Can experts there provide more efficient way (even with shift the paradigm) to deal fast with vary name-value pairs
You can redesign your schema like this:
{
pairs:[
{k: "a", v: 5775},
{k: "b", v: "b1"},
]
}
Then you indexing your key:
db.people.ensureIndex({"pairs.k" : 1})
After this you will able to search by exact match:
db.ent.find({'pairs.k':"a"})
In case you go with Sparse index and your current schema, proposed by #WesFreeman, you will need to create an index on each key you want to search. It can affect write performance or will be not acceptable if your keys are not static.
Simply redesign your schema such that it's an indexable query. Your use case is infact analogous to the first example application given in MongoDB The Definitive Guide.
If you want/need the convenience of result.a just store the keys somewhere indexable.
instead of the existing:
db.ent.insert({a:5775, b:'b1'})
do
db.ent.insert({a:5775, b:'b1', index: ['a', 'b']})
That's then an indexable query:
db.end.find({index: "a"}).explain()
{
"cursor" : "BtreeCursor index_1",
"nscanned" : 1,
"nscannedObjects" : 1,
"n" : 1,
"millis" : 0,
"nYields" : 0,
"nChunkSkips" : 0,
"isMultiKey" : true,
"indexOnly" : false,
"indexBounds" : {
"index" : [
[
"a",
"a"
]
]
}
}
or if you're ever likely to query also by value:
db.ent.insert({
a:5775,
b:'b1',
index: [
{name: 'a', value: 5775},
{name: 'b', value: 'b1'}
]
})
That's also an indexable query:
db.end.find({"index.name": "a"}).explain()
{
"cursor" : "BtreeCursor index.name_",
"nscanned" : 1,
"nscannedObjects" : 1,
"n" : 1,
"millis" : 0,
"nYields" : 0,
"nChunkSkips" : 0,
"isMultiKey" : true,
"indexOnly" : false,
"indexBounds" : {
"index.name" : [
[
"a",
"a"
]
]
}
}
I think a sparse index is the answer to this, although you'll need an index for each field. http://www.mongodb.org/display/DOCS/Indexes#Indexes-SparseIndexes
Sparse indexes should help with $exists:true queries.
Even still, if your field is not really sparse (meaning it's mostly set), it's not going to help you that much.
Update I guess I'm wrong. Looks like there's an open issue ( https://jira.mongodb.org/browse/SERVER-4187 ) still that $exists doesn't use sparse indexes. However, you can do something like this with find and sort, which looks like it properly uses the sparse index:
db.ent.find({}).sort({a:1});
Here's a full demonstration of the difference, using your example values:
> db.ent.insert({'a':5775, 'b':'b1'})
> db.ent.insert({'c':'its a c', 'b':'b2'})
> db.ent.insert({'a':7557, 'c':'its a c'})
> db.ent.ensureIndex({a:1},{sparse:true});
Note that find({}).sort({a:1}) uses the index (BtreeCursor):
> db.ent.find({}).sort({a:1}).explain();
{
"cursor" : "BtreeCursor a_1",
"nscanned" : 2,
"nscannedObjects" : 2,
"n" : 2,
"millis" : 0,
"nYields" : 0,
"nChunkSkips" : 0,
"isMultiKey" : false,
"indexOnly" : false,
"indexBounds" : {
"a" : [
[
{
"$minElement" : 1
},
{
"$maxElement" : 1
}
]
]
}
}
And find({a:{$exists:true}}) does a full scan:
> db.ent.find({a:{$exists:true}}).explain();
{
"cursor" : "BasicCursor",
"nscanned" : 3,
"nscannedObjects" : 3,
"n" : 2,
"millis" : 0,
"nYields" : 0,
"nChunkSkips" : 0,
"isMultiKey" : false,
"indexOnly" : false,
"indexBounds" : {
}
}
Looks like you can also use .hint({a:1}) to force it to use the index.
> db.ent.find().hint({a:1}).explain();
{
"cursor" : "BtreeCursor a_1",
"nscanned" : 2,
"nscannedObjects" : 2,
"n" : 2,
"millis" : 0,
"nYields" : 0,
"nChunkSkips" : 0,
"isMultiKey" : false,
"indexOnly" : false,
"indexBounds" : {
"a" : [
[
{
"$minElement" : 1
},
{
"$maxElement" : 1
}
]
]
}
}
How about setting the non-exists field to null? Then you can query them with {field: {$ne: null}}.
db.ent.insert({'a':5775, 'b':'b1', 'c': null})
db.ent.insert({'a': null, 'b':'b2', 'c':'its a c'})
db.ent.insert({'a':7557, 'b': null, 'c':'its a c'})
db.ent.ensureIndex({"a" : 1})
db.ent.ensureIndex({"b" : 1})
db.ent.ensureIndex({"c" : 1})
db.ent.find({'a':{$ne: null}}).explain()
Here's the output:
{
"cursor" : "BtreeCursor a_1 multi",
"isMultiKey" : false,
"n" : 4,
"nscannedObjects" : 4,
"nscanned" : 5,
"nscannedObjectsAllPlans" : 4,
"nscannedAllPlans" : 5,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 0,
"indexBounds" : {
"a" : [
[
{
"$minElement" : 1
},
null
],
[
null,
{
"$maxElement" : 1
}
]
]
},
"server" : "my-laptop"
}