Multikey sparse index on empty array in MongoDB - mongodb

What behavior is expected in this case? Will mongo treat empty array as null/undefined and include it into sparse index or if array is empty, document won't be indexed?

Empty arrays are not treated the same as null in MongoDB. As you can see in the following code from the Mongo shell, a sparse index finds the empty array as an empty array as opposed to as null.
> c = db.docs
test.docs
> c.insert({a : []})
> c.ensureIndex({a : 1}, {sparse: true})
> c.find({a : []}).count()
1
> c.find({a : null}).count()
0
As with most interesting questions about MongoDB, using explain will provide a wealth of info. For example, you can see that the test actually uses the index and that the boundaries of the index are null and [], demonstrating their unique treatment.
> c.find({a : null}).explain()
{
"cursor" : "BtreeCursor a_1",
"isMultiKey" : false,
"n" : 0,
"nscannedObjects" : 0,
"nscanned" : 0,
"nscannedObjectsAllPlans" : 0,
"nscannedAllPlans" : 0,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 0,
"indexBounds" : {
"a" : [
[
null,
null
]
]
},
"server" : "new-host.home:27017"
}
> c.find({a : []}).explain()
{
"cursor" : "BtreeCursor a_1 multi",
"isMultiKey" : false,
"n" : 1,
"nscannedObjects" : 1,
"nscanned" : 1,
"nscannedObjectsAllPlans" : 1,
"nscannedAllPlans" : 1,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 0,
"indexBounds" : {
"a" : [
[
null,
null
],
[
[ ],
[ ]
]
]
},
"server" : "new-host.home:27017"
}

Related

Query with $in and $nin doesn't use index

When matching an attribute against both $in and $nin, Mongo doesn't use the index correctly.
If only $in is used, then index takes advantage of that:
db.assets.find({
tags: {
$in: ['blah']
}
}).explain()
{
"cursor" : "BtreeCursor tags_1",
"isMultiKey" : true,
"n" : 6,
"nscannedObjects" : 6,
"nscanned" : 6,
"nscannedObjectsAllPlans" : 6,
"nscannedAllPlans" : 6,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 0,
"indexBounds" : {
"tags" : [
[ "blah", "blah" ]
]
}
}
However, if $nin is involved, instead of finding documents that match $in and then filtering out those that don't pass the $nin condition, it scans all documents.
db.assets.find({
tags: {
$in: ['blah'],
$nin: ['cat']
}
}).explain()
{
"cursor" : "BtreeCursor tags_1",
"isMultiKey" : true,
"n" : 75760,
"nscannedObjects" : 79974,
"nscanned" : 1197016,
"nscannedObjectsAllPlans" : 79974,
"nscannedAllPlans" : 1197130,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 9351,
"nChunkSkips" : 0,
"millis" : 2331,
"indexBounds" : {
"tags" : [
[ {"$minElement" : 1}, "cat" ],
[ "cat", {"$maxElement" : 1} ]
]
}
}
Is there a way to trick Mongo to do the right thing?

Why is regex prefix query on indexed array slow in MongoDB?

I am trying to perform regex query on an array of strings in MongoDB collection. I could only find this limitation in the docs:
$regex can only use an index efficiently when the regular expression
has an anchor for the beginning (i.e. ^) of a string and is a
case-sensitive match.
Let's make a test:
> for (var i=0; i<100000; i++) db.test.insert({f: ['a_0_'+i, 'a_1_2']})
> db.test.count()
100000
> db.test.ensureIndex({f: 1})
> db.test.find({f: /^a_(0)?_12$/ })
{ "_id" : ObjectId("514ac59886f004fe03ef2a96"), "f" : [ "a_0_12", "a_1_2" ] }
> db.test.find({f: /^a_(0)?_12$/ }).explain()
{
"cursor" : "BtreeCursor f_1 multi",
"isMultiKey" : true,
"n" : 1,
"nscannedObjects" : 200000,
"nscanned" : 200000,
"nscannedObjectsAllPlans" : 200000,
"nscannedAllPlans" : 200000,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 482,
"indexBounds" : {
"f" : [
[
"a_",
"a`"
],
[
/^a_(0)?_12$/,
/^a_(0)?_12$/
]
]
},
"server" : "someserver:27017"
}
The query is sloooow. On the other hand, this query is optimal: (but doesn't suit my use case)
> db.test.find({f: 'a_0_12' }).explain()
{
"cursor" : "BtreeCursor f_1",
"isMultiKey" : true,
"n" : 1,
"nscannedObjects" : 1,
"nscanned" : 1,
"nscannedObjectsAllPlans" : 1,
"nscannedAllPlans" : 1,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 0,
"indexBounds" : {
"f" : [
[
"a_0_12",
"a_0_12"
]
]
},
"server" : "someserver:27017"
}
Why is regex query scanning all (sub)records when it has an index? What am I missing?
Your test case has several characteristics that are unhelpful for regex and index usage:
each document includes an array of two values both starting with "a_". Your regex /^a_(0)?_12$/ is looking for a string starting with a followed by an optional "0", so leads to a comparison of all index entries (200k values).
your regex also matches a value that every document has (a_1_2), so will end up matching all documents irrespective of the index
Since you have a multikey (array index), the number of index comparisons is actually worse than just doing a full table scan of the 100k documents. You can test with a $natural hint to see:
db.test.find({f: /^a_(0|)12$/ }).hint({$natural:1}).explain()
{
"cursor" : "BasicCursor",
"isMultiKey" : false,
"n" : 0,
"nscannedObjects" : 100000,
"nscanned" : 100000,
"nscannedObjectsAllPlans" : 100000,
"nscannedAllPlans" : 100000,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 192,
"indexBounds" : {
},
}
More random data or a more selective regex will result in fewer comparisons.

MongoDB outside range query

I am trying to query MongoDB to obtain something like:
"get persons with age not in the range [30,40]"
I am doing:
db.persons.find({'age' : {$nin : [{$lt : 30},{$gt : 40}]}})
which is not working for me. I know that I could do something like people with age<30 AND people with age>40 but I was wondering if I can use the "not in" operator...
thanks
What about using the OR conjunction like this:
db.persons.find($or: [{'age': {$lt: 30}},{'age': {$gt : 40}}])
$in / $nin are operators used for querying for discrete values in a list and can not be used for range searches.
In your example, the query with $nin would have to be
db.persons.find({age:{$nin:[30,31,32,33,34,35,36,37,38,39,40]}})
which is not at all practical and, furthermore, would not make use of an index:
db.persons.ensureIndex({age:1})
db.persons.find({age:{$nin:[30,31,32,33,34,35,36,37,38,39,40]}}).explain()
{
"cursor" : "BasicCursor",
"isMultiKey" : false,
"n" : 1,
"nscannedObjects" : 1,
"nscanned" : 1,
"nscannedObjectsAllPlans" : 1,
"nscannedAllPlans" : 1,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 0,
"indexBounds" : {
},
"server" : "Aspire-5750:27017"
}
Sgoettschkes' answer above is correct and would use the index:
db.persons.find({$or: [{'age': {$lt: 30}},{'age': {$gt : 40}}]}).explain()
{
"clauses" : [
{
"cursor" : "BtreeCursor age_1",
"isMultiKey" : false,
"n" : 0,
"nscannedObjects" : 0,
"nscanned" : 0,
"nscannedObjectsAllPlans" : 0,
"nscannedAllPlans" : 0,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 12,
"indexBounds" : {
"age" : [
[
-1.7976931348623157e+308,
30
]
]
}
},
{
"cursor" : "BtreeCursor age_1",
"isMultiKey" : false,
"n" : 1,
"nscannedObjects" : 1,
"nscanned" : 1,
"nscannedObjectsAllPlans" : 1,
"nscannedAllPlans" : 1,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 0,
"indexBounds" : {
"age" : [
[
40,
1.7976931348623157e+308
]
]
}
}
],
"n" : 1,
"nscannedObjects" : 1,
"nscanned" : 1,
"nscannedObjectsAllPlans" : 1,
"nscannedAllPlans" : 1,
"millis" : 12,
"server" : "Aspire-5750:27017"
}
For more information on querying effectively, see http://docs.mongodb.org/manual/core/read-operations/

MongoDB: why doesn't sorting by multiple keys use an index?

Question:
I have a very large collection that is indexed by field ts: (timestamp)
> db.events.ensureIndex({'ts': -1})
I want to get last 5 entries. What surprises me is that the query doesn't use the index and is thus very slow:
> db.events.find().sort({'ts': -1, '_id': -1}).limit(5)
However, sorting just by ts or the other field uses index as it should:
> db.events.find().sort({'ts': -1}).limit(5)
> db.events.find().sort({'_id': -1}).limit(5)
Is this a bug in MongoDB, is this indeed a documented feature or am I doing something wrong?
Additional info:
> db.events.find().sort({'ts': -1, '_id': -1}).limit(5).explain()
{
"cursor" : "BasicCursor",
"nscanned" : 795609,
"nscannedObjects" : 795609,
"n" : 5,
"scanAndOrder" : true,
"millis" : 22866,
"nYields" : 73,
"nChunkSkips" : 0,
"isMultiKey" : false,
"indexOnly" : false,
"indexBounds" : {
}
}
> db.events.find().sort({'ts': -1}).limit(5).explain()
{
"cursor" : "BtreeCursor ts_-1",
"nscanned" : 5,
"nscannedObjects" : 5,
"n" : 5,
"millis" : 0,
"nYields" : 0,
"nChunkSkips" : 0,
"isMultiKey" : false,
"indexOnly" : false,
"indexBounds" : {
"ts" : [
[
{
"$maxElement" : 1
},
{
"$minElement" : 1
}
]
]
}
}
It's worth having a read of the Indexing Strategies section of the Indexing Advice & FAQ wiki page.
There are a few considerations that you may be missing:
MongoDB only uses one index per query
the sort column used must be the last column in the index
So, for your example you should add a compound index on ts and _id:
db.events.ensureIndex({'ts':-1, '_id':-1});
.. and confirm with explain() that the sort is now using the expected index:
> db.events.find().sort({'ts': -1, '_id':-1}).limit(5).explain()
{
"cursor" : "BtreeCursor ts_-1__id_-1",
"nscanned" : 5,
"nscannedObjects" : 5,
"n" : 5,
"millis" : 0,
"nYields" : 0,
"nChunkSkips" : 0,
"isMultiKey" : false,
"indexOnly" : false,
"indexBounds" : {
"ts" : [
[
{
"$maxElement" : 1
},
{
"$minElement" : 1
}
]
],
"_id" : [
[
{
"$maxElement" : 1
},
{
"$minElement" : 1
}
]
]
}
}

Improve querying fields exist in MongoDB

I'm in progress with estimation of MongoDB for our customers. Per requirements we need associate with some entity ent variable set of name-value pairs.
db.ent.insert({'a':5775, 'b':'b1'})
db.ent.insert({'c':'its a c', 'b':'b2'})
db.ent.insert({'a':7557, 'c':'its a c'})
After this I need intensively query ent for presence of fields:
db.ent.find({'a':{$exists:true}})
db.ent.find({'c':{$exists:false}})
Per MongoDB docs:
$exists is not very efficient even with an index, and esp. with {$exists:true} since it will effectively have to scan all indexed values.
Can experts there provide more efficient way (even with shift the paradigm) to deal fast with vary name-value pairs
You can redesign your schema like this:
{
pairs:[
{k: "a", v: 5775},
{k: "b", v: "b1"},
]
}
Then you indexing your key:
db.people.ensureIndex({"pairs.k" : 1})
After this you will able to search by exact match:
db.ent.find({'pairs.k':"a"})
In case you go with Sparse index and your current schema, proposed by #WesFreeman, you will need to create an index on each key you want to search. It can affect write performance or will be not acceptable if your keys are not static.
Simply redesign your schema such that it's an indexable query. Your use case is infact analogous to the first example application given in MongoDB The Definitive Guide.
If you want/need the convenience of result.a just store the keys somewhere indexable.
instead of the existing:
db.ent.insert({a:5775, b:'b1'})
do
db.ent.insert({a:5775, b:'b1', index: ['a', 'b']})
That's then an indexable query:
db.end.find({index: "a"}).explain()
{
"cursor" : "BtreeCursor index_1",
"nscanned" : 1,
"nscannedObjects" : 1,
"n" : 1,
"millis" : 0,
"nYields" : 0,
"nChunkSkips" : 0,
"isMultiKey" : true,
"indexOnly" : false,
"indexBounds" : {
"index" : [
[
"a",
"a"
]
]
}
}
or if you're ever likely to query also by value:
db.ent.insert({
a:5775,
b:'b1',
index: [
{name: 'a', value: 5775},
{name: 'b', value: 'b1'}
]
})
That's also an indexable query:
db.end.find({"index.name": "a"}).explain()
{
"cursor" : "BtreeCursor index.name_",
"nscanned" : 1,
"nscannedObjects" : 1,
"n" : 1,
"millis" : 0,
"nYields" : 0,
"nChunkSkips" : 0,
"isMultiKey" : true,
"indexOnly" : false,
"indexBounds" : {
"index.name" : [
[
"a",
"a"
]
]
}
}
I think a sparse index is the answer to this, although you'll need an index for each field. http://www.mongodb.org/display/DOCS/Indexes#Indexes-SparseIndexes
Sparse indexes should help with $exists:true queries.
Even still, if your field is not really sparse (meaning it's mostly set), it's not going to help you that much.
Update I guess I'm wrong. Looks like there's an open issue ( https://jira.mongodb.org/browse/SERVER-4187 ) still that $exists doesn't use sparse indexes. However, you can do something like this with find and sort, which looks like it properly uses the sparse index:
db.ent.find({}).sort({a:1});
Here's a full demonstration of the difference, using your example values:
> db.ent.insert({'a':5775, 'b':'b1'})
> db.ent.insert({'c':'its a c', 'b':'b2'})
> db.ent.insert({'a':7557, 'c':'its a c'})
> db.ent.ensureIndex({a:1},{sparse:true});
Note that find({}).sort({a:1}) uses the index (BtreeCursor):
> db.ent.find({}).sort({a:1}).explain();
{
"cursor" : "BtreeCursor a_1",
"nscanned" : 2,
"nscannedObjects" : 2,
"n" : 2,
"millis" : 0,
"nYields" : 0,
"nChunkSkips" : 0,
"isMultiKey" : false,
"indexOnly" : false,
"indexBounds" : {
"a" : [
[
{
"$minElement" : 1
},
{
"$maxElement" : 1
}
]
]
}
}
And find({a:{$exists:true}}) does a full scan:
> db.ent.find({a:{$exists:true}}).explain();
{
"cursor" : "BasicCursor",
"nscanned" : 3,
"nscannedObjects" : 3,
"n" : 2,
"millis" : 0,
"nYields" : 0,
"nChunkSkips" : 0,
"isMultiKey" : false,
"indexOnly" : false,
"indexBounds" : {
}
}
Looks like you can also use .hint({a:1}) to force it to use the index.
> db.ent.find().hint({a:1}).explain();
{
"cursor" : "BtreeCursor a_1",
"nscanned" : 2,
"nscannedObjects" : 2,
"n" : 2,
"millis" : 0,
"nYields" : 0,
"nChunkSkips" : 0,
"isMultiKey" : false,
"indexOnly" : false,
"indexBounds" : {
"a" : [
[
{
"$minElement" : 1
},
{
"$maxElement" : 1
}
]
]
}
}
How about setting the non-exists field to null? Then you can query them with {field: {$ne: null}}.
db.ent.insert({'a':5775, 'b':'b1', 'c': null})
db.ent.insert({'a': null, 'b':'b2', 'c':'its a c'})
db.ent.insert({'a':7557, 'b': null, 'c':'its a c'})
db.ent.ensureIndex({"a" : 1})
db.ent.ensureIndex({"b" : 1})
db.ent.ensureIndex({"c" : 1})
db.ent.find({'a':{$ne: null}}).explain()
Here's the output:
{
"cursor" : "BtreeCursor a_1 multi",
"isMultiKey" : false,
"n" : 4,
"nscannedObjects" : 4,
"nscanned" : 5,
"nscannedObjectsAllPlans" : 4,
"nscannedAllPlans" : 5,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 0,
"indexBounds" : {
"a" : [
[
{
"$minElement" : 1
},
null
],
[
null,
{
"$maxElement" : 1
}
]
]
},
"server" : "my-laptop"
}