MongoDB Aggregation is slow on Indexed fields - mongodb

I have a collection with ~2.5m documents, the collection size is 14,1GB, storage size 4.2GB and average object size 5,8KB. I created two separate indexes on two of the fields dataSourceName and version (text fields) and tried to make an aggregate query to list their 'grouped by' values.
(Trying to achieve this: select dsn, v from collection group by dsn, v).
db.getCollection("the-collection").aggregate(
[
{
"$group" : {
"_id" : {
"dataSourceName" : "$dataSourceName",
"version" : "$version"
}
}
}
],
{
"allowDiskUse" : false
}
);
Even though MongoDB eats ~10GB RAM on the server, the fields are indexed and nothing else is running at all, the aggregation takes ~40 seconds.
I tried to make a new index, which contains both fields in order, but still, the query does not seem to use the index:
{
"stages" : [
{
"$cursor" : {
"query" : {
},
"fields" : {
"dataSourceName" : NumberInt(1),
"version" : NumberInt(1),
"_id" : NumberInt(0)
},
"queryPlanner" : {
"plannerVersion" : NumberInt(1),
"namespace" : "db.the-collection",
"indexFilterSet" : false,
"parsedQuery" : {
},
"winningPlan" : {
"stage" : "COLLSCAN",
"direction" : "forward"
},
"rejectedPlans" : [
]
}
}
},
{
"$group" : {
"_id" : {
"dataSourceName" : "$dataSourceName",
"version" : "$version"
}
}
}
],
"ok" : 1.0
}
I am using MongoDB 3.6.5 64bit on Windows, so it should use the indexes: https://docs.mongodb.com/master/core/aggregation-pipeline/#pipeline-operators-and-indexes
As #Alex-Blex suggested, I tried it with sorting, but I an get OOM error:
The following error occurred while attempting to execute the aggregate query
Mongo Server error (MongoCommandException): Command failed with error 16819: 'Sort exceeded memory limit of 104857600 bytes, but did not opt in to external sorting. Aborting operation. Pass allowDiskUse:true to opt in.' on server server-address:port.
The full response is:
{
"ok" : 0.0,
"errmsg" : "Sort exceeded memory limit of 104857600 bytes, but did not opt in to external sorting. Aborting operation. Pass allowDiskUse:true to opt in.",
"code" : NumberInt(16819),
"codeName" : "Location16819"
}
My bad, I tried it on the wrong collection... Adding the same sort as the index works, now it is using the index. Still not fast thought, took ~10 seconds to give me the results.
The new exaplain:
{
"stages" : [
{
"$cursor" : {
"query" : {
},
"sort" : {
"dataSourceName" : NumberInt(1),
"version" : NumberInt(1)
},
"fields" : {
"dataSourceName" : NumberInt(1),
"version" : NumberInt(1),
"_id" : NumberInt(0)
},
"queryPlanner" : {
"plannerVersion" : NumberInt(1),
"namespace" : "....",
"indexFilterSet" : false,
"parsedQuery" : {
},
"winningPlan" : {
"stage" : "PROJECTION",
"transformBy" : {
"dataSourceName" : NumberInt(1),
"version" : NumberInt(1),
"_id" : NumberInt(0)
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"dataSourceName" : NumberInt(1),
"version" : NumberInt(1)
},
"indexName" : "dataSourceName_1_version_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"dataSourceName" : [
],
"version" : [
]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : NumberInt(2),
"direction" : "forward",
"indexBounds" : {
"dataSourceName" : [
"[MinKey, MaxKey]"
],
"version" : [
"[MinKey, MaxKey]"
]
}
}
},
"rejectedPlans" : [
]
}
}
},
{
"$group" : {
"_id" : {
"dataSourceName" : "$dataSourceName",
"version" : "$version"
}
}
}
],
"ok" : 1.0
}

The page you are referring to says exactly opposite:
The $match and $sort pipeline operators can take advantage of an index
Your first stage is $group, which is neither $match nor $sort.
Try to sort it on the first stage to trigger use of the index:
db.getCollection("the-collection").aggregate(
[
{ $sort: { dataSourceName:1, version:1 } },
{
"$group" : {
"_id" : {
"dataSourceName" : "$dataSourceName",
"version" : "$version"
}
}
}
],
{
"allowDiskUse" : false
}
);
Please note, it should be a single compound index with the same fields and sorting:
db.getCollection("the-collection").createIndex({ dataSourceName:1, version:1 })

Related

MongoDB $group + $sum aggregation is very slow

I have an aggregation query in MongoDB:
[{
$group: {
_id: '$status',
status: {
$sum: 1
}
}
}]
It is running on a collection that has ~80 million documents. The status field is indexed, yet the query is very slow and runs for around 60 seconds or more.
I did an explain() on the query, but still got almost nowhere:
{
"explainVersion" : "1",
"stages" : [
{
"$cursor" : {
"queryPlanner" : {
"namespace" : "loa.document",
"indexFilterSet" : false,
"parsedQuery" : {
},
"queryHash" : "B9878693",
"planCacheKey" : "8EAA28C6",
"maxIndexedOrSolutionsReached" : false,
"maxIndexedAndSolutionsReached" : false,
"maxScansToExplodeReached" : false,
"winningPlan" : {
"stage" : "PROJECTION_SIMPLE",
"transformBy" : {
"status" : 1,
"_id" : 0
},
"inputStage" : {
"stage" : "COLLSCAN",
"direction" : "forward"
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : "$status",
"status" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"serverInfo" : {
"host" : "rack-compute-2",
"port" : 27017,
"version" : "5.0.6",
"gitVersion" : "212a8dbb47f07427dae194a9c75baec1d81d9259"
},
"serverParameters" : {
"internalQueryFacetBufferSizeBytes" : 104857600,
"internalQueryFacetMaxOutputDocSizeBytes" : 104857600,
"internalLookupStageIntermediateDocumentMaxSizeBytes" : 104857600,
"internalDocumentSourceGroupMaxMemoryBytes" : 104857600,
"internalQueryMaxBlockingSortMemoryUsageBytes" : 104857600,
"internalQueryProhibitBlockingMergeOnMongoS" : 0,
"internalQueryMaxAddToSetBytes" : 104857600,
"internalDocumentSourceSetWindowFieldsMaxMemoryBytes" : 104857600
},
"command" : {
"aggregate" : "document",
"pipeline" : [
{
"$group" : {
"_id" : "$status",
"status" : {
"$sum" : 1
}
}
}
],
"explain" : true,
"cursor" : {
},
"lsid" : {
"id" : UUID("a07e17fe-65ff-4d38-966f-7517b7a5d3f2")
},
"$db" : "loa"
},
"ok" : 1
}
I see that it does a full COLLSCAN, I just can't understand why.
I plan on supporting a couple hundred million (or even a billion) documents in that collection, but this problem hijacks my plans for seemingly no reason.
You can advice the query planner to use the index as follow:
db.test.explain("executionStats").aggregate(
[
{$group:{ _id:"$status" ,status:{$sum:1} }}
],
{hint:"status_1"}
)
Make sure the index name in the hint is same as created ...
(db.test.getIndexes() will show you the exact index name )

MongoDB Shard-Key Performance Simple

I have a cluster with 3 config dbs and 3 shards. I am querying against a db with 106M records, each with 410 fields. I imported this data using a shard key of:
{state: 1, zipCode: 1}.
When I run the following queries individually each one completes in less than 5sec. (SC = 1.6M records, NC = 5.2M records)
db.records.find( { "state" : "NC" } ).count()
db.records.find( { "state" : "SC" } ).count()
db.records.find( { "state" : { $in : ["NC"] } } ).count()
db.records.find( { "state" : { $in : ["SC"] } } ).count()
However when I query both states using an $in or an $or the query takes over an hour to complete.
db.records.find( "state" : { $in : [ "NC" , "SC" ] } ).count()
db.records.find({ $or : [ { "state" : "NC" }, { "state" : "SC" } ).count()
The entirety of both states exist on 1 shard. Below are the results of .explain() using the $in query:
db.records.find({"state":{$in:["NC","SC"]}}).explain()
{
"queryPlanner" : {
"mongosPlannerVersion" : 1,
"winningPlan" : {
"stage" : "SINGLE_SHARD",
"shards" : [
{
"shardName" : "s2",
"connectionString" : "s2/192.168.2.17:27000,192.168.2.17:27001",
"serverInfo" : {
"host" : "MonDbShard2",
"port" : 27000,
"version" : "3.4.7",
"gitVersion" : "cf38c1b8a0a8dca4a11737581beafef4fe120bcd"
},
"plannerVersion" : 1,
"namespace" : "DBNAME.records",
"indexFilterSet" : false,
"parsedQuery" : {
"state" : {
"$in" : [
"NC",
"SC"
]
}
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "SHARDING_FILTER",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"state" : 1,
"zipCode" : 1
},
"indexName" : "state_1_zipCode_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"state" : [ ],
"zipCode" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"state" : [
"[\"NC\", \"NC\"]",
"[\"SC\", \"SC\"]"
],
"zipCode" : [
"[MinKey, MaxKey]"
]
}
}
}
},
"rejectedPlans" : [ ]
}
]
}
},
"ok" : 1
}
Why would querying two states at one time cause such a drastic difference in completion time? Also, querying a single zipCode without also including the corresponding state, results in the same drastic difference in completion time. I feel like I am misunderstanding how the shard-key actually operates. Any thoughts?

Tune Up Mongo Query

I am new to Mongo and was trying to get distinct count of users. The field Id and Status are not individually Indexed columns but there exists a composite index on both the field. My current query is something like this where the match conditions changes depending on the requirements.
DBQuery.shellBatchSize = 1000000;
db.getCollection('username').aggregate([
{$match:
{ Status: "A"
} },
{"$group" : {_id:"$Id", count:{$sum:1}}}
]);
Is there anyway we can optimize this query more or add parallel runs on collection so that we can achieve results faster ?
Regards
You can tune your aggregation pipelines by passing in an option of explain=true in the aggregate method.
db.getCollection('username').aggregate([
{$match: { Status: "A" } },
{"$group" : {_id:"$Id", count:{$sum:1}}}],
{ explain: true });
This will then output the following to work with
{
"stages" : [
{
"$cursor" : {
"query" : {
"Status" : "A"
},
"fields" : {
"Id" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "test.usernames",
"indexFilterSet" : false,
"parsedQuery" : {
"Status" : {
"$eq" : "A"
}
},
"winningPlan" : {
"stage" : "EOF"
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : "$Id",
"count" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"ok" : 1
}
So to speed up our query we need a index to help the match part of the pipeline, so let's create a index on Status
> db.usernames.createIndex({Status:1})
{
"createdCollectionAutomatically" : true,
"numIndexesBefore" : 1,
"numIndexesAfter" : 2,
"ok" : 1
}
If we now run the explain again we'll get the following results
{
"stages" : [
{
"$cursor" : {
"query" : {
"Status" : "A"
},
"fields" : {
"Id" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "test.usernames",
"indexFilterSet" : false,
"parsedQuery" : {
"Status" : {
"$eq" : "A"
}
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"Status" : 1
},
"indexName" : "Status_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"Status" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"Status" : [
"[\"A\", \"A\"]"
]
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : "$Id",
"count" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"ok" : 1
}
We can now see straight away this is using a index.
https://docs.mongodb.com/manual/reference/explain-results/

MongoDB optimize indexes for aggregation

I have an aggregate on a collection with about 1.6M of registers. That consult is a simple example of other more complex, but illustrate the poor optimization of index used in my opinion.
db.getCollection('cbAlters').runCommand("aggregate", {pipeline: [
{
$match: { cre_carteraId: "31" }
},
{
$group: { _id: { ca_tramomora: "$cre_tramoMora" },
count: { $sum: 1 } }
}
]})
That query toke about 5 sec. The colleccion have 25 indexes configured to differents consults. The one used according to query explain is:
{
"v" : 1,
"key" : {
"cre_carteraId" : 1,
"cre_periodo" : 1,
"cre_tramoMora" : 1,
"cre_inactivo" : 1
},
"name" : "cartPerTramInact",
"ns" : "basedatos.cbAlters"
},
I created an index adjusted to this particular query:
{
"v" : 1,
"key" : {
"cre_carteraId" : 1,
"cre_tramoMora" : 1
},
"name" : "cartPerTramTest",
"ns" : "basedatos.cbAlters"
}
The query optimizer reject this index, and suggests me to use the initial index. Output of my query explain seem like this:
{
"waitedMS" : NumberLong(0),
"stages" : [
{
"$cursor" : {
"query" : {
"cre_carteraId" : "31"
},
"fields" : {
"cre_tramoMora" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "basedatos.cbAlters",
"indexFilterSet" : false,
"parsedQuery" : {
"cre_carteraId" : {
"$eq" : "31"
}
},
"winningPlan" : {
"stage" : "PROJECTION",
"transformBy" : {
"cre_tramoMora" : 1,
"_id" : 0
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"cre_carteraId" : 1,
"cre_periodo" : 1,
"cre_tramoMora" : 1,
"cre_inactivo" : 1
},
"indexName" : "cartPerTramInact",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"cre_carteraId" : [
"[\"31\", \"31\"]"
],
"cre_periodo" : [
"[MinKey, MaxKey]"
],
"cre_tramoMora" : [
"[MinKey, MaxKey]"
],
"cre_inactivo" : [
"[MinKey, MaxKey]"
]
}
}
},
"rejectedPlans" : [
{
"stage" : "PROJECTION",
"transformBy" : {
"cre_tramoMora" : 1,
"_id" : 0
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"cre_carteraId" : 1,
"cre_tramoMora" : 1
},
"indexName" : "cartPerTramTest",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"cre_carteraId" : [
"[\"31\", \"31\"]"
],
"cre_tramoMora" : [
"[MinKey, MaxKey]"
]
}
}
}
]
}
}
},
{
"$group" : {
"_id" : {
"ca_tramomora" : "$cre_tramoMora"
},
"count" : {
"$sum" : {
"$const" : 1.0
}
}
}
}
],
"ok" : 1.0
}
Then, why optimizer prefers an index less adjusted? Should indexFilterSet (result filtered for index) be true for this aggregate?
How can I improve this index, or something goes wrong with the query?
I do not have much experience with mongoDB, I appreciate any help
As long as you have index cartPerTramInact, optimizer won't use your cartPerTramTest index because first fields are same and in same order.
This goes with other indexes too. When there is indexes what have same keys at same order (like a.b.c.d, a.b.d, a.b) and you query use fields a.b, it will favour that a.b.c.d. Anyway you don't need that index a.b because you already have two indexes what covers a.b (a.b.c.d and a.b.d)
Index a.b.d is used only when you do query with those fields a.b.d, BUT if a.b is already very selective, it's probably faster to do select with index a.b.c.d using only part a.b and do "full table scan" to find that d
There is a hint option for aggregations that can help with the index...
See https://www.mongodb.com/docs/upcoming/reference/method/db.collection.aggregate/#mongodb-method-db.collection.aggregate

Mongo $group too slow

I have a mongo db collections of about 168,200,000 documents. I am trying to get the average of a certain field with $group, and I am using $match before the $group in the pipeline to use the index on client.city. But the query is taking about 5 minutes to run, which is very slow.
Here are the things I tried:
db.ar12.aggregate(
{$match:{'client.city':'New York'}},
{'$group':{'_id':'client.city', 'avg':{'$avg':'$length'}}}
)
db.ar12.aggregate(
{$match:{'client.city':'New York'}},
{'$group':{'_id':null, 'avg':{'$avg':'$length'}}}
)
db.ar12.aggregate(
{$match:{'client.city':'New York'}},
{$project: {'length':1}},
{'$group':{'_id':null, 'avg':{'$avg':'$length'}}}
)
All 3 queries take about the same time, number of documents with client.city = to New York is 1,231,672, find({'client.city':'New York').count() takes a second to run
> db.version()
3.2.0
EDIT
Here's the explain result... As for the comment for adding a compound index with length, would that help, although I am not search by length I want all lengthes...
{
"waitedMS" : NumberLong(0),
"stages" : [
{
"$cursor" : {
"query" : {
"client.city" : "New York"
},
"fields" : {
"length" : 1,
"_id" : 1
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "clients.ar12",
"indexFilterSet" : false,
"parsedQuery" : {
"client.city" : {
"$eq" : "New York"
}
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"client.city" : 1
},
"indexName" : "client.city_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"client.city" : [
"[\"New York\", \"New York\"]"
]
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$project" : {
"length" : true
}
},
{
"$group" : {
"_id" : {
"$const" : null
},
"total" : {
"$avg" : "$length"
}
}
}
],
"ok" : 1
}
EDIT 2
I have added a compound index of client.city and length, but to no avail the speed is still too slow, I tried these 2 queries:
db.ar12.aggregate(
{$match: {'client.city':'New York'}},
{$project: {'client.city':1, 'length':1}},
{'$group':{'_id':'$client.city', 'avg':{'$avg':'$length'}}}
)
The above query wasn't using the compound index, so I tried this to force using it, and still nothing changed:
db.ar12.aggregate(
{$match: { $and : [{'client.city':'New York'}, {'length':{'$gt':0}}]}},
{$project: {'client.city':1, 'length':1}},
{'$group':{'_id':'$client.city', 'avg':{'$avg':'$length'}}}
)
below is the explain of the last query:
{
"waitedMS" : NumberLong(0),
"stages" : [
{
"$cursor" : {
"query" : {
"$and" : [
{
"client.city" : "New York"
},
{
"length" : {
"$gt" : 0
}
}
]
},
"fields" : {
"client.city" : 1,
"length" : 1,
"_id" : 1
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "clients.ar12",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"client.city" : {
"$eq" : "New York"
}
},
{
"length" : {
"$gt" : 0
}
}
]
},
"winningPlan" : {
"stage" : "CACHED_PLAN",
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"client.city" : 1,
"length" : 1
},
"indexName" : "client.city_1_length_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"client.city" : [
"[\"New York\", \"New York\"]"
],
"length" : [
"(0.0, inf.0]"
]
}
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$project" : {
"client" : {
"city" : true
},
"length" : true
}
},
{
"$group" : {
"_id" : "$client.city",
"avg" : {
"$avg" : "$length"
}
}
}
],
"ok" : 1
}
I have found a work around, length goes from 1 till 70. So what I did is in python I iterated from 1 to 70, and found the count of each length for each city,
db.ar12.find({'client.city':'New York', 'length':i}).count()
which is very fast, then calculated the average in python, it is taking about 2 seconds to run.
This is not the best solution, since I have other queries to run, I don't know if I can find a work around for all of them...