MongoDB inconsistent aggregate call between queries - mongodb

I have two tables. videos and youtubes. I want to do a $lookup on videos.youtube and match that to youtubes._id and then $match that data based on a youtubes field. Which is working fine, but there are some huge inconsistencies between queries that should be identical in nature, or at the very least close to.
Query 1: returns 8261 documents. Takes [40, 50]ms to execute
db.getCollection('videos').aggregate([
{ '$sort': { date: -1 } },
{
'$lookup': {
from: 'youtubes',
localField: 'youtube',
foreignField: '_id',
as: 'youtube'
}
},
{ '$match': { 'youtube.talent': true } },
])
Query 2: returns 760 documents. Takes [470, 500]ms to execute
db.getCollection('videos').aggregate([
{ '$sort': { date: -1 } },
{
'$lookup': {
from: 'youtubes',
localField: 'youtube',
foreignField: '_id',
as: 'youtube'
}
},
{ '$match': { 'youtube.id': 7 } },
])
Query 3: returns 760 documents. Takes [90, 100]ms to execute
db.getCollection('videos').aggregate([
// { '$sort': { date: -1 } },
{
'$lookup': {
from: 'youtubes',
localField: 'youtube',
foreignField: '_id',
as: 'youtube'
}
},
{ '$match': { 'youtube.id': 7 } },
])
All fields used in the queries are indexed. What stands out is that the $sort statement in Query 2, apparently uses roughly 400ms to execute, yet in Query 1 that uses the same $sort statement in the same location in the pipeline and it only uses [40, 50]ms.
I've used the { explain: true } option to look for differences between Query 1 and Query 2 that could explain the speed differences, but they are identical except for the $match portion.
Any solution/suggestions for bringing Query 2 up to speed with Query 1? Or at the very least an explanation for the huge differences in speed?
Another weird thing discovered while making this post
Query 4: returns 9378 documents. Takes [25, 35]ms to execute
db.getCollection('videos').aggregate([
{ '$sort': { date: -1 } },
{
'$lookup': {
from: 'youtubes',
localField: 'youtube',
foreignField: '_id',
as: 'youtube'
}
},
{ '$match': { 'youtube.clipper': true } }
])
Query 5: returns 9378 documents. Takes [600, 680]ms to execute
db.getCollection('videos').aggregate([
//{ '$sort': { date: -1 } },
{
'$lookup': {
from: 'youtubes',
localField: 'youtube',
foreignField: '_id',
as: 'youtube'
}
},
{ '$match': { 'youtube.clipper': true } }
])
At this point I'm stumped as to what is happening. Originally I thought it had to do with Number vs Boolean, but as Query 4 and Query 5 shows it clearly has 0 impact. And it seems random.
Indexes just in case (for youtubes)
[
{
"v" : 2,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "holo-watcher.youtubes"
},
{
"v" : 2,
"unique" : true,
"key" : {
"id" : 1
},
"name" : "id_1",
"ns" : "holo-watcher.youtubes",
"background" : true
},
{
"v" : 2,
"key" : {
"name" : 1
},
"name" : "name_1",
"ns" : "holo-watcher.youtubes",
"background" : true
},
{
"v" : 2,
"unique" : true,
"key" : {
"channelId" : 1
},
"name" : "channelId_1",
"ns" : "holo-watcher.youtubes",
"background" : true
},
{
"v" : 2,
"key" : {
"clipper" : 1
},
"name" : "clipper_1",
"ns" : "holo-watcher.youtubes",
"background" : true
},
{
"v" : 2,
"key" : {
"talent" : 1
},
"name" : "talent_1",
"ns" : "holo-watcher.youtubes",
"background" : true
},
{
"v" : 2,
"key" : {
"debut" : 1
},
"name" : "debut_1",
"ns" : "holo-watcher.youtubes",
"background" : true
}
]
indexes (for videos)
[
{
"v" : 2,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "holo-watcher.videos"
},
{
"v" : 2,
"unique" : true,
"key" : {
"videoId" : 1
},
"name" : "videoId_1",
"ns" : "holo-watcher.videos",
"background" : true
},
{
"v" : 2,
"key" : {
"title" : 1
},
"name" : "title_1",
"ns" : "holo-watcher.videos",
"background" : true
},
{
"v" : 2,
"key" : {
"date" : 1
},
"name" : "date_1",
"ns" : "holo-watcher.videos",
"background" : true
}
]
{ explain: true } output for Query 5 (nearly identical to Query 1 and Query 2):
{
"stages" : [
{
"$cursor" : {
"query" : {},
"sort" : {
"date" : -1
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "holo-watcher.videos",
"indexFilterSet" : false,
"parsedQuery" : {},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"date" : 1
},
"indexName" : "date_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"date" : []
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "backward",
"indexBounds" : {
"date" : [
"[MaxKey, MinKey]"
]
}
}
},
"rejectedPlans" : []
}
}
},
{
"$lookup" : {
"from" : "youtubes",
"as" : "youtube",
"localField" : "youtube",
"foreignField" : "_id"
}
},
{
"$match" : {
"youtube.clipper" : {
"$eq" : true
}
}
}
],
"ok" : 1.0
}

Related

MongoDB $group + $sum aggregation is very slow

I have an aggregation query in MongoDB:
[{
$group: {
_id: '$status',
status: {
$sum: 1
}
}
}]
It is running on a collection that has ~80 million documents. The status field is indexed, yet the query is very slow and runs for around 60 seconds or more.
I did an explain() on the query, but still got almost nowhere:
{
"explainVersion" : "1",
"stages" : [
{
"$cursor" : {
"queryPlanner" : {
"namespace" : "loa.document",
"indexFilterSet" : false,
"parsedQuery" : {
},
"queryHash" : "B9878693",
"planCacheKey" : "8EAA28C6",
"maxIndexedOrSolutionsReached" : false,
"maxIndexedAndSolutionsReached" : false,
"maxScansToExplodeReached" : false,
"winningPlan" : {
"stage" : "PROJECTION_SIMPLE",
"transformBy" : {
"status" : 1,
"_id" : 0
},
"inputStage" : {
"stage" : "COLLSCAN",
"direction" : "forward"
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : "$status",
"status" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"serverInfo" : {
"host" : "rack-compute-2",
"port" : 27017,
"version" : "5.0.6",
"gitVersion" : "212a8dbb47f07427dae194a9c75baec1d81d9259"
},
"serverParameters" : {
"internalQueryFacetBufferSizeBytes" : 104857600,
"internalQueryFacetMaxOutputDocSizeBytes" : 104857600,
"internalLookupStageIntermediateDocumentMaxSizeBytes" : 104857600,
"internalDocumentSourceGroupMaxMemoryBytes" : 104857600,
"internalQueryMaxBlockingSortMemoryUsageBytes" : 104857600,
"internalQueryProhibitBlockingMergeOnMongoS" : 0,
"internalQueryMaxAddToSetBytes" : 104857600,
"internalDocumentSourceSetWindowFieldsMaxMemoryBytes" : 104857600
},
"command" : {
"aggregate" : "document",
"pipeline" : [
{
"$group" : {
"_id" : "$status",
"status" : {
"$sum" : 1
}
}
}
],
"explain" : true,
"cursor" : {
},
"lsid" : {
"id" : UUID("a07e17fe-65ff-4d38-966f-7517b7a5d3f2")
},
"$db" : "loa"
},
"ok" : 1
}
I see that it does a full COLLSCAN, I just can't understand why.
I plan on supporting a couple hundred million (or even a billion) documents in that collection, but this problem hijacks my plans for seemingly no reason.
You can advice the query planner to use the index as follow:
db.test.explain("executionStats").aggregate(
[
{$group:{ _id:"$status" ,status:{$sum:1} }}
],
{hint:"status_1"}
)
Make sure the index name in the hint is same as created ...
(db.test.getIndexes() will show you the exact index name )

mongodb aggregate query not using index sort

hi my document example as below.
{
"_id" : ObjectId("5ee2234fde52e50d9520f6d7"),
"ClientMessageId" : "EAAB38DD88551BF180FA005056BB48A4",
"ClientMessageType" : "UpdateSalesModelAndVendor",
"Sku" : "HBV00000XXXXX",
"TraceId" : "EAAB38DD885504F180FA005056BB48A4",
"Data" : [
{
"_id" : ObjectId("5ee2234fde52e50d9520f6d8"),
"Sku" : "HBV00000XXXXX",
"IsActive" : false,
"Version" : NumberInt(1),
"CommandName" : "UpdateSalesModelAndVendorCommand",
"TraceId" : "EAAB38DD885504F180FA005056BB48A4",
"ClientMessageType" : "UpdateSalesModelAndVendor",
"ClientMessageId" : "EAAB38DD88551BF180FA005056BB48A4",
"ClientUsername" : "USER",
"ClientDateTime" : ISODate("2020-06-10T16:38:55.000+0000"),
"ReceivedAt" : ISODate("2020-06-10T16:42:23.992+0000"),
"InfoMessages" : null,
"WarningMessages" : null,
"ErrorMessages" : [
]
}
],
"MessageType" : "Listing.RetailListingCreated",
"__v" : NumberInt(0)
}
I have two index like this
[
{
"v" : 2.0,
"key" : {
"_id" : 1.0
},
"name" : "_id_"
},
{
"v" : 2.0,
"key" : {
"Data.Sku" : 1.0
},
"name" : "Data.Sku_1",
"background" : true
},
{
"v" : 2.0,
"key" : {
"Data.Sku" : 1.0,
"Data.ReceivedAt" : -1.0
},
"name" : "Data.Sku_1_Data.ReceivedAt_-1",
"background" : true
}
]
this query get 0.8 ms (index used)
db.eventlogs.aggregate([
{ $sort: { "Data.Sku": -1 } },
{ $unwind: "$Data" },
{ "$skip": 0 }, { "$limit": 50 },
],
{ allowDiskUse: true }
)
but this query get 2 minutes over ( not index used )
db.eventlogs.aggregate([
{ $sort: { "Data.ReceivedAt": -1 } },
{ $unwind: "$Data" },
{ "$skip": 0 }, { "$limit": 50 },
],
{ allowDiskUse: true }
)
How can solve this problem. Should i create new index for ReceivedAt ?
db.eventlogs.createIndex( { "Data.ReceivedAt": -1}, { "background": true});

MongoDB aggregate count is too much slow

I have around 60 thousand document in users collection, and have the following query:
db.getCollection('users').aggregate([
{"$match":{"userType":"employer"}},
{"$lookup":{"from":"companies","localField":"_id","foreignField":"owner.id","as":"company"}},
{"$unwind":"$company"},
{"$lookup":{"from":"companytypes","localField":"company.type.id","foreignField":"_id","as":"companyType"}},
{"$unwind":"$companyType"},
{ $group: { _id: null, count: { $sum: 1 } } }
])
It takes around 12 seconds to count, even I call count function before list function, but my list function with limit: 10 response faster than count.
And following is explain result:
{
"stages" : [
{
"$cursor" : {
"query" : {
"userType" : "employer"
},
"fields" : {
"company" : 1,
"_id" : 1
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "jobs.users",
"indexFilterSet" : false,
"parsedQuery" : {
"userType" : {
"$eq" : "employer"
}
},
"winningPlan" : {
"stage" : "COLLSCAN",
"filter" : {
"userType" : {
"$eq" : "employer"
}
},
"direction" : "forward"
},
"rejectedPlans" : []
}
}
},
{
"$lookup" : {
"from" : "companies",
"as" : "company",
"localField" : "_id",
"foreignField" : "owner.id",
"unwinding" : {
"preserveNullAndEmptyArrays" : false
}
}
},
{
"$match" : {
"$nor" : [
{
"company" : {
"$eq" : []
}
}
]
}
},
{
"$group" : {
"_id" : {
"$const" : null
},
"total" : {
"$sum" : {
"$const" : 1
}
}
}
},
{
"$project" : {
"_id" : false,
"total" : true
}
}
],
"ok" : 1.0
}
$lookup operations are slow since they mimic the left join behavior, from the DOCS:
$lookup performs an equality match on the localField to the
foreignField from the documents of the from collection
Hence if there are no indexes in the fields used for joining the collections Mongodb is force to do a collection scan.
Adding an index for the foreignField attributes should prevent a collection scan and increase the performance even of a magnitude

Mongo aggregation performance

I am new to mongo and below query performs really slow with record set over 2 Million records
Query
db.testCollection.aggregate({
$match: {
active: {
$ne: false
}
}
}, {
$group: {
_id: {
productName: "$productName",
model: "$model",
version: "$version",
uid: "$uid"
},
total: {
$sum: 1
}
}
}, {
$project: {
total: 1,
model: "$_id.model",
version: "$_id.version",
uid: "$_id.uid",
productName: "$_id.productName"
}
}, {
$sort: {
model: 1
}
})
explain()
{
"stages" : [
{
"$cursor" : {
"query" : {
"active" : {
"$ne" : false
}
},
"fields" : {
"version" : 1,
"productName" : 1,
"model" : 1,
"uid" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "fms2.device",
"indexFilterSet" : false,
"parsedQuery" : {
"$nor" : [
{
"active" : {
"$eq" : false
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"active" : 1
},
"indexName" : "active",
"isMultiKey" : false,
"multiKeyPaths" : {
"active" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"active" : [
"[MinKey, false)",
"(false, MaxKey]"
]
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : {
"productName" : "$productName",
"model" : "$model",
"version" : "$version",
"uid" : "$uid"
},
"total" : {
"$sum" : {
"$const" : 1
}
}
}
},
{
"$project" : {
"_id" : true,
"total" : true,
"model" : "$_id.model",
"version" : "$_id.version",
"uid" : "$_id.uid",
"productName" : "$_id.productName"
}
},
{
"$sort" : {
"sortKey" : {
"model" : 1
}
}
}
],
"ok" : 1
}
Is there a way to optimize this query more ? I had a look into https://docs.mongodb.com/manual/core/aggregation-pipeline-optimization/ as well but most of the stated suggestions are not applicable for this query.
Not sure if it matters, result of this aggregation ends up with only 20-30 records.

Realtime Mongo aggregation is slow

I am developing the dashboard of our application and I was wondering if it's normal for MongoDB to be so slow for some query aggregate queries. I am pasting below a simplified version of one of our queries:
db.data_items.aggregate([
{'$match':
{
'organization_subscription_waves.wave_id':ObjectId('5617fe6abecf0500f9c6e125')
},
{'$group':
{'_id': '$data_type_value_type', 'count': {'$sum': 1}}
}
])
The result is this:
{ "_id" : "PHOTO", "count" : 76 }
{ "_id" : "MULTI_SELECT", "count" : 1607 }
{ "_id" : "TIME", "count" : 659 }
{ "_id" : "MULTIPLE_CHOICE", "count" : 78321 }
{ "_id" : "DATE", "count" : 649 }
{ "_id" : "NUMBER", "count" : 2679 }
and takes over two minutes which I believe it's too long for such a simple aggregation. I am wondering if there is anything I can do to improve the query efficiency or whether I must do some offline optimization to improve the performance.
Among others, I have this index on the collection:
{
"v" : 1,
"name" : "organization_subscription_waves.wave_id_1",
"key" : {
"organization_subscription_waves.wave_id" : 1
},
"ns" : "gigwalk_apps_1.data_items"
},
Yet I am not sure if it's being used. explain has the following output:
{
"stages" : [
{
"$cursor" : {
"query" : {
"organization_subscription_waves.wave_id" : ObjectId("5617fe6abecf0500f9c6e125")
},
"fields" : {
"data_type_value_type" : 1,
"_id" : 0
},
"plan" : {
"cursor" : "BtreeCursor ",
"isMultiKey" : false,
"scanAndOrder" : false,
"indexBounds" : {
"organization_subscription_waves.wave_id" : [
[
ObjectId("5617fe6abecf0500f9c6e125"),
ObjectId("5617fe6abecf0500f9c6e125")
]
]
},
"allPlans" : [
{
"cursor" : "BtreeCursor ",
"isMultiKey" : false,
"scanAndOrder" : false,
"indexBounds" : {
"organization_subscription_waves.wave_id" : [
[
ObjectId("5617fe6abecf0500f9c6e125"),
ObjectId("5617fe6abecf0500f9c6e125")
]
]
}
}
]
}
}
},
{
"$group" : {
"_id" : "$data_type_value_type",
"count" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"ok" : 1
}