Related
Recently, I started to investigate the performance of MongoDB with AIS Data.
I used a collection with 19m documents with proper field types as described in the definition.
I also created a new geoloc field with type: (Point) from coordinates (lon,lat) in this same collection.
The query under investigation is:
db.nari_dynamic.explain('executionStats').aggregate
(
[
{
"$match": {
"geoloc": {
"$geoWithin": {
"$geometry": {
"type" : "Polygon" ,
"coordinates": [ [ [ -5.00, 45.00 ], [ +0.00, 45.00 ], [ +0.00, 50.00 ], [ -5.00, 50.00 ], [ -5.00, 45.00 ] ] ]
}}}}
},
{ "$group": {"_id": "$sourcemmsi", "PointCount": {"$sum" : 1}, "MinDatePoint": {"$min" : {"date": "$t3" }}, "MaxDatePoint": {"$max" : {"date": "$t3" }} }},
{ "$sort": {"_id":1} },
{ "$limit":100 },
{ "$project": {"_id":1, "PointCount":1, "MinDatePoint":1, "MaxDatePoint":1} }
],
{ explain:true}
)
During investigation and testing I found the following:
Without any index: 94s
With geoloc-2dsphere index: 280s
Here are the Execution Stats:
Without the Index
{ stages:
[ { '$cursor':
{ queryPlanner:
{ plannerVersion: 1,
namespace: 'mscdata.nari_dynamic',
indexFilterSet: false,
parsedQuery:
{ geoloc:
{ '$geoWithin':
{ '$geometry':
{ type: 'Polygon',
coordinates: [ [ [ -5, 45 ], [ 0, 45 ], [ 0, 50 ], [ -5, 50 ], [ -5, 45 ] ] ] } } } },
queryHash: '6E2EAB94',
planCacheKey: '6E2EAB94',
winningPlan:
{ stage: 'PROJECTION_SIMPLE',
transformBy: { sourcemmsi: 1, t3: 1, _id: 0 },
inputStage:
{ stage: 'COLLSCAN',
filter:
{ geoloc:
{ '$geoWithin':
{ '$geometry':
{ type: 'Polygon',
coordinates: [ [ [ -5, 45 ], [ 0, 45 ], [ 0, 50 ], [ -5, 50 ], [ -5, 45 ] ] ] } } } },
direction: 'forward' } },
rejectedPlans: [] } } },
{ '$group':
{ _id: '$sourcemmsi',
PointCount: { '$sum': { '$const': 1 } },
MinDatePoint: { '$min': { date: '$t3' } },
MaxDatePoint: { '$max': { date: '$t3' } } } },
{ '$sort': { sortKey: { _id: 1 }, limit: 100 } },
{ '$project':
{ _id: true,
PointCount: true,
MaxDatePoint: true,
MinDatePoint: true } } ],
serverInfo:
{ host: 'ubuntu16',
port: 27017,
version: '4.4.1',
gitVersion: 'ad91a93a5a31e175f5cbf8c69561e788bbc55ce1' },
ok: 1 }
Here are the Execution Stats: With the Index
{ stages:
[ { '$cursor':
{ queryPlanner:
{ plannerVersion: 1,
namespace: 'mscdata.nari_dynamic',
indexFilterSet: false,
parsedQuery:
{ geoloc:
{ '$geoWithin':
{ '$geometry':
{ type: 'Polygon',
coordinates: [ [ [ -5, 45 ], [ 0, 45 ], [ 0, 50 ], [ -5, 50 ], [ -5, 45 ] ] ] } } } },
queryHash: '6E2EAB94',
planCacheKey: 'F35B194B',
winningPlan:
{ stage: 'PROJECTION_SIMPLE',
transformBy: { sourcemmsi: 1, t3: 1, _id: 0 },
inputStage:
{ stage: 'FETCH',
filter:
{ geoloc:
{ '$geoWithin':
{ '$geometry':
{ type: 'Polygon',
coordinates: [ [ [ -5, 45 ], [ 0, 45 ], [ 0, 50 ], [ -5, 50 ], [ -5, 45 ] ] ] } } } },
inputStage:
{ stage: 'IXSCAN',
keyPattern: { geoloc: '2dsphere' },
indexName: 'geoloc-field',
isMultiKey: false,
multiKeyPaths: { geoloc: [] },
isUnique: false,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds:
{ geoloc:
[ '[936748722493063168, 936748722493063168]',
'[954763121002545152, 954763121002545152]',
'[959266720629915648, 959266720629915648]',
'[960392620536758272, 960392620536758272]',
'[960674095513468928, 960674095513468928]',
'[960744464257646592, 960744464257646592]',
'[960762056443691008, 960762056443691008]',
'[960766454490202112, 960766454490202112]',
'[960767554001829888, 960767554001829888]',
'[960767828879736832, 960767828879736832]',
'[960767897599213568, 960767897599213568]',
'[960767914779082752, 960767914779082752]',
'[960767919074050048, 960767919074050048]',
'[960767920147791872, 960767920147791872]',
'[960767920416227328, 960767920416227328]',
'[960767920483336192, 960767920483336192]',
'[960767920500113408, 960767920500113408]',
'[960767920504307712, 960767920504307712]',
'[960767920505356288, 960767920505356288]',
'[960767920505618432, 960767920505618432]',
'[960767920505683968, 960767920505683968]',
'[960767920505683969, 960767920505716735]',
'[1345075088707977217, 1345075088708009983]',
'[1345075088708009984, 1345075088708009984]',
'[1345075088708075520, 1345075088708075520]',
'[1345075088708337664, 1345075088708337664]',
'[1345075088709386240, 1345075088709386240]',
'[1345075088713580544, 1345075088713580544]',
'[1345075088730357760, 1345075088730357760]',
'[1345075088797466624, 1345075088797466624]',
'[1345075089065902080, 1345075089065902080]',
'[1345075090139643904, 1345075090139643904]',
'[1345075094434611200, 1345075094434611200]',
'[1345075111614480384, 1345075111614480384]',
'[1345075180333957120, 1345075180333957120]',
'[1345075455211864064, 1345075455211864064]',
'[1345076554723491840, 1345076554723491840]',
'[1345080952770002944, 1345080952770002944]',
'[1345098544956047360, 1345098544956047360]',
'[1345168913700225024, 1345168913700225024]',
'[1345450388676935680, 1345450388676935680]',
'[1346576288583778304, 1346576288583778304]',
'[1351079888211148800, 1351079888211148800]',
'[1369094286720630784, 1369094286720630784]',
'[5116089176692883456, 5116089176692883456]',
'[5170132372221329408, 5170132372221329408]',
'[5179139571476070401, 5179702521429491711]',
'[5179702521429491713, 5180265471382913023]',
'[5180265471382913024, 5180265471382913024]',
'[5183643171103440896, 5183643171103440896]',
'[5187020870823968768, 5187020870823968768]',
'[5187020870823968769, 5187583820777390079]',
'[5187583820777390081, 5188146770730811391]',
'[5188146770730811393, 5197153969985552383]',
'[5206161169240293376, 5206161169240293376]',
'[5218264593238851584, 5218264593238851584]',
'[5218264593238851585, 5218405330727206911]',
'[5218546068215562240, 5218546068215562240]',
'[5218546068215562241, 5219109018168983551]',
'[5219671968122404864, 5219671968122404864]',
'[5220234918075826177, 5220797868029247487]',
'[5220797868029247488, 5220797868029247488]',
'[5220938605517602817, 5221079343005958143]',
'[5221079343005958144, 5221079343005958144]',
'[5260204364768739328, 5260204364768739328]' ] } } } },
rejectedPlans: [] } } },
{ '$group':
{ _id: '$sourcemmsi',
PointCount: { '$sum': { '$const': 1 } },
MinDatePoint: { '$min': { date: '$t3' } },
MaxDatePoint: { '$max': { date: '$t3' } } } },
{ '$sort': { sortKey: { _id: 1 }, limit: 100 } },
{ '$project':
{ _id: true,
MinDatePoint: true,
MaxDatePoint: true,
PointCount: true } } ],
serverInfo:
{ host: 'ubuntu16',
port: 27017,
version: '4.4.1',
gitVersion: 'ad91a93a5a31e175f5cbf8c69561e788bbc55ce1' },
ok: 1 }
Of course, I understand that is more complex as the query has a grouping function, but the idea is that usually, we will get something quicker and not slower with the index unless the index causes a different sorting inside the engine as geoNear does.
Also, there is a complete analysis from MongoDB if how the queries and index improvements have an impact on the queries, but not so much info for geoWithin. MongoDB states that the results are not sorted with GeoWithin, so I don't find the reason for the delay.
https://www.mongodb.com/blog/post/geospatial-performance-improvements-in-mongodb-3-2
Any ideas or opinions, why the query with the index is slower?
After a lot of investigation, appears that once a query is requesting more than 70% of the dataset, in this case, 95% having an index is slower than not having that index.
This situation is also present with other indexes than geospatial, like simple indexes in numeric or descriptive columns (ship_name, ship_number, or timestamp).
This is happening because the RDBMS has to search the keys of the index and also the keys of the documents, this results in higher execution times.
On the other hand, this should not be happening as the Mongo-Planner should be able to address this problem and not give the index for further use, keeping the accessing of the keys low.
The issue opened in MongoDB support, and can be found here:
https://jira.mongodb.org/browse/SERVER-53709
Sample Data:
[
{type: 'partial', jobId: '121', browser: 'chrome', status:'true', jobName:'one'},
{type: 'partial', jobId: '122', browser: 'chrome', status:'false', jobName:'two'},
{type: 'partial', jobId: '121', browser: 'firefox', status:'false', jobName:'one'},
{type: 'partial', jobId: '122', browser: 'firefox', status:'true', jobName:'two'},
{type: 'full', jobId: '123', browser: 'chrome', status:'true', jobName:'three'},
{type: 'full', jobId: '123', browser: 'chrome', status:'true', jobName:'three'},
{type: 'full', jobId: '123', browser: 'chrome', status:'false', jobName:'three'},
{type: 'full', jobId: '124', browser: 'firefox', status:'false', jobName:'four'},
]
Output Needed:
[
{
"type": "partial",
"browsers": [
{
"browser": "chrome",
"jobIds": [
{
"jobId": "121",
"results": [
{
"jobName": "one",
"status": "true",
},
]
},
{
"jobId": "122",
"results": [
{
"jobName": "two",
"status": "false"
},
]
}
]
},
{
"browser": "firefox",
"testIds": [
{
"jobId": "121",
"results": [
{
"jobName": "one",
"status": "false"
},
]
},
{
"jobId": "122",
"results": [
{
"jobName": "two",
"status": "true"
},
]
}
]
}
]
},
{
"type": "full",
"browsers": [
{
"browser": "chrome",
"jobIds": [
{
"jobId": "123",
"results": [
{
"jobName": "three",
"status": "true"
},
{
"jobName": "three",
"status": "true"
},
{
"jobName": "three",
"status": "false"
}
]
},
]
},
{
"browser": "firefox",
"testIds": [
{
"jobId": "124",
"results": [
{
"jobName": "four",
"status": "false"
},
]
},
]
}
]
}
]
I understand how to use group, but then I don't understand how to make the nested grouping. I tried the below query, it is not fetching needed results, I don't know how to proceed further.
db.collection.aggregate([
{
$match: {
jobId: {
"$exists": true
}
}
},
{
$sort: {
_id: -1
}
},
{
$group: {
_id: {
type: "$type",
browser: "$browser",
jobId: "$jobId"
},
results: {
$push: {
jobName: "$jobName",
status: "$status",
type: "$type",
jobId: "$jobId"
}
}
}
},
{
$addFields: {
results: {
$slice: [
"$results",
30
]
}
}
},
{
$group: {
_id: "$_id.browser",
results: {
$push: {
results: "$results"
}
}
}
},
])
Need fetch recent 30 results, that's why I added $addFields in query.
https://mongoplayground.net/p/pt3H1O445GA
$group by type, browser and jobId and make results array
$group by type and browser and make jobs array
$group by type and make browsers array
db.collection.aggregate([
{ $match: { jobId: { $exists: true } } },
{ $sort: { _id: -1 } },
{
$group: {
_id: {
type: "$type",
browser: "$browser",
jobId: "$jobId"
},
results: {
$push: {
jobName: "$jobName",
status: "$status"
}
}
}
},
{ $addFields: { results: { $slice: ["$results", 30] } } },
{
$group: {
_id: {
type: "$_id.type",
browser: "$_id.browser"
},
browser: { $first: "$_id.browser" },
jobIds: {
$push: {
jobId: "$_id.jobId",
results: "$results"
}
}
}
},
{
$group: {
_id: "$_id.type",
type: { $first: "$_id.type" },
browsers: {
$push: {
browser: "$_id.browser",
jobIds: "$jobIds"
}
}
}
},
{ $project: { _id: 0 } }
])
Playground
Help, I am using MongoDB 4.2.6, and writing an aggregate to obtain the number of filtered data from collections with 40000+ data. Before applying the $count method, I need to $lookup an extra collection as well.
Here is my aggregate
db.exams.aggregate([{
$match: {
schoolId: ObjectId("5d91c9ec098506001b426cb5")
}
}, {
$lookup: {
from: 'students',
localField: 'studentId',
foreignField: '_id',
as: 'student'
}
}, {
$unwind: "$student"
}, {
$match: {
"student.gender": 1
}
},{
$count: 'count'
}])
But it looks more than 10 seconds. I have already add indexes on every ID: exams._id, students._id, exams.studentId, exams.schoolId, student.gender, etc...
Can someone gives me some suggestions in order to make the query faster?
Explains:
{
stages: [
{
$cursor: {
query: {
schoolId: ObjectId('5d91c9ec098506001b426cb5')
},
fields: {
_id: 1
},
queryPlanner: {
plannerVersion: 1,
namespace: 'happya.exams',
indexFilterSet: false,
parsedQuery: {
schoolId: {
$eq: ObjectId('5d91c9ec098506001b426cb5')
}
},
queryHash: '9533F340',
planCacheKey: 'CE7F9610',
winningPlan: {
stage: 'FETCH',
inputStage: {
stage: 'IXSCAN',
keyPattern: {
schoolId: 1
},
indexName: 'schoolId_1',
isMultiKey: false,
multiKeyPaths: {
schoolId: []
},
isUnique: false,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds: {
schoolId: [
"[ObjectId('5d91c9ec098506001b426cb5'), ObjectId('5d91c9ec098506001b426cb5')]"
]
}
}
},
rejectedPlans: [
{
stage: 'FETCH',
inputStage: {
stage: 'IXSCAN',
keyPattern: {
schoolId: 1,
referenceNo: 1
},
indexName: 'schoolId_1_referenceNo_1',
isMultiKey: false,
multiKeyPaths: {
schoolId: [],
referenceNo: []
},
isUnique: true,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds: {
schoolId: [
"[ObjectId('5d91c9ec098506001b426cb5'), ObjectId('5d91c9ec098506001b426cb5')]"
],
referenceNo: ['[MinKey, MaxKey]']
}
}
}
]
}
}
},
{
$lookup: {
from: 'students',
as: 'student',
localField: 'studentId',
foreignField: '_id',
unwinding: {
preserveNullAndEmptyArrays: false
},
matching: {
gender: {
$eq: 1
}
}
}
},
{
$group: {
_id: {
$const: null
},
count: {
$sum: {
$const: 1
}
}
}
},
{
$project: {
_id: false,
count: true
}
}
],
serverInfo: {
host: 'a98010d6dcf4',
port: 27017,
version: '4.2.6',
gitVersion: '20364840b8f1af16917e4c23c1b5f5efd8b352f8'
},
ok: 1,
$clusterTime: {
clusterTime: Timestamp(1597720010, 1),
signature: {
hash: BinData(0, '1PiNaAzDzNRrnZl/mpVJP4oneyU='),
keyId: NumberLong('6819213090182135813')
}
},
operationTime: Timestamp(1597720010, 1)
};
Below is a sample document:
{
'uid': 1,
'sent': [
{
'mid': 100,
'date': 20171210,
},
{
'mid': 101,
'date': 20171210,
}
],
'open': [
{
'mid': 100,
'date': 20171220,
},
{
'mid': 101,
'date': 20171220,
}
]
}
I want to group on 'uid' and nested 'mid' fields.
My desired output is :
{
'uid': 1,
'mid': 100,
'sent': [ 20171210 ],
'open': [ 20171220 ]
}
{
'uid': 1,
'mid': 101,
'sent': [ 20171210 ],
'open': [ 20171220 ]
}
Is there any efficient way of Aggregation which can give me above result?
You can $unwind the one array, then use $filter to keep only the matching entries in the second array. Then $unwind the second array and $group.
db.temp.aggregate(
[
{
$unwind: {
'path': '$sent',
}
},
{
$project: {
'uid': 1,
'sent': 1,
'open': { $filter: {
input: '$open',
as: 'this',
cond: { $eq: [ '$sent.mid', '$$this.mid' ] }
} }
}
},
{
$unwind: {
'path': '$open',
}
},
{
$group: {
'_id': { 'uid': '$uid', 'mid': '$sent.mid' },
'sent': { '$push': '$sent.date' },
'open': { '$push': '$open.date' }
}
},
{
$project: {
'_id': 0,
'uid': '$_id.uid',
'mid': '$_id.mid',
'sent': 1,
'open': 1
}
},
]
);
I have the following set of objects:
[
{
id: 1,
clientId: 1,
cost: 200
},
{
id: 1,
clientId: 2,
cost: 500
},
{
id: 1,
clientId: 2,
cost: 800
},
{
id: 2,
clientId: 1,
cost: 600
},
{
id: 2,
clientId: 2,
cost: 100
}
]
And I made a group of that with:
db.collection.aggregate(
{
'$group': {
'_id': '$id',
'clients': {
'$addToSet': {
'id': '$clientId',
'cost': '$cost'
}
}
}
}
)
So I obteined the following:
[
{
'_id': 1,
'clients': [
{
id: 1,
cost: 200
},
{
id: 2,
cost: 500
},
{
id: 2,
cost: 800
}
],
'_id': 2,
'clients': [
{
id: 1,
cost: 600
},
{
id: 2,
cost: 100
}
]
}
]
As you can see in the array of clients of the first value, I have 2 repeated and what I want is to have 1 with the cost added. So instead of have:
'clients': [
{
id: 1,
cost: 200
},
{
id: 2,
cost: 500
},
{
id: 2,
cost: 800
}
]
I need:
'clients': [
{
id: 1,
cost: 200
},
{
id: 2,
cost: 1300
}
]
So my question is: how can I do that? Because $addToSet nor $push allow $sum.
You can use aggregation operators to get expected output like following:
db.collection.aggregate({
"$group": {
"_id": {
"mainId": "$id",
"client": "$clientId"
},
"cost": {
"$sum": "$cost"
}
}
}, {
"$project": {
"mainId": "$_id.mainId",
"clients": {
"clientId": "$_id.client",
"cost": "$cost"
},
"_id": 0
}
}, {
"$group": {
"_id": "$mainId",
"clients": {
"$push": "$clients"
}
}
})