MongoDB Time-Spatial Query is slower with 2dsphere Index - mongodb

Recently, I started to investigate the performance of MongoDB with AIS Data.
I used a collection with 19m documents with proper field types as described in the definition.
I also created a new geoloc field with type: (Point) from coordinates (lon,lat) in this same collection.
The query under investigation is:
db.nari_dynamic.explain('executionStats').aggregate
(
[
{
"$match": {
"geoloc": {
"$geoWithin": {
"$geometry": {
"type" : "Polygon" ,
"coordinates": [ [ [ -5.00, 45.00 ], [ +0.00, 45.00 ], [ +0.00, 50.00 ], [ -5.00, 50.00 ], [ -5.00, 45.00 ] ] ]
}}}}
},
{ "$group": {"_id": "$sourcemmsi", "PointCount": {"$sum" : 1}, "MinDatePoint": {"$min" : {"date": "$t3" }}, "MaxDatePoint": {"$max" : {"date": "$t3" }} }},
{ "$sort": {"_id":1} },
{ "$limit":100 },
{ "$project": {"_id":1, "PointCount":1, "MinDatePoint":1, "MaxDatePoint":1} }
],
{ explain:true}
)
During investigation and testing I found the following:
Without any index: 94s
With geoloc-2dsphere index: 280s
Here are the Execution Stats:
Without the Index
{ stages:
[ { '$cursor':
{ queryPlanner:
{ plannerVersion: 1,
namespace: 'mscdata.nari_dynamic',
indexFilterSet: false,
parsedQuery:
{ geoloc:
{ '$geoWithin':
{ '$geometry':
{ type: 'Polygon',
coordinates: [ [ [ -5, 45 ], [ 0, 45 ], [ 0, 50 ], [ -5, 50 ], [ -5, 45 ] ] ] } } } },
queryHash: '6E2EAB94',
planCacheKey: '6E2EAB94',
winningPlan:
{ stage: 'PROJECTION_SIMPLE',
transformBy: { sourcemmsi: 1, t3: 1, _id: 0 },
inputStage:
{ stage: 'COLLSCAN',
filter:
{ geoloc:
{ '$geoWithin':
{ '$geometry':
{ type: 'Polygon',
coordinates: [ [ [ -5, 45 ], [ 0, 45 ], [ 0, 50 ], [ -5, 50 ], [ -5, 45 ] ] ] } } } },
direction: 'forward' } },
rejectedPlans: [] } } },
{ '$group':
{ _id: '$sourcemmsi',
PointCount: { '$sum': { '$const': 1 } },
MinDatePoint: { '$min': { date: '$t3' } },
MaxDatePoint: { '$max': { date: '$t3' } } } },
{ '$sort': { sortKey: { _id: 1 }, limit: 100 } },
{ '$project':
{ _id: true,
PointCount: true,
MaxDatePoint: true,
MinDatePoint: true } } ],
serverInfo:
{ host: 'ubuntu16',
port: 27017,
version: '4.4.1',
gitVersion: 'ad91a93a5a31e175f5cbf8c69561e788bbc55ce1' },
ok: 1 }
Here are the Execution Stats: With the Index
{ stages:
[ { '$cursor':
{ queryPlanner:
{ plannerVersion: 1,
namespace: 'mscdata.nari_dynamic',
indexFilterSet: false,
parsedQuery:
{ geoloc:
{ '$geoWithin':
{ '$geometry':
{ type: 'Polygon',
coordinates: [ [ [ -5, 45 ], [ 0, 45 ], [ 0, 50 ], [ -5, 50 ], [ -5, 45 ] ] ] } } } },
queryHash: '6E2EAB94',
planCacheKey: 'F35B194B',
winningPlan:
{ stage: 'PROJECTION_SIMPLE',
transformBy: { sourcemmsi: 1, t3: 1, _id: 0 },
inputStage:
{ stage: 'FETCH',
filter:
{ geoloc:
{ '$geoWithin':
{ '$geometry':
{ type: 'Polygon',
coordinates: [ [ [ -5, 45 ], [ 0, 45 ], [ 0, 50 ], [ -5, 50 ], [ -5, 45 ] ] ] } } } },
inputStage:
{ stage: 'IXSCAN',
keyPattern: { geoloc: '2dsphere' },
indexName: 'geoloc-field',
isMultiKey: false,
multiKeyPaths: { geoloc: [] },
isUnique: false,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds:
{ geoloc:
[ '[936748722493063168, 936748722493063168]',
'[954763121002545152, 954763121002545152]',
'[959266720629915648, 959266720629915648]',
'[960392620536758272, 960392620536758272]',
'[960674095513468928, 960674095513468928]',
'[960744464257646592, 960744464257646592]',
'[960762056443691008, 960762056443691008]',
'[960766454490202112, 960766454490202112]',
'[960767554001829888, 960767554001829888]',
'[960767828879736832, 960767828879736832]',
'[960767897599213568, 960767897599213568]',
'[960767914779082752, 960767914779082752]',
'[960767919074050048, 960767919074050048]',
'[960767920147791872, 960767920147791872]',
'[960767920416227328, 960767920416227328]',
'[960767920483336192, 960767920483336192]',
'[960767920500113408, 960767920500113408]',
'[960767920504307712, 960767920504307712]',
'[960767920505356288, 960767920505356288]',
'[960767920505618432, 960767920505618432]',
'[960767920505683968, 960767920505683968]',
'[960767920505683969, 960767920505716735]',
'[1345075088707977217, 1345075088708009983]',
'[1345075088708009984, 1345075088708009984]',
'[1345075088708075520, 1345075088708075520]',
'[1345075088708337664, 1345075088708337664]',
'[1345075088709386240, 1345075088709386240]',
'[1345075088713580544, 1345075088713580544]',
'[1345075088730357760, 1345075088730357760]',
'[1345075088797466624, 1345075088797466624]',
'[1345075089065902080, 1345075089065902080]',
'[1345075090139643904, 1345075090139643904]',
'[1345075094434611200, 1345075094434611200]',
'[1345075111614480384, 1345075111614480384]',
'[1345075180333957120, 1345075180333957120]',
'[1345075455211864064, 1345075455211864064]',
'[1345076554723491840, 1345076554723491840]',
'[1345080952770002944, 1345080952770002944]',
'[1345098544956047360, 1345098544956047360]',
'[1345168913700225024, 1345168913700225024]',
'[1345450388676935680, 1345450388676935680]',
'[1346576288583778304, 1346576288583778304]',
'[1351079888211148800, 1351079888211148800]',
'[1369094286720630784, 1369094286720630784]',
'[5116089176692883456, 5116089176692883456]',
'[5170132372221329408, 5170132372221329408]',
'[5179139571476070401, 5179702521429491711]',
'[5179702521429491713, 5180265471382913023]',
'[5180265471382913024, 5180265471382913024]',
'[5183643171103440896, 5183643171103440896]',
'[5187020870823968768, 5187020870823968768]',
'[5187020870823968769, 5187583820777390079]',
'[5187583820777390081, 5188146770730811391]',
'[5188146770730811393, 5197153969985552383]',
'[5206161169240293376, 5206161169240293376]',
'[5218264593238851584, 5218264593238851584]',
'[5218264593238851585, 5218405330727206911]',
'[5218546068215562240, 5218546068215562240]',
'[5218546068215562241, 5219109018168983551]',
'[5219671968122404864, 5219671968122404864]',
'[5220234918075826177, 5220797868029247487]',
'[5220797868029247488, 5220797868029247488]',
'[5220938605517602817, 5221079343005958143]',
'[5221079343005958144, 5221079343005958144]',
'[5260204364768739328, 5260204364768739328]' ] } } } },
rejectedPlans: [] } } },
{ '$group':
{ _id: '$sourcemmsi',
PointCount: { '$sum': { '$const': 1 } },
MinDatePoint: { '$min': { date: '$t3' } },
MaxDatePoint: { '$max': { date: '$t3' } } } },
{ '$sort': { sortKey: { _id: 1 }, limit: 100 } },
{ '$project':
{ _id: true,
MinDatePoint: true,
MaxDatePoint: true,
PointCount: true } } ],
serverInfo:
{ host: 'ubuntu16',
port: 27017,
version: '4.4.1',
gitVersion: 'ad91a93a5a31e175f5cbf8c69561e788bbc55ce1' },
ok: 1 }
Of course, I understand that is more complex as the query has a grouping function, but the idea is that usually, we will get something quicker and not slower with the index unless the index causes a different sorting inside the engine as geoNear does.
Also, there is a complete analysis from MongoDB if how the queries and index improvements have an impact on the queries, but not so much info for geoWithin. MongoDB states that the results are not sorted with GeoWithin, so I don't find the reason for the delay.
https://www.mongodb.com/blog/post/geospatial-performance-improvements-in-mongodb-3-2
Any ideas or opinions, why the query with the index is slower?

After a lot of investigation, appears that once a query is requesting more than 70% of the dataset, in this case, 95% having an index is slower than not having that index.
This situation is also present with other indexes than geospatial, like simple indexes in numeric or descriptive columns (ship_name, ship_number, or timestamp).
This is happening because the RDBMS has to search the keys of the index and also the keys of the documents, this results in higher execution times.
On the other hand, this should not be happening as the Mongo-Planner should be able to address this problem and not give the index for further use, keeping the accessing of the keys low.
The issue opened in MongoDB support, and can be found here:
https://jira.mongodb.org/browse/SERVER-53709

Related

Comparing 2 fields from $project in a mongoDB pipeline

In a previous post I created a mongodb query projecting the number of elements matching a condition in an array. Now I need to filter this number of elements depending on another field.
This is my db :
db={
"fridges": [
{
_id: 1,
items: [
{
itemId: 1,
name: "beer"
},
{
itemId: 2,
name: "chicken"
}
],
brand: "Bosch",
size: 195,
cooler: true,
color: "grey",
nbMax: 2
},
{
_id: 2,
items: [
{
itemId: 1,
name: "beer"
},
{
itemId: 2,
name: "chicken"
},
{
itemId: 3,
name: "lettuce"
}
],
brand: "Electrolux",
size: 200,
cooler: true,
color: "white",
nbMax: 2
},
]
}
This is my query :
db.fridges.aggregate([
{
$match: {
$and: [
{
"brand": {
$in: [
"Bosch",
"Electrolux"
]
}
},
{
"color": {
$in: [
"grey",
"white"
]
}
}
]
}
},
{
$project: {
"itemsNumber": {
$size: {
"$filter": {
"input": "$items",
"as": "item",
"cond": {
$in: [
"$$item.name",
[
"beer",
"lettuce"
]
]
}
}
}
},
brand: 1,
cooler: 1,
color: 1,
nbMax: 1
}
}
])
The runnable example.
Which gives me this :
[
{
"_id": 1,
"brand": "Bosch",
"color": "grey",
"cooler": true,
"itemsNumber": 1,
"nbMax": 2
},
{
"_id": 2,
"brand": "Electrolux",
"color": "white",
"cooler": true,
"itemsNumber": 2,
"nbMax": 2
}
]
What I expect is to keep only the results having a itemsNumber different from nbMax. In this instance, the second fridge with _id:2 would not match the condition and should not be in returned. How can I modify my query to get this :
[
{
"_id": 1,
"brand": "Bosch",
"color": "grey",
"cooler": true,
"itemsNumber": 1,
"nbMax": 2
}
]
You can put a $match stage with expression condition at the end of your query,
$ne to check both fields should not same
{
$match: {
$expr: { $ne: ["$nbMax", "$itemsNumber"] }
}
}
Playground

MongoDB aggregation, Group by value interval,

MongoDB documents:
[{
_id: '123213',
elevation: 2300,
area: 25
},
{
_id: '343221',
elevation: 1600,
area: 35,
},
{
_id: '545322',
elevation: 500
area: 12,
},
{
_id: '234234',
elevation: null,
area: 5
}]
I want to group these on a given interval on elevation and summarize the area property.
Group 1: < 0
Group 2: 0 - 1500
Group 3: 1501 - 3000,
Group 4: > 3000
So the expected output would be:
[{
interval: '1501-3000',
count: 2,
summarizedArea: 60
},
{
interval: '0-1500',
count: 1,
summarizedArea: 12,
},
{
interval: 'N/A',
count: 1,
summarizedArea: 5
}]
If possible, I want to use the aggregation pipeline.
Maybe something with $range? Or a combination of $gte and $lte?
As Feliix suggested $bucket should do the job, but boundaries should be slightly different to play well with negative and N/A values:
db.collection.aggregate([
{
$bucket: {
groupBy: "$elevation",
boundaries: [ -Number.MAX_VALUE, 0, 1501, 3001, Number.POSITIVE_INFINITY ],
default: Number.NEGATIVE_INFINITY,
output: {
"count": { $sum: 1 },
"summarizedArea" : { $sum: "$area" }
}
}
}
])
The formatting stage below can be added to the pipeline to adjust shape of the response:
{ $group: {
_id: null,
documents: { $push: {
interval: { $let: {
vars: {
idx: { $switch: {
branches: [
{ case: { $eq: [ "$_id", -Number.MAX_VALUE ] }, then: 3 },
{ case: { $eq: [ "$_id", 0 ] }, then: 2 },
{ case: { $eq: [ "$_id", 1501 ] }, then: 1 },
{ case: { $eq: [ "$_id", 3001 ] }, then: 0 }
],
default: 4
} }
},
in: { $arrayElemAt: [ [ ">3000", "1501-3000", "0-1500", "<0", "N/A" ], "$$idx" ] }
} },
count: "$count",
summarizedArea: "$summarizedArea"
} }
} }
$group with _id: null $push es all groups into array of a single document.
$let maps $_id from previous stage to text labels of interval defined in the array [ ">3000", "1501-3000", "0-1500", "<0", "N/A" ]. For that it calculates idx index of the label using $switch.
It must be way simpler to implement the logic on application level unless you absolutely need to do it in the pipeline.
you can use $bucket introduced in MongoDB 3.4 to achive this:
db.collection.aggregate([
{
$bucket: {
groupBy: "$elevation",
boundaries: [
0,
1500,
3000,
5000
],
default: 10000,
output: {
"count": {
$sum: 1
},
"summarizedArea": {
$sum: "$area"
}
}
}
}
])
output:
[
{
"_id": 0,
"count": 1,
"summarizedArea": 12
},
{
"_id": 1500,
"count": 2,
"summarizedArea": 60
},
{
"_id": 10000,
"count": 1,
"summarizedArea": 5
}
]
you can try it here: mongoplayground.net/p/xFe7ZygMqaY

MongoDB. Aggregate the sum of two arrays sizes

With MongoDB 3.4.10 and mongoose 4.13.6 I'm able to count sizes of two arrays on the User model:
User.aggregate()
.project({
'_id': 1,
'leftVotesCount': { '$size': '$leftVoted' },
'rightVotesCount': { '$size': '$rightVoted' }
})
where my Users are (per db.users.find())
{ "_id" : ObjectId("5a2b21e63023c6117085c240"), "rightVoted" : [ 2 ],
"leftVoted" : [ 1, 6 ] }
{ "_id" : ObjectId("5a2c0d68efde3416bc8b7020"), "rightVoted" : [ 2 ],
"leftVoted" : [ 1 ] }
Here I'm getting expected result:
[ { _id: '5a2b21e63023c6117085c240', leftVotesCount: 2, rightVotesCount: 1 },
{ _id: '5a2c0d68efde3416bc8b7020', leftVotesCount: 1, rightVotesCount: 1 } ]
Question. How can I get a cumulative value of leftVotesCount and rightVotesCount data? I tried folowing:
User.aggregate()
.project({
'_id': 1,
'leftVotesCount': { '$size': '$leftVoted' },
'rightVotesCount': { '$size': '$rightVoted' },
'votesCount': { '$add': [ '$leftVotesCount', '$rightVotesCount' ] },
'votesCount2': { '$sum': [ '$leftVotesCount', '$rightVotesCount' ] }
})
But votesCount is null and votesCount2 is 0 for both users. I'm expecting votesCount = 3 for User 1 and votesCount = 2 for User 2.
$leftVotesCount, $rightVotesCount become available only on the next stage. Try something like:
User.aggregate()
.project({
'_id': 1,
'leftVotesCount': { '$size': '$leftVoted' },
'rightVotesCount': { '$size': '$rightVoted' }
})
.project({
'_id': 1,
'leftVotesCount': 1,
'rightVotesCount': 1
'votesCount': { '$add': [ '$leftVotesCount', '$rightVotesCount' ] },
'votesCount2': { '$sum': [ '$leftVotesCount', '$rightVotesCount' ] }
})
You can't reference the project variables created in the same project stage.
You can wrap the variables in a $let expression.
User.aggregate().project({
"$let": {
"vars": {
"leftVotesCount": {
"$size": "$leftVoted"
},
"rightVotesCount": {
"$size": "$rightVoted"
}
},
"in": {
"votesCount": {
"$add": [
"$$leftVotesCount",
"$$rightVotesCount"
]
},
"leftVotesCount": "$$leftVotesCount",
"rightVotesCount": "$$rightVotesCount"
}
}
})
It turned out that $add supports nested expressions, so I was able to solve the issue by excluding intermediate variables:
User.aggregate().project({
'_id': 1,
'votesCount': { '$add': [ { '$size': '$leftVoted' }, { '$size': '$rightVoted' } ] }
});
// [ {_id: '...', votesCount: 3}, {_id: '...', votesCount: 2} ]

mongoDB aggregate with two percent by $group

My dataset :
{
"codepostal": 84000,
"siren": 520010234,
"type": "home"
},
{
"codepostal": 84000,
"siren": 0,
"type": "home"
},
{
"codepostal": 84000,
"siren": 450123003,
"type": "appt"
} ...
My pipeline (total is an integer) :
var pipeline = [
{
$match: { codepostal: 84000 }
},
{
$group: {
_id: { type: "$type" },
count: { $sum: 1 }
}
},
{
$project: {
percentage: { $multiply: ["$count", 100 / total] }
}
},
{
$sort: { _id: 1 }
}
];
Results :
[ { _id: { type: 'appt' }, percentage: 66 },
{ _id: { type: 'home' }, percentage: 34 } ]
Expected results is to count when "siren" is set to 0 or another number.
Count siren=0 => part
Count siren!=0 => pro
[ { _id: { type: 'appt' }, totalPercent: 66, proPercent: 20, partPercent: 80},
{ _id: { type: 'home' }, totalPercent: 34, proPercent: 45, partPercent: 55 } ]
Thanks a lot for your help !!
You can use $cond to get 0 or 1 for pro/part documents depending o value of siren field. Then it's easy to calculate totals for each type of document:
[
{
$match: { codepostal: 84000 }
},
{
$group: {
_id: { type: "$type" },
count: { $sum: 1 },
countPro: { $sum: {$cond: [{$eq:["$siren",0]}, 0, 1]} },
countPart: {$sum: {$cond: [{$eq:["$siren",0]}, 1, 0]} }
}
},
{
$project: {
totalPercent: { $multiply: ["$count", 100 / total] },
proPercent: { $multiply: ["$countPro", {$divide: [100, "$count"]}] },
partPercent: { $multiply: ["$countPart", {$divide: [100, "$count"]}] }
}
},
{
$sort: { _id: 1 }
}
]
Note that I used $divide to calculate pro/part percentage relative to the count of document within type group.
For your sample documents (total = 3) output will be:
[
{
"_id" : { "type" : "appt" },
"totalPercent" : 33.3333333333333,
"proPercent" : 100,
"partPercent" : 0
},
{
"_id" : { "type" : "home" },
"totalPercent" : 66.6666666666667,
"proPercent" : 50,
"partPercent" : 50
}
]

How to multi update of a nested array in MondoDB?

I have documents in a MongoDB 'playground' collection following the below "schema" :
{
"_id": ObjectId("54423b40c92f9fffb486a6d4"),
"ProjectFileId": 1,
"SourceLanguageId": 2,
"TargetSegments": [
{
"LanguageId": 1,
"Segment": "Something",
"Colors": [
1,
2,
3
],
"Heights": [
1,
2,
3
],
"Widths": [
1,
2,
3
]
},
{
"LanguageId": 1,
"Segment": "Something",
"Colors": [
1,
2,
3
],
"Heights": [
1,
2,
3
],
"Widths": [
1,
2,
3
]
}
]
}
And the following update query:
db.playground.update({
$and: [
{
"TargetSegments.Colors": {
$exists: true
}
},
{
"ProjectFileId": 1
},
{
"SourceLanguageId": 2
},
{
"TargetSegments": {
$elemMatch: {
"LanguageId": 1
}
}
}
]
},
{
$set: {
"TargetSegments.$.Segment": null,
"TargetSegments.$.Colors": [],
"TargetSegments.$.Widths": [],
"TargetSegments.$.Heights": []
}
},
false, true)
After the execution of the query the result is:
{
"_id": ObjectId("54423b40c92f9fffb486a6d4"),
"ProjectFileId": 1,
"SourceLanguageId": 2,
"TargetSegments": [
{
"LanguageId": 1,
"Segment": null,
"Colors": [],
"Heights": [],
"Widths": []
},
{
"LanguageId": 1,
"Segment": "Something",
"Colors": [
1,
2,
3
],
"Heights": [
1,
2,
3
],
"Widths": [
1,
2,
3
]
}
]
}
As you can see, only the first element of the "TargetSegments" array is updated.
How can I update all the elements of the TargetSegments array in one update query?
Its because you are using $ operator: The positional $ operator identifies an element (not multi) in an array to update without explicitly specifying the position of the element in the array. To project, or return, an array element from a read operation, see the $ projection operator.
You can use below code to do it:
db.playground.find({
$and: [
{
"TargetSegments.Colors": {
$exists: true
}
},
{
"ProjectFileId": 1
},
{
"SourceLanguageId": 2
},
{
"TargetSegments": {
$elemMatch: {
"LanguageId": 1
}
}
}
]
}).forEach(function(item)
{
var targets = item.TargetSegments;
for(var index = 0; index < targets.length; index++)
{
var target = targets[index];
target.Segment = null,
target.Colors= [],
target.Widths= [],
target.Heights= []
}
db.playground.save(item);
});