Mongo aggregation combines $match steps, resulting in slow query - mongodb

This question is a follow up to Query with $in and $nin doesn't use index. I've tried using aggregation to declare the order of steps.
db.assets.aggregate([
{
"$match": {
"tags": {
"$in": ["blah"]
}
}
},
{
"$match": {
"tags": {
"$nin": ["test"]
}
}
}
], {"explain": true})
You'd think Mongo would now understand that we want to filter by $in first. Well, you'd be surprised.
{
"stages" : [
{
"$cursor" : {
"query" : {
"$and" : [
{
"tags" : {
"$in" : [
"blah"
]
}
},
{
"tags" : {
"$nin" : [
"test"
]
}
}
]
},
"planError" : "InternalError No plan available to provide stats"
}
}
],
"ok" : 1
}
The planner doesn't even know what to do. It turns out it actually combines both $matches into one query, and then runs into the same problem as Query with $in and $nin doesn't use index, eventually returning the results in about 2-3 seconds (which corresponds to the 2331ms on the linked question).

It looks you can trick the aggregator by inserting an empty skip step:
db.assets.aggregate([
{
"$match": {
"tags": {
"$in": ["blah"]
}
}
},
{
"$skip": 0
},
{
"$match": {
"tags": {
"$nin": ["test"]
}
}
}
], {"explain": true})
With that, the planner will use the index and the results are returned immediately.
{
"stages" : [
{
"$cursor" : {
"query" : {
"tags" : {
"$in" : [
"blah"
]
}
},
"plan" : {
"cursor" : "BtreeCursor ",
"isMultiKey" : false,
"scanAndOrder" : false,
"indexBounds" : {
"tags" : [
[ "blah", "blah" ]
]
},
"allPlans" : [
{
"cursor" : "BtreeCursor ",
"isMultiKey" : false,
"scanAndOrder" : false,
"indexBounds" : {
"tags" : [
[ "blah", "blah" ]
]
}
}
]
}
}
},
{
"$skip" : NumberLong(0)
},
{
"$match" : {
"tags" : {
"$nin" : [
"test"
]
}
}
}
],
"ok" : 1
}

Related

Partition data around a match query during aggregation

What I have been trying to get my head around is to perform some kind of partitioning(split by predicate) in a mongo query. My current query looks like:
db.posts.aggregate([
{"$match": { $and:[ {$or:[{"toggled":false},{"toggled":true, "status":"INACTIVE"}]} , {"updatedAt":{$gte:1549786260000}} ] }},
{"$unwind" :"$interests"},
{"$group" : {"_id": {"iid": "$interests", "pid":"$publisher"}, "count": {"$sum" : 1}}},
{"$project":{ _id: 0, "iid": "$_id.iid", "pid": "$_id.pid", "count": 1 }}
])
This results in the following output:
{
"count" : 3.0,
"iid" : "INT456",
"pid" : "P789"
}
{
"count" : 2.0,
"iid" : "INT789",
"pid" : "P789"
}
{
"count" : 1.0,
"iid" : "INT123",
"pid" : "P789"
}
{
"count" : 1.0,
"iid" : "INT123",
"pid" : "P123"
}
All good so far, but then I had realized that for the documents that match the specific filter {"toggled":true, "status":"INACTIVE"}, I would rather decrement the count (-1). (considering the eventual value can be negative as well.)
Is there a way to somehow partition the data after match to make sure different grouping operations are performed for both the collection of documents?
Something that sounds similar to what I am looking for is
$mergeObjects, or maybe $reduce, but not much that I can relate from the documentation examples.
Note: I can sense, one straightforward way to deal with this would be to perform two queries, but I am looking for a single query to perform the operation.
Sample documents for the above output would be:
/* 1 */
{
"_id" : ObjectId("5d1f7******"),
"id" : "CON123",
"title" : "Game",
"content" : {},
"status" : "ACTIVE",
"toggle":false,
"publisher" : "P789",
"interests" : [
"INT456"
],
"updatedAt" : NumberLong(1582078628264)
}
/* 2 */
{
"_id" : ObjectId("5d1f8******"),
"id" : "CON456",
"title" : "Home",
"content" : {},
"status" : "INACTIVE",
"toggle":true,
"publisher" : "P789",
"interests" : [
"INT456",
"INT789"
],
"updatedAt" : NumberLong(1582078628264)
}
/* 3 */
{
"_id" : ObjectId("5d0e9******"),
"id" : "CON654",
"title" : "School",
"content" : {},
"status" : "ACTIVE",
"toggle":false,
"publisher" : "P789",
"interests" : [
"INT123",
"INT456",
"INT789"
],
"updatedAt" : NumberLong(1582078628264)
}
/* 4 */
{
"_id" : ObjectId("5d207*******"),
"id" : "CON789",
"title":"Stack",
"content" : { },
"status" : "ACTIVE",
"toggle":false,
"publisher" : "P123",
"interests" : [
"INT123"
],
"updatedAt" : NumberLong(1582078628264)
}
What I am looking forward to as a result though is
{
"count" : 1.0, (2-1)
"iid" : "INT456",
"pid" : "P789"
}
{
"count" : 0.0, (1-1)
"iid" : "INT789",
"pid" : "P789"
}
{
"count" : 1.0,
"iid" : "INT123",
"pid" : "P789"
}
{
"count" : 1.0,
"iid" : "INT123",
"pid" : "P123"
}
This aggregation gives the desired result.
db.posts.aggregate( [
{ $match: { updatedAt: { $gte: 1549786260000 } } },
{ $facet: {
FALSE: [
{ $match: { toggle: false } },
{ $unwind : "$interests" },
{ $group : { _id : { iid: "$interests", pid: "$publisher" }, count: { $sum : 1 } } },
],
TRUE: [
{ $match: { toggle: true, status: "INACTIVE" } },
{ $unwind : "$interests" },
{ $group : { _id : { iid: "$interests", pid: "$publisher" }, count: { $sum : -1 } } },
]
} },
{ $project: { result: { $concatArrays: [ "$FALSE", "$TRUE" ] } } },
{ $unwind: "$result" },
{ $replaceRoot: { newRoot: "$result" } },
{ $group : { _id : "$_id", count: { $sum : "$count" } } },
{ $project:{ _id: 0, iid: "$_id.iid", pid: "$_id.pid", count: 1 } }
] )
[ EDIT ADD ]
The output from the query using the input data from the question post:
{ "count" : 1, "iid" : "INT123", "pid" : "P789" }
{ "count" : 1, "iid" : "INT123", "pid" : "P123" }
{ "count" : 0, "iid" : "INT789", "pid" : "P789" }
{ "count" : 1, "iid" : "INT456", "pid" : "P789" }
[ EDIT ADD 2 ]
This query gets the same result with different approach (code):
db.posts.aggregate( [
{
$match: { updatedAt: { $gte: 1549786260000 } }
},
{
$unwind : "$interests"
},
{
$group : {
_id : {
iid: "$interests",
pid: "$publisher"
},
count: {
$sum: {
$switch: {
branches: [
{ case: { $eq: [ "$toggle", false ] },
then: 1 },
{ case: { $and: [ { $eq: [ "$toggle", true] }, { $eq: [ "$status", "INACTIVE" ] } ] },
then: -1 }
]
}
}
}
}
},
{
$project:{
_id: 0,
iid: "$_id.iid",
pid: "$_id.pid",
count: 1
}
}
] )
[ EDIT ADD 3 ]
NOTE:
The facet query runs the two facets (TRUE and FALSE) on the same set of documents; it is like two queries running in parallel. But, there is some duplication of code as well as additional stages for shaping the documents down the pipeline to get the desired output.
The second query avoids the code duplication, and there are much lesser stages in the aggregation pipeline. This will make difference when the input dataset has a large number of documents to process - in terms of performance. In general, lesser stages means lesser iterations of the documents (as a stage has to scan the documents which are output from the previous stage).

monogdb nested array items exact match

I have a collection as below what I want is to fetch the items that has exact match of Tag="dolore", I tried different ways but I am getting all the elements if any of the embedded element has tag as dolore
{
"_id" : 123,
"vendor" : "ut",
"boxes" : [
{
"boxRef" : 321,
"items" : [
{
"Tag" : "dolore",
},
{
"Tag" : "irure",
},
{
"Tag" : "labore",
}
]
},
{
"boxRef" : 789,
"items" : [
{
"Tag" : "incididunt",
},
{
"Tag" : "magna",
},
{
"Tag" : "laboris",
}
]
},
{
"boxRef" : 456,
"items" : [
{
"Tag" : "reprehenderit",
},
{
"Tag" : "reprehenderit",
},
{
"Tag" : "enim",
}
]
}
]
}
If you are expecting to get only the matching embedded documents you have $unwind, $match and then $group to reverse the $unwind. Like this:
db.getCollection('collectionName').aggregate([
{
$unwind:"$boxes"
},
{
$unwind:"$boxes.items"
},
{
$match:{
"boxes.items.Tag":"dolore"
}
},
{
$group:{
_id:{
boxRef:"$boxes.boxRef",
_id:"$_id"
},
vendor:{
"$first":"$vendor"
},
boxRef:{
"$first":"$boxes.boxRef"
},
items:{
$push:"$boxes.items"
}
}
},
{
$group:{
_id:"$_id._id",
vendor:{
"$first":"$vendor"
},
boxes:{
$push:{
boxRef:"$boxRef",
items:"$items"
}
}
}
},
])
Output:
{
"_id" : 123.0,
"vendor" : "ut",
"boxes" : [
{
"boxRef" : 321.0,
"items" : [
{
"Tag" : "dolore"
}
]
}
]
}

Mongodb Aggregation Pipeline Count Total size across multiple fields

`"ActivityScores" : {
"Spring" : [
{
"ActivityId" : "8fd38724-7e7d-4518-bd49-d38a8b4b3435",
"ActivityTime" : "2017-05-25T16:07:02.000-06:00"
}
],
"Winter" : [
{
"ActivityId" : "90d2a976-19d9-4ce0-aa88-d32c122d173b",
"ActivityTime" : "2017-02-14T22:50:00.000-06:00"
}
],
"Fall" : [
{
"ActivityId" : "84b8c41e-788f-4acd-abec-dc455285972b",
"ActivityTime" : "2016-11-15T22:37:02.000-06:00"
},
{
"ActivityId" : "157af880-d47b-42fc-8ecf-ecfc1bbb56b1",
"ActivityTime" : "2016-09-01T22:50:05.000-06:00"
}
]
},
"Grade" : "2",
"GradeTag" : "GRADE_2", `
I am looking for aggregation query to get Total of ActivityIds. I tried various combination of $group, $unwind, $size $addToset but none of them seems to be working . I need to find total activities using aggregation framework only. I don't want to go through each document using javascript or python to get the total counts. Is there any easy way around?
Thanks.We are on version 3.2.Finally below combination worked. ActivityScores was field to entity.SchoolYears in our Schema.Working Aggregation Pipeline for me.
db.studentcontentareadocument.aggregate(
[
{
$project: {
"SpringActivitiesPerDoc" : {
"$size" : "$entity.SchoolYears.ActivityScores.Spring"
},
"WinterActivitiesPerDoc" : {
"$size" : "$entity.SchoolYears.ActivityScores.Winter"
},
"FallActivitiesPerDoc" : {
"$size" : "$entity.SchoolYears.ActivityScores.Fall"
}
}
},
{
$project: {
"TotalActivitiesPerDoc" : {
"$add" : [
"$SpringActivitiesPerDoc",
"$WinterActivitiesPerDoc",
"$FallActivitiesPerDoc"
]
}
}
},
{
$group: {
"_id" : null,
"TotalActivities" : {
"$sum" : "$TotalActivitiesPerDoc"
}
}
},
{
$project: {
"_id" : 0,
"TotalSGPActivities" : "$TotalActivities"
}
}
],
{
cursor: {
batchSize: 50
},
allowDiskUse: true
}
);

MongoDB. How to set up indexes?

Please help me with indexes in mongoDB.
There is a collection in which 800,000 documents.
There is a request that is very long runs. About 5 seconds!
{
"$or":[
{
"performer":"534ba408f9cd0ecb51711673",
"$or":[
{
"performersRole":"534ba30bf9cd0ec151a69522"
},
{
"performersRole":{
"$exists":false
}
}
]
},
{
"performersRole":"534ba30bf9cd0ec151a69522",
"notShowInToDo":{
"$ne":true
}
}
],
"taskTime":{
"$gte":1409774400,
"$lt":1409860799
},
"$and":[
{
"$or":[
{
"department":{
"$in":[
"5356134ef9cd0e4805672a15",
"53561368f9cd0e4b05645f3f",
"53a0357ff9cd0e670537c4b7",
"53a03594f9cd0e6705389449"
]
}
},
{
"department":{
"$exists":false
}
}
]
},
{
"$or":[
{
"salon":"534f7b3bf9cd0e311e77896f"
},
{
"salon":{
"$exists":false
}
}
]
}
],
"isDone":{
"$ne":true
}
}
Which indexes to add to optimize? Thanks for any advice!
Almost all documents about this format:
{
"_id": "541da66cf535a4a8569dd0ed",
"title": "test task",
"taskTime": NumberLong(1411229292),
"client": "53f876b2f535a4187f9e1264",
"salon": "534f7c3cf9cd0e91206dd948",
"track": "541da66cf535a4a8569dd0ec",
"department": "53a0357ff9cd0e670537c4b7",
"type": "invitePBP",
"performersRole": [
"534ba30bf9cd0ec151a69522"
],
"notShowInToDo": true,
"#createTime": NumberLong(1411229292),
"#updateTime": NumberLong(1411229292)
}
Before the creation of index, consider following points:
1. Cut down the number of query hierarchy as possible as you can;
2. Avoid to use $add and $or if possible;
3. Avoid to use $exists if possible as it will access the collection even though having index on the field;
4. Design the index according to the sequence executed as you want to.
Suppose I have understood your requirements correctly, then I reconstruct the query as below:
var query = {
"taskTime" : {
"$gte" : 1409774400,
"$lt" : 1409860799
},
"isDone" : {
"$ne" : true
},
"$and" : [
{
"salon" : {
"$in" : [ null, "534f7b3bf9cd0e311e77896f" ]
}
}, {
"department" : {
"$in" : [ null,
"5356134ef9cd0e4805672a15",
"53561368f9cd0e4b05645f3f",
"53a0357ff9cd0e670537c4b7",
"53a03594f9cd0e6705389449" ]
}
}],
"$or" : [ {
"performer" : "534ba408f9cd0ecb51711673",
"performersRole" : {
"$in" : [ null, "534ba30bf9cd0ec151a69522" ]
}
}, {
"performersRole" : "534ba30bf9cd0ec151a69522",
"notShowInToDo" : {
"$ne" : true
}
} ]
};
Be careful of null:
Be attentioned that {"salon" : {"$in" : [ null, "534f7b3bf9cd0e311e77896f" ]} can work completely on index {salon:1} in v2.4 but will still access the collection in v2.6, I don't know the exact reason but just guess that it's possible to the definition of null has been changed (include undefined type).
To avoid this issue in v2.6, an alternative is to initialize a real value to field salon instead of doing nothing.
You can try this way to create index and your feedback is appriciated since I haven't the real data to make a test.
db.c.ensureIndex({taskTime:1, isDone:1, salon:1, department:1}, {name:"bigIndex"});
Add my test result - 1010,000 documents
var a = {
"taskTime" : {
"$gte" : 1410443932781,
"$lt" : 1412443932781
},
"isDone" : {
"$ne" : true
},
"$and" : [
{
"salon" : {
"$in" : [ null, "534f7b3bf9cd0e311e77896f", "5420ecdc218ba2fb5353ad5b" ]
}
}, {
"department" : {
"$in" : [ null,
"5356134ef9cd0e4805672a15",
"53561368f9cd0e4b05645f3f",
"53a0357ff9cd0e670537c4b7", "5420ecdc218ba2fb5353ad5d",
"53a03594f9cd0e6705389449" ]
}
}],
"$or" : [ {
"performer" : "534ba408f9cd0ecb51711673",
"performersRole" : {
"$in" : [ null, "5420ecdc218ba2fb5353ad5e" ]
}
}, {
"performersRole" : "5420ecdc218ba2fb5353ad5e",
"notShowInToDo" : {
"$ne" : true
}
} ]
};
db.c.find(a).explain();
{
"cursor" : "BtreeCursor bigIndex",
"isMultiKey" : false,
"n" : 1,
"nscannedObjects" : 1,
"nscanned" : 54290,
"nscannedObjectsAllPlans" : 1,
"nscannedAllPlans" : 54290,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 425,
"nChunkSkips" : 0,
"millis" : 261,
"indexBounds" : {
"taskTime" : [
[
1410443932781,
1412443932781
]
],
"isDone" : [
[
{
"$minElement" : 1
},
true
],
[
true,
{
"$maxElement" : 1
}
]
],
"salon" : [
[
null,
null
],
[
"534f7b3bf9cd0e311e77896f",
"534f7b3bf9cd0e311e77896f"
],
[
"5420ecdc218ba2fb5353ad5b",
"5420ecdc218ba2fb5353ad5b"
]
],
"department" : [
[
null,
null
],
[
"5356134ef9cd0e4805672a15",
"5356134ef9cd0e4805672a15"
],
[
"53561368f9cd0e4b05645f3f",
"53561368f9cd0e4b05645f3f"
],
[
"53a0357ff9cd0e670537c4b7",
"53a0357ff9cd0e670537c4b7"
],
[
"53a03594f9cd0e6705389449",
"53a03594f9cd0e6705389449"
],
[
"5420ecdc218ba2fb5353ad5d",
"5420ecdc218ba2fb5353ad5d"
]
]
},
"server" : "Mars-PC:27017",
"filterSet" : false
}

"InternalError No plan available on provide stats" on aggregate with explain

When I run my aggregation using explain, as described here I get the following...
{
"stages":[
{
"$cursor":{
...
"planError":"InternalError No plan available to provide stats"
}
Any thoughts on what is going on here? I really need to be able to see what (if any) index is being used in my $match stage.
This seems to be a MongoDB 2.6 bug. Check the JIRA ticket.
I tweaked your query just a bit (adding a match to the front since I don't want to unwind the Tags array for all document):
db.collection.aggregate(
[
{ $match: {$or: [{"Tags._id":"tag1"},{"Tags._id":"tag2"}]}},
{ $unwind : "$Tags" },
{ $match: {$or: [{"Tags._id":"tag1"},{"Tags._id":"tag2"}]}},
{ $group: { _id : "$_id", count: { $sum:1 } }},
{$sort: {"count":-1}}
],
{ explain: true }
)
And got:
{
"stages" : [
{
"$cursor" : {
"query" : {
"$or" : [
{
"Tags._id" : "tag1"
},
{
"Tags._id" : "tag2"
}
]
},
"plan" : {
"cursor" : "BtreeCursor ",
"isMultiKey" : false,
"scanAndOrder" : false,
"indexBounds" : {
"Tags._id" : [
[
"tag1",
"tag1"
],
[
"tag2",
"tag2"
]
]
},
"allPlans" : [
{
"cursor" : "BtreeCursor ",
"isMultiKey" : false,
"scanAndOrder" : false,
"indexBounds" : {
"Tags._id" : [
[
"tag1",
"tag1"
],
[
"tag2",
"tag2"
]
]
}
}
]
}
}
},
{
"$unwind" : "$Tags"
},
{
"$match" : {
"$or" : [
{
"Tags._id" : "tag1"
},
{
"Tags._id" : "tag2"
}
]
}
},
{
"$group" : {
"_id" : "$_id",
"count" : {
"$sum" : {
"$const" : 1
}
}
}
},
{
"$sort" : {
"sortKey" : {
"count" : -1
}
}
}
],
"ok" : 1
}
While this doesn't quite address why your operation returns a planError, but maybe it can help some how.
Regards
Had the same issue in my Rails app, fixed it by restarting rails server.
MongoDB version is 2.6.4.
I worked around this by rebuilding all indexes on the collection. Not exactly elegant, but the error is gone now.