How can I put $group condition to $match? [duplicate] - mongodb

For what would be this query in SQL (to find duplicates):
SELECT userId, name FROM col GROUP BY userId, name HAVING COUNT(*)>1
I performed this simple query in MongoDB:
res = db.col.group({key:{userId:true,name:true},
reduce: function(obj,prev) {prev.count++;},
initial: {count:0}})
I've added a simple Javascript loop to go over the result set, and performed a filter to find all the fields with a count > 1 there, like so:
for (i in res) {if (res[i].count>1) printjson(res[i])};
Is there a better way to do this other than using javascript code in the client?
If this is the best/simplest way, say that it is, and this question will help someone :)

New answer using Mongo aggregation framework
After this question was asked and answered, 10gen released Mongodb version 2.2 with an aggregation framework. The new best way to do this query is:
db.col.aggregate( [
{ $group: { _id: { userId: "$userId", name: "$name" },
count: { $sum: 1 } } },
{ $match: { count: { $gt: 1 } } },
{ $project: { _id: 0,
userId: "$_id.userId",
name: "$_id.name",
count: 1}}
] )
10gen has a handy SQL to Mongo Aggregation conversion chart worth bookmarking.

The answer already given is apt to be honest, and use of projection makes it even better due to implicit optimisation working under the hood. I have made a small change and I am explaining the positive behind it.
The original command
db.getCollection('so').explain(1).aggregate( [
{ $group: { _id: { userId: "$userId", name: "$name" },
count: { $sum: 1 } } },
{ $match: { count: { $gt: 1 } } },
{ $project: { _id: 0,
userId: "$_id.userId",
name: "$_id.name",
count: 1}}
] )
Parts from the explain plan
{
"stages" : [
{
"$cursor" : {
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "5fa42c8b8778717d277f67c4_test.so",
"indexFilterSet" : false,
"parsedQuery" : {},
"queryHash" : "F301762B",
"planCacheKey" : "F301762B",
"winningPlan" : {
"stage" : "PROJECTION_SIMPLE",
"transformBy" : {
"name" : 1,
"userId" : 1,
"_id" : 0
},
"inputStage" : {
"stage" : "COLLSCAN",
"direction" : "forward"
}
},
"rejectedPlans" : []
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 6000,
"executionTimeMillis" : 8,
"totalKeysExamined" : 0,
"totalDocsExamined" : 6000,
The sampleset is pretty small, just 6000 documents
This query will work on data in WiredTiger Internal Cache, thus if the size of the collection is huge then all that will be kept in the Internal Cache to make sure the execution takes place. The WT Cache is pretty important and if this command takes up such huge space in cache then the cache size will have to be bigger to accommodate other operations
Now a small, hack and addition of an index.
db.getCollection('so').createIndex({userId : 1, name : 1})
New Command
db.getCollection('so').explain(1).aggregate( [
{$match : {name :{ "$ne" : null }, userId : { "$ne" : null } }},
{ $group: { _id: { userId: "$userId", name: "$name" },
count: { $sum: 1 } } },
{ $match: { count: { $gt: 1 } } },
{ $project: { _id: 0,
userId: "$_id.userId",
name: "$_id.name",
count: 1}}
] )
Explain Plan
{
"stages": [{
"$cursor": {
"queryPlanner": {
"plannerVersion": 1,
"namespace": "5fa42c8b8778717d277f67c4_test.so",
"indexFilterSet": false,
"parsedQuery": {
"$and": [{
"name": {
"$not": {
"$eq": null
}
}
},
{
"userId": {
"$not": {
"$eq": null
}
}
}
]
},
"queryHash": "4EF9C4D5",
"planCacheKey": "3898FC0A",
"winningPlan": {
"stage": "PROJECTION_COVERED",
"transformBy": {
"name": 1,
"userId": 1,
"_id": 0
},
"inputStage": {
"stage": "IXSCAN",
"keyPattern": {
"userId": 1.0,
"name": 1.0
},
"indexName": "userId_1_name_1",
"isMultiKey": false,
"multiKeyPaths": {
"userId": [],
"name": []
},
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": 2,
"direction": "forward",
"indexBounds": {
"userId": [
"[MinKey, undefined)",
"(null, MaxKey]"
],
"name": [
"[MinKey, undefined)",
"(null, MaxKey]"
]
}
}
},
"rejectedPlans": [{
"stage": "PROJECTION_SIMPLE",
"transformBy": {
"name": 1,
"userId": 1,
"_id": 0
},
"inputStage": {
"stage": "FETCH",
"filter": {
"userId": {
"$not": {
"$eq": null
}
}
},
"inputStage": {
"stage": "IXSCAN",
"keyPattern": {
"name": 1.0
},
"indexName": "name_1",
"isMultiKey": false,
"multiKeyPaths": {
"name": []
},
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": 2,
"direction": "forward",
"indexBounds": {
"name": [
"[MinKey, undefined)",
"(null, MaxKey]"
]
}
}
}
}]
},
"executionStats": {
"executionSuccess": true,
"nReturned": 6000,
"executionTimeMillis": 9,
"totalKeysExamined": 6000,
"totalDocsExamined": 0,
"executionStages": {
"stage": "PROJECTION_COVERED",
"nReturned": 6000,
Check the Projection_Covered part, this command is a covered query which basically is just relying on data in indexes
This command won't need to keep the data in the WT Internal Cache because it is not going there at all, check the docs examined, it is 0, given that data is in indexes it is using that for execution, this is a big positive for a system where WT Cache is already under pressure from other operations
If by any chance the requirement to search for specific names and not the whole collection then this becomes useful :D
Disadvantage here is an addition of index, if this index is utilised for other operations as well then no disadvantage to be honest but if this is an extra addition then it will take more space for the index in cache + the writes are impacted with addition of an index marginally
*On performance front for 6000 records the time shown is 1 ms more but for larger dataset this may vary. It must be noted that the sample document that I inserted has just 3 fields, apart from the two used here, the default _id, if this collection has bigger document size then the execution for original command will increase and the volume it will occupy in the cache will also increase.

Related

Can I write an index for an $or query in mongodb?

I have a few different but similar mongodb queries that I need to write an index for. Is this the correct way to do it?
First Query:
{
$or: [
{_id: "..."},
{linkedListingID: "..."}
]
}
Second Query:
{
$or: [
{_id: "..."},
{linkedListingID: {$in: ["...", "..."]}
]
}
Third Query:
$or: [
{
_id: {$in: ["...", "..."]},
linkedListingID: {
$exists: false,
},
},
{
linkedListingID: {$in: ["...", "..."]},
},
];
Index:
Listing.index(
{
_id: 1,
linkedListingID: 1
},
{name: "index_name"}
);
As per $or documentation,
When using indexes with $or queries, each clause of an $or can use its own index.
That means it will not use compound index!
To support your query, rather than a compound index, you would create one index on _id and another index on linkedListingID:
Listing.index({ _id: 1 });
Listing.index({ linkedListingID: 1 });
For more details try explain() with your query, and check executionStats > executionStages > inputStage
Explain state with compound index:
Listing.index({ _id: 1, linkedListingID: 1 }, {name: "index_name"});
Result: This will scan full collection "stage": "COLLSCAN"!
"winningPlan": {
"inputStage": {
"direction": "forward",
"filter": {
"$or": [
{
"_id": {
"$eq": "abc"
}
},
{
"linkedListingID": {
"$eq": "bca"
}
}
]
},
"stage": "COLLSCAN"
},
"stage": "SUBPLAN"
}
Playground
Explain state with single index: Here _id not needed index because by default _id having unique index,
Listing.index({ linkedListingID: 1 });
Result: this will at least use index individually "stage": "IXSCAN"!
"winningPlan": {
"inputStage": {
"inputStage": {
"inputStages": [
{
"direction": "forward",
"indexBounds": {
"_id": [
"[\"abc\", \"abc\"]"
]
},
"indexName": "_id_",
"indexVersion": 2,
"isMultiKey": false,
"isPartial": false,
"isSparse": false,
"isUnique": true,
"keyPattern": {
"_id": 1
},
"multiKeyPaths": {
"_id": []
},
"stage": "IXSCAN"
},
{
"direction": "forward",
"indexBounds": {
"linkedListingID": [
"[\"bca\", \"bca\"]"
]
},
"indexName": "_linkedListingID",
"indexVersion": 2,
"isMultiKey": false,
"isPartial": false,
"isSparse": false,
"isUnique": false,
"keyPattern": {
"linkedListingID": 1
},
"multiKeyPaths": {
"linkedListingID": []
},
"stage": "IXSCAN"
}
],
"stage": "OR"
},
"stage": "FETCH"
},
"stage": "SUBPLAN"
}
Playground
By default, MongoDB creates a unique index on the _id field during the creation of a collection.
You query could be as simple as:
{_id: "..."}
MongoDB ensures that the indexed fields do not store duplicate values; for that reason you query on the _id will alway retrieve an unique document.

Sort exceeds limit on text search?

I have defined an index like this:
db.imageProperties.createIndex(
{
"imageProperties.cameraMaker": "text",
"imageProperties.cameraModel": "text",
"imageProperties.dateTimeOriginal": -1,
},
{ name: "TextIndex" }
)
But, when I try to run a query with a sort like this:
db.imageProperties.find( { $text: { $search: "nikon" } }, {"imagePath" : 1, _id: 0 } ).sort( { "imageProperties.dateTimeOriginal": -1 } )
I get this error:
Error: error: {
"ok" : 0,
"errmsg" : "Executor error during find command :: caused by :: Sort operation used more than the maximum 33554432 bytes of RAM. Add an index, or specify a smaller limit.",
"code" : 96,
"codeName" : "OperationFailed"
It is my understanding from reading the documentation that it would be possible to combine text search with sorting by creating a combined index as I have done.
This is the output from .explain() on the above query:
> db.imageProperties.find( { $text: { $search: "nikon" } }, {"imagePath" : 1, _id: 0 } ).sort( { "imageProperties.dateTimeOriginal": -1 } ).explain()
{
"queryPlanner": {
"plannerVersion": 1,
"namespace": "olavt-images.imageProperties",
"indexFilterSet": false,
"parsedQuery": {
"$text": {
"$search": "nikon",
"$language": "english",
"$caseSensitive": false,
"$diacriticSensitive": false
}
},
"queryHash": "1DCFCE0B",
"planCacheKey": "650B3A8E",
"winningPlan": {
"stage": "PROJECTION_SIMPLE",
"transformBy": {
"imagePath": 1,
"_id": 0
},
"inputStage": {
"stage": "SORT",
"sortPattern": {
"imageProperties.dateTimeOriginal": -1
},
"inputStage": {
"stage": "SORT_KEY_GENERATOR",
"inputStage": {
"stage": "TEXT",
"indexPrefix": {
},
"indexName": "TextIndex",
"parsedTextQuery": {
"terms": [
"nikon"
],
"negatedTerms": [],
"phrases": [],
"negatedPhrases": []
},
"textIndexVersion": 3,
"inputStage": {
"stage": "TEXT_MATCH",
"inputStage": {
"stage": "FETCH",
"inputStage": {
"stage": "OR",
"inputStage": {
"stage": "IXSCAN",
"keyPattern": {
"_fts": "text",
"_ftsx": 1,
"imageProperties.dateTimeOriginal": -1
},
"indexName": "TextIndex",
"isMultiKey": true,
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": 2,
"direction": "backward",
"indexBounds": {
}
}
}
}
}
}
}
}
},
"rejectedPlans": []
},
"serverInfo": {
"host": "4794df1ed9c4",
"port": 27017,
"version": "4.2.5",
"gitVersion": "2261279b51ea13df08ae708ff278f0679c59dc32"
},
"ok": 1
}
How can I get the desired behavior?
The error suggests that sorting the result requires more memory than what is configured.
The field imagePath that you want to project is not covered by the TextIndex try adding a new index:
db.imageProperties.createIndex(
{
"imageProperties.cameraMaker": "text",
"imageProperties.cameraModel": "text",
"imageProperties.dateTimeOriginal": -1,
"imagePath": 1
}
)
Then try the following steps:
Check that the indexes are created successfully by running:
db.imageProperties.getIndexes()
Check whether the correct index is being used:
db.imageProperties.find( { $text: { $search: "nikon" } }, {"imagePath" : 1, _id: 0 } )
.sort( { "imageProperties.dateTimeOriginal": -1 } ).explain()
If you only want a limited rows of results, also add the limit
db.imageProperties.find( { $text: { $search: "nikon" } }, {"imagePath" : 1, _id: 0 } )
.sort( { "imageProperties.dateTimeOriginal": -1 } ).limit(100)
You can also allow disk usage by using aggregation framework with allowDiskUse
db.imageProperties.aggregate([{
$match: { $text: { $search: "nikon" } }
}, {
$sort: { "imageProperties.dateTimeOriginal": -1 }
} , {
$project: { imagePath: 1 }
}], {
allowDiskUse: true
})

Complex time series query time difference per action type

I have this data structure of documents in MongoDB and it is intended for complex data analysis from any point of view in relation to time series of different actions (flat data log). I found it difficult to extract the time taken between specific type of changes per document using mongo queries and then applying the $graphLookup function (shown below). I'm a beginner in MongoDB and I need help with the query to fetch the required data.
the data structure of a single document (example):
{
"_id":NumberInt(1),
"Creation": ISODate("2018-11-19T06:30:42Z"),
"Creator": NumberInt(1),
"Replies": NumberInt(10),
//... other aggregated properties
"CurrentProperties":{ // a copy of the last update signifying the current state
"StatusId": NumberInt(8),
"PriorityId": NumberInt(6),
"DepartmentId": NumberInt(5),
"TypeId": NumberInt(4),
"CategoryId": NumberInt(2),
"SubcategoryId": NumberInt(333),
"ChangeTime": ISODate("2018-11-19T10:17:20Z"),
"TimeDelta": NumberLong(3600000), //timespan from last change in MS
"ChangeType": NumberInt(4),
"UserId": NumberInt(1)
},
"ChangeHistory":[ // time series changes
{
"StatusId": NumberInt(8),
"PriorityId": NumberInt(6),
"DepartmentId": NumberInt(1),
"TypeId": NumberInt(4),
"CategoryId": NumberInt(2),
"SubcategoryId": NumberInt(333),
"ChangeTime": ISODate("2018-11-19T10:14:20Z"),
"TimeDelta": NumberLong(0), //timespan from last change in MS
"ChangeType": NumberInt(0), // the changed property identifier (0= creation)
"UserId": NumberInt(1)
},
{
"StatusId": NumberInt(8),
"PriorityId": NumberInt(6),
"DepartmentId": NumberInt(2),
"TypeId": NumberInt(4),
"CategoryId": NumberInt(2),
"SubcategoryId": NumberInt(333),
"ChangeTime": ISODate("2018-11-19T10:15:50Z"),
"TimeDelta": NumberLong(90000), //timespan from last change in MS
"ChangeType": NumberInt(4), // the changed property identifier (4= department)
"UserId": NumberInt(1)
},
{
"StatusId": NumberInt(2),
"PriorityId": NumberInt(6),
"DepartmentId": NumberInt(2),
"TypeId": NumberInt(4),
"CategoryId": NumberInt(2),
"SubcategoryId": NumberInt(333),
"ChangeTime": ISODate("2018-11-19T10:16:20Z"),
"TimeDelta": NumberLong(30000), //timespan from last change in MS
"ChangeType": NumberInt(2), // the changed property identifier (2= status)
"UserId": NumberInt(1)
},
{
"StatusId": NumberInt(2),
"PriorityId": NumberInt(6),
"DepartmentId": NumberInt(5),
"TypeId": NumberInt(4),
"CategoryId": NumberInt(2),
"SubcategoryId": NumberInt(333),
"ChangeTime": ISODate("2018-11-19T10:17:20Z"),
"TimeDelta": NumberLong(60000), //timespan from last change in MS
"ChangeType": NumberInt(4), // the changed property identifier (4= department)
"UserId": NumberInt(1)
}
]
}
The expected result for department changes in time:
[{
RecordID: 1,
Department: 1,
ChangeTime: ISODate("2018-11-19T10:15:50Z"),
TimeSpent: 90000
},
{
RecordID: 1,
Department: 2,
ChangeTime: ISODate("2018-11-19T10:17:20Z")
TimeSpent: 90000
},
{
RecordID: 1,
Department: 5,
ChangeTime: ISODate("2018-11-21T09:47:47Z") // Current Time
TimeSpent: 171027000 //difference between now and last change in departments
}]
and for status:
[{
RecordID: 1,
Status: 8,
ChangeTime: ISODate("2018-11-19T10:16:20Z"),
TimeDelta: 120000
},
{
RecordID: 1,
Status: 2,
ChangeTime: ISODate("2018-11-21T09:47:47Z"), // Current Time
TimeDelta: 171087000 //difference between now and last change in status
}]
What I tried so far
The best result I got so far was using the following aggregation to create a view and then apply a $GraphLookup function on the view:
db.test.aggregate([
{$project: {
_id:0,
RecordID: "$_id",
history: {
$filter: {
input: "$ChangeHistory",
as: "changeHistory",
cond: {$or:[
{$eq:["$$changeHistory.ChangeType",0]},
{$eq:["$$changeHistory.ChangeType",4]}
]}
}
}
}},
{$unwind: {
path: "$history",
includeArrayIndex:"order"
}}, {$project: {
_id:"$RecordID",
"RecordID": "$RecordID",
"departmentID": "$history.DepartmentId",
"actionOrder":"$order",
"nextAction":{$add:["$order",1]},
"time":"$history.ChangeTime"
}}
])
then applied the following:
db.TestView.aggregate([{
$graphLookup: {
from: 'TestView',
startWith: "$nextAction",
connectFromField: 'nextAction',
connectToField: 'actionOrder',
as: 'pair',
}
}, {
$unwind: {
path: "$pair"
}
}, {
$project: {
_id: 0,
RecordID: "$_id",
Department: "$departmentID",
ChangeTime: "$pair.time",
TimeSpent: {
$subtract: ["$pair.time", "$time"]
}
}
}
])
the problem with this is that it mixes the action pairing across different documents, doesn't include the spent time till the current time and it has so many propagations on top of using a view in the middle.
The data structure can be modified a little if needed.
I actually took 2 days trying to figure out a solution for this before posting the question, and I solved it a few hours later.
Just wanted to share my solution and if anyone could optimize it for performance or anything please feel free to post your answers too
Solution
it makes use of the $zip function in order to form pairs of actions after the filter is applied by passing the original array of events and another copy of the same array excluding the first element, so that the first element gets matched with the second and the second with the third and so on. I also added a default of the current time to calculate the last element's delta from the current time.
db.test.aggregate([{
$project: {
RecordID: "$_id",
history: {
$filter: {
input: "$ChangeHistory",
as: "changeHistory",
cond: {
$or: [{
$eq: ["$$changeHistory.ChangeType", 0]
},
{
$eq: ["$$changeHistory.ChangeType", 2]
}
]
}
}
}
}
},
{
$addFields: {
pairs: {
$zip: { // here is the trick
inputs: ["$history", {
$slice: ["$history", 1, {
$size: "$history"
}]
}],
useLongestLength: true,
defaults: [0, {
ChangeTime: new Date()
}]
}
}
}
},
{
$unwind: {
path: "$pairs"
}
},
{
$project: {
id: "$_id",
old: {
$arrayElemAt: ["$pairs", 0]
},
new: {
$arrayElemAt: ["$pairs", 1]
}
}
},
{
$project: {
RecordID: "$id",
Status: "$old.StatusId",
TimeDeltaMS: {
$subtract: ["$new.ChangeTime", "$old.ChangeTime"]
},
ChangeTime: "$new.ChangeTime"
}
},
])

Unable to compute average time

3 days old into MongoDB, and I am not finding it very fluid. I am simply trying to compute the average time for a field but I keep running into all sorts of problems.
Here is my code:
db.results.group({
key:{"profile.Zend_Http_Client_Adapter_Socket::read==>fgets":{$exists:true}},
initial: {count: 0, total:0},
reduce: function(doc, out){
out.count++;
out.total += doc."profile.Zend_Http_Client_Adapter_Socket::read==>fgets.wt";
},
finalize: function(out){
out.avg = out.total/out.count;
}
});
The error:
SyntaxError: Unexpected String
The above is a very childish error, but I can't understand why it would state this, the only plausible reason I can think of is that the keys I have specified here are in quotes hence Mongo is getting confused.
BUT, these keys ARE in quotes in my collection, so there should be no reason why I keep getting this syntax error right ?
Sample document:
{
"_id" : ObjectId("532a2a986803faba658b456b"),
"profile" : {
"main()==>register_shutdown_function" : {
"ct" : 1,
"wt" : 13,
"cpu" : 0,
"mu" : 1568,
"pmu" : 1000
},
"main()==>load::htdocs/index.php" : {
"ct" : 1,
"wt" : 17,
"cpu" : 0,
"mu" : 1736,
"pmu" : 4296
},
{"Zend_Http_Client_Adapter_Curl::write==>curl_exec" : {
"ct" : 3,
"wt" : 54782314,
"cpu" : 16001,
"mu" : 83288,
"pmu" : 49648
}, ....
As per the comment your problem is one of forming valid JavaScript. Also your "key" value would not seem to be what you really want. There is however the aggregate function that you should be favoring over the use of "group"
db.results.aggregate([
{ "$match": {
"$and": [
{ "profile.Zend_Http_Client_Adapter_Socket::read==>fgets.wt": {
"$exists": true
}},
{ "profile.Zend_Http_Client_Adapter_Socket::read==>fgets.wt": {
"$not": { "$type": 2 }
}}
]
}},
{ "$group": {
"_id": null,
"total": { "$sum":
"$profile.Zend_Http_Client_Adapter_Socket::read==>fgets.wt"
},
"count": { "$sum": 1 }
}},
{ "$project": {
"_id": 0,
"avg": { "$divide": [ "$total", "$count" ] }
}}
])
The aggregation pipeline sort of supercedes earlier introduced functions such as group and distinct. And for all but trivial operations should be your favored choice.
It will run much faster as well as this is processed in native code and not the JavaScript engine.
Also see the SQL to aggregation mapping chart in the documentation.
Problems With Data
Your sample is not very complete. To sort out all issues I have to put in a document like this:
{
"profile": {
"Zend_Http_Client_Adapter_Socket::read==>fgets": {
"ct" : 3,
"wt" : 54782314,
"cpu" : 16001,
"mu" : 83288,
"pmu" : 49648
},
}
}
Also your document example has some invalid fields in it:
{
"_id" : ObjectId("532a2a986803faba658b456b"),
"profile" : {
"main()==>register_shutdown_function" : {
"ct" : 1,
"wt" : 13,
"cpu" : 0,
"mu" : 1568,
"pmu" : 1000
},
"main()==>load::htdocs/index.php" : { <-- Invalid
"ct" : 1,
"wt" : 17,
"cpu" : 0,
"mu" : 1736,
"pmu" : 4296
},
So that field cannot exist as it has a . in the field name, which for obvious sub-document reasons is not allowed.
#Neils answer led me to the correct solution:
db.results.aggregate([
{
$match: {
"profile.Zend_Http_Client_Adapter_Socket::read==>fgets.wt": {
"$exists": true
}
}
},
{
$group: {
"_id": null,
"total": {
$sum: "$profile.Zend_Http_Client_Adapter_Socket::read==>fgets.wt"
},
"count": {
$sum: 1
}
}
},
{
$project: {
"_id": 0,
"count": "$count",
"avg": {
$divide: [
"$total",
"$count"
]
}
}
}
]);

What is the correct way to do a HAVING in a MongoDB GROUP BY?

For what would be this query in SQL (to find duplicates):
SELECT userId, name FROM col GROUP BY userId, name HAVING COUNT(*)>1
I performed this simple query in MongoDB:
res = db.col.group({key:{userId:true,name:true},
reduce: function(obj,prev) {prev.count++;},
initial: {count:0}})
I've added a simple Javascript loop to go over the result set, and performed a filter to find all the fields with a count > 1 there, like so:
for (i in res) {if (res[i].count>1) printjson(res[i])};
Is there a better way to do this other than using javascript code in the client?
If this is the best/simplest way, say that it is, and this question will help someone :)
New answer using Mongo aggregation framework
After this question was asked and answered, 10gen released Mongodb version 2.2 with an aggregation framework. The new best way to do this query is:
db.col.aggregate( [
{ $group: { _id: { userId: "$userId", name: "$name" },
count: { $sum: 1 } } },
{ $match: { count: { $gt: 1 } } },
{ $project: { _id: 0,
userId: "$_id.userId",
name: "$_id.name",
count: 1}}
] )
10gen has a handy SQL to Mongo Aggregation conversion chart worth bookmarking.
The answer already given is apt to be honest, and use of projection makes it even better due to implicit optimisation working under the hood. I have made a small change and I am explaining the positive behind it.
The original command
db.getCollection('so').explain(1).aggregate( [
{ $group: { _id: { userId: "$userId", name: "$name" },
count: { $sum: 1 } } },
{ $match: { count: { $gt: 1 } } },
{ $project: { _id: 0,
userId: "$_id.userId",
name: "$_id.name",
count: 1}}
] )
Parts from the explain plan
{
"stages" : [
{
"$cursor" : {
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "5fa42c8b8778717d277f67c4_test.so",
"indexFilterSet" : false,
"parsedQuery" : {},
"queryHash" : "F301762B",
"planCacheKey" : "F301762B",
"winningPlan" : {
"stage" : "PROJECTION_SIMPLE",
"transformBy" : {
"name" : 1,
"userId" : 1,
"_id" : 0
},
"inputStage" : {
"stage" : "COLLSCAN",
"direction" : "forward"
}
},
"rejectedPlans" : []
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 6000,
"executionTimeMillis" : 8,
"totalKeysExamined" : 0,
"totalDocsExamined" : 6000,
The sampleset is pretty small, just 6000 documents
This query will work on data in WiredTiger Internal Cache, thus if the size of the collection is huge then all that will be kept in the Internal Cache to make sure the execution takes place. The WT Cache is pretty important and if this command takes up such huge space in cache then the cache size will have to be bigger to accommodate other operations
Now a small, hack and addition of an index.
db.getCollection('so').createIndex({userId : 1, name : 1})
New Command
db.getCollection('so').explain(1).aggregate( [
{$match : {name :{ "$ne" : null }, userId : { "$ne" : null } }},
{ $group: { _id: { userId: "$userId", name: "$name" },
count: { $sum: 1 } } },
{ $match: { count: { $gt: 1 } } },
{ $project: { _id: 0,
userId: "$_id.userId",
name: "$_id.name",
count: 1}}
] )
Explain Plan
{
"stages": [{
"$cursor": {
"queryPlanner": {
"plannerVersion": 1,
"namespace": "5fa42c8b8778717d277f67c4_test.so",
"indexFilterSet": false,
"parsedQuery": {
"$and": [{
"name": {
"$not": {
"$eq": null
}
}
},
{
"userId": {
"$not": {
"$eq": null
}
}
}
]
},
"queryHash": "4EF9C4D5",
"planCacheKey": "3898FC0A",
"winningPlan": {
"stage": "PROJECTION_COVERED",
"transformBy": {
"name": 1,
"userId": 1,
"_id": 0
},
"inputStage": {
"stage": "IXSCAN",
"keyPattern": {
"userId": 1.0,
"name": 1.0
},
"indexName": "userId_1_name_1",
"isMultiKey": false,
"multiKeyPaths": {
"userId": [],
"name": []
},
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": 2,
"direction": "forward",
"indexBounds": {
"userId": [
"[MinKey, undefined)",
"(null, MaxKey]"
],
"name": [
"[MinKey, undefined)",
"(null, MaxKey]"
]
}
}
},
"rejectedPlans": [{
"stage": "PROJECTION_SIMPLE",
"transformBy": {
"name": 1,
"userId": 1,
"_id": 0
},
"inputStage": {
"stage": "FETCH",
"filter": {
"userId": {
"$not": {
"$eq": null
}
}
},
"inputStage": {
"stage": "IXSCAN",
"keyPattern": {
"name": 1.0
},
"indexName": "name_1",
"isMultiKey": false,
"multiKeyPaths": {
"name": []
},
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": 2,
"direction": "forward",
"indexBounds": {
"name": [
"[MinKey, undefined)",
"(null, MaxKey]"
]
}
}
}
}]
},
"executionStats": {
"executionSuccess": true,
"nReturned": 6000,
"executionTimeMillis": 9,
"totalKeysExamined": 6000,
"totalDocsExamined": 0,
"executionStages": {
"stage": "PROJECTION_COVERED",
"nReturned": 6000,
Check the Projection_Covered part, this command is a covered query which basically is just relying on data in indexes
This command won't need to keep the data in the WT Internal Cache because it is not going there at all, check the docs examined, it is 0, given that data is in indexes it is using that for execution, this is a big positive for a system where WT Cache is already under pressure from other operations
If by any chance the requirement to search for specific names and not the whole collection then this becomes useful :D
Disadvantage here is an addition of index, if this index is utilised for other operations as well then no disadvantage to be honest but if this is an extra addition then it will take more space for the index in cache + the writes are impacted with addition of an index marginally
*On performance front for 6000 records the time shown is 1 ms more but for larger dataset this may vary. It must be noted that the sample document that I inserted has just 3 fields, apart from the two used here, the default _id, if this collection has bigger document size then the execution for original command will increase and the volume it will occupy in the cache will also increase.