Check if every element in array matches condition - mongodb

I have a collection of documents:
date: Date
users: [
{ user: 1, group: 1 }
{ user: 5, group: 2 }
]
date: Date
users: [
{ user: 1, group: 1 }
{ user: 3, group: 2 }
]
I would like to query against this collection to find all documents where every user id in my array of users is in another array, [1, 5, 7]. In this example, only the first document matches.
The best solution I've been able to find is to do:
$where: function() {
var ids = [1, 5, 7];
return this.users.every(function(u) {
return ids.indexOf(u.user) !== -1;
});
}
Unfortunately, this seems to hurt performance is stated in the $where docs:
$where evaluates JavaScript and cannot take advantage of indexes.
How can I improve this query?

The query you want is this:
db.collection.find({"users":{"$not":{"$elemMatch":{"user":{$nin:[1,5,7]}}}}})
This says find me all documents that don't have elements that are outside of the list 1,5,7.

I don't know about better, but there are a few different ways to approach this, and depending on the version of MongoDB you have available.
Not too sure if this is your intention or not, but the query as shown will match the first document example because as your logic is implemented you are matching the elements within that document's array that must be contained within the sample array.
So if you actually wanted the document to contain all of those elements, then the $all operator would be the obvious choice:
db.collection.find({ "users.user": { "$all": [ 1, 5, 7 ] } })
But working with the presumption that your logic is actually intended, at least as per suggestion you can "filter" those results by combining with the $in operator so that there are less documents subject to your $where** condition in evaluated JavaScript:
db.collection.find({
"users.user": { "$in": [ 1, 5, 7 ] },
"$where": function() {
var ids = [1, 5, 7];
return this.users.every(function(u) {
return ids.indexOf(u.user) !== -1;
});
}
})
And you get an index though the actual scanned will be multiplied by the number of elements in the arrays from the matched documents, but still better than without the additional filter.
Or even possibly you consider the logical abstraction of the $and operator used in combination with $or and possibly the $size operator depending on your actual array conditions:
db.collection.find({
"$or": [
{ "users.user": { "$all": [ 1, 5, 7 ] } },
{ "users.user": { "$all": [ 1, 5 ] } },
{ "users.user": { "$all": [ 1, 7 ] } },
{ "users": { "$size": 1 }, "users.user": 1 },
{ "users": { "$size": 1 }, "users.user": 5 },
{ "users": { "$size": 1 }, "users.user": 7 }
]
})
So this is a generations of all of the possible permutations of your matching condition, but again performance will likely vary depending on your available installed version.
NOTE: Actually a complete fail in this case as this does something entirely different and in fact results in a logical $in
Alternates are with the aggregation framework, your mileage may vary on which is most efficient due to the number of documents in your collection, one approach with MongoDB 2.6 and upwards:
db.problem.aggregate([
// Match documents that "could" meet the conditions
{ "$match": {
"users.user": { "$in": [ 1, 5, 7 ] }
}},
// Keep your original document and a copy of the array
{ "$project": {
"_id": {
"_id": "$_id",
"date": "$date",
"users": "$users"
},
"users": 1,
}},
// Unwind the array copy
{ "$unwind": "$users" },
// Just keeping the "user" element value
{ "$group": {
"_id": "$_id",
"users": { "$push": "$users.user" }
}},
// Compare to see if all elements are a member of the desired match
{ "$project": {
"match": { "$setEquals": [
{ "$setIntersection": [ "$users", [ 1, 5, 7 ] ] },
"$users"
]}
}},
// Filter out any documents that did not match
{ "$match": { "match": true } },
// Return the original document form
{ "$project": {
"_id": "$_id._id",
"date": "$_id.date",
"users": "$_id.users"
}}
])
So that approach uses some newly introduced set operators in order to compare the contents, though of course you need to restructure the array in order to make the comparison.
As pointed out, there is a direct operator to do this in $setIsSubset which does the equivalent of the combined operators above in a single operator:
db.collection.aggregate([
{ "$match": {
"users.user": { "$in": [ 1,5,7 ] }
}},
{ "$project": {
"_id": {
"_id": "$_id",
"date": "$date",
"users": "$users"
},
"users": 1,
}},
{ "$unwind": "$users" },
{ "$group": {
"_id": "$_id",
"users": { "$push": "$users.user" }
}},
{ "$project": {
"match": { "$setIsSubset": [ "$users", [ 1, 5, 7 ] ] }
}},
{ "$match": { "match": true } },
{ "$project": {
"_id": "$_id._id",
"date": "$_id.date",
"users": "$_id.users"
}}
])
Or with a different approach while still taking advantage of the $size operator from MongoDB 2.6:
db.collection.aggregate([
// Match documents that "could" meet the conditions
{ "$match": {
"users.user": { "$in": [ 1, 5, 7 ] }
}},
// Keep your original document and a copy of the array
// and a note of it's current size
{ "$project": {
"_id": {
"_id": "$_id",
"date": "$date",
"users": "$users"
},
"users": 1,
"size": { "$size": "$users" }
}},
// Unwind the array copy
{ "$unwind": "$users" },
// Filter array contents that do not match
{ "$match": {
"users.user": { "$in": [ 1, 5, 7 ] }
}},
// Count the array elements that did match
{ "$group": {
"_id": "$_id",
"size": { "$first": "$size" },
"count": { "$sum": 1 }
}},
// Compare the original size to the matched count
{ "$project": {
"match": { "$eq": [ "$size", "$count" ] }
}},
// Filter out documents that were not the same
{ "$match": { "match": true } },
// Return the original document form
{ "$project": {
"_id": "$_id._id",
"date": "$_id.date",
"users": "$_id.users"
}}
])
Which of course can still be done, though a little more long winded in versions prior to 2.6:
db.collection.aggregate([
// Match documents that "could" meet the conditions
{ "$match": {
"users.user": { "$in": [ 1, 5, 7 ] }
}},
// Keep your original document and a copy of the array
{ "$project": {
"_id": {
"_id": "$_id",
"date": "$date",
"users": "$users"
},
"users": 1,
}},
// Unwind the array copy
{ "$unwind": "$users" },
// Group it back to get it's original size
{ "$group": {
"_id": "$_id",
"users": { "$push": "$users" },
"size": { "$sum": 1 }
}},
// Unwind the array copy again
{ "$unwind": "$users" },
// Filter array contents that do not match
{ "$match": {
"users.user": { "$in": [ 1, 5, 7 ] }
}},
// Count the array elements that did match
{ "$group": {
"_id": "$_id",
"size": { "$first": "$size" },
"count": { "$sum": 1 }
}},
// Compare the original size to the matched count
{ "$project": {
"match": { "$eq": [ "$size", "$count" ] }
}},
// Filter out documents that were not the same
{ "$match": { "match": true } },
// Return the original document form
{ "$project": {
"_id": "$_id._id",
"date": "$_id.date",
"users": "$_id.users"
}}
])
That generally rounds out the different ways, try them out and see what works best for you. In all likelihood the simple combination of $in with your existing form is probably going to be the best one. But in all cases, make sure you have an index that can be selected:
db.collection.ensureIndex({ "users.user": 1 })
Which is going to give you the best performance as long as you are accessing that in some way, as all the examples here do.
Verdict
I was intrigued by this so ultimately contrived a test case in order to see what had the best performance. So first some test data generation:
var batch = [];
for ( var n = 1; n <= 10000; n++ ) {
var elements = Math.floor(Math.random(10)*10)+1;
var obj = { date: new Date(), users: [] };
for ( var x = 0; x < elements; x++ ) {
var user = Math.floor(Math.random(10)*10)+1,
group = Math.floor(Math.random(10)*10)+1;
obj.users.push({ user: user, group: group });
}
batch.push( obj );
if ( n % 500 == 0 ) {
db.problem.insert( batch );
batch = [];
}
}
With 10000 documents in a collection with random arrays from 1..10 in length holding random values of 1..0, I came to a match count of 430 documents (reduced from 7749 from the $in match ) with the following results (avg):
JavaScript with $in clause: 420ms
Aggregate with $size : 395ms
Aggregate with group array count : 650ms
Aggregate with two set operators : 275ms
Aggregate with $setIsSubset : 250ms
Noting that over the samples done all but the last two had a peak variance of approximately 100ms faster, and the last two both exhibited 220ms response. The largest variations were in the JavaScript query which also exhibited results 100ms slower.
But the point here is relative to hardware, which on my laptop under a VM is not particularly great, but gives an idea.
So the aggregate, and specifically the MongoDB 2.6.1 version with set operators clearly wins on performance with the additional slight gain coming from $setIsSubset as a single operator.
This is particularly interesting given (as indicated by the 2.4 compatible method) the largest cost in this process will be the $unwind statement ( over 100ms avg ), so with the $in selection having a mean around 32ms the rest of the pipeline stages execute in less than 100ms on average. So that gives a relative idea of aggregation versus JavaScript performance.

I just spent a substantial portion of my day trying to implement Asya's solution above with object-comparisons rather than strict equality. So I figured I'd share it here.
Let's say you expanded your question from userIds to full users.
You want to find all documents where every item in its users array is present in another users array: [{user: 1, group: 3}, {user: 2, group: 5},...]
This won't work: db.collection.find({"users":{"$not":{"$elemMatch":{"$nin":[{user: 1, group: 3},{user: 2, group: 5},...]}}}}}) because $nin only works for strict equality. So we need to find a different way of expressing "Not in array" for arrays of objects. And using $where would slow down the query too much.
Solution:
db.collection.find({
"users": {
"$not": {
"$elemMatch": {
// if all of the OR-blocks are true, element is not in array
"$and": [{
// each OR-block == true if element != that user
"$or": [
"user": { "ne": 1 },
"group": { "ne": 3 }
]
}, {
"$or": [
"user": { "ne": 2 },
"group": { "ne": 5 }
]
}, {
// more users...
}]
}
}
}
})
To round out the logic: $elemMatch matches all documents that have a user not in the array. So $not will match all documents that have all of the users in the array.

Related

mongodb find doc with matching values of element in object array [duplicate]

I have a collection of documents:
date: Date
users: [
{ user: 1, group: 1 }
{ user: 5, group: 2 }
]
date: Date
users: [
{ user: 1, group: 1 }
{ user: 3, group: 2 }
]
I would like to query against this collection to find all documents where every user id in my array of users is in another array, [1, 5, 7]. In this example, only the first document matches.
The best solution I've been able to find is to do:
$where: function() {
var ids = [1, 5, 7];
return this.users.every(function(u) {
return ids.indexOf(u.user) !== -1;
});
}
Unfortunately, this seems to hurt performance is stated in the $where docs:
$where evaluates JavaScript and cannot take advantage of indexes.
How can I improve this query?
The query you want is this:
db.collection.find({"users":{"$not":{"$elemMatch":{"user":{$nin:[1,5,7]}}}}})
This says find me all documents that don't have elements that are outside of the list 1,5,7.
I don't know about better, but there are a few different ways to approach this, and depending on the version of MongoDB you have available.
Not too sure if this is your intention or not, but the query as shown will match the first document example because as your logic is implemented you are matching the elements within that document's array that must be contained within the sample array.
So if you actually wanted the document to contain all of those elements, then the $all operator would be the obvious choice:
db.collection.find({ "users.user": { "$all": [ 1, 5, 7 ] } })
But working with the presumption that your logic is actually intended, at least as per suggestion you can "filter" those results by combining with the $in operator so that there are less documents subject to your $where** condition in evaluated JavaScript:
db.collection.find({
"users.user": { "$in": [ 1, 5, 7 ] },
"$where": function() {
var ids = [1, 5, 7];
return this.users.every(function(u) {
return ids.indexOf(u.user) !== -1;
});
}
})
And you get an index though the actual scanned will be multiplied by the number of elements in the arrays from the matched documents, but still better than without the additional filter.
Or even possibly you consider the logical abstraction of the $and operator used in combination with $or and possibly the $size operator depending on your actual array conditions:
db.collection.find({
"$or": [
{ "users.user": { "$all": [ 1, 5, 7 ] } },
{ "users.user": { "$all": [ 1, 5 ] } },
{ "users.user": { "$all": [ 1, 7 ] } },
{ "users": { "$size": 1 }, "users.user": 1 },
{ "users": { "$size": 1 }, "users.user": 5 },
{ "users": { "$size": 1 }, "users.user": 7 }
]
})
So this is a generations of all of the possible permutations of your matching condition, but again performance will likely vary depending on your available installed version.
NOTE: Actually a complete fail in this case as this does something entirely different and in fact results in a logical $in
Alternates are with the aggregation framework, your mileage may vary on which is most efficient due to the number of documents in your collection, one approach with MongoDB 2.6 and upwards:
db.problem.aggregate([
// Match documents that "could" meet the conditions
{ "$match": {
"users.user": { "$in": [ 1, 5, 7 ] }
}},
// Keep your original document and a copy of the array
{ "$project": {
"_id": {
"_id": "$_id",
"date": "$date",
"users": "$users"
},
"users": 1,
}},
// Unwind the array copy
{ "$unwind": "$users" },
// Just keeping the "user" element value
{ "$group": {
"_id": "$_id",
"users": { "$push": "$users.user" }
}},
// Compare to see if all elements are a member of the desired match
{ "$project": {
"match": { "$setEquals": [
{ "$setIntersection": [ "$users", [ 1, 5, 7 ] ] },
"$users"
]}
}},
// Filter out any documents that did not match
{ "$match": { "match": true } },
// Return the original document form
{ "$project": {
"_id": "$_id._id",
"date": "$_id.date",
"users": "$_id.users"
}}
])
So that approach uses some newly introduced set operators in order to compare the contents, though of course you need to restructure the array in order to make the comparison.
As pointed out, there is a direct operator to do this in $setIsSubset which does the equivalent of the combined operators above in a single operator:
db.collection.aggregate([
{ "$match": {
"users.user": { "$in": [ 1,5,7 ] }
}},
{ "$project": {
"_id": {
"_id": "$_id",
"date": "$date",
"users": "$users"
},
"users": 1,
}},
{ "$unwind": "$users" },
{ "$group": {
"_id": "$_id",
"users": { "$push": "$users.user" }
}},
{ "$project": {
"match": { "$setIsSubset": [ "$users", [ 1, 5, 7 ] ] }
}},
{ "$match": { "match": true } },
{ "$project": {
"_id": "$_id._id",
"date": "$_id.date",
"users": "$_id.users"
}}
])
Or with a different approach while still taking advantage of the $size operator from MongoDB 2.6:
db.collection.aggregate([
// Match documents that "could" meet the conditions
{ "$match": {
"users.user": { "$in": [ 1, 5, 7 ] }
}},
// Keep your original document and a copy of the array
// and a note of it's current size
{ "$project": {
"_id": {
"_id": "$_id",
"date": "$date",
"users": "$users"
},
"users": 1,
"size": { "$size": "$users" }
}},
// Unwind the array copy
{ "$unwind": "$users" },
// Filter array contents that do not match
{ "$match": {
"users.user": { "$in": [ 1, 5, 7 ] }
}},
// Count the array elements that did match
{ "$group": {
"_id": "$_id",
"size": { "$first": "$size" },
"count": { "$sum": 1 }
}},
// Compare the original size to the matched count
{ "$project": {
"match": { "$eq": [ "$size", "$count" ] }
}},
// Filter out documents that were not the same
{ "$match": { "match": true } },
// Return the original document form
{ "$project": {
"_id": "$_id._id",
"date": "$_id.date",
"users": "$_id.users"
}}
])
Which of course can still be done, though a little more long winded in versions prior to 2.6:
db.collection.aggregate([
// Match documents that "could" meet the conditions
{ "$match": {
"users.user": { "$in": [ 1, 5, 7 ] }
}},
// Keep your original document and a copy of the array
{ "$project": {
"_id": {
"_id": "$_id",
"date": "$date",
"users": "$users"
},
"users": 1,
}},
// Unwind the array copy
{ "$unwind": "$users" },
// Group it back to get it's original size
{ "$group": {
"_id": "$_id",
"users": { "$push": "$users" },
"size": { "$sum": 1 }
}},
// Unwind the array copy again
{ "$unwind": "$users" },
// Filter array contents that do not match
{ "$match": {
"users.user": { "$in": [ 1, 5, 7 ] }
}},
// Count the array elements that did match
{ "$group": {
"_id": "$_id",
"size": { "$first": "$size" },
"count": { "$sum": 1 }
}},
// Compare the original size to the matched count
{ "$project": {
"match": { "$eq": [ "$size", "$count" ] }
}},
// Filter out documents that were not the same
{ "$match": { "match": true } },
// Return the original document form
{ "$project": {
"_id": "$_id._id",
"date": "$_id.date",
"users": "$_id.users"
}}
])
That generally rounds out the different ways, try them out and see what works best for you. In all likelihood the simple combination of $in with your existing form is probably going to be the best one. But in all cases, make sure you have an index that can be selected:
db.collection.ensureIndex({ "users.user": 1 })
Which is going to give you the best performance as long as you are accessing that in some way, as all the examples here do.
Verdict
I was intrigued by this so ultimately contrived a test case in order to see what had the best performance. So first some test data generation:
var batch = [];
for ( var n = 1; n <= 10000; n++ ) {
var elements = Math.floor(Math.random(10)*10)+1;
var obj = { date: new Date(), users: [] };
for ( var x = 0; x < elements; x++ ) {
var user = Math.floor(Math.random(10)*10)+1,
group = Math.floor(Math.random(10)*10)+1;
obj.users.push({ user: user, group: group });
}
batch.push( obj );
if ( n % 500 == 0 ) {
db.problem.insert( batch );
batch = [];
}
}
With 10000 documents in a collection with random arrays from 1..10 in length holding random values of 1..0, I came to a match count of 430 documents (reduced from 7749 from the $in match ) with the following results (avg):
JavaScript with $in clause: 420ms
Aggregate with $size : 395ms
Aggregate with group array count : 650ms
Aggregate with two set operators : 275ms
Aggregate with $setIsSubset : 250ms
Noting that over the samples done all but the last two had a peak variance of approximately 100ms faster, and the last two both exhibited 220ms response. The largest variations were in the JavaScript query which also exhibited results 100ms slower.
But the point here is relative to hardware, which on my laptop under a VM is not particularly great, but gives an idea.
So the aggregate, and specifically the MongoDB 2.6.1 version with set operators clearly wins on performance with the additional slight gain coming from $setIsSubset as a single operator.
This is particularly interesting given (as indicated by the 2.4 compatible method) the largest cost in this process will be the $unwind statement ( over 100ms avg ), so with the $in selection having a mean around 32ms the rest of the pipeline stages execute in less than 100ms on average. So that gives a relative idea of aggregation versus JavaScript performance.
I just spent a substantial portion of my day trying to implement Asya's solution above with object-comparisons rather than strict equality. So I figured I'd share it here.
Let's say you expanded your question from userIds to full users.
You want to find all documents where every item in its users array is present in another users array: [{user: 1, group: 3}, {user: 2, group: 5},...]
This won't work: db.collection.find({"users":{"$not":{"$elemMatch":{"$nin":[{user: 1, group: 3},{user: 2, group: 5},...]}}}}}) because $nin only works for strict equality. So we need to find a different way of expressing "Not in array" for arrays of objects. And using $where would slow down the query too much.
Solution:
db.collection.find({
"users": {
"$not": {
"$elemMatch": {
// if all of the OR-blocks are true, element is not in array
"$and": [{
// each OR-block == true if element != that user
"$or": [
"user": { "ne": 1 },
"group": { "ne": 3 }
]
}, {
"$or": [
"user": { "ne": 2 },
"group": { "ne": 5 }
]
}, {
// more users...
}]
}
}
}
})
To round out the logic: $elemMatch matches all documents that have a user not in the array. So $not will match all documents that have all of the users in the array.

How to convert an array of documents to two dimensions array

I am making a query to MongoDB
db.getCollection('user_actions').aggregate([
{$match: {
type: 'play_started',
entity_id: {$ne: null}
}},
{$group: {
_id: '$entity_id',
view_count: {$sum: 1}
}},
])
and getting a list of docs with two fields:
How can I get a list of lists with two items like
[[entity_id, view_count], [entity_id, view_count], ...]
Actually there are two different way to do this, depending on your MongoDB server version.
The optimal way is in MongoDB 3.2 using the square brackets [] to directly create new array fields in the $project stage. This return an array for each group. The next stage is the another $group stage where you group your document and use the $push accumulator operator to return a two dimensional array.
db.getCollection('user_actions').aggregate([
{ "$match": {
"type": 'play_started',
"entity_id": { "$ne": null }
}},
{ "$group": {
"_id": "$entity_id",
"view_count": { "$sum": 1}
}},
{ "$project": {
"_id": 0,
"result": [ "$_id", "$view_count" ]
}},
{ "$group": {
"_id": null,
"result": { "$push": "$result" }
}}
])
From MongoDB 2.6 and prior to 3.2 you need a different approach. In order to create your array you need to use the $map operator. Because the $map "input" field must resolves to and array you need to use $literal operator to set a literal array value to input. Of course the $cond operator here returns the "entity_id" or "view_count" accordingly to the "boolean-expression".
db.getCollection('user_actions').aggregate([
{ "$match": {
"type": 'play_started',
"entity_id": { "$ne": null }
}},
{ "$group": {
"_id": "$entity_id",
"view_count": { "$sum": 1}
}},
{ "$project": {
"_id": 0,
"result": {
"$map": {
"input": { "$literal": [ "A", "B"] },
"as": "el",
"in": {
"$cond": [
{ "$eq": [ "$$el", "A" ] },
"$_id",
"$view_count"
]
}
}
}
}},
{ "$group": {
"_id": null,
"result": { "$push": "$result" }
}}
])
It worth noting that this will also work in MongoDB 2.4. If you are running MongoDB 2.2, you can use the undocumented $const operator which does the same thing.

Usage of mapreduce in mongodb [duplicate]

I have a query where I need to return 10 of "Type A" records, while returning all other records. How can I accomplish this?
Update: Admittedly, I could do this with two queries, but I wanted to avoid that, if possible, thinking it would be less overhead, and possibly more performant. My query already is an aggregation query that takes both kinds of records into account, I just need to limit the number of the one type of record in the results.
Update: the following is an example query that highlights the problem:
db.books.aggregate([
{$geoNear: {near: [-118.09771, 33.89244], distanceField: "distance", spherical: true}},
{$match: {"type": "Fiction"}},
{$project: {
'title': 1,
'author': 1,
'type': 1,
'typeSortOrder':
{$add: [
{$cond: [{$eq: ['$type', "Fiction"]}, 1, 0]},
{$cond: [{$eq: ['$type', "Science"]}, 0, 0]},
{$cond: [{$eq: ['$type', "Horror"]}, 3, 0]}
]},
}},
{$sort: {'typeSortOrder'}},
{$limit: 10}
])
db.books.aggregate([
{$geoNear: {near: [-118.09771, 33.89244], distanceField: "distance", spherical: true}},
{$match: {"type": "Horror"}},
{$project: {
'title': 1,
'author': 1,
'type': 1,
'typeSortOrder':
{$add: [
{$cond: [{$eq: ['$type', "Fiction"]}, 1, 0]},
{$cond: [{$eq: ['$type', "Science"]}, 0, 0]},
{$cond: [{$eq: ['$type', "Horror"]}, 3, 0]}
]},
}},
{$sort: {'typeSortOrder'}},
{$limit: 10}
])
db.books.aggregate([
{$geoNear: {near: [-118.09771, 33.89244], distanceField: "distance", spherical: true}},
{$match: {"type": "Science"}},
{$project: {
'title': 1,
'author': 1,
'type': 1,
'typeSortOrder':
{$add: [
{$cond: [{$eq: ['$type', "Fiction"]}, 1, 0]},
{$cond: [{$eq: ['$type', "Science"]}, 0, 0]},
{$cond: [{$eq: ['$type', "Horror"]}, 3, 0]}
]},
}},
{$sort: {'typeSortOrder'}},
{$limit: 10}
])
I would like to have all these records returned in one query, but limit the type to at most 10 of any category.
I realize that the typeSortOrder doesn't need to be conditional when the queries are broken out like this, I had it there for when the queries were one query, originally (which is where I would like to get back to).
I don't think this is presently (2.6) possible to do with one aggregation pipeline. It's difficult to give a precise argument as to why not, but basically the aggregation pipeline performs transformations of streams of documents, one document at a time. There's no awareness within the pipeline of the state of the stream itself, which is what you'd need to determine that you've hit the limit for A's, B's, etc and need to drop further documents of the same type. $group does bring multiple documents together and allows their field values in aggregate to affect the resulting group document ($sum, $avg, etc.). Maybe this makes some sense, but it's necessarily not rigorous because there are simple operations you could add to make it possible to limit based on the types, e.g., adding a $push x accumulator to $group that only pushes the value if the array being pushed to has fewer than x elements.
Even if I did have a way to do it, I'd recommend just doing two aggregations. Keep it simple.
Problem
The results here are not impossible but are also possibly impractical. The general notes have been made that you cannot "slice" an array or otherwise "limit" the amount of results pushed onto one. And the method for doing this per "type" is essentially to use arrays.
The "impractical" part is usually about the number of results, where too large a result set is going to blow up the BSON document limit when "grouping". But, I'm going to consider this with some other recommendations on your "geo search" along with the ultimate goal to return 10 results of each "type" at most.
Principle
To first consider and understand the problem, let's look at a simplified "set" of data and the pipeline code necessary to return the "top 2 results" from each type:
{ "title": "Title 1", "author": "Author 1", "type": "Fiction", "distance": 1 },
{ "title": "Title 2", "author": "Author 2", "type": "Fiction", "distance": 2 },
{ "title": "Title 3", "author": "Author 3", "type": "Fiction", "distance": 3 },
{ "title": "Title 4", "author": "Author 4", "type": "Science", "distance": 1 },
{ "title": "Title 5", "author": "Author 5", "type": "Science", "distance": 2 },
{ "title": "Title 6", "author": "Author 6", "type": "Science", "distance": 3 },
{ "title": "Title 7", "author": "Author 7", "type": "Horror", "distance": 1 }
That's a simplified view of the data and somewhat representative of the state of documents after an initial query. Now comes the trick of how to use the aggregation pipeline to get the "nearest" two results for each "type":
db.books.aggregate([
{ "$sort": { "type": 1, "distance": 1 } },
{ "$group": {
"_id": "$type",
"1": {
"$first": {
"_id": "$_id",
"title": "$title",
"author": "$author",
"distance": "$distance"
}
},
"books": {
"$push": {
"_id": "$_id",
"title": "$title",
"author": "$author",
"distance": "$distance"
}
}
}},
{ "$project": {
"1": 1,
"books": {
"$cond": [
{ "$eq": [ { "$size": "$books" }, 1 ] },
{ "$literal": [false] },
"$books"
]
}
}},
{ "$unwind": "$books" },
{ "$project": {
"1": 1,
"books": 1,
"seen": { "$eq": [ "$1", "$books" ] }
}},
{ "$sort": { "_id": 1, "seen": 1 } },
{ "$group": {
"_id": "$_id",
"1": { "$first": "$1" },
"2": { "$first": "$books" },
"books": {
"$push": {
"$cond": [ { "$not": "$seen" }, "$books", false ]
}
}
}},
{ "$project": {
"1": 1,
"2": 2,
"pos": { "$literal": [1,2] }
}},
{ "$unwind": "$pos" },
{ "$group": {
"_id": "$_id",
"books": {
"$push": {
"$cond": [
{ "$eq": [ "$pos", 1 ] },
"$1",
{ "$cond": [
{ "$eq": [ "$pos", 2 ] },
"$2",
false
]}
]
}
}
}},
{ "$unwind": "$books" },
{ "$match": { "books": { "$ne": false } } },
{ "$project": {
"_id": "$books._id",
"title": "$books.title",
"author": "$books.author",
"type": "$_id",
"distance": "$books.distance",
"sortOrder": {
"$add": [
{ "$cond": [ { "$eq": [ "$_id", "Fiction" ] }, 1, 0 ] },
{ "$cond": [ { "$eq": [ "$_id", "Science" ] }, 0, 0 ] },
{ "$cond": [ { "$eq": [ "$_id", "Horror" ] }, 3, 0 ] }
]
}
}},
{ "$sort": { "sortOrder": 1 } }
])
Of course that is just two results, but it outlines the process for getting n results, which naturally is done in generated pipeline code. Before moving onto the code the process deserves a walk through.
After any query, the first thing to do here is $sort the results, and this you want to basically do by both the "grouping key" which is the "type" and by the "distance" so that the "nearest" items are on top.
The reason for this is shown in the $group stages that will repeat. What is done is essentially "popping the $first result off of each grouping stack. So other documents are not lost, they are placed in an array using $push.
Just to be safe, the next stage is really only required after the "first step", but could optionally be added for similar filtering in the repetition. The main check here is that the resulting "array" is larger than just one item. Where it is not, the contents are replaced with a single value of false. The reason for which is about to become evident.
After this "first step" the real repetition cycle beings, where that array is then "de-normalized" with $unwind and then a $project made in order to "match" the document that has been last "seen".
As only one of the documents will match this condition the results are again "sorted" in order to float the "unseen" documents to the top, while of course maintaining the grouping order. The next thing is similar to the first $group step, but where any kept positions are maintained and the "first unseen" document is "popped off the stack" again.
The document that was "seen" is then pushed back to the array not as itself but as a value of false. This is not going to match the kept value and this is generally the way to handle this without being "destructive" to the array contents where you don't want the operations to fail should there not be enough matches to cover the n results required.
Cleaning up when complete, the next "projection" adds an array to the final documents now grouped by "type" representing each position in the n results required. When this array is unwound, the documents can again be grouped back together, but now all in a single array
that possibly contains several false values but is n elements long.
Finally unwind the array again, use $match to filter out the false values, and project to the required document form.
Practicality
The problem as stated earlier is with the number of results being filtered as there is a real limit on the number of results that can be pushed into an array. That is mostly the BSON limit, but you also don't really want 1000's of items even if that is still under the limit.
The trick here is keeping the initial "match" small enough that the "slicing operations" becomes practical. There are some things with the $geoNear pipeline process that can make this a possibility.
The obvious is limit. By default this is 100 but you clearly want to have something in the range of:
(the number of categories you can possibly match) X ( required matches )
But if this is essentially a number not in the 1000's then there is already some help here.
The others are maxDistance and minDistance, where essentially you put upper and lower bounds on how "far out" to search. The max bound is the general limiter while the min bound is useful when "paging", which is the next helper.
When "upwardly paging", you can use the query argument in order to exclude the _id values of documents "already seen" using the $nin query. In much the same way, the minDistance can be populated with the "last seen" largest distance, or at least the smallest largest distance by "type". This allows some concept of filtering out things that have already been "seen" and getting another page.
Really a topic in itself, but those are the general things to look for in reducing that initial match in order to make the process practical.
Implementing
The general problem of returning "10 results at most, per type" is clearly going to want some code in order to generate the pipeline stages. No-one wants to type that out, and practically speaking you will probably want to change that number at some point.
So now to the code that can generate the monster pipeline. All code in JavaScript, but easy to translate in principles:
var coords = [-118.09771, 33.89244];
var key = "$type";
var val = {
"_id": "$_id",
"title": "$title",
"author": "$author",
"distance": "$distance"
};
var maxLen = 10;
var stack = [];
var pipe = [];
var fproj = { "$project": { "pos": { "$literal": [] } } };
pipe.push({ "$geoNear": {
"near": coords,
"distanceField": "distance",
"spherical": true
}});
pipe.push({ "$sort": {
"type": 1, "distance": 1
}});
for ( var x = 1; x <= maxLen; x++ ) {
fproj["$project"][""+x] = 1;
fproj["$project"]["pos"]["$literal"].push( x );
var rec = {
"$cond": [ { "$eq": [ "$pos", x ] }, "$"+x ]
};
if ( stack.length == 0 ) {
rec["$cond"].push( false );
} else {
lval = stack.pop();
rec["$cond"].push( lval );
}
stack.push( rec );
if ( x == 1) {
pipe.push({ "$group": {
"_id": key,
"1": { "$first": val },
"books": { "$push": val }
}});
pipe.push({ "$project": {
"1": 1,
"books": {
"$cond": [
{ "$eq": [ { "$size": "$books" }, 1 ] },
{ "$literal": [false] },
"$books"
]
}
}});
} else {
pipe.push({ "$unwind": "$books" });
var proj = {
"$project": {
"books": 1
}
};
proj["$project"]["seen"] = { "$eq": [ "$"+(x-1), "$books" ] };
var grp = {
"$group": {
"_id": "$_id",
"books": {
"$push": {
"$cond": [ { "$not": "$seen" }, "$books", false ]
}
}
}
};
for ( n=x; n >= 1; n-- ) {
if ( n != x )
proj["$project"][""+n] = 1;
grp["$group"][""+n] = ( n == x ) ? { "$first": "$books" } : { "$first": "$"+n };
}
pipe.push( proj );
pipe.push({ "$sort": { "_id": 1, "seen": 1 } });
pipe.push(grp);
}
}
pipe.push(fproj);
pipe.push({ "$unwind": "$pos" });
pipe.push({
"$group": {
"_id": "$_id",
"msgs": { "$push": stack[0] }
}
});
pipe.push({ "$unwind": "$books" });
pipe.push({ "$match": { "books": { "$ne": false } }});
pipe.push({
"$project": {
"_id": "$books._id",
"title": "$books.title",
"author": "$books.author",
"type": "$_id",
"distance": "$books",
"sortOrder": {
"$add": [
{ "$cond": [ { "$eq": [ "$_id", "Fiction" ] }, 1, 0 ] },
{ "$cond": [ { "$eq": [ "$_id", "Science" ] }, 0, 0 ] },
{ "$cond": [ { "$eq": [ "$_id", "Horror" ] }, 3, 0 ] },
]
}
}
});
pipe.push({ "$sort": { "sortOrder": 1, "distance": 1 } });
Alternate
Of course the end result here and the general problem with all above is that you really only want the "top 10" of each "type" to return. The aggregation pipeline will do it, but at the cost of keeping more than 10 and then "popping off the stack" until 10 is reached.
An alternate approach is to "brute force" this with mapReduce and "globally scoped" variables. Not as nice since the results all in arrays, but it may be a practical approach:
db.collection.mapReduce(
function () {
if ( !stash.hasOwnProperty(this.type) ) {
stash[this.type] = [];
}
if ( stash[this.type.length < maxLen ) {
stash[this.type].push({
"title": this.title,
"author": this.author,
"type": this.type,
"distance": this.distance
});
emit( this.type, 1 );
}
},
function(key,values) {
return 1; // really just want to keep the keys
},
{
"query": {
"location": {
"$nearSphere": [-118.09771, 33.89244]
}
},
"scope": { "stash": {}, "maxLen": 10 },
"finalize": function(key,value) {
return { "msgs": stash[key] };
},
"out": { "inline": 1 }
}
)
This is a real cheat which just uses the "global scope" to keep a single object whose keys are the grouping keys. The results are pushed onto an array in that global object until the maximum length is reached. Results are already sorted by nearest, so the mapper just gives up doing anything with the current document after the 10 are reached per key.
The reducer wont be called since only 1 document per key is emitted. The finalize then just "pulls" the value from the global and returns it in the result.
Simple, but of course you don't have all the $geoNear options if you really need them, and this form has the hard limit of 100 document as the output from the initial query.
This is a classic case for subquery/join which is not supported by MongoDB. All joins and subquery-like operations need to be implemented in the application logic. So multiple queries is your best bet. Performance of the multiple query approach should be good if you have an index on type.
Alternatively you can write a single aggregation query minus the type-matching and limit clauses and then process the stream in your application logic to limit documents per type.
This approach will be low on performance for large result sets because documents may be returned in random order. Your limiting logic will then need to traverse to the entire result set.
i guess you can use cursor.limit() on a cursor to specify the maximum number of documents the cursor will return. limit() is analogous to the LIMIT statement in a SQL database.
You must apply limit() to the cursor before retrieving any documents from the database.
The limit function in the cursors can be used for limiting the number of records in the find.
I guess this example should help:
var myCursor = db.bios.find( );
db.bios.find().limit( 5 )

MongoDb : Find common element from two arrays within a query

Let's say we have records of following structure in database.
{
"_id": 1234,
"tags" : [ "t1", "t2", "t3" ]
}
Now, I want to check if database contains a record with any of the tags specified in array tagsArray which is [ "t3", "t4", "t5" ]
I know about $in operator but I not only want to know whether any of the records in database has any of the tag specified in tagsArray, I also want to know which tag of the record in database matches with any of the tags specified in tagsArray. (i.e. t3 in for the case of record mentioned above)
That is, I want to compare two arrays (one of the record and other given by me) and find out the common element.
I need to have this expression along with many expressions in the query so projection operators like $, $elematch etc won't be of much use. (Or is there a way it can be used without having to iterate over all records?)
I think I can use $where operator but I don't think that is the best way to do this.
How can this problem be solved?
There are a few approaches to do what you want, it just depends on your version of MongoDB. Just submitting the shell responses. The content is basically JSON representation which is not hard to translate for DBObject entities in Java, or JavaScript to be executed on the server so that really does not change.
The first and the fastest approach is with MongoDB 2.6 and greater where you get the new set operations:
var test = [ "t3", "t4", "t5" ];
db.collection.aggregate([
{ "$match": { "tags": {"$in": test } }},
{ "$project": {
"tagMatch": {
"$setIntersection": [
"$tags",
test
]
},
"sizeMatch": {
"$size": {
"$setIntersection": [
"$tags",
test
]
}
}
}},
{ "$match": { "sizeMatch": { "$gte": 1 } } },
{ "$project": { "tagMatch": 1 } }
])
The new operators there are $setIntersection that is doing the main work and also the $size operator which measures the array size and helps for the latter filtering. This ends up as a basic comparison of "sets" in order to find the items that intersect.
If you have an earlier version of MongoDB then this is still possible, but you need a few more stages and this might affect performance somewhat depending if you have large arrays:
var test = [ "t3", "t4", "t5" ];
db.collection.aggregate([
{ "$match": { "tags": {"$in": test } }},
{ "$project": {
"tags": 1,
"match": { "$const": test }
}},
{ "$unwind": "$tags" },
{ "$unwind": "$match" },
{ "$project": {
"tags": 1,
"matched": { "$eq": [ "$tags", "$match" ] }
}},
{ "$match": { "matched": true }},
{ "$group": {
"_id": "$_id",
"tagMatch": { "$push": "$tags" },
"count": { "$sum": 1 }
}}
{ "$match": { "count": { "$gte": 1 } }},
{ "$project": { "tagMatch": 1 }}
])
Or if all of that seems to involved or your arrays are large enough to make a performance difference then there is always mapReduce:
var test = [ "t3", "t4", "t5" ];
db.collection.mapReduce(
function () {
var intersection = this.tags.filter(function(x){
return ( test.indexOf( x ) != -1 );
});
if ( intersection.length > 0 )
emit ( this._id, intersection );
},
function(){},
{
"query": { "tags": { "$in": test } },
"scope": { "test": test },
"output": { "inline": 1 }
}
)
Note that in all cases the $in operator still helps you to reduce the results even though it is not the full match. The other common element is checking the "size" of the intersection result to reduce the response.
All pretty easy to code up, convince the boss to switch to MongoDB 2.6 or greater if you are not already there for the best results.

how to create a mongodb query that finds if a number is between two fields?

The objects in the database have a 'properties' array, that can hold diverse objects. Some of them present numbers or ranges and look like:
{'value': 10}
or
{'minValue': 4, 'maxValue': 8}
When querying the collection for a specific number, like X, I want to find all documents where either value equals X or minValue <= X <= maxValue. My first attempt of a query looks like
db.Pool0.find({$or: [{'properties.value': X}, {'properties.minValue': {$lte: X}, 'properties.maxValue': {$gte: X}}]}, {'_id': 1}).pretty()
The drawback is that if the properties array holds multiple objects specifying minValue and maxValue, X can be between any of them. For example
"properties" : [
{
"minValue" : 4,
"maxValue" : 6
},
{
"minValue" : 10,
"maxValue" : 20
}
]
will match for X = 8. Can the query be improved so the object structure within properties is respected?
You basically want to use $elemMatch combined with an $or condition in this form:
db.collection.find({
"$or": [
{
"properties": { "$elemMatch": {
"minValue": { "$lte": 8 },
"maxValue": { "$gte": 8 }
}}
},
{
"properties.value": 8
}
]
})
That covers meeting the documents that contain the range as well as the possible other key name for the field in properties.
But keep in mind that matching documents is different to matching elements in an array. So if you just expect the array elements that match to be returned and you have more than one, then you use the aggregate form:
db.collection.aggregate([
// Matching documents is still good practice
{ "$match": {
"$or": [
{
"properties": { "$elemMatch": {
"minValue": { "$lte": 8 },
"maxValue": { "$gte": 8 }
}}
},
{
"properties.value": 8
}
]
}},
// Unwind to de-normalize
{ "$unwind": "$properties" },
// Then match to filter the elements
{ "$match": {
"$or": [
{
"properties.minValue": { "$lte": 8 },
"properties.maxValue": { "$gte": 8 }
},
{ "properties.value": 8 }
]
}},
// Reconstruct the filtered array
{ "$group": {
"_id": "$_id",
"properties": { "$push": "$properties" }
}}
])
But if there will only be one match that you are sure of then simply use projection with find:
db.collection.find(
{
"$or": [
{
"properties": { "$elemMatch": {
"minValue": { "$lte": 8 },
"maxValue": { "$gte": 8 }
}}
},
{
"properties.value": 8
}
]
},
{ "properties.$": 1 }
)