MongoDB: Search minimum, maximum in nested object with dynamic field name - mongodb

I have a query to get the minimum query and maximum query on below sample data set. In my case fields names are dynamic, like below product_1, product_2...
{
"_id" : NumberLong(540),
"product_1" : {
"orderCancelled" : 0,
"orderDelivered" : 6
},
"product_2" : {
"orderCancelled" : 3,
"orderDelivered" : 16
},
"product_3" : {
"orderCancelled" : 5,
"orderDelivered" : 11
}
}
I am not getting an idea how can i do this in Mongo where the field names are dynamic, means in future there may be other products also get created as product_4 and product_5 for the same id.
I need a query which gives me minimum value for orderDelivered and maximum value for orderCancelled, as for example in above document result will be orderDelivered:16 & orderCancelled:0 .
Thanks for any idea.

You should restructure your document so that all product documents are in an array:
{
"_id": NumberLong(540),
products: [
{
"name": "product_1",
"orderCancelled": 0,
"orderDelivered": 6
},
{
"name": "product_2",
"orderCancelled": 3,
"orderDelivered": 16
},
{
"name": "product_3",
"orderCancelled": 5,
"orderDelivered": 11
}
]
}
Then you'll be able to issue normal max/min queries like this:
db.test.aggregate([
{
$match: { "_id" : NumberLong(540) }
},
{
$unwind: "$products"
},
{
$group: {
_id: "$_id",
minDelivered: { $min: "$products.orderDelivered" },
maxCancelled: { $max: "$products.orderCancelled" }
}
}
])

You need to change your documents structure by updating them. You will need loop through each document using the .forEach method then $unset all field's name that startswith product_. From there you will need to add new fields products which is array of products using the $set update operator. That being said you should use "bulk" operations to update your documents for maximum efficiency
var bulkOp = db.collection.initializeOrderedBulkOp();
var count = 0;
db.collection.find().forEach(function(doc) {
var allproducts = [];
for(var key in doc) {
if(Object.prototype.hasOwnProperty.call(doc, key) && /^product_\d+/.test(key)) {
var product = {};
product["name"] = key;
product["orderCancelled"] = doc[key]["orderCancelled"];
product["orderDelivered"] = doc[key]["orderDelivered"];
allproducts.push(product);
var unsetField = {};
unsetField[key] = "";
bulkOp.find({"_id": doc._id}).update({ "$unset": unsetField });
};
count++;
};
bulkOp.find({"_id": doc._id}).update({
"$set": { "products": allproducts }
});
count++;
if(count % 500 === 0) {
// Execute per 500 operations and re-init
bulkOp.execute();
bulkOp = db.collection.initializeOrderedBulkOp();
}
})
// clean up queues
if(count > 0) {
bulkOp.execute();
}
Your documents will now look like this:
{
"_id" : NumberLong(542),
"products" : [
{
"name" : "product_1",
"orderCancelled" : 0,
"orderDelivered" : 6
},
{
"name" : "product_2",
"orderCancelled" : 3,
"orderDelivered" : 16
},
{
"name" : "product_3",
"orderCancelled" : 5,
"orderDelivered" : 11
}
]
}
Then comes your aggregation query using the .aggregate() method:
db.collection.aggregate([
{ "$match": { "_id": 542 } },
{ "$unwind": "$products" },
{ "$group": {
"_id": "$_id",
"maxOrderCancelled": { "$max": "$products.orderCancelled"},
"minOrderDelivvered": { "$min": "$products.orderDelivered"}
}}
])
Which returns:
{ "_id" : NumberLong(542), "maxOrderCancelled" : 5, "minOrderDelivvered" : 6 }
From version 3.2 you can use the $max and $min in your $project stage which is a much more better way to do this because there no need to $unwind your array first.
db.collection.aggregate([
{ "$match": { "_id": 542 } },
{ "$project": {
"maxOrderCancelled": {
"$max": {
"$map": {
"input": "$products",
"as": "order",
"in": "$$orc.orderCancelled"
}
}
},
"minOrderDelivered": {
"$min": {
"$map": {
"input": "$products",
"as": "orc",
"in": "$$orc.orderDelivered"
}
}
}
}}
])
Which yields:
{ "_id" : NumberLong(542), "maxOrderCancelled" : 5, "minOrderDelivered" : 6 }

Related

Matching ObjectId to String for $graphLookup

I'm trying to run a $graphLookup like demonstrated in print bellow:
The objective is to, given a specific record (commented $match there), retrieve it's full "path" throught immediateAncestors property. As you can see, it's not happening.
I introduced $convert here to deal with _id from collection as string, believing it could be possible to "match" with _id from immediateAncestors records list (which is a string).
So, I did run another test with different data (no ObjectIds involved):
db.nodos.insert({"id":5,"name":"cinco","children":[{"id":4}]})
db.nodos.insert({"id":4,"name":"quatro","ancestors":[{"id":5}],"children":[{"id":3}]})
db.nodos.insert({"id":6,"name":"seis","children":[{"id":3}]})
db.nodos.insert({"id":1,"name":"um","children":[{"id":2}]})
db.nodos.insert({"id":2,"name":"dois","ancestors":[{"id":1}],"children":[{"id":3}]})
db.nodos.insert({"id":3,"name":"três","ancestors":[{"id":2},{"id":4},{"id":6}]})
db.nodos.insert({"id":7,"name":"sete","children":[{"id":5}]})
And the query:
db.nodos.aggregate( [
{ $match: { "id": 3 } },
{ $graphLookup: {
from: "nodos",
startWith: "$ancestors.id",
connectFromField: "ancestors.id",
connectToField: "id",
as: "ANCESTORS_FROM_BEGINNING"
}
},
{ $project: {
"name": 1,
"id": 1,
"ANCESTORS_FROM_BEGINNING": "$ANCESTORS_FROM_BEGINNING.id"
}
}
] )
...which outputs what I was expecting (the five records directly and indirectly connected to the one with id 3):
{
"_id" : ObjectId("5afe270fb4719112b613f1b4"),
"id" : 3.0,
"name" : "três",
"ANCESTORS_FROM_BEGINNING" : [
1.0,
4.0,
6.0,
5.0,
2.0
]
}
The question is: there is a way to achieve the objetive I mentioned in the beginning?
I'm running Mongo 3.7.9 (from official Docker)
Thanks in advance!
You are currently using a development version of MongoDB which has some features enabled expected to be released with MongoDB 4.0 as an official release. Note that some features may be subject to change before the final release, so production code should be aware of this before you commit to it.
Why $convert fails here
Probably the best way to explain this is to look at your altered sample but replacing with ObjectId values for _id and "strings" for those under the the arrays:
{
"_id" : ObjectId("5afe5763419503c46544e272"),
"name" : "cinco",
"children" : [ { "_id" : "5afe5763419503c46544e273" } ]
},
{
"_id" : ObjectId("5afe5763419503c46544e273"),
"name" : "quatro",
"ancestors" : [ { "_id" : "5afe5763419503c46544e272" } ],
"children" : [ { "_id" : "5afe5763419503c46544e277" } ]
},
{
"_id" : ObjectId("5afe5763419503c46544e274"),
"name" : "seis",
"children" : [ { "_id" : "5afe5763419503c46544e277" } ]
},
{
"_id" : ObjectId("5afe5763419503c46544e275"),
"name" : "um",
"children" : [ { "_id" : "5afe5763419503c46544e276" } ]
}
{
"_id" : ObjectId("5afe5763419503c46544e276"),
"name" : "dois",
"ancestors" : [ { "_id" : "5afe5763419503c46544e275" } ],
"children" : [ { "_id" : "5afe5763419503c46544e277" } ]
},
{
"_id" : ObjectId("5afe5763419503c46544e277"),
"name" : "três",
"ancestors" : [
{ "_id" : "5afe5763419503c46544e273" },
{ "_id" : "5afe5763419503c46544e274" },
{ "_id" : "5afe5763419503c46544e276" }
]
},
{
"_id" : ObjectId("5afe5764419503c46544e278"),
"name" : "sete",
"children" : [ { "_id" : "5afe5763419503c46544e272" } ]
}
That should give a general simulation of what you were trying to work with.
What you attempted was to convert the _id value into a "string" via $project before entering the $graphLookup stage. The reason this fails is whilst you did an initial $project "within" this pipeline, the problem is that the source for $graphLookup in the "from" option is still the unaltered collection and therefore you don't get the correct details on the subsequent "lookup" iterations.
db.strcoll.aggregate([
{ "$match": { "name": "três" } },
{ "$addFields": {
"_id": { "$toString": "$_id" }
}},
{ "$graphLookup": {
"from": "strcoll",
"startWith": "$ancestors._id",
"connectFromField": "ancestors._id",
"connectToField": "_id",
"as": "ANCESTORS_FROM_BEGINNING"
}},
{ "$project": {
"name": 1,
"ANCESTORS_FROM_BEGINNING": "$ANCESTORS_FROM_BEGINNING._id"
}}
])
Does not match on the "lookup" therefore:
{
"_id" : "5afe5763419503c46544e277",
"name" : "três",
"ANCESTORS_FROM_BEGINNING" : [ ]
}
"Patching" the problem
However that is the core problem and not a failing of $convert or it's aliases itself. In order to make this actually work we can instead create a "view" which presents itself as a collection for the sake of input.
I'll do this the other way around and convert the "strings" to ObjectId via $toObjectId:
db.createView("idview","strcoll",[
{ "$addFields": {
"ancestors": {
"$ifNull": [
{ "$map": {
"input": "$ancestors",
"in": { "_id": { "$toObjectId": "$$this._id" } }
}},
"$$REMOVE"
]
},
"children": {
"$ifNull": [
{ "$map": {
"input": "$children",
"in": { "_id": { "$toObjectId": "$$this._id" } }
}},
"$$REMOVE"
]
}
}}
])
Using the "view" however means that the data is consistently seen with the values converted. So the following aggregation using the view:
db.idview.aggregate([
{ "$match": { "name": "três" } },
{ "$graphLookup": {
"from": "idview",
"startWith": "$ancestors._id",
"connectFromField": "ancestors._id",
"connectToField": "_id",
"as": "ANCESTORS_FROM_BEGINNING"
}},
{ "$project": {
"name": 1,
"ANCESTORS_FROM_BEGINNING": "$ANCESTORS_FROM_BEGINNING._id"
}}
])
Returns the expected output:
{
"_id" : ObjectId("5afe5763419503c46544e277"),
"name" : "três",
"ANCESTORS_FROM_BEGINNING" : [
ObjectId("5afe5763419503c46544e275"),
ObjectId("5afe5763419503c46544e273"),
ObjectId("5afe5763419503c46544e274"),
ObjectId("5afe5763419503c46544e276"),
ObjectId("5afe5763419503c46544e272")
]
}
Fixing the problem
With all of that said, the real issue here is that you have some data which "looks like" an ObjectId value and is in fact valid as an ObjectId, however it has been recorded as a "string". The basic issue to everything working as it should is that the two "types" are not the same and this results in an equality mismatch as the "joins" are attempted.
So the real fix is still the same as it always has been, which is to instead go through the data and fix it so that the "strings" are actually also ObjectId values. These will then match the _id keys which they are meant to refer to, and you are saving a considerable amount of storage space since an ObjectId takes up a lot less space to store than it's string representation in hexadecimal characters.
Using MongoDB 4.0 methods, you "could" actually use the "$toObjectId" in order to write a new collection, just in much the same matter that we created the "view" earlier:
db.strcoll.aggregate([
{ "$addFields": {
"ancestors": {
"$ifNull": [
{ "$map": {
"input": "$ancestors",
"in": { "_id": { "$toObjectId": "$$this._id" } }
}},
"$$REMOVE"
]
},
"children": {
"$ifNull": [
{ "$map": {
"input": "$children",
"in": { "_id": { "$toObjectId": "$$this._id" } }
}},
"$$REMOVE"
]
}
}}
{ "$out": "fixedcol" }
])
Or of course where you "need" to keep the same collection, then the traditional "loop and update" remains the same as what has always been required:
var updates = [];
db.strcoll.find().forEach(doc => {
var update = { '$set': {} };
if ( doc.hasOwnProperty('children') )
update.$set.children = doc.children.map(e => ({ _id: new ObjectId(e._id) }));
if ( doc.hasOwnProperty('ancestors') )
update.$set.ancestors = doc.ancestors.map(e => ({ _id: new ObjectId(e._id) }));
updates.push({
"updateOne": {
"filter": { "_id": doc._id },
update
}
});
if ( updates.length > 1000 ) {
db.strcoll.bulkWrite(updates);
updates = [];
}
})
if ( updates.length > 0 ) {
db.strcoll.bulkWrite(updates);
updates = [];
}
Which is actually a bit of a "sledgehammer" due to actually overwriting the entire array in a single go. Not a great idea for a production environment, but enough as a demonstration for the purposes of this exercise.
Conclusion
So whilst MongoDB 4.0 will add these "casting" features which can indeed be very useful, their actual intent is not really for cases such as this. They are in fact much more useful as demonstrated in the "conversion" to a new collection using an aggregation pipeline than most other possible uses.
Whilst we "can" create a "view" which transforms the data types to enable things like $lookup and $graphLookup to work where the actual collection data differs, this really is only a "band-aid" on the real problem as the data types really should not differ, and should in fact be permanently converted.
Using a "view" actually means that the aggregation pipeline for construction needs to effectively run every time the "collection" ( actually a "view" ) is accessed, which creates a real overhead.
Avoiding overhead is usually a design goal, therefore correcting such data storage mistakes is imperative to getting real performance out of your application, rather than just working with "brute force" that will only slow things down.
A much safer "conversion" script which applied "matched" updates to each array element. The code here requires NodeJS v10.x and a latest release MongoDB node driver 3.1.x:
const { MongoClient, ObjectID: ObjectId } = require('mongodb');
const EJSON = require('mongodb-extended-json');
const uri = 'mongodb://localhost/';
const log = data => console.log(EJSON.stringify(data, undefined, 2));
(async function() {
try {
const client = await MongoClient.connect(uri);
let db = client.db('test');
let coll = db.collection('strcoll');
let fields = ["ancestors", "children"];
let cursor = coll.find({
$or: fields.map(f => ({ [`${f}._id`]: { "$type": "string" } }))
}).project(fields.reduce((o,f) => ({ ...o, [f]: 1 }),{}));
let batch = [];
for await ( let { _id, ...doc } of cursor ) {
let $set = {};
let arrayFilters = [];
for ( const f of fields ) {
if ( doc.hasOwnProperty(f) ) {
$set = { ...$set,
...doc[f].reduce((o,{ _id },i) =>
({ ...o, [`${f}.$[${f.substr(0,1)}${i}]._id`]: ObjectId(_id) }),
{})
};
arrayFilters = [ ...arrayFilters,
...doc[f].map(({ _id },i) =>
({ [`${f.substr(0,1)}${i}._id`]: _id }))
];
}
}
if (arrayFilters.length > 0)
batch = [ ...batch,
{ updateOne: { filter: { _id }, update: { $set }, arrayFilters } }
];
if ( batch.length > 1000 ) {
let result = await coll.bulkWrite(batch);
batch = [];
}
}
if ( batch.length > 0 ) {
log({ batch });
let result = await coll.bulkWrite(batch);
log({ result });
}
await client.close();
} catch(e) {
console.error(e)
} finally {
process.exit()
}
})()
Produces and executes bulk operations like these for the seven documents:
{
"updateOne": {
"filter": {
"_id": {
"$oid": "5afe5763419503c46544e272"
}
},
"update": {
"$set": {
"children.$[c0]._id": {
"$oid": "5afe5763419503c46544e273"
}
}
},
"arrayFilters": [
{
"c0._id": "5afe5763419503c46544e273"
}
]
}
},
{
"updateOne": {
"filter": {
"_id": {
"$oid": "5afe5763419503c46544e273"
}
},
"update": {
"$set": {
"ancestors.$[a0]._id": {
"$oid": "5afe5763419503c46544e272"
},
"children.$[c0]._id": {
"$oid": "5afe5763419503c46544e277"
}
}
},
"arrayFilters": [
{
"a0._id": "5afe5763419503c46544e272"
},
{
"c0._id": "5afe5763419503c46544e277"
}
]
}
},
{
"updateOne": {
"filter": {
"_id": {
"$oid": "5afe5763419503c46544e274"
}
},
"update": {
"$set": {
"children.$[c0]._id": {
"$oid": "5afe5763419503c46544e277"
}
}
},
"arrayFilters": [
{
"c0._id": "5afe5763419503c46544e277"
}
]
}
},
{
"updateOne": {
"filter": {
"_id": {
"$oid": "5afe5763419503c46544e275"
}
},
"update": {
"$set": {
"children.$[c0]._id": {
"$oid": "5afe5763419503c46544e276"
}
}
},
"arrayFilters": [
{
"c0._id": "5afe5763419503c46544e276"
}
]
}
},
{
"updateOne": {
"filter": {
"_id": {
"$oid": "5afe5763419503c46544e276"
}
},
"update": {
"$set": {
"ancestors.$[a0]._id": {
"$oid": "5afe5763419503c46544e275"
},
"children.$[c0]._id": {
"$oid": "5afe5763419503c46544e277"
}
}
},
"arrayFilters": [
{
"a0._id": "5afe5763419503c46544e275"
},
{
"c0._id": "5afe5763419503c46544e277"
}
]
}
},
{
"updateOne": {
"filter": {
"_id": {
"$oid": "5afe5763419503c46544e277"
}
},
"update": {
"$set": {
"ancestors.$[a0]._id": {
"$oid": "5afe5763419503c46544e273"
},
"ancestors.$[a1]._id": {
"$oid": "5afe5763419503c46544e274"
},
"ancestors.$[a2]._id": {
"$oid": "5afe5763419503c46544e276"
}
}
},
"arrayFilters": [
{
"a0._id": "5afe5763419503c46544e273"
},
{
"a1._id": "5afe5763419503c46544e274"
},
{
"a2._id": "5afe5763419503c46544e276"
}
]
}
},
{
"updateOne": {
"filter": {
"_id": {
"$oid": "5afe5764419503c46544e278"
}
},
"update": {
"$set": {
"children.$[c0]._id": {
"$oid": "5afe5763419503c46544e272"
}
}
},
"arrayFilters": [
{
"c0._id": "5afe5763419503c46544e272"
}
]
}
}

Mongoid duplicate fields query causes FieldPath field names may not contain '.' [duplicate]

I have the following mongo data which looks like this
{
eventType : "mousedown",
eventArgs : {
type : "touchstart",
elementId : "id1"
},
creationDateTime : ISODate("2017-02-24T07:05:49.986Z")
}
I wrote the following query to perform group count.
db.analytics.aggregate
(
{
$match :
{
$and :
[
{"eventArgs.type" : 'touchstart'},
{eventType : 'mousedown'},
{creationDateTime : {$gte : ISODate("2017-02-24T000:00:00.000Z")}}
]
}
},
{
$group :
{
_id :
{
"eventsArgs.elementId" : "$elementId"
},
count :
{
$sum : 1
}
}
}
);
I'm getting error for $group, which states that
FieldPath field names may not contain '.'
If I were not able to specific '.' in
$group :
{
_id :
{
"eventsArgs.elementId" : "$elementId"
},
What is the correct way to do so?
Since you have a single group field, the best way is to just use the _id group key on that field and then create another $project pipeline that will reshape the _id key from the previous pipeline into the desired subdocument that you want. For example
db.analytics.aggregate([
{
"$match": {
"eventArgs.type": 'touchstart',
"eventType": 'mousedown',
"creationDateTime": { "$gte": ISODate("2017-02-24T000:00:00.000Z") }
}
},
{
"$group": {
"_id": "$eventArgs.elementId",
"count": { "$sum": 1 }
}
},
{
"$project": {
"eventsArgs.elementId": "$_id",
"count": 1, "_id": 0
}
}
]);
The following should work as well:
db.analytics.aggregate([
{
"$match": {
"eventArgs.type": 'touchstart',
"eventType": 'mousedown',
"creationDateTime": { "$gte": ISODate("2017-02-24T000:00:00.000Z") }
}
},
{
"$group": {
"_id": {
"eventArgs": {
"elementId": "$eventArgs.elementId"
}
},
"count": { "$sum": 1 }
}
}
]);

FieldPath field names may not contain '.' in $group

I have the following mongo data which looks like this
{
eventType : "mousedown",
eventArgs : {
type : "touchstart",
elementId : "id1"
},
creationDateTime : ISODate("2017-02-24T07:05:49.986Z")
}
I wrote the following query to perform group count.
db.analytics.aggregate
(
{
$match :
{
$and :
[
{"eventArgs.type" : 'touchstart'},
{eventType : 'mousedown'},
{creationDateTime : {$gte : ISODate("2017-02-24T000:00:00.000Z")}}
]
}
},
{
$group :
{
_id :
{
"eventsArgs.elementId" : "$elementId"
},
count :
{
$sum : 1
}
}
}
);
I'm getting error for $group, which states that
FieldPath field names may not contain '.'
If I were not able to specific '.' in
$group :
{
_id :
{
"eventsArgs.elementId" : "$elementId"
},
What is the correct way to do so?
Since you have a single group field, the best way is to just use the _id group key on that field and then create another $project pipeline that will reshape the _id key from the previous pipeline into the desired subdocument that you want. For example
db.analytics.aggregate([
{
"$match": {
"eventArgs.type": 'touchstart',
"eventType": 'mousedown',
"creationDateTime": { "$gte": ISODate("2017-02-24T000:00:00.000Z") }
}
},
{
"$group": {
"_id": "$eventArgs.elementId",
"count": { "$sum": 1 }
}
},
{
"$project": {
"eventsArgs.elementId": "$_id",
"count": 1, "_id": 0
}
}
]);
The following should work as well:
db.analytics.aggregate([
{
"$match": {
"eventArgs.type": 'touchstart',
"eventType": 'mousedown',
"creationDateTime": { "$gte": ISODate("2017-02-24T000:00:00.000Z") }
}
},
{
"$group": {
"_id": {
"eventArgs": {
"elementId": "$eventArgs.elementId"
}
},
"count": { "$sum": 1 }
}
}
]);

Limit results in a Mongo Aggregation [duplicate]

I want to group all the documents according to a field but to restrict the number of documents grouped for each value.
Each message has a conversation_ID. I need to get 10 or lesser number of messages for each conversation_ID.
I am able to group according to the following command but can't figure out how to restrict the
number of grouped documents apart from slicing the results
Message.aggregate({'$group':{_id:'$conversation_ID',msgs:{'$push':{msgid:'$_id'}}}})
How to limit the length of msgs array for each conversation_ID to 10?
Modern
From MongoDB 3.6 there is a "novel" approach to this by using $lookup to perform a "self join" in much the same way as the original cursor processing demonstrated below.
Since in this release you can specify a "pipeline" argument to $lookup as a source for the "join", this essentially means you can use $match and $limit to gather and "limit" the entries for the array:
db.messages.aggregate([
{ "$group": { "_id": "$conversation_ID" } },
{ "$lookup": {
"from": "messages",
"let": { "conversation": "$_id" },
"pipeline": [
{ "$match": { "$expr": { "$eq": [ "$conversation_ID", "$$conversation" ] } }},
{ "$limit": 10 },
{ "$project": { "_id": 1 } }
],
"as": "msgs"
}}
])
You can optionally add additional projection after the $lookup in order to make the array items simply the values rather than documents with an _id key, but the basic result is there by simply doing the above.
There is still the outstanding SERVER-9277 which actually requests a "limit to push" directly, but using $lookup in this way is a viable alternative in the interim.
NOTE: There also is $slice which was introduced after writing the original answer and mentioned by "outstanding JIRA issue" in the original content. Whilst you can get the same result with small result sets, it does involve still "pushing everything" into the array and then later limiting the final array output to the desired length.
So that's the main distinction and why it's generally not practical to $slice for large results. But of course can be alternately used in cases where it is.
There are a few more details on mongodb group values by multiple fields about either alternate usage.
Original
As stated earlier, this is not impossible but certainly a horrible problem.
Actually if your main concern is that your resulting arrays are going to be exceptionally large, then you best approach is to submit for each distinct "conversation_ID" as an individual query and then combine your results. In very MongoDB 2.6 syntax which might need some tweaking depending on what your language implementation actually is:
var results = [];
db.messages.aggregate([
{ "$group": {
"_id": "$conversation_ID"
}}
]).forEach(function(doc) {
db.messages.aggregate([
{ "$match": { "conversation_ID": doc._id } },
{ "$limit": 10 },
{ "$group": {
"_id": "$conversation_ID",
"msgs": { "$push": "$_id" }
}}
]).forEach(function(res) {
results.push( res );
});
});
But it all depends on whether that is what you are trying to avoid. So on to the real answer:
The first issue here is that there is no function to "limit" the number of items that are "pushed" into an array. It is certainly something we would like, but the functionality does not presently exist.
The second issue is that even when pushing all items into an array, you cannot use $slice, or any similar operator in the aggregation pipeline. So there is no present way to get just the "top 10" results from a produced array with a simple operation.
But you can actually produce a set of operations to effectively "slice" on your grouping boundaries. It is fairly involved, and for example here I will reduce the array elements "sliced" to "six" only. The main reason here is to demonstrate the process and show how to do this without being destructive with arrays that do not contain the total you want to "slice" to.
Given a sample of documents:
{ "_id" : 1, "conversation_ID" : 123 }
{ "_id" : 2, "conversation_ID" : 123 }
{ "_id" : 3, "conversation_ID" : 123 }
{ "_id" : 4, "conversation_ID" : 123 }
{ "_id" : 5, "conversation_ID" : 123 }
{ "_id" : 6, "conversation_ID" : 123 }
{ "_id" : 7, "conversation_ID" : 123 }
{ "_id" : 8, "conversation_ID" : 123 }
{ "_id" : 9, "conversation_ID" : 123 }
{ "_id" : 10, "conversation_ID" : 123 }
{ "_id" : 11, "conversation_ID" : 123 }
{ "_id" : 12, "conversation_ID" : 456 }
{ "_id" : 13, "conversation_ID" : 456 }
{ "_id" : 14, "conversation_ID" : 456 }
{ "_id" : 15, "conversation_ID" : 456 }
{ "_id" : 16, "conversation_ID" : 456 }
You can see there that when grouping by your conditions you will get one array with ten elements and another with "five". What you want to do here reduce both to the top "six" without "destroying" the array that only will match to "five" elements.
And the following query:
db.messages.aggregate([
{ "$group": {
"_id": "$conversation_ID",
"first": { "$first": "$_id" },
"msgs": { "$push": "$_id" },
}},
{ "$unwind": "$msgs" },
{ "$project": {
"msgs": 1,
"first": 1,
"seen": { "$eq": [ "$first", "$msgs" ] }
}},
{ "$sort": { "seen": 1 }},
{ "$group": {
"_id": "$_id",
"msgs": {
"$push": {
"$cond": [ { "$not": "$seen" }, "$msgs", false ]
}
},
"first": { "$first": "$first" },
"second": { "$first": "$msgs" }
}},
{ "$unwind": "$msgs" },
{ "$project": {
"msgs": 1,
"first": 1,
"second": 1,
"seen": { "$eq": [ "$second", "$msgs" ] }
}},
{ "$sort": { "seen": 1 }},
{ "$group": {
"_id": "$_id",
"msgs": {
"$push": {
"$cond": [ { "$not": "$seen" }, "$msgs", false ]
}
},
"first": { "$first": "$first" },
"second": { "$first": "$second" },
"third": { "$first": "$msgs" }
}},
{ "$unwind": "$msgs" },
{ "$project": {
"msgs": 1,
"first": 1,
"second": 1,
"third": 1,
"seen": { "$eq": [ "$third", "$msgs" ] },
}},
{ "$sort": { "seen": 1 }},
{ "$group": {
"_id": "$_id",
"msgs": {
"$push": {
"$cond": [ { "$not": "$seen" }, "$msgs", false ]
}
},
"first": { "$first": "$first" },
"second": { "$first": "$second" },
"third": { "$first": "$third" },
"forth": { "$first": "$msgs" }
}},
{ "$unwind": "$msgs" },
{ "$project": {
"msgs": 1,
"first": 1,
"second": 1,
"third": 1,
"forth": 1,
"seen": { "$eq": [ "$forth", "$msgs" ] }
}},
{ "$sort": { "seen": 1 }},
{ "$group": {
"_id": "$_id",
"msgs": {
"$push": {
"$cond": [ { "$not": "$seen" }, "$msgs", false ]
}
},
"first": { "$first": "$first" },
"second": { "$first": "$second" },
"third": { "$first": "$third" },
"forth": { "$first": "$forth" },
"fifth": { "$first": "$msgs" }
}},
{ "$unwind": "$msgs" },
{ "$project": {
"msgs": 1,
"first": 1,
"second": 1,
"third": 1,
"forth": 1,
"fifth": 1,
"seen": { "$eq": [ "$fifth", "$msgs" ] }
}},
{ "$sort": { "seen": 1 }},
{ "$group": {
"_id": "$_id",
"msgs": {
"$push": {
"$cond": [ { "$not": "$seen" }, "$msgs", false ]
}
},
"first": { "$first": "$first" },
"second": { "$first": "$second" },
"third": { "$first": "$third" },
"forth": { "$first": "$forth" },
"fifth": { "$first": "$fifth" },
"sixth": { "$first": "$msgs" },
}},
{ "$project": {
"first": 1,
"second": 1,
"third": 1,
"forth": 1,
"fifth": 1,
"sixth": 1,
"pos": { "$const": [ 1,2,3,4,5,6 ] }
}},
{ "$unwind": "$pos" },
{ "$group": {
"_id": "$_id",
"msgs": {
"$push": {
"$cond": [
{ "$eq": [ "$pos", 1 ] },
"$first",
{ "$cond": [
{ "$eq": [ "$pos", 2 ] },
"$second",
{ "$cond": [
{ "$eq": [ "$pos", 3 ] },
"$third",
{ "$cond": [
{ "$eq": [ "$pos", 4 ] },
"$forth",
{ "$cond": [
{ "$eq": [ "$pos", 5 ] },
"$fifth",
{ "$cond": [
{ "$eq": [ "$pos", 6 ] },
"$sixth",
false
]}
]}
]}
]}
]}
]
}
}
}},
{ "$unwind": "$msgs" },
{ "$match": { "msgs": { "$ne": false } }},
{ "$group": {
"_id": "$_id",
"msgs": { "$push": "$msgs" }
}}
])
You get the top results in the array, up to six entries:
{ "_id" : 123, "msgs" : [ 1, 2, 3, 4, 5, 6 ] }
{ "_id" : 456, "msgs" : [ 12, 13, 14, 15 ] }
As you can see here, loads of fun.
After you have initially grouped you basically want to "pop" the $first value off of the stack for the array results. To make this process simplified a little, we actually do this in the initial operation. So the process becomes:
$unwind the array
Compare to the values already seen with an $eq equality match
$sort the results to "float" false unseen values to the top ( this still retains order )
$group back again and "pop" the $first unseen value as the next member on the stack. Also this uses the $cond operator to replace "seen" values in the array stack with false to help in the evaluation.
The final action with $cond is there to make sure that future iterations are not just adding the last value of the array over and over where the "slice" count is greater than the array members.
That whole process needs to be repeated for as many items as you wish to "slice". Since we already found the "first" item in the initial grouping, that means n-1 iterations for the desired slice result.
The final steps are really just an optional illustration of converting everything back into arrays for the result as finally shown. So really just conditionally pushing items or false back by their matching position and finally "filtering" out all the false values so the end arrays have "six" and "five" members respectively.
So there is not a standard operator to accommodate this, and you cannot just "limit" the push to 5 or 10 or whatever items in the array. But if you really have to do it, then this is your best approach.
You could possibly approach this with mapReduce and forsake the aggregation framework all together. The approach I would take ( within reasonable limits ) would be to effectively have an in-memory hash-map on the server and accumulate arrays to that, while using JavaScript slice to "limit" the results:
db.messages.mapReduce(
function () {
if ( !stash.hasOwnProperty(this.conversation_ID) ) {
stash[this.conversation_ID] = [];
}
if ( stash[this.conversation_ID.length < maxLen ) {
stash[this.conversation_ID].push( this._id );
emit( this.conversation_ID, 1 );
}
},
function(key,values) {
return 1; // really just want to keep the keys
},
{
"scope": { "stash": {}, "maxLen": 10 },
"finalize": function(key,value) {
return { "msgs": stash[key] };
},
"out": { "inline": 1 }
}
)
So that just basically builds up the "in-memory" object matching the emitted "keys" with an array never exceeding the maximum size you want to fetch from your results. Additionally this does not even bother to "emit" the item when the maximum stack is met.
The reduce part actually does nothing other than essentially just reduce to "key" and a single value. So just in case our reducer did not get called, as would be true if only 1 value existed for a key, the finalize function takes care of mapping the "stash" keys to the final output.
The effectiveness of this varies on the size of the output, and JavaScript evaluation is certainly not fast, but possibly faster than processing large arrays in a pipeline.
Vote up the JIRA issues to actually have a "slice" operator or even a "limit" on "$push" and "$addToSet", which would both be handy. Personally hoping that at least some modification can be made to the $map operator to expose the "current index" value when processing. That would effectively allow "slicing" and other operations.
Really you would want to code this up to "generate" all of the required iterations. If the answer here gets enough love and/or other time pending that I have in tuits, then I might add some code to demonstrate how to do this. It is already a reasonably long response.
Code to generate pipeline:
var key = "$conversation_ID";
var val = "$_id";
var maxLen = 10;
var stack = [];
var pipe = [];
var fproj = { "$project": { "pos": { "$const": [] } } };
for ( var x = 1; x <= maxLen; x++ ) {
fproj["$project"][""+x] = 1;
fproj["$project"]["pos"]["$const"].push( x );
var rec = {
"$cond": [ { "$eq": [ "$pos", x ] }, "$"+x ]
};
if ( stack.length == 0 ) {
rec["$cond"].push( false );
} else {
lval = stack.pop();
rec["$cond"].push( lval );
}
stack.push( rec );
if ( x == 1) {
pipe.push({ "$group": {
"_id": key,
"1": { "$first": val },
"msgs": { "$push": val }
}});
} else {
pipe.push({ "$unwind": "$msgs" });
var proj = {
"$project": {
"msgs": 1
}
};
proj["$project"]["seen"] = { "$eq": [ "$"+(x-1), "$msgs" ] };
var grp = {
"$group": {
"_id": "$_id",
"msgs": {
"$push": {
"$cond": [ { "$not": "$seen" }, "$msgs", false ]
}
}
}
};
for ( n=x; n >= 1; n-- ) {
if ( n != x )
proj["$project"][""+n] = 1;
grp["$group"][""+n] = ( n == x ) ? { "$first": "$msgs" } : { "$first": "$"+n };
}
pipe.push( proj );
pipe.push({ "$sort": { "seen": 1 } });
pipe.push(grp);
}
}
pipe.push(fproj);
pipe.push({ "$unwind": "$pos" });
pipe.push({
"$group": {
"_id": "$_id",
"msgs": { "$push": stack[0] }
}
});
pipe.push({ "$unwind": "$msgs" });
pipe.push({ "$match": { "msgs": { "$ne": false } }});
pipe.push({
"$group": {
"_id": "$_id",
"msgs": { "$push": "$msgs" }
}
});
That builds the basic iterative approach up to maxLen with the steps from $unwind to $group. Also embedded in there are details of the final projections required and the "nested" conditional statement. The last is basically the approach taken on this question:
Does MongoDB's $in clause guarantee order?
Starting Mongo 4.4, the $group stage has a new aggregation operator $accumulator allowing custom accumulations of documents as they get grouped, via javascript user defined functions.
Thus, in order to only select n messages (for instance 2) for each conversation:
// { "conversationId" : 3, "messageId" : 14 }
// { "conversationId" : 5, "messageId" : 34 }
// { "conversationId" : 3, "messageId" : 39 }
// { "conversationId" : 3, "messageId" : 47 }
db.collection.aggregate([
{ $group: {
_id: "$conversationId",
messages: {
$accumulator: {
accumulateArgs: ["$messageId"],
init: function() { return [] },
accumulate:
function(messages, message) { return messages.concat(message).slice(0, 2); },
merge:
function(messages1, messages2) { return messages1.concat(messages2).slice(0, 2); },
lang: "js"
}
}
}}
])
// { "_id" : 5, "messages" : [ 34 ] }
// { "_id" : 3, "messages" : [ 14, 39 ] }
The accumulator:
accumulates on the field messageId (accumulateArgs)
is initialised to an empty array (init)
accumulates messageId items in an array and only keeps a maximum of 2 (accumulate and merge)
Starting in Mongo 5.2, it's a perfect use case for the new $topN aggregation accumulator:
// { "conversationId" : 3, "messageId" : 14 }
// { "conversationId" : 5, "messageId" : 34 }
// { "conversationId" : 3, "messageId" : 39 }
// { "conversationId" : 3, "messageId" : 47 }
db.collection.aggregate([
{ $group: {
_id: "$conversationId",
messages: { $topN: { n: 2, output: "$messageId", sortBy: { _id: 1 } } }
}}
])
// { "_id" : 5, "messages" : [ 34 ] }
// { "_id" : 3, "messages" : [ 14, 39 ] }
This applies a $topN group accumulation that:
takes for each group the top 2 (n: 2) elements
and for each grouped record extracts the field value (output: "$messageId")
the choice of the "top 2" is defined by sortBy: { _id: 1 } (that I chose to be _id since you didn't specify an order).
The $slice operator is not an aggregation operator so you can't do this (like I suggested in this answer, before the edit):
db.messages.aggregate([
{ $group : {_id:'$conversation_ID',msgs: { $push: { msgid:'$_id' }}}},
{ $project : { _id : 1, msgs : { $slice : 10 }}}]);
Neil's answer is very detailed, but you can use a slightly different approach (if it fits your use case). You can aggregate your results and output them to a new collection:
db.messages.aggregate([
{ $group : {_id:'$conversation_ID',msgs: { $push: { msgid:'$_id' }}}},
{ $out : "msgs_agg" }
]);
The $out operator will write the results of the aggregation to a new collection. You can then use a regular find query project your results with the $slice operator:
db.msgs_agg.find({}, { msgs : { $slice : 10 }});
For this test documents:
> db.messages.find().pretty();
{ "_id" : 1, "conversation_ID" : 123 }
{ "_id" : 2, "conversation_ID" : 123 }
{ "_id" : 3, "conversation_ID" : 123 }
{ "_id" : 4, "conversation_ID" : 123 }
{ "_id" : 5, "conversation_ID" : 123 }
{ "_id" : 7, "conversation_ID" : 1234 }
{ "_id" : 8, "conversation_ID" : 1234 }
{ "_id" : 9, "conversation_ID" : 1234 }
The result will be:
> db.msgs_agg.find({}, { msgs : { $slice : 10 }});
{ "_id" : 1234, "msgs" : [ { "msgid" : 7 }, { "msgid" : 8 }, { "msgid" : 9 } ] }
{ "_id" : 123, "msgs" : [ { "msgid" : 1 }, { "msgid" : 2 }, { "msgid" : 3 },
{ "msgid" : 4 }, { "msgid" : 5 } ] }
Edit
I assume this would mean duplicating the whole messages collection.
Isn't that overkill?
Well, obviously this approach won't scale with huge collections. But, since you're considering using large aggregation pipelines or large map-reduce jobs you probably won't use this for "real-time" requests.
There are many cons of this approach: 16 MB BSON limit if you're creating huge documents with aggregation, wasting disk space / memory with duplication, increased disk IO...
The pros of this approach: its simple to implement and thus easy to change. If your collection is rarely updated you can use this "out" collection like a cache. This way you wouldn't have to perform the aggregation operation multiple times and you could then even support "real-time" client requests on the "out" collection. To refresh your data, you can periodically do aggregation (e.g. in a background job that runs nightly).
Like it was said in the comments this isn't an easy problem and there isn't a perfect solution for this (yet!). I showed you another approach you can use, it's up to you to benchmark and decide what's most appropriate for your use case.
I hope this will work as you wanted:
db.messages.aggregate([
{ $group : {_id:'$conversation_ID',msgs: { $push: { msgid:'$_id' }}}},
{ $project : { _id : 1, msgs : { $slice : ["$msgid",0,10] }}}
]);

Weighted Average rating through mongodb

Is it possible to do a query to sort by "weighted average"
There is 5 values from 1-5 possible. Weighted average is
(n5*5 + n4*4 + n3*3 + n2*2 + n1*1) / (n5+n4+n3+n2+n1)
Where n5 would be the count of objects with rating: 5
I have the following example. If you find better structure to store I am happy to hear.
{
"_id" : "wPg4jzJsEFXNxR5Wf",
"caveId" : "56424a93819e7419112c883e",
"data" : [
{
"value" : 1
},
{
"value" : 3
},
{
"value" : 4
},
{
"value" : 2
}
]
}
{
"_id" : "oSrtv33MgnkJFvNan",
"caveId" : "56424a93819e7419112c949f",
"data" : [
{
"value" : 1
},
{
"value" : 4
},
{
"value" : 4
},
{
"value" : 2
}
]
}
{
"_id" : "gJRMMQPwDwjFrL7zz",
"caveId" : "56424a93819e7419112c8727",
"data" : [
{
"value" : 5
},
{
"value" : 1
},
{
"value" : 4
}
]
}
Example of _ID: oSrtv33MgnkJFvNan (Second one)
(2*4 + 1*2 + 1*1)/(2+1+1) = 2.75
Then I would want to sort all the documents by that value.
Order would be
gJRMMQPwDwjFrL7zz: value: 3.33
oSrtv33MgnkJFvNan: value 2.75
wPg4jzJsEFXNxR5Wf: value 2.5
Well the answer is really both "yes" and "no" in respect to can MongoDB sort data from calculation like this. It can of course do it, but possibly not in a practical way for your purpose.
The two tools MongoDB has to do any sort of calculation are the aggregation framework and mapReduce. The former currently lacks the operators to really handle this in a practical way. The second can be "tricked" into sorting, as an artifact of how mapReduce works, by putting the component to be sorted in the grouping key (even if there is no actual grouping).
So you can basically apply the math with something like this:
db.data.mapReduce(
function() {
var vals = this.data.map(function(el){ return el.value }),
uniq = {};
vals.forEach(function(el) {
if (!uniq.hasOwnProperty(el)) {
uniq[el] = 1;
} else {
uniq[el]++;
}
});
var weight = Array.sum(Object.keys(uniq).map(function(key) {
return uniq[key] * key
})) / Array.sum(Object.keys(uniq).map(function(key) {
return uniq[key];
}))
var id = this._id;
delete this._id;
emit({ "weight": weight, "orig": id },this);
},
function() {},
{ "out": { "inline": 1 } }
)
Which gives you this output:
{
"results" : [
{
"_id" : {
"weight" : 2.5,
"orig" : "wPg4jzJsEFXNxR5Wf"
},
"value" : {
"caveId" : "56424a93819e7419112c883e",
"data" : [
{
"value" : 1
},
{
"value" : 3
},
{
"value" : 4
},
{
"value" : 2
}
]
}
},
{
"_id" : {
"weight" : 2.75,
"orig" : "oSrtv33MgnkJFvNan"
},
"value" : {
"caveId" : "56424a93819e7419112c949f",
"data" : [
{
"value" : 1
},
{
"value" : 4
},
{
"value" : 4
},
{
"value" : 2
}
]
}
},
{
"_id" : {
"weight" : 3.3333333333333335,
"orig" : "gJRMMQPwDwjFrL7zz"
},
"value" : {
"caveId" : "56424a93819e7419112c8727",
"data" : [
{
"value" : 5
},
{
"value" : 1
},
{
"value" : 4
}
]
}
}
]
}
So all the results are sorted, but of course the restriction applies that mapReduce can only produce "inline" output that is under the 16MB BSON limit, or alternately write the results out to another collection.
Even with new features being added to the aggregation framework that can assist here ( from current development series 3.1.x ) this would still require some juggling with $unwind in order to get the "sum" of elements in any way ( no such feature as a "reduce" function yet ), which does not make it a stable or practical alternative.
So you can do it with mapReduce, but for my money I would have another process that calculates this to run periodicallly ( or triggered on updates ) and update a standard "weight" field on the document, that could then be used directly for sorting.
Having a value in place in your documents is always the most performant option.
For the curious, you can grab a development branch release of MongoDB ( 3.1.x series ), or any release after that and apply an aggregation pipeline like this:
db.data.aggregate([
{ "$project": {
"caveId": 1,
"data": 1,
"conv": {
"$setUnion": [
{ "$map": {
"input": "$data",
"as": "el",
"in": "$$el.value"
}},
[]
]
},
"orig": {
"$map": {
"input": "$data",
"as": "el",
"in": "$$el.value"
}
}
}},
{ "$project": {
"caveId": 1,
"data": 1,
"conv": 1,
"orig": 1,
"counts": { "$map": {
"input": "$conv",
"as": "el",
"in": {
"$size": {
"$filter": {
"input": "$orig",
"as": "o",
"cond": {
"$eq": [ "$$o", "$$el" ]
}
}
}
}
}}
}},
{ "$unwind": { "path": "$conv", "includeArrayIndex": true } },
{ "$group": {
"_id": "$_id",
"caveId": { "$first": "$caveId" },
"data": { "$first": "$data" },
"counts": { "$first": "$counts" },
"mult": {
"$sum": {
"$multiply": [
"$conv.value",
{ "$arrayElemAt": [ "$counts", "$conv.index" ] }
]
}
}
}},
{ "$unwind": "$counts" },
{ "$group": {
"_id": "$_id",
"caveId": { "$first": "$caveId" },
"data": { "$first": "$data" },
"count": { "$sum": "$counts" },
"mult": { "$first": "$mult" }
}},
{ "$project": {
"data": 1,
"weight": { "$divide": [ "$mult", "$count" ] }
}},
{ "$sort": { "weight": 1 } }
])
But even with helpers like $filter and "includeArrayIndex" in $unwind and the $arrayElemAt operator using that index later to match up the distinct elements with their counts, the usage of $unwind in any way makes this a non-performant solution.
It may become practical in the future if operators like $map can produce index values needed for pairing and with the introduction of any methods to similarly do an "in-line sum" operation or other math on array results without processing $unwind. But as of writing this does not exist, even in development.