List values in MongoDB aggregation pipeline belonging to different documents - mongodb

I got this result coming from an Aggregation pipeline I did:
{
"_id" : ISODate("2015-05-14T08:22:09.441Z"),
"values" : {
"v1_min" : 15.872267760931187,
"v1" : 15.909139078185774,
"v1_max" : 20.6420184124931776
}
},
{
"_id" : ISODate("2015-05-13T08:22:09.441Z"),
"values" : {
"v1_min" : 2.872263320931187,
"v1" : 7.909132898185774,
"v1_max" :44.6498764124931776
}
},
{...}
do you think it's possible to get a structure like this one
{
"_id" : [ISODate("2015-05-14T08:22:09.441Z"),ISODate("2015-05-13T08:22:09.441Z")]
"values" : {
"v1_min" : [15.872267760931187, 2.872263320931187],
"v1" : [15.909139078185774, 7.909132898185774]
"v1_max" : [2.6420184124931776, 44.6498764124931776]
}
}
adding some others stages to my aggregation pipelines?
If so, how would you do?
I'd not like to handle this via code because I think MongoDB aggregation framework is faster and should do better than me.

Yes, it's quite possible indeed. The following aggregation pipeline will achieve the desired output:
db.collection.aggregate([
{
"$group": {
"_id": null,
"ids": {
"$addToSet": "$_id"
},
"v1_min": {
"$push": "$values.v1_min"
},
"v1": {
"$push": "$values.v1"
},
"v1_max": {
"$push": "$values.v1_max"
}
}
},
{
"$project": {
"_id": "$ids",
"values": {
"v1_min": "$v1_min",
"v1": "$v1",
"v1_max": "$v1_max"
}
}
}
]);
-- EDIT --
Use $push instead of $addToSet as the latter will only add the element if and only if the final array does not contain the element itself. (Thanks to #SylvainLeroux for the positive contributions)

Related

Concat String by Group

I want to group records by _id and create a string by combining client_id values.
Here are examples of my documents:
{
"_id" : ObjectId("59e955e633d64c81875bfd2f"),
"tag_id" : 1,
"client_id" : "10001"
}
{
"_id" : ObjectId("59e955e633d64c81875bfd30"),
"tag_id" : 1,
"client_id" : "10002"
}
I'd like to have this output:
{
"_id" : 1
"client_id" : "10001,10002"
}
You can do it with the aggregation framework as a "two step" operation. Which is to first accumulate the items to an array via $push withing a $group pipeline, and then to use $concat with $reduce on the produced array in final projection:
db.collection.aggregate([
{ "$group": {
"_id": "$tag_id",
"client_id": { "$push": "$client_id" }
}},
{ "$addFields": {
"client_id": {
"$reduce": {
"input": "$client_id",
"initialValue": "",
"in": {
"$cond": {
"if": { "$eq": [ "$$value", "" ] },
"then": "$$this",
"else": {
"$concat": ["$$value", ",", "$$this"]
}
}
}
}
}
}}
])
We also apply $cond here to avoid concatenating an empty string with a comma in the results, so it looks more like a delimited list.
FYI There is an JIRA issue SERVER-29339 which does ask for $reduce to be implemented as an accumulator expression to allow it's use directly in a $group pipeline stage. Not likely to happen any time soon, but it theoretically would replace $push in the above and make the operation a single pipeline stage. Sample proposed syntax is on the JIRA issue.
If you don't have $reduce ( requires MongoDB 3.4 ) then just post process the cursor:
db.collection.aggregate([
{ "$group": {
"_id": "$tag_id",
"client_id": { "$push": "$client_id" }
}},
]).map( doc =>
Object.assign(
doc,
{ "client_id": doc.client_id.join(",") }
)
)
Which then leads to the other alternative of doing this using mapReduce if you really must:
db.collection.mapReduce(
function() {
emit(this.tag_id,this.client_id);
},
function(key,values) {
return [].concat.apply([],values.map(v => v.split(","))).join(",");
},
{ "out": { "inline": 1 } }
)
Which of course outputs in the specific mapReduce form of _id and value as the set of keys, but it is basically the output.
We use [].concat.apply([],values.map(...)) because the output of the "reducer" can be a "delimited string" because mapReduce works incrementally with large results and therefore output of the reducer can become "input" on another pass. So we need to expect that this can happen and treat it accordingly.
Starting Mongo 4.4, the $group stage has a new aggregation operator $accumulator allowing custom accumulations of documents as they get grouped:
// { "tag_id" : 1, "client_id" : "10001" }
// { "tag_id" : 1, "client_id" : "10002" }
// { "tag_id" : 2, "client_id" : "9999" }
db.collection.aggregate([
{ $group: {
_id: "$tag_id",
client_id: {
$accumulator: {
accumulateArgs: ["$client_id"],
init: function() { return [] },
accumulate: function(ids, id) { return ids.concat(id) },
merge: function(ids1, ids2) { return ids1.concat(ids2) },
finalize: function(ids) { return ids.join(",") },
lang: "js"
}
}
}}
])
// { "_id" : 2, "client_id" : "9999" }
// { "_id" : 1, "client_id" : "10001,10002" }
The accumulator:
accumulates on the field client_id (accumulateArgs)
is initialised to an empty array (init)
accumulates by concatenating new ids to already seen ids to new ones (accumulate and merge)
and finally joins all ids as a string (finalize)

MongoDB - Operations with nested fields

I have twitter data that looks like this:
db.users.findOne()
{
"_id" : ObjectId("578ffa8e7eb9513f4f55a935"),
"user_name" : "koteras",
"retweet_count" : 0,
"tweet_followers_count" : 461,
"source" : "Twitter for iPhone",
"coordinates" : null,
"tweet_mentioned_count" : 1,
"tweet_ID" : "755891629932675072",
"tweet_text" : "RT #ochocinco: I beat them all for 10 straight hours #FIFA16KING",
"user" : {
"CreatedAt" : ISODate("2011-12-27T09:04:01Z"),
"FavouritesCount" : 5223,
"FollowersCount" : 461,
"FriendsCount" : 619,
"UserId" : 447818090,
"Location" : "501"
}
For example, I want to find the number of users that have "FollowersCount" greater than "FavouritesCount". How can I do that?
The $where operator is specifically designed for this.
db.users.find( { $where: function() { return (this.user.FollowersCount > this.user.FavouritesCount) } } );
But keep in mind that this would run single threaded JS code, and will be slower.
Another option is to use an aggregation pipeline projecting the difference, and then having a $match on the difference
db.users.aggregate([
{$project: {
diff: {$subtract: ["$user.FollowersCount", "$user.FavouritesCount"]},
// project remaining fields here
}
},
{$match: {diff: {$gt: 0}}}
])
In my experience I have found the second one to be much faster than the first.
To get the number of users that have "FollowersCount" greater than "FavouritesCount", you could use the aggregation framework which has some operators that you can apply.
Consider the first use case which looks at manipulating the comparison operators within the $project pipeline and a subsequent $match pipeline to filter documents based on the $cmp value. You can then get the final user count by applying a $group pipeline that aggregates the filtered documents:
db.users.aggregate([
{
"$project": {
"hasMoreFollowersThanFavs": {
"$cmp": [ "$user.FollowersCount", "$user.FavouritesCount" ]
}
}
},
{ "$match": { "hasMoreFollowersThanFavs": 1 } },
{
"$group": {
"_id": null,
"count": { "$sum": 1 }
}
}
])
Another option is using a single pipeline with $redact operator which incorporates the functionality of $project and $match as above and returns all documents which match a specified condition using $$KEEP system variable and discards those that don't match using the $$PRUNE system variable:
db.collection.aggregate([
{
"$redact": {
"$cond": [
{
"$eq": [
{ "$cmp": [ "$user.FollowersCount", "$user.FavouritesCount" ] },
1
]
},
"$$KEEP",
"$$PRUNE"
]
}
},
{
"$group": {
"_id": null,
"count": { "$sum": 1 }
}
}
])

how to count number of keys in subdocument using aggregation pipeline?

Suppose I have a document like this:
{
"_id" : ObjectId("57eb386e37b4842ff5f386c9"),
"lesson_id" : ObjectId("57e27cd190e6993e393f5c74"),
"student_id" : ObjectId("57d3c3f590e6995fe8de7932"),
"answer_records" : {
"1" : {
"answer" : [
"A"
]
},
"3" : {
"answer" : [
"C"
]
}
}
I want to count the number of answer records in the collection. Apparently, this document contribute two answer records which are "1" and "3". So, my question is how to achieve this using aggregation pipeline.
In your case, it is far easier to just use JS.
On the mongo shell :
var json=db.sof.findOne().answer_records;
Object.keys(json).length;
Prints 2 for the number of answer records in the said document.
For MongoDB 3.6 and newer, use the $objectToArray operator within an aggregation pipeline to convert the document to an array. The return array contains an element for each field/value pair in the original document. Each element in the return array is a document that contains two fields k and v.
On getting the array, you can then leverage the use of $addFields pipeline step to create a field that holds the counts and the actual count is derived with the use of the $size operator.
All this can be done in a single pipeline by nesting the expressions as follows:
db.collection.aggregate([
{
"$addFields": {
"answers_count": {
"$size": {
"$objectToArray": "$answer_records"
}
}
}
}
])
Sample Output
{
"_id" : ObjectId("57eb386e37b4842ff5f386c9"),
"lesson_id" : ObjectId("57e27cd190e6993e393f5c74"),
"student_id" : ObjectId("57d3c3f590e6995fe8de7932"),
"answer_records" : {
"1" : {
"answer" : [
"A"
]
},
"3" : {
"answer" : [
"C"
]
}
},
"answers_count": 2
}
For MongoDB server versions which do not support the above operators, you would need to change your schema design in order to carry out efficient queries with the aggregation framework. As it is currently you'd need
to preprocess the documents either on the client or server with JavaScript thus you won't be able to fully utilise MongoDB's better infrastructure built for faster querying.
The ideal design follows:
{
"_id" : ObjectId("57eb386e37b4842ff5f386c9"),
"lesson_id" : ObjectId("57e27cd190e6993e393f5c74"),
"student_id" : ObjectId("57d3c3f590e6995fe8de7932"),
"answer_records" : [
{ "id": "1", "answer": "A" }
{ "id": "3", "answer": "C" }
]
}
which you can then simply apply the aggregation's $project pipeline that uses the $size operator to return the length of the answer_records array per document:
db.collection.aggregate([
{
"$project": {
"lesson_id": 1,
"student_id": 1,
"count": { "$size": "$answer_records" }
}
}
])
If you want the total number of answer records for the whole collection then add another $group pipeline to get the accumulated total for all the documents using an _id of null:
db.collection.aggregate([
{
"$project": {
"count": { "$size": "$answer_records" }
}
},
{
"$group": {
"_id": null,
"total_answers": { "$sum": "$count" }
}
}
])
Otherwise with the current design your only option is MapReduce which is much slower:
db.collection.mapReduce(
function() {
emit(this._id, Object.keys(this.answer_records).length);
},
function() { },
{ "out": { "inline": 1 } }
)
Sample Output:
{
"results" : [
{
"_id" : ObjectId("57eb386e37b4842ff5f386c9"),
"value" : 2
}
],
....
}
To get the total for all the documents in the collection then run this mapReduce operation:
db.collection.mapReduce(
function() {
emit(null, Object.keys(this.answer_records).length);
},
function(key, values) {
return Array.sum(values);
},
{ "out": { "inline": 1 } }
)

Combining group and project in mongoDB aggregation framework

my document looks like this:
{
"_id" : ObjectId("5748d1e2498ea908d588b65e"),
"some_item" : {
"_id" : ObjectId("5693afb1b49eb7d5ed97de14"),
"item_property_1" : 1.0,
"item_property_2" : 2.0,
},
"timestamp" : "2016-05-28",
"price_information" : {
"arbitrary_value" : 111,
"hourly_rates" : [
{
"price" : 74.45,
"hour" : "0"
},
{
"price" : 74.45,
"hour" : "1"
},
{
"price" : 74.45,
"hour" : "2"
},
]
}
}
I did average the price per day via:
db.hourly.aggregate([
{$match: {timestamp : "2016-05-28"}},
{$unwind: "$price_information.hourly_rates"},
{$group: { _id: "$unique_item_identifier", total_price: { $avg: "$price_information.hourly_rates.price"}}}
]);
I am struggling with bringing (projecting) other params with in the result set. I would like to have also some_item and timestampin the result set. I tried to use a $project: {some_item: 1, total_price: 1, ...} within the query, but that wasn't right.
My desired output would be like:
{
"_id" : ObjectId("5693afb1b49eb7d5ed97de27"),
"someItem" : {
"_id" : ObjectId("5693afb1b49eb7d5ed97de14"),
"item_property_1" : 1.0,
"item_property_2" : 2.0,
},
"timestamp" : "2016-05-28",
"price_information" : {
"avg_price": 34
}
}
If somebody could give me a hint, how to project the grouping and the other params into the result set, I would be thankful.
Best
Rob
If using MongoDB 3.2 and newer, you can use $avg in the $project pipeline since it returns the average of the specified expression or list of expressions for each document e.g
db.hourly.aggregate([
{ "$match": { "timestamp": "2016-05-28" } },
{
"$project": {
"price_information": {
"avg_price": { "$avg": "$price_information.hourly_rates.price" }
},
"someItem": 1,
"timestamp": 1,
}
}
]);
In previous versions of MongoDB, $avg is available in the $group stage only. So to include the other fields, use the $first operator in your grouping:
db.hourly.aggregate([
{ "$match": { "timestamp": "2016-05-28" } },
{ "$unwind": "$price_information.hourly_rates" },
{
"$group": {
"_id": "$_id",
"avg_price": { "$avg": "$price_information.hourly_rates.price" },
"someItem": { "$first": "$some_item" },
"timestamp": { "$first": "$timestamp" },
}
},
{
"$project": {
"price_information": { "avg_price": "$avg_price" },
"someItem": 1
"timestamp": 1
}
}
]);
Note: Usage of the $first operator in a $group stage will largely depend on how the documents getting in that pipeline are ordered as well as the group by key. Because $first will returns the first document value in a group of documents that share the same group by key, the $group stage logically should precede a $sort stage to have the input documents in a defined order. This is only sensible to use when you know the order that the data is being processed in.
However, as the above is grouping by the main document's _id key, the $first operator when applied to non-denormalized fields (and not the flattened price_information array fields) will guarantee the original value in the result. Hence no need for a pre-sort stage to define the order since it won't be necessary in this case.

mongodb sorting array documents

This is my document i want to sort array documents by ascending order to get so for that my queries are in following code.but i am not getting the docs in sorted way.
The query is
db.sample.find({_id: ObjectId("55b32f5957e47fabd30c5d2e")}).sort({'naresh.ts':1}).pretty();
This is the result I am getting
{
"_id" : ObjectId("55b32f5957e47fabd30c5d2e"),
"naresh" : [
{
"ts" : "hi",
"created_by" : 1437806425105
},
{
"ts" : "hello",
"created_by" : 1437806425105
},
{
"ts" : "waht",
"created_by" : 1437807757261
},
{
"ts" : "lefo",
"created_by" : 1437807768514
},
{
"ts" : "lefow",
"created_by" : 1437807775719
}
]
}
You can use $aggregation like following query:
db.collection.aggregate({
"$match": {
"_id": ObjectId("55b32f5957e47fabd30c5d2e")
}
}, {
$unwind: "$naresh"
}, {
$sort: {
"naresh.ts": 1
}
}, {
"$group": {
_id: "$_id",
"naresh": {
$push: "$naresh"
}
}
})
The cursor .sort() only looks at the values in the array to decide to use the "smallest" value of the specified field ( in ascending order ) to determine how to "sort" the documents in the response. This does not "sort" the array content itself.
In order to sort the array, you need to use the aggregation framework to manipulate the document:
db.sample.aggregate([
{ "$match": { "_id": ObjectId("55b32f5957e47fabd30c5d2e") },
{ "$unwind": "$naresh" },
{ "$sort": { "$naresh.ts": 1 } },
{ "$group": {
"_id": "$_id",
"naresh": { "$push": "$naresh" }
}}
])
That sorts the array.
Better yet, if you "always" want then results sorted then do it as you update the document:
db.sample.update({},{ "$push": { "$each": [], "$sort": { "ts": 1 } } },{ "multi": true })
And use those same, $each and $sort modifiers when adding new elements to the array and the content will remain sorted.
If you want just query the collection and get the output sorted, then Blackes Seven's answer will work perfectly for you.
However if you want to update the documents in the sorted order, go with this update query:
update(
{
_id: ObjectId("55b32f5957e47fabd30c5d2e")
},
{
$push: {
naresh: {
$each: [],
$sort: {created_by: 1}
}
}
}
)