Return specific field in aggregate - mongodb

i am trying to aggregate the following data:
{
"_id" : ObjectId("527a6b7c24a8874c078b9d10"),
"Name" : "FirstName",
"Link" : "www.mylink.com/123",
"year" : 2013
}
{
"_id" : ObjectId("527a6b7c24a8874c078b9d11"),
"Name" : "FirstName",
"Link" : "www.mylink.com/124",
"year" : 2013
}
{
"_id" : ObjectId("527a6b7c24a8874c078b9d12"),
"Name" : "SecondName",
"Link" : "www.mylink.com/125",
"year" : 2013
}
I want to aggregate number of occurencies of Name field, but also want to return the corresponding Link field in the output of aggregate query. Now I am doing it like this (which does not return the Link field in the output):
db.coll.aggregate([
{ "$match": { "Year": 2013 } },
{ "$group": {
"_id": {
"Name": "$Name"
},
"count": { "$sum": 1 }
}},
{ "$project": {
"_id": "$_id",
"count": 1
}},
{ $sort: {
count: 1
} }
])
The above returns only Name field and count. But how can I also return the corresponding Link field (could be several) in the output of aggregate query?
Best Regards

db.coll.aggregate([
{ "$match": { "year": 2013 } },
{ "$group": {"_id": "$Name", "Link": {$push: "$Link"}, "count": { "$sum": 1 }}},
{ "$project": {"Name": "$_id", _id: 0, "Link": 1, "count": 1}},
{ $sort: {count: 1} }
])
Results in:
{ "Link" : [ "www.mylink.com/125" ], "count" : 1, "Name" : "SecondName" }
{ "Link" : [ "www.mylink.com/123", "www.mylink.com/124" ], "count" : 2, "Name" : "FirstName" }
Ok so the $match was correct except for a typo with 'Year' --> 'year'
The $group could be simplified a little bit. I removed an extra set of brackets so that you get id: 'FirstName' instead of id: { 'name': 'FirstName' } since we can reshape the _id to 'name' in the $project stage anyways.
You needed to add $push or $addToSet to maintain the $Link value in your grouping. $addToSet will allow for unique values in the array only, while $push will add all values, so use whichever at your discretion.
$project and $sort are straightforward, rename and include/exclude whichever fields you would like.

Related

MongoDB combining group aggregation and strLenBytes

I've a Mongo collection with documents like this:
{
"_id" : ObjectId("5a9d0d44c3a1ce5f14c6940a"),
"topic_id" : "5a7af30613b79405643e7da1",
"value" : "VMware Virtual Platform",
"timestamp" : "2018-03-05 09:26:25.136546",
"insert_ts" : "2018-03-05 09:26:25.136682",
"inserted_by" : 1
},
{
"_id" : ObjectId("5a9d0d44c3a1ce5f14c69409"),
"topic_id" : "5a7af30713b79479f82b4b84",
"value" : "VMware, Inc.",
"timestamp" : "2018-03-05 09:26:25.118931",
"insert_ts" : "2018-03-05 09:26:25.119081",
"inserted_by" : 1
},
{
"_id" : ObjectId("5a9d0d44c3a1ce5f14c69408"),
"topic_id" : "5a7af30713b7946d6d0a8772",
"value" : "Phoenix Technologies LTD 6.00 09/21/2015",
"timestamp" : "2018-03-05 09:26:25.101624",
"insert_ts" : "2018-03-05 09:26:25.101972",
"inserted_by" : 1
}
I would like to fetch some aggregated data from this collection. I want to know the oldest timestamp, the documents count and the total strlen of all values, but grouped by topic_id, where the document-id is greater than x.
In mysql, i would build a sql like this:
SELECT
MAX(_id) as max_id,
COUNT(*) as message_count,
MIN(timestamp) as min_timestamp,
LENGTH(GROUP_CONCAT(value)) as size
FROM `dev_topic_data_numeric`
WHERE _id > 22000
GROUP BY topic_id
How do i achieve this in MongoDB? I already tried to build it looking like this:
db.getCollection('topic_data_text').aggregate(
[
{
"$match":
{
"_id": {"$gte": ObjectId("5a9d0aefc3a1ce5f14c68c81") }
}
},
{
"$group":
{
"_id": "$topic_id",
"max_id": {"$max":"$_id"},
"min_timestamp": {"$min": "$timestamp"},
"message_count": {"$sum": 1},
/*"size": {"$strLenBytes": "$value" }*/
}
}
]
);
Then i uncomment $strLenBytes it crashes saying, that strLenBytes is not a group operator. The API of MongoDB does not help me here. How do i have to write it to get the strlen?
My expected result should look like this:
{
"_id" : "5a7af30613b79405643e7da1",
"max_id" : ObjectId("5a9d0d44c3a1ce5f14c6940a"),
"min_timestamp" : "2018-03-05 09:26:25.136546",
"message_count" : 1,
"size" : 23,
}
My MongoDB version is 3.4.4.
This is because $strLenBytes is not an accumulator, unlike $sum or $max. The $group stage accumulates values, so any operator that is valid in the $group stage are typically accumulators.
$strLenBytes converts one value to another in a 1-1 fashion. This is typically an operator for the $project stage.
Adding a $project stage in your aggregation should give you the result you require. Note that you would also need to modify the $group stage slightly to pass on the required values:
> db.test.aggregate([
{
"$match":
{
"_id": {"$gte": ObjectId("5a9d0aefc3a1ce5f14c68c81") }
}
},
{
"$group":
{
"_id": {"topic_id": "$topic_id", value: "$value"},
"max_id": {"$max":"$_id"},
"min_timestamp": {"$min": "$timestamp"},
"message_count": {"$sum": 1}
}
},
{
"$project":
{
"_id": "$_id.topic_id",
"max_id": "$max_id",
"min_timestamp": "$min_timestamp",
"message_count": "$message_count",
size: {"$strLenBytes": "$_id.value" }
}
}
])
Output using your example documents:
{
"_id": "5a7af30613b79405643e7da1",
"max_id": ObjectId("5a9d0d44c3a1ce5f14c6940a"),
"min_timestamp": "2018-03-05 09:26:25.136546",
"message_count": 1,
"size": 23
}
{
"_id": "5a7af30713b79479f82b4b84",
"max_id": ObjectId("5a9d0d44c3a1ce5f14c69409"),
"min_timestamp": "2018-03-05 09:26:25.118931",
"message_count": 1,
"size": 12
}
{
"_id": "5a7af30713b7946d6d0a8772",
"max_id": ObjectId("5a9d0d44c3a1ce5f14c69408"),
"min_timestamp": "2018-03-05 09:26:25.101624",
"message_count": 1,
"size": 40
}
After testing #kevin-adistambha's answer and some further experimenting, I found another way to achieve my wanted result - and maybe it has a better performance - but that needs more testing to be sure about this.
db.getCollection('topic_data_text').aggregate(
[
{
"$match":
{
"_id": {"$gt": ObjectId("5a9f9d8bd5de3ac75f8cc269") }
}
},
{
"$group":
{
"_id": "$topic_id",
"max_id": {"$max":"$_id"},
"min_timestamp": {"$min": "$timestamp"},
"message_count": {"$sum": 1},
"size": {"$sum": {"$strLenBytes": "$value"}}
}
}
]
);

MongoDB aggregate nested array correctly

OK I am very new to Mongo, and I am already stuck.
Db has the following structure (much simplified for sure):
{
{
"_id" : ObjectId("57fdfbc12dc30a46507044ec"),
"keyterms" : [
{
"score" : "2",
"value" : "AA",
},
{
"score" : "2",
"value" : "AA",
},
{
"score" : "4",
"value" : "BB",
},
{
"score" : "3",
"value" : "CC",
}
]
},
{
"_id" : ObjectId("57fdfbc12dc30a46507044ef"),
"keyterms" : [
...
There are some Objects. Each Object have an array "keywords". Each of this Arrays Entries, which have score and value. There are some duplicates though (not really, since in the real db the keywords entries have much more fields, but concerning value and score they are duplicates).
Now I need a query, which
selects one object by id
groups its keyterms in by value
and counts the dublicates
sorts them by score
So I want to have something like that as result
// for Object 57fdfbc12dc30a46507044ec
"keyterms"; [
{
"score" : "4",
"value" : "BB",
"count" : 1
},
{
"score" : "3",
"value" : "CC",
"count" : 1
}
{
"score" : "2",
"value" : "AA",
"count" : 2
}
]
In SQL I would have written something like this
select
score, value, count(*) as count
from
all_keywords_table_or_some_join
group by
value
order by
score
But, sadly enough, it's not SQL.
In Mongo I managed to write this:
db.getCollection('tests').aggregate([
{$match: {'_id': ObjectId('57fdfbc12dc30a46507044ec')}},
{$unwind: "$keyterms"},
{$sort: {"keyterms.score": -1}},
{$group: {
'_id': "$_id",
'keyterms': {$push: "$keyterms"}
}},
{$project: {
'keyterms.score': 1,
'keyterms.value': 1
}}
])
But there is something missing: the grouping of the the keywords by their value. I can not get rid of the feeling, that this is the wrong approach at all. How can I select the keywords array and continue with that, and use an aggregate function inly on this - that would be easy.
BTW I read this
(Mongo aggregate nested array)
but I can't figure it out for my example unfortunately...
You'd want an aggregation pipeline where after you $unwind the array, you group the flattened documents by the array's value and score keys, aggregate the counts using the $sum accumulator operator and retain the main document's _id with the $first operator.
The preceding pipeline should then group the documents from the previous pipeline by the _id key so as to preserve the original schema and recreate the keyterms array using the $push operator.
The following demonstration attempts to explain the above aggregation operation:
db.tests.aggregate([
{ "$match": { "_id": ObjectId("57fdfbc12dc30a46507044ec") } },
{ "$unwind": "$keyterms" },
{
"$group": {
"_id": {
"value": "$keyterms.value",
"score": "$keyterms.score"
},
"doc_id": { "$first": "$_id" },
"count": { "$sum": 1 }
}
},
{ "$sort": {"_id.score": -1 } },
{
"$group": {
"_id": "$doc_id",
"keyterms": {
"$push": {
"value": "$_id.value",
"score": "$_id.score",
"count": "$count"
}
}
}
}
])
Sample Output
{
"_id" : ObjectId("57fdfbc12dc30a46507044ec"),
"keyterms" : [
{
"value" : "BB",
"score" : "4",
"count" : 1
},
{
"value" : "CC",
"score" : "3",
"count" : 1
},
{
"value" : "AA",
"score" : "2",
"count" : 2
}
]
}
Demo
Meanwhile, I solved it myself:
aggregate([
{$match: {'_id': ObjectId('57fdfbc12dc30a46507044ec')}},
{$unwind: "$keyterms"},
{$sort: {"keyterms.score": -1}},
{$group: {
'_id': "$keyterms.value",
'keyterms': {$push: "$keyterms"},
'escore': {$first: "$keyterms.score"},
'evalue': {$first: "$keyterms.value"}
}},
{$limit: 15},
{$project: {
"score": "$escore",
"value": "$evalue",
"count": {$size: "$keyterms"}
}}
])

Combining group and project in mongoDB aggregation framework

my document looks like this:
{
"_id" : ObjectId("5748d1e2498ea908d588b65e"),
"some_item" : {
"_id" : ObjectId("5693afb1b49eb7d5ed97de14"),
"item_property_1" : 1.0,
"item_property_2" : 2.0,
},
"timestamp" : "2016-05-28",
"price_information" : {
"arbitrary_value" : 111,
"hourly_rates" : [
{
"price" : 74.45,
"hour" : "0"
},
{
"price" : 74.45,
"hour" : "1"
},
{
"price" : 74.45,
"hour" : "2"
},
]
}
}
I did average the price per day via:
db.hourly.aggregate([
{$match: {timestamp : "2016-05-28"}},
{$unwind: "$price_information.hourly_rates"},
{$group: { _id: "$unique_item_identifier", total_price: { $avg: "$price_information.hourly_rates.price"}}}
]);
I am struggling with bringing (projecting) other params with in the result set. I would like to have also some_item and timestampin the result set. I tried to use a $project: {some_item: 1, total_price: 1, ...} within the query, but that wasn't right.
My desired output would be like:
{
"_id" : ObjectId("5693afb1b49eb7d5ed97de27"),
"someItem" : {
"_id" : ObjectId("5693afb1b49eb7d5ed97de14"),
"item_property_1" : 1.0,
"item_property_2" : 2.0,
},
"timestamp" : "2016-05-28",
"price_information" : {
"avg_price": 34
}
}
If somebody could give me a hint, how to project the grouping and the other params into the result set, I would be thankful.
Best
Rob
If using MongoDB 3.2 and newer, you can use $avg in the $project pipeline since it returns the average of the specified expression or list of expressions for each document e.g
db.hourly.aggregate([
{ "$match": { "timestamp": "2016-05-28" } },
{
"$project": {
"price_information": {
"avg_price": { "$avg": "$price_information.hourly_rates.price" }
},
"someItem": 1,
"timestamp": 1,
}
}
]);
In previous versions of MongoDB, $avg is available in the $group stage only. So to include the other fields, use the $first operator in your grouping:
db.hourly.aggregate([
{ "$match": { "timestamp": "2016-05-28" } },
{ "$unwind": "$price_information.hourly_rates" },
{
"$group": {
"_id": "$_id",
"avg_price": { "$avg": "$price_information.hourly_rates.price" },
"someItem": { "$first": "$some_item" },
"timestamp": { "$first": "$timestamp" },
}
},
{
"$project": {
"price_information": { "avg_price": "$avg_price" },
"someItem": 1
"timestamp": 1
}
}
]);
Note: Usage of the $first operator in a $group stage will largely depend on how the documents getting in that pipeline are ordered as well as the group by key. Because $first will returns the first document value in a group of documents that share the same group by key, the $group stage logically should precede a $sort stage to have the input documents in a defined order. This is only sensible to use when you know the order that the data is being processed in.
However, as the above is grouping by the main document's _id key, the $first operator when applied to non-denormalized fields (and not the flattened price_information array fields) will guarantee the original value in the result. Hence no need for a pre-sort stage to define the order since it won't be necessary in this case.

Mongodb Aggregation grouping with leave the field

After applying the aggregation
db.grades.aggregate([
{$match: {'type': 'homework'}},
{$sort: {'student_id':1, 'score':1}}
])
got the result:
{
"result" : [
{
"_id" : ObjectId("50906d7fa3c412bb040eb579"),
"student_id" : 0,
"type" : "homework",
"score" : 14.8504576811645
},
{
"_id" : ObjectId("50906d7fa3c412bb040eb57a"),
"student_id" : 0,
"type" : "homework",
"score" : 63.98402553675503
},
...
How to modify the request to leave documents with a minimum value score and get a result which kept the field id. For example, in such a way:
{
"_id" : ObjectId("50906d7fa3c412bb040eb579"),
"score" : 14.8504576811645
}
Thanks.
Is this a homework question from the education site? I can't remember, but this is fairly trivial.
db.grades.aggregate([
{ "$match": { type: 'homework' } },
{ "$sort": {student_id: 1, score: 1} },
{ "$group": {
"_id": "$student_id",
"doc": { "$first": "$_id"},
"score": { "$first": "$score"}
}},
{ "$sort: { "_id": 1 } },
{ "$project": {
"_id": "$doc",
"score": 1
}}
])
All this does is use $first to get the first result when grouping by student_id. By first it means exactly that, so this is only useful after sorting and is different from $min which would take the smallest value from the grouped results.
So if you got part of the way there, not only do you keep the first score, but you also do the same operation on the _id value as well.
The additional sort is only there so the results don't trip you up, because they are likely to appear in the reverse order of student_id. Finally there is just a small use of $project to get the document form that you want.

Selecting Distinct values from Array in MongoDB

I have a collection name Alpha_Num, It has following structure. I am trying to find out which Alphabet-Numerals pair will appear maximum number of times ?
If we just go with the data below, pair abcd-123 appears twice so as pair efgh-10001, but the second one is not a valid case for me as it appears in same document.
{
"_id" : 12345,
"Alphabet" : "abcd",
"Numerals" : [
"123",
"456",
"2345"
]
}
{
"_id" : 123456,
"Alphabet" : "efgh",
"Numerals" : [
"10001",
"10001",
"1002"
]
}
{
"_id" : 123456567,
"Alphabet" : "abcd",
"Numerals" : [
"123"
]
}
I tried to use aggregation frame work, something like below
db.Alpha_Num.aggregate([
{"$unwind":"$Numerals"},
{"$group":
{"_id":{"Alpha":"$Alphabet","Num":"$Numerals"},
"count":{$sum:1}}
},
{"$sort":{"count":-1}}
])
Problem in this query is it gives pair efgh-10001 twice.
Question : How to select distinct values from array "Numerals" in the above condition ?
Problem solved.
db.Alpha_Num.aggregate([{
"$unwind": "$Numerals"
}, {
"$group": {
_id: {
"_id": "$_id",
"Alpha": "$Alphabet"
},
Num: {
$addToSet: "$Numerals"
}
}
}, {
"$unwind": "$Num"
}, {
"$group": {
_id: {
"Alplha": "$_id.Alpha",
"Num": "$Num"
},
count: {
"$sum": 1
}
}
}])
Grouping using $addToSet and unwinding again did the trick. Got the answer from one of 10gen online course.