MongoDB, grouping inner Hash key value by another key value - mongodb

I have these 4 elements in my collection:
/* 1 */
{
"demographics": [
{
"key": "country",
"value": "ES"
},
{
"key": "city",
"value": "Sevilla"
},
{
"key": "region",
"value": "Andalucía"
}
]
}
/* 2 */
{
"demographics": [
{
"key": "city",
"value": "Cádiz"
},
{
"key": "country",
"value": "ES"
},
{
"key": "region",
"value": "Andalucía"
}
]
}
/* 3 */
{
"demographics": [
{
"key": "country",
"value": "GB"
},
{
"key": "region",
"value": "Greater London"
},
{
"key": "city",
"value": "London"
}
]
}
/* 4 */
{
"demographics": [
{
"key": "country",
"value": "ES"
},
{
"key": "region",
"value": "Andalucía"
},
{
"key": "city",
"value": "Sevilla"
}
]
}
I would like to group them by:
demographic.value when demographic.key = "country"
demographic.value when demographic.key = "region"
demographic.value when demographic.key = "city"
Having a result like this:
{ "values": ["ES", "Andalucía", "Sevilla"], "count": 2 }
{ "values": ["ES", "Andalucía", "Cádiz"], "count": 1 }
{ "values": ["GB", "Greater London", "London"], "count": 1 }
Attention: beware the order of the demographics array elements might be not always the same.
I have tried
db.getCollection('test').aggregate(
[
{ "$unwind": "$demographics" },
{
"$project" :{
"_id": 0,
"demographics.key": 1,
"demographics.value": 1
}
},
{
"$group" : {
"_id": {
"key": "$demographics.key",
"value": "$demographics.value"
},
"count": { "$sum": 1 }
}
},
{
"$group" : {
"_id": "$_id.key",
"values": { "$push": { "value": "$_id.value", "count": "$count" } }
}
}
]
)
This gives me this result:
/* 1 */
{
"_id": "country",
"values": [
{
"value": "GB",
"count": 1.0
},
{
"value": "ES",
"count": 3.0
}
]
}
/* 2 */
{
"_id": "region",
"values": [
{
"value": "Greater London",
"count": 1.0
},
{
"value": "Andalucía",
"count": 3.0
}
]
}
/* 3 */
{
"_id": "city",
"values": [
{
"value": "London",
"count": 1.0
},
{
"value": "Cádiz",
"count": 1.0
},
{
"value": "Sevilla",
"count": 2.0
}
]
}
But this is not the groups I am looking for

You can try running the following pipeline:
db.test.aggregate([
{ "$unwind": "$demographics" },
{ "$sort": { "demographics.key": 1, "demographics.value": 1 } },
{
"$group": {
"_id": "$_id",
"values": { "$push": "$demographics.value" }
}
},
{
"$group": {
"_id": "$values",
"count": { "$sum": 1 }
}
},
{
"$project": {
"_id": 0, "values": "$_id", "count": 1
}
}
])
Sample Output
/* 1 */
{
"count" : 2,
"values" : [
"Sevilla",
"ES",
"Andalucía"
]
}
/* 2 */
{
"count" : 1,
"values" : [
"London",
"GB",
"Greater London"
]
}
/* 3 */
{
"count" : 1,
"values" : [
"Cádiz",
"ES",
"Andalucía"
]
}

Related

mongodb: How to build facets from an array of objects

What would be the best way to create facets from an array of objects with different attributes for each product.
Example Data:
[
{
"key": 1,
"title": "Product A",
"attrs": [
{
"keyasd": "size",
"value": "small"
},
{
"kedasy": "color",
"value": "red"
}
]
},
{
"key": 2,
"title": "Product B",
"attrs": [
{
"key": "size",
"value": "large"
},
{
"key": "color",
"value": "blue"
}
]
},
{
"key": 3,
"title": "Product C",
"attrs": [
{
"key": "resolution",
"value": "8K"
},
{
"key": "refresh rate",
"value": "60 Hz"
},
]
}
]
The result I would like to get would be something like this:
[
{
"_id": {
"key": "size",
"values" : [
{"title": "small", "count": 1},
{"title": "large", "count": 1}
]
}
},
{
"_id": {
"key": "color",
"values" : [
{"title": "red", "count": 1},
{"title": "blue", "count": 1}
]
}
},
{
"_id": {
"key": "resolution",
"values" : [
{"title": "8K", "count": 1}
]
}
},
{
"_id": {
"key": "refresh rate",
"values" : [
{"title": "60 Hz", "count": 1}
]
}
}
]
I don't know if the result I put is possible, but I need to somehow build it, even if it's individually each facet for each type of attribute that a product can have
db.collection.aggregate([
{
"$unwind": "$attrs"
},
{
"$group": {
"_id": {
k: "$attrs.key",
v: "$attrs.value"
},
"count": { "$sum": 1 }
}
},
{
"$group": {
"_id": "$_id.k",
"values": {
"$push": {
"title": "$_id.v",
"count": "$count"
}
}
}
},
{
"$project": {
"_id": {
key: "$_id",
values: "$values"
}
}
}
])
mongoplayground

Get the $size (length) of a nested array and calculate the difference to a stored value on the parent object - using aggregate

Let's consider that I have the following documents (ignoring the _id):
[
{
"Id": "Store1",
"Info": {
"Location": "Store1 Street",
"PhoneNumber": 111
},
"MaxItemsPerShelf": 3,
"Shelf": [
{
"Id": "Shelf1",
"Items": [
{
"Id": "Item1",
"Name": "bananas"
},
{
"Id": "Item2",
"Name": "apples"
},
{
"Id": "Item3",
"Name": "oranges"
}
]
},
{
"Id": "Shelf2",
"Items": [
{
"Id": "Item4",
"Name": "cookies"
},
{
"Id": "Item5",
"Name": "chocolate"
}
]
},
{
"Id": "Shelf3",
"Items": []
}
]
},
{
"Id": "Store3",
"Info": {
"Location": "Store2 Street",
"PhoneNumber": 222
},
"MaxItemsPerShelf": 2,
"Shelf": [
{
"Id": "Shelf4",
"Items": [
{
"Id": "Item6",
"Name": "champoo"
},
{
"Id": "Item7",
"Name": "toothpaste"
}
]
},
{
"Id": "Shelf5",
"Items": [
{
"Id": "Item8",
"Name": "chicken"
}
]
}
]
}
]
Given a specific Shelf.Id I want to get the following result ( Shelf.Id = "Shelf2"):
[{
"Info": {
"Location": "Store1 Street",
"PhoneNumber": 111
},
"ItemsNumber": 2,
"ItemsRemaining": 1
}]
Therefore:
ItemsNumberis the $size of Shelf
and
ItemsRemainingis equal to MaxItemsPerShelf $size of Shelf
also I want to copy the value of the Info to the aggregate output.
How can I accomplish this with aggregate? On my efforts I couldn't pass through an iterator that gets the $size of $Shelf.Items
You can use below aggregation
db.collection.aggregate([
{ "$match": { "Shelf.Id": "Shelf2" }},
{ "$replaceRoot": {
"newRoot": {
"$let": {
"vars": {
"shelf": {
"$filter": {
"input": {
"$map": {
"input": "$Shelf",
"in": {
"Id": "$$this.Id",
"count": { "$size": "$$this.Items" }
}
}
},
"as": "ss",
"cond": { "$eq": ["$$ss.Id", "Shelf2"] }
}
}
},
"in": {
"Info": "$Info",
"ItemsNumber": { "$arrayElemAt": ["$$shelf.count", 0] },
"ItemsRemaining": {
"$subtract": [
"$MaxItemsPerShelf",
{ "$ifNull": [
{ "$arrayElemAt": ["$$shelf.count", 0] },
0
]}
]
}
}
}
}
}}
])

Find by biggest difference between elements on embedded object array

Given a list of products like this:
{
"_id" : ObjectId("5a594f8eff9da13c9d415a63"),
"productId" : "xxx",
"date" : "2018-09-13",
"prices" : [
{
"country" : "en",
"price" : 16.5,
"currency" : "EUR"
},
{
"country" : "es",
"price" : 17.78,
"currency" : "EUR"
},
{
"country" : "fr",
"price" : 18.08,
"currency" : "EUR"
},
{
"country" : "de",
"price" : 18.89,
"currency" : "EUR"
},
{
"country" : "it",
"price" : 27.49,
"currency" : "EUR"
}
]
}
Given a country code and a date, is there any way to find the products for that date and order by biggest different between price for the country?
Thank you very much in advance
Assuming that
you want the biggest difference between the given country and any other country and
there are no duplicate product ids (if there are, the latest product will be used, thanks to this line "$last": "$prices"),
try this:
db.collection.aggregate([
{
"$match": {
"date": "2018-09-13" // replace with date variable
}
},
{
"$group": {
"_id": "$productId",
"prices": {
"$last": "$prices"
}
}
},
{
"$addFields": {
"pricesObj": {
"$map": {
"input": "$prices",
"in": {
"k": "$$this.country",
"v": "$$this.price"
}
}
}
}
},
{
"$addFields": {
"pricesObj": {
"$arrayToObject": "$pricesObj"
}
}
},
{
"$addFields": {
"reference": "$pricesObj.es" // replace with country variable
}
},
{
"$addFields": {
"differences": {
"$map": {
"input": "$prices",
"in": {
"country": "$$this.country",
"difference": {
"$abs": {
"$subtract": [
"$$this.price",
"$reference"
]
}
}
}
}
}
}
},
{
"$addFields": {
"biggestDifference": {
"$reduce": {
"input": "$differences",
"initialValue": {
difference: 0
},
"in": {
"$cond": [
{
"$gt": [
"$$this.difference",
"$$value.difference"
]
},
"$$this",
"$$value"
]
}
}
}
}
},
{
"$project": {
"_id": 1,
"biggestDifference": "$biggestDifference.difference"
}
},
{
"$sort": {
"biggestDifference": -1
}
}
])
I'm sure it could be expressed more concisely, but it works: https://mongoplayground.net/p/y67jhhFBB9l
The output looks like:
[
{
"_id": "xxy",
"biggestDifference": 12295.109999999999
},
{
"_id": "xxx",
"biggestDifference": 98.72
}
]
for this input:
[
{
"productId": "xxx",
"date": "2018-09-13",
"prices": [
{
"country": "en",
"price": 116.5,
"currency": "EUR"
},
{
"country": "es",
"price": 17.78,
"currency": "EUR"
},
{
"country": "fr",
"price": 18.08,
"currency": "EUR"
},
{
"country": "de",
"price": 18.89,
"currency": "EUR"
},
{
"country": "it",
"price": 27.49,
"currency": "EUR"
}
]
},
{
"productId": "xxy",
"date": "2018-09-13",
"prices": [
{
"country": "en",
"price": 16.5,
"currency": "EUR"
},
{
"country": "es",
"price": 17.78,
"currency": "EUR"
},
{
"country": "fr",
"price": 18.08,
"currency": "EUR"
},
{
"country": "de",
"price": 12312.89,
"currency": "EUR"
},
{
"country": "it",
"price": 997.49,
"currency": "EUR"
}
]
}
]
Thank you #jaksz,
Finally, I'm using this approach that works like a charm (because the smaller price is always in the first position of the array):
db.productPrices.aggregate(
[
{
"$match": {
"date": "2018-09-13" // replace with date variable
}
},
{
"$group": {
"_id": "$productId",
"prices": {
"$last": "$prices"
}
}
},
{
"$addFields": {
"pricesObj": {
"$map": {
"input": "$prices",
"in": {
"k": "$$this.country",
"v": "$$this.price"
}
}
}
}
},
{
"$addFields": {
"pricesObj": {
"$arrayToObject": "$pricesObj"
}
}
},
{
"$addFields": {
"reference": "$pricesObj.es" // replace with country variable
}
},
{
"$addFields": {
"cheapest": {
"$arrayElemAt": ["$prices", 0]
}
}
},
{
"$addFields": {
"difference": {
"$abs": {
"$subtract": ["$reference", "$cheapest.price"]
}
}
}
},
{
"$project": {
"_id": 1,
"prices": "$prices",
"difference": "$difference"
}
},
{
"$sort": {
"difference": -1
}
}
]).pretty()

complex elasticsearch query over rest

using Rest from classic asp I am able to retrieve data from an index when I perform a simple search such as:
Set xml = Server.CreateObject("Microsoft.XMLHTTP")
xml.Open "GET", "http://elastic:changeme#10.128.128.109:9200/myindex/_search?q='fred blogs',_source='path.real'", False
I now want to retrive data from a much more complex query that if I was using Kibana would look something like this:
GET newsindex/_search
{
"query": {
"function_score": {
"query": { "bool" : {"should": [
{ "term": { "headline": "brown" } },
{ "term": { "bodytext": "brown" } },
{ "term": { "headline": "fox" } },
{ "term": { "bodytext": "fox" } }
]}},
"functions": [
{
"filter": {"match" : { "headline": "brown fox" }},
"weight": 2
},
{
"filter": {"match" : { "bodytext": "brown fox" }},
"weight": 1
},
{
"filter": {"match" : { "bodytext": "fox" }},
"weight": 3
},
{
"filter": {"match" : { "bodytext": "brown" }},
"weight": 3
},
{
"filter": {"match" : { "headline": "brown" }},
"weight": 4
},
{
"filter": {"match" : { "headline": "fox" }},
"weight": 4
}
],
"score_mode": "sum"
}
},"sort": [
{
"_score": {
"order": "asc"
}
}
],
"_source":[ "headline", "bodytext" ]
}
How can I pass this query using Rest? I'll be building queries like this on the fly to pass via rest. But I need to get the construct right.
By using curl command, you can do something like this:
curl -XGET "http://elastic:changeme#10.128.128.109:9200/myindex/_search" -H 'Content-Type: application/json' -d'
{
"query": {
"function_score": {
"query": {
"bool": {
"should": [
{
"term": {
"headline": "brown"
}
},
{
"term": {
"bodytext": "brown"
}
},
{
"term": {
"headline": "fox"
}
},
{
"term": {
"bodytext": "fox"
}
}
]
}
},
"functions": [
{
"filter": {
"match": {
"headline": "brown fox"
}
},
"weight": 2
},
{
"filter": {
"match": {
"bodytext": "brown fox"
}
},
"weight": 1
},
{
"filter": {
"match": {
"bodytext": "fox"
}
},
"weight": 3
},
{
"filter": {
"match": {
"bodytext": "brown"
}
},
"weight": 3
},
{
"filter": {
"match": {
"headline": "brown"
}
},
"weight": 4
},
{
"filter": {
"match": {
"headline": "fox"
}
},
"weight": 4
}
],
"score_mode": "sum"
}
},
"sort": [
{
"_score": {
"order": "asc"
}
}
],
"_source": [
"headline",
"bodytext"
]
}'
You can send your query in data by using curl. I'm sure the same can also be achieved by your Rest client. Just check how to send data using your Rest client.
This post might help. Replace the DataToSend variable in the example with your query and see if this works.

ElasticSearch group by multiple fields

The only close thing that I've found was: Multiple group-by in Elasticsearch
Basically I'm trying to get the ES equivalent of the following MySql query:
select gender, age_range, count(distinct profile_id) as count
FROM TABLE group by age_range, gender
The age and gender by themselves were easy to get:
{
"query": {
"match_all": {}
},
"facets": {
"ages": {
"terms": {
"field": "age_range",
"size": 20
}
},
"gender_by_age": {
"terms": {
"fields": [
"age_range",
"gender"
]
}
}
},
"size": 0
}
which gives:
{
"ages": {
"_type": "terms",
"missing": 0,
"total": 193961,
"other": 0,
"terms": [
{
"term": 0,
"count": 162643
},
{
"term": 3,
"count": 10683
},
{
"term": 4,
"count": 8931
},
{
"term": 5,
"count": 4690
},
{
"term": 6,
"count": 3647
},
{
"term": 2,
"count": 3247
},
{
"term": 1,
"count": 120
}
]
},
"total_gender": {
"_type": "terms",
"missing": 0,
"total": 193961,
"other": 0,
"terms": [
{
"term": 1,
"count": 94799
},
{
"term": 2,
"count": 62645
},
{
"term": 0,
"count": 36517
}
]
}
}
But now I need something that looks like this:
[breakdown_gender] => Array
(
[1] => Array
(
[0] => 264
[1] => 1
[2] => 6
[3] => 67
[4] => 72
[5] => 40
[6] => 23
)
[2] => Array
(
[0] => 153
[2] => 2
[3] => 21
[4] => 35
[5] => 22
[6] => 11
)
)
Please note that 0,1,2,3,4,5,6 are "mappings" for the age ranges so they actually mean something :) and not just numbers. e.g. Gender[1] (which is "male") breaks down into age range [0] (which is "under 18") with a count of 246.
Starting from version 1.0 of ElasticSearch, the new aggregations API allows grouping by multiple fields, using sub-aggregations. Suppose you want to group by fields field1, field2 and field3:
{
"aggs": {
"agg1": {
"terms": {
"field": "field1"
},
"aggs": {
"agg2": {
"terms": {
"field": "field2"
},
"aggs": {
"agg3": {
"terms": {
"field": "field3"
}
}
}
}
}
}
}
}
Of course this can go on for as many fields as you'd like.
Update:
For completeness, here is how the output of the above query looks. Also below is python code for generating the aggregation query and flattening the result into a list of dictionaries.
{
"aggregations": {
"agg1": {
"buckets": [{
"doc_count": <count>,
"key": <value of field1>,
"agg2": {
"buckets": [{
"doc_count": <count>,
"key": <value of field2>,
"agg3": {
"buckets": [{
"doc_count": <count>,
"key": <value of field3>
},
{
"doc_count": <count>,
"key": <value of field3>
}, ...
]
},
{
"doc_count": <count>,
"key": <value of field2>,
"agg3": {
"buckets": [{
"doc_count": <count>,
"key": <value of field3>
},
{
"doc_count": <count>,
"key": <value of field3>
}, ...
]
}, ...
]
},
{
"doc_count": <count>,
"key": <value of field1>,
"agg2": {
"buckets": [{
"doc_count": <count>,
"key": <value of field2>,
"agg3": {
"buckets": [{
"doc_count": <count>,
"key": <value of field3>
},
{
"doc_count": <count>,
"key": <value of field3>
}, ...
]
},
{
"doc_count": <count>,
"key": <value of field2>,
"agg3": {
"buckets": [{
"doc_count": <count>,
"key": <value of field3>
},
{
"doc_count": <count>,
"key": <value of field3>
}, ...
]
}, ...
]
}, ...
]
}
}
}
The following python code performs the group-by given the list of fields. I you specify include_missing=True, it also includes combinations of values where some of the fields are missing (you don't need it if you have version 2.0 of Elasticsearch thanks to this)
def group_by(es, fields, include_missing):
current_level_terms = {'terms': {'field': fields[0]}}
agg_spec = {fields[0]: current_level_terms}
if include_missing:
current_level_missing = {'missing': {'field': fields[0]}}
agg_spec[fields[0] + '_missing'] = current_level_missing
for field in fields[1:]:
next_level_terms = {'terms': {'field': field}}
current_level_terms['aggs'] = {
field: next_level_terms,
}
if include_missing:
next_level_missing = {'missing': {'field': field}}
current_level_terms['aggs'][field + '_missing'] = next_level_missing
current_level_missing['aggs'] = {
field: next_level_terms,
field + '_missing': next_level_missing,
}
current_level_missing = next_level_missing
current_level_terms = next_level_terms
agg_result = es.search(body={'aggs': agg_spec})['aggregations']
return get_docs_from_agg_result(agg_result, fields, include_missing)
def get_docs_from_agg_result(agg_result, fields, include_missing):
current_field = fields[0]
buckets = agg_result[current_field]['buckets']
if include_missing:
buckets.append(agg_result[(current_field + '_missing')])
if len(fields) == 1:
return [
{
current_field: bucket.get('key'),
'doc_count': bucket['doc_count'],
}
for bucket in buckets if bucket['doc_count'] > 0
]
result = []
for bucket in buckets:
records = get_docs_from_agg_result(bucket, fields[1:], include_missing)
value = bucket.get('key')
for record in records:
record[current_field] = value
result.extend(records)
return result
As you only have 2 fields a simple way is doing two queries with single facets. For Male:
{
"query" : {
"term" : { "gender" : "Male" }
},
"facets" : {
"age_range" : {
"terms" : {
"field" : "age_range"
}
}
}
}
And for female:
{
"query" : {
"term" : { "gender" : "Female" }
},
"facets" : {
"age_range" : {
"terms" : {
"field" : "age_range"
}
}
}
}
Or you can do it in a single query with a facet filter (see this link for further information)
{
"query" : {
"match_all": {}
},
"facets" : {
"age_range_male" : {
"terms" : {
"field" : "age_range"
},
"facet_filter":{
"term": {
"gender": "Male"
}
}
},
"age_range_female" : {
"terms" : {
"field" : "age_range"
},
"facet_filter":{
"term": {
"gender": "Female"
}
}
}
}
}
Update:
As facets are about to be removed. This is the solution with aggregations:
{
"query": {
"match_all": {}
},
"aggs": {
"male": {
"filter": {
"term": {
"gender": "Male"
}
},
"aggs": {
"age_range": {
"terms": {
"field": "age_range"
}
}
}
},
"female": {
"filter": {
"term": {
"gender": "Female"
}
},
"aggs": {
"age_range": {
"terms": {
"field": "age_range"
}
}
}
}
}
}
I know, it doesn't answer the question, but I found this page while looking for a way to do multi terms aggregation. Finally, found info about this functionality in the documentation. Maybe it will help somebody...
multi_terms aggregation:
"aggs": {
"lat_lng": {
"multi_terms": {
"terms": [{
"field": "lat"
},{
"field": "lng"
}]
}
}
}
The result will be something close to
...
{
"key" : [
"43.00861889999999",
"-78.8186202"
],
"key_as_string" : "43.00861889999999|-78.8186202",
"doc_count" : 6
},
...
I have tried grouping profiles on organization yearly revenue and the count will then further distributed among industries using the following query
Example:
{
"size": 0,
"aggs": {
"categories": {
"filter": {
"exists": {
"field": "organization_industries"
}
},
"aggs": {
"names": {
"terms": {
"field": "organization_revenue_in_thousands_int.keyword",
"size": 200,
"order": {
"_key": "desc"
}
},
"aggs": {
"industry_stats": {
"terms": {
"field": "organization_industries.keyword"
}
}
}
}
}
}
}
}
Output:
"aggregations": {
"categories": {
"doc_count": 195161605,
"names": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 19226983,
"buckets": [
{
"key": "99900",
"doc_count": 1742,
"industry_stats": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "internet",
"doc_count": 1605
},
{
"key": "investment management",
"doc_count": 81
},
{
"key": "biotechnology",
"doc_count": 54
},
{
"key": "computer & network security",
"doc_count": 2
}
]
}
},
{
"key": "998000",
"doc_count": 71,
"industry_stats": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "finance",
"doc_count": 48
},
{
"key": "information technology & services",
"doc_count": 23
}
]
}
}
}
]
}
}