ElasticSearch group by multiple fields - group-by

The only close thing that I've found was: Multiple group-by in Elasticsearch
Basically I'm trying to get the ES equivalent of the following MySql query:
select gender, age_range, count(distinct profile_id) as count
FROM TABLE group by age_range, gender
The age and gender by themselves were easy to get:
{
"query": {
"match_all": {}
},
"facets": {
"ages": {
"terms": {
"field": "age_range",
"size": 20
}
},
"gender_by_age": {
"terms": {
"fields": [
"age_range",
"gender"
]
}
}
},
"size": 0
}
which gives:
{
"ages": {
"_type": "terms",
"missing": 0,
"total": 193961,
"other": 0,
"terms": [
{
"term": 0,
"count": 162643
},
{
"term": 3,
"count": 10683
},
{
"term": 4,
"count": 8931
},
{
"term": 5,
"count": 4690
},
{
"term": 6,
"count": 3647
},
{
"term": 2,
"count": 3247
},
{
"term": 1,
"count": 120
}
]
},
"total_gender": {
"_type": "terms",
"missing": 0,
"total": 193961,
"other": 0,
"terms": [
{
"term": 1,
"count": 94799
},
{
"term": 2,
"count": 62645
},
{
"term": 0,
"count": 36517
}
]
}
}
But now I need something that looks like this:
[breakdown_gender] => Array
(
[1] => Array
(
[0] => 264
[1] => 1
[2] => 6
[3] => 67
[4] => 72
[5] => 40
[6] => 23
)
[2] => Array
(
[0] => 153
[2] => 2
[3] => 21
[4] => 35
[5] => 22
[6] => 11
)
)
Please note that 0,1,2,3,4,5,6 are "mappings" for the age ranges so they actually mean something :) and not just numbers. e.g. Gender[1] (which is "male") breaks down into age range [0] (which is "under 18") with a count of 246.

Starting from version 1.0 of ElasticSearch, the new aggregations API allows grouping by multiple fields, using sub-aggregations. Suppose you want to group by fields field1, field2 and field3:
{
"aggs": {
"agg1": {
"terms": {
"field": "field1"
},
"aggs": {
"agg2": {
"terms": {
"field": "field2"
},
"aggs": {
"agg3": {
"terms": {
"field": "field3"
}
}
}
}
}
}
}
}
Of course this can go on for as many fields as you'd like.
Update:
For completeness, here is how the output of the above query looks. Also below is python code for generating the aggregation query and flattening the result into a list of dictionaries.
{
"aggregations": {
"agg1": {
"buckets": [{
"doc_count": <count>,
"key": <value of field1>,
"agg2": {
"buckets": [{
"doc_count": <count>,
"key": <value of field2>,
"agg3": {
"buckets": [{
"doc_count": <count>,
"key": <value of field3>
},
{
"doc_count": <count>,
"key": <value of field3>
}, ...
]
},
{
"doc_count": <count>,
"key": <value of field2>,
"agg3": {
"buckets": [{
"doc_count": <count>,
"key": <value of field3>
},
{
"doc_count": <count>,
"key": <value of field3>
}, ...
]
}, ...
]
},
{
"doc_count": <count>,
"key": <value of field1>,
"agg2": {
"buckets": [{
"doc_count": <count>,
"key": <value of field2>,
"agg3": {
"buckets": [{
"doc_count": <count>,
"key": <value of field3>
},
{
"doc_count": <count>,
"key": <value of field3>
}, ...
]
},
{
"doc_count": <count>,
"key": <value of field2>,
"agg3": {
"buckets": [{
"doc_count": <count>,
"key": <value of field3>
},
{
"doc_count": <count>,
"key": <value of field3>
}, ...
]
}, ...
]
}, ...
]
}
}
}
The following python code performs the group-by given the list of fields. I you specify include_missing=True, it also includes combinations of values where some of the fields are missing (you don't need it if you have version 2.0 of Elasticsearch thanks to this)
def group_by(es, fields, include_missing):
current_level_terms = {'terms': {'field': fields[0]}}
agg_spec = {fields[0]: current_level_terms}
if include_missing:
current_level_missing = {'missing': {'field': fields[0]}}
agg_spec[fields[0] + '_missing'] = current_level_missing
for field in fields[1:]:
next_level_terms = {'terms': {'field': field}}
current_level_terms['aggs'] = {
field: next_level_terms,
}
if include_missing:
next_level_missing = {'missing': {'field': field}}
current_level_terms['aggs'][field + '_missing'] = next_level_missing
current_level_missing['aggs'] = {
field: next_level_terms,
field + '_missing': next_level_missing,
}
current_level_missing = next_level_missing
current_level_terms = next_level_terms
agg_result = es.search(body={'aggs': agg_spec})['aggregations']
return get_docs_from_agg_result(agg_result, fields, include_missing)
def get_docs_from_agg_result(agg_result, fields, include_missing):
current_field = fields[0]
buckets = agg_result[current_field]['buckets']
if include_missing:
buckets.append(agg_result[(current_field + '_missing')])
if len(fields) == 1:
return [
{
current_field: bucket.get('key'),
'doc_count': bucket['doc_count'],
}
for bucket in buckets if bucket['doc_count'] > 0
]
result = []
for bucket in buckets:
records = get_docs_from_agg_result(bucket, fields[1:], include_missing)
value = bucket.get('key')
for record in records:
record[current_field] = value
result.extend(records)
return result

As you only have 2 fields a simple way is doing two queries with single facets. For Male:
{
"query" : {
"term" : { "gender" : "Male" }
},
"facets" : {
"age_range" : {
"terms" : {
"field" : "age_range"
}
}
}
}
And for female:
{
"query" : {
"term" : { "gender" : "Female" }
},
"facets" : {
"age_range" : {
"terms" : {
"field" : "age_range"
}
}
}
}
Or you can do it in a single query with a facet filter (see this link for further information)
{
"query" : {
"match_all": {}
},
"facets" : {
"age_range_male" : {
"terms" : {
"field" : "age_range"
},
"facet_filter":{
"term": {
"gender": "Male"
}
}
},
"age_range_female" : {
"terms" : {
"field" : "age_range"
},
"facet_filter":{
"term": {
"gender": "Female"
}
}
}
}
}
Update:
As facets are about to be removed. This is the solution with aggregations:
{
"query": {
"match_all": {}
},
"aggs": {
"male": {
"filter": {
"term": {
"gender": "Male"
}
},
"aggs": {
"age_range": {
"terms": {
"field": "age_range"
}
}
}
},
"female": {
"filter": {
"term": {
"gender": "Female"
}
},
"aggs": {
"age_range": {
"terms": {
"field": "age_range"
}
}
}
}
}
}

I know, it doesn't answer the question, but I found this page while looking for a way to do multi terms aggregation. Finally, found info about this functionality in the documentation. Maybe it will help somebody...
multi_terms aggregation:
"aggs": {
"lat_lng": {
"multi_terms": {
"terms": [{
"field": "lat"
},{
"field": "lng"
}]
}
}
}
The result will be something close to
...
{
"key" : [
"43.00861889999999",
"-78.8186202"
],
"key_as_string" : "43.00861889999999|-78.8186202",
"doc_count" : 6
},
...

I have tried grouping profiles on organization yearly revenue and the count will then further distributed among industries using the following query
Example:
{
"size": 0,
"aggs": {
"categories": {
"filter": {
"exists": {
"field": "organization_industries"
}
},
"aggs": {
"names": {
"terms": {
"field": "organization_revenue_in_thousands_int.keyword",
"size": 200,
"order": {
"_key": "desc"
}
},
"aggs": {
"industry_stats": {
"terms": {
"field": "organization_industries.keyword"
}
}
}
}
}
}
}
}
Output:
"aggregations": {
"categories": {
"doc_count": 195161605,
"names": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 19226983,
"buckets": [
{
"key": "99900",
"doc_count": 1742,
"industry_stats": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "internet",
"doc_count": 1605
},
{
"key": "investment management",
"doc_count": 81
},
{
"key": "biotechnology",
"doc_count": 54
},
{
"key": "computer & network security",
"doc_count": 2
}
]
}
},
{
"key": "998000",
"doc_count": 71,
"industry_stats": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "finance",
"doc_count": 48
},
{
"key": "information technology & services",
"doc_count": 23
}
]
}
}
}
]
}
}

Related

how to update nested doc in mongo

I need to batch update cdi_tags with md5 collection(push item to cdi_tags) as follows:
db.getCollection("event").update({
"_id": {
$in: [ObjectId("6368f03e21b1ad246c84d67b"), ObjectId("6368f03f21b1ad246c84d982")]
},
"meta": {
$elemMatch: {
"key": "bags",
"value"
}
}
}, {
$addToSet: {
"meta.$.value.$[t].cdi_tags: "test_tag"
}
}, {
arrayFilters: [{
"t": {
$in: ["cc09ab29db36f85e154d2c1ae9517f57", "b6b9c266f584191b6eb2d2659948a7a9"]
}
}],
multi: true
})
but not work, my doc as follows
{
"_id": ObjectId("6368f03f21b1ad246c84d982"),
"event_key": "PLAA7-194710",
"data_source": "EAP",
"name": "EP33L-AA94710",
"production": "CP",
"meta":
[
{
"key": "auto_note",
"value":
[]
},
{
"key": "bags",
"value":
{
"cc09ab29db36f85e154d2c1ae9517f57":
{
"name": "PLAA63952_event_ftp_pcar_event_20221107-194709_0.bag",
"profile": "msquare-prediction-ro",
"md5": "cc09ab29db36f85e154d2c1ae9517f57",
"cdi_tags":
[
"FDI_V1_0_0",
"from_dpp",
"epl-no-sensor",
"with_f100",
"epl-fix_video",
"epl-fix_horizon"
]
},
"361f5160cca3c3dec90cbbf93e3d7ae3":
{
"name": "PLAA63952_event_ftp_pcar_event_20221107-194709_0.bag",
"profile": "msquare-prediction-ro",
"md5": "361f5160cca3c3dec90cbbf93e3d7ae3",
"cdi_tags":
[
"FDI_V1_0_0",
"china",
"ftp_epcar_rawdata",
"from_dpp",
"trigger_type:system"
]
}
}
}
]
}
thinks
how to batch update nested doc in mongo

Elasticsearch: Class Cast Exception Scala API

I have been using ES 5.6 and the aggregation queries are working
fine. Recently, we upgraded our ES to 7.1 and it has resulted in a
ClassCastException for one of the queries. I'm posting the ES Index
mapping along with the Scala code and ES query that is resulting in
the exception.
Mapping:
{
"orgs": {
"mappings": {
"org": {
"properties": {
"people": {
"type": "nested",
"properties": {
"email": {
"type": "keyword"
},
"first_name": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"last_name": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"pcsi": {
"type": "keyword"
},
"position": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"position_type": {
"type": "keyword"
},
"source_guid": {
"type": "keyword"
},
"source_lni": {
"type": "keyword"
},
"suffix": {
"type": "keyword"
}
}
}
}
}
}
}
}
Scala Query:
baseQuery.aggs(nestedAggregation("people", OrganizationSchema.People)
.subAggregations(termsAgg("positiontype", "people.position_type")))
Elastic Query:
{"query":{"term":{"_id":{"value":"id"}}},"aggs":{"people":{"nested":{"path":"people"},"aggs":{"positiontype":{"terms":{"field":"people.position_type"}}}}}}
response:
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 6,
"successful": 6,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0,
"hits": []
},
"aggregations": {
"people": {
"doc_count": 52,
"positiontype": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Board Member",
"doc_count": 28
},
{
"key": "Executive",
"doc_count": 22
},
{
"key": "Others",
"doc_count": 2
}
]
}
}
}
}
Scala code:
def getOrganizationPeopleFilters(client: ElasticClient, entityType: String, entityId: String, request: Option[PostFilterApiRequest], baseQuery: SearchRequest): IO[PostFilters] = {
val q = baseQuery.aggs(nestedAggregation("people", OrganizationSchema.People)
.subAggregations(termsAgg("positiontype", "people.position_type")))
client.execute {
q
}.flatMap { res ⇒
esToJsonOrganizationPeopleFilters(res.result)
}
}
The ES query is running and aggregating correctly in Kibana. But, when we are trying to FlatMap the response in the above Scala api code, it is resulting in a ClassCastException (java.lang.ClassCastException: scala.collection.immutable.Map$Map2 cannot be cast to java.lang.Integer)

How to filter unique value in REST API

I want to filter unique IP regardless it's in DST_Local_IP or in SRC_Local_IP.
this is my REST API:
{
"size" : 0,
"query": {
"bool": {
"should": [
{
"match":{"IPV4_DST_ADDR":"120.127.0.0/16"}
},
{
"match":{"IPV4_SRC_ADDR":"120.127.0.0/16"}
},
{
"range" : {
"LAST_SWITCHED" : {
"gte" : 0
}
}
}
],
"minimum_should_match": 2
}
},
"aggs": {
"DST_Local_IP": {
"filter": {
"bool": {
"filter": {
"match":{"IPV4_DST_ADDR":"120.127.0.0/16"}
}
}
},
"aggs": {
"dst_local_ip" : {
"terms" : {
"field" : "IPV4_DST_ADDR",
"size": 10000
}
}
}
},
"SRC_Local_IP": {
"filter": {
"bool": {
"filter": {
"match":{"IPV4_SRC_ADDR":"120.127.0.0/16"}
}
}
},
"aggs": {
"src_local_ip" : {
"terms" : {
"field" : "IPV4_SRC_ADDR",
"size": 10000
}
}
}
}
}
}
response:
"aggregations": {
"SRC_Local_IP": {
"doc_count": 48287688,
"src_local_ip": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "120.127.160.8",
"doc_count": 6890185
},
{
"key": "120.127.160.77",
"doc_count": 3791683
},
{
"key": "120.127.160.65",
"doc_count": 1646648
},
{
"key": "120.127.160.42",
"doc_count": 1058027
}
.
.
.
"DST_Local_IP": {
"doc_count": 36696216,
"dst_local_ip": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "120.127.160.8",
"doc_count": 2762815
},
{
"key": "120.127.160.99",
"doc_count": 1344110
}
I want the return value is distinct because the ip in DST_Local_IP may be in SRC_Local_IP duplicated, but I just want the unique ip regardless the ip is in DST_Local_IP or SRC_Local_IP.
How can I do?could you give me some idea:)
thank you in advance!

complex elasticsearch query over rest

using Rest from classic asp I am able to retrieve data from an index when I perform a simple search such as:
Set xml = Server.CreateObject("Microsoft.XMLHTTP")
xml.Open "GET", "http://elastic:changeme#10.128.128.109:9200/myindex/_search?q='fred blogs',_source='path.real'", False
I now want to retrive data from a much more complex query that if I was using Kibana would look something like this:
GET newsindex/_search
{
"query": {
"function_score": {
"query": { "bool" : {"should": [
{ "term": { "headline": "brown" } },
{ "term": { "bodytext": "brown" } },
{ "term": { "headline": "fox" } },
{ "term": { "bodytext": "fox" } }
]}},
"functions": [
{
"filter": {"match" : { "headline": "brown fox" }},
"weight": 2
},
{
"filter": {"match" : { "bodytext": "brown fox" }},
"weight": 1
},
{
"filter": {"match" : { "bodytext": "fox" }},
"weight": 3
},
{
"filter": {"match" : { "bodytext": "brown" }},
"weight": 3
},
{
"filter": {"match" : { "headline": "brown" }},
"weight": 4
},
{
"filter": {"match" : { "headline": "fox" }},
"weight": 4
}
],
"score_mode": "sum"
}
},"sort": [
{
"_score": {
"order": "asc"
}
}
],
"_source":[ "headline", "bodytext" ]
}
How can I pass this query using Rest? I'll be building queries like this on the fly to pass via rest. But I need to get the construct right.
By using curl command, you can do something like this:
curl -XGET "http://elastic:changeme#10.128.128.109:9200/myindex/_search" -H 'Content-Type: application/json' -d'
{
"query": {
"function_score": {
"query": {
"bool": {
"should": [
{
"term": {
"headline": "brown"
}
},
{
"term": {
"bodytext": "brown"
}
},
{
"term": {
"headline": "fox"
}
},
{
"term": {
"bodytext": "fox"
}
}
]
}
},
"functions": [
{
"filter": {
"match": {
"headline": "brown fox"
}
},
"weight": 2
},
{
"filter": {
"match": {
"bodytext": "brown fox"
}
},
"weight": 1
},
{
"filter": {
"match": {
"bodytext": "fox"
}
},
"weight": 3
},
{
"filter": {
"match": {
"bodytext": "brown"
}
},
"weight": 3
},
{
"filter": {
"match": {
"headline": "brown"
}
},
"weight": 4
},
{
"filter": {
"match": {
"headline": "fox"
}
},
"weight": 4
}
],
"score_mode": "sum"
}
},
"sort": [
{
"_score": {
"order": "asc"
}
}
],
"_source": [
"headline",
"bodytext"
]
}'
You can send your query in data by using curl. I'm sure the same can also be achieved by your Rest client. Just check how to send data using your Rest client.
This post might help. Replace the DataToSend variable in the example with your query and see if this works.

MongoDB, grouping inner Hash key value by another key value

I have these 4 elements in my collection:
/* 1 */
{
"demographics": [
{
"key": "country",
"value": "ES"
},
{
"key": "city",
"value": "Sevilla"
},
{
"key": "region",
"value": "Andalucía"
}
]
}
/* 2 */
{
"demographics": [
{
"key": "city",
"value": "Cádiz"
},
{
"key": "country",
"value": "ES"
},
{
"key": "region",
"value": "Andalucía"
}
]
}
/* 3 */
{
"demographics": [
{
"key": "country",
"value": "GB"
},
{
"key": "region",
"value": "Greater London"
},
{
"key": "city",
"value": "London"
}
]
}
/* 4 */
{
"demographics": [
{
"key": "country",
"value": "ES"
},
{
"key": "region",
"value": "Andalucía"
},
{
"key": "city",
"value": "Sevilla"
}
]
}
I would like to group them by:
demographic.value when demographic.key = "country"
demographic.value when demographic.key = "region"
demographic.value when demographic.key = "city"
Having a result like this:
{ "values": ["ES", "Andalucía", "Sevilla"], "count": 2 }
{ "values": ["ES", "Andalucía", "Cádiz"], "count": 1 }
{ "values": ["GB", "Greater London", "London"], "count": 1 }
Attention: beware the order of the demographics array elements might be not always the same.
I have tried
db.getCollection('test').aggregate(
[
{ "$unwind": "$demographics" },
{
"$project" :{
"_id": 0,
"demographics.key": 1,
"demographics.value": 1
}
},
{
"$group" : {
"_id": {
"key": "$demographics.key",
"value": "$demographics.value"
},
"count": { "$sum": 1 }
}
},
{
"$group" : {
"_id": "$_id.key",
"values": { "$push": { "value": "$_id.value", "count": "$count" } }
}
}
]
)
This gives me this result:
/* 1 */
{
"_id": "country",
"values": [
{
"value": "GB",
"count": 1.0
},
{
"value": "ES",
"count": 3.0
}
]
}
/* 2 */
{
"_id": "region",
"values": [
{
"value": "Greater London",
"count": 1.0
},
{
"value": "Andalucía",
"count": 3.0
}
]
}
/* 3 */
{
"_id": "city",
"values": [
{
"value": "London",
"count": 1.0
},
{
"value": "Cádiz",
"count": 1.0
},
{
"value": "Sevilla",
"count": 2.0
}
]
}
But this is not the groups I am looking for
You can try running the following pipeline:
db.test.aggregate([
{ "$unwind": "$demographics" },
{ "$sort": { "demographics.key": 1, "demographics.value": 1 } },
{
"$group": {
"_id": "$_id",
"values": { "$push": "$demographics.value" }
}
},
{
"$group": {
"_id": "$values",
"count": { "$sum": 1 }
}
},
{
"$project": {
"_id": 0, "values": "$_id", "count": 1
}
}
])
Sample Output
/* 1 */
{
"count" : 2,
"values" : [
"Sevilla",
"ES",
"Andalucía"
]
}
/* 2 */
{
"count" : 1,
"values" : [
"London",
"GB",
"Greater London"
]
}
/* 3 */
{
"count" : 1,
"values" : [
"Cádiz",
"ES",
"Andalucía"
]
}