Configure monitor query with limitation on aggeration - opensearch

I am trying to configure a monitor that looks at data logged by cron jobs.
I want to trigger an alert if a job does stop to log data.
The query using SQL looks something like this:
POST _plugins/_sql/
{
"query" : "SELECT instance, job-id, count(*), max(#timestamp) as newest FROM job-statistics-* where #timestamp > '2022-09-28 00:00:00.000' group BY job-id, instance HAVING newest < '2022-09-28 08:45:00.000'"
}
Using exlplain I converted this to a JSON Query and made the timestamp dynamic:
{
"from": 0,
"size": 0,
"timeout": "1m",
"query": {
"range": {
"#timestamp": {
"from": "now-1h",
"to": null,
"include_lower": false,
"include_upper": true,
"boost": 1
}
}
},
"sort": [
{
"_doc": {
"order": "asc"
}
}
],
"aggregations": {
"composite_buckets": {
"composite": {
"size": 1000,
"sources": [
{
"job-id": {
"terms": {
"field": "job-id.keyword",
"missing_bucket": true,
"missing_order": "first",
"order": "asc"
}
}
},
{
"instance": {
"terms": {
"field": "instance.keyword",
"missing_bucket": true,
"missing_order": "first",
"order": "asc"
}
}
}
]
},
"aggregations": {
"count(*)": {
"value_count": {
"field": "_index"
}
},
"max(#timestamp)": {
"max": {
"field": "#timestamp"
}
}
}
}
}
}
From this query, the limitation on the aggeration max(#timestmap) is missing.
In the explain response it is here:
"name": "FilterOperator",
"description": {
"conditions": """<(max(#timestamp), cast_to_timestamp("2022-09-28 08:45:00.000"))"""
},
Ideally, this should be max(#timestmap) < now-30m
My question:
How can I integrate this into the query or the monitor?
Is there another way to do this?
Thanks a lot
Marius

Related

Cloudant database search index

I have a Json document in cloudant as:
{
"createdAt": "2022-10-26T09:16:29.472Z",
"user_id": "4499c1c2-7507-4707-b0e4-ec83e2d2f34d",
"_id": "606a4d591031c14a8c48fcb4a9541ff0"
}
{
"createdAt": "2022-10-24T11:15:24.269Z",
"user_id": "c4bdcb54-3d0a-4b6a-a8a9-aa12e45345f3",
"_id": "fb24a15d8fb7cdf12feadac08e7c05dc"
}
{
"createdAt": "2022-10-24T11:08:24.269Z",
"user_id": "06d67681-e2c4-4ed4-b40a-5a2c5e7e6ed9",
"_id": "2d277ec3dd8c33da7642b72722aa93ed"
}
I have created a index json as:
{
"type": "json",
"partitioned": false,
"def": {
"fields": [
{
"createdAt": "asc"
},
{
"user_id": "asc"
}
]
}
}
I have created a index text as:
{
"type": "text",
"partitioned": false,
"def": {
"default_analyzer": "keyword",
"default_field": {},
"selector": {},
"fields": [
{
"_id": "string"
},
{
"createdAt": "string"
},
{
"user_id": "string"
}
],
"index_array_lengths": true
}
}
I have created a selctor cloudant query :
{
"selector": {
"$and": [
{
"createdAt": {
"$exists": true
}
},
{
"user_id": {
"$exists": true
}
}
]
},
"fields": [
"createdAt",
"user_id",
"_id"
],
"sort": [
{
"createdAt": "desc"
}
],
"limit": 10,
"skip": 0
}
This code work fine inside the cloudant ambient.
My problem is in the Search Index.
I created this function code that works,
function (doc) {
index("specialsearch", doc._id);
if(doc.createdAt){
index("createdAt", doc.createdAt, {"store":true})
}
if(doc.user_id){
index("user_id", doc.user_id, {"store":true})
}
}
result by this url:
// https://[user]-bluemix.cloudant.com/[database]/_design/attributes/_search/by_all?q=*:*&counts=["createdAt"]&limit=2
{
"total_rows": 10,
"bookmark": "xxx",
"rows": [
{
"id": "fb24a15d8fb7cdf12feadac08e7c05dc",
"order": [
1.0,
0
],
"fields": {
"createdAt": "2022-10-24T11:15:24.269Z",
"user_id": "c4bdcb54-3d0a-4b6a-a8a9-aa12e45345f3"
}
},
{
"id": "dad431735986bbf41b1fa3b1cd30cd0f",
"order": [
1.0,
0
],
"fields": {
"createdAt": "2022-10-24T11:07:02.138Z",
"user_id": "76f03307-4497-4a19-a647-8097fa288e77"
}
},
{
"id": "2d277ec3dd8c33da7642b72722aa93ed",
"order": [
1.0,
0
],
"fields": {
"createdAt": "2022-10-24T11:08:24.269Z",
"user_id": "06d67681-e2c4-4ed4-b40a-5a2c5e7e6ed9"
}
}
]
}
but it doesn't return the id sorted by date based on the createdAt and user_id keys.
What I would like is to get a list of an organized search with the index of the createdAt and user_id keys without having to indicate the value; a wildcard type search
Where am I wrong?
I have read several posts and guides but I did not understand how to do it.
Thanks for your help.
You say you want to return a list of id, createdAt and user_id, sorted by createdAt and user_id. And that you want all the documents returned.
If that is the case, what you need to do is simply create a MapReduce view of your data that emits the createdAt and user_id fields in that order, i.e. :
function (doc) {
emit([doc.createdAt, doc.user_id], 1);
}
You don't need to include the document id because that comes for free.
You can then query the view by visiting the URL:
https://<URL>/<database>/_design/<ddoc_name>/_view/<view_name>
You will get all the docs like this:
{"total_rows":3,"offset":0,"rows":[
{"id":"2d277ec3dd8c33da7642b72722aa93ed","key":["2022-10-24T11:08:24.269Z","06d67681-e2c4-4ed4-b40a-5a2c5e7e6ed9"],"value":1},
{"id":"fb24a15d8fb7cdf12feadac08e7c05dc","key":["2022-10-24T11:15:24.269Z","c4bdcb54-3d0a-4b6a-a8a9-aa12e45345f3"],"value":1},
{"id":"606a4d591031c14a8c48fcb4a9541ff0","key":["2022-10-26T09:16:29.472Z","4499c1c2-7507-4707-b0e4-ec83e2d2f34d"],"value":1}
]}

Elasticsearch: Class Cast Exception Scala API

I have been using ES 5.6 and the aggregation queries are working
fine. Recently, we upgraded our ES to 7.1 and it has resulted in a
ClassCastException for one of the queries. I'm posting the ES Index
mapping along with the Scala code and ES query that is resulting in
the exception.
Mapping:
{
"orgs": {
"mappings": {
"org": {
"properties": {
"people": {
"type": "nested",
"properties": {
"email": {
"type": "keyword"
},
"first_name": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"last_name": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"pcsi": {
"type": "keyword"
},
"position": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"position_type": {
"type": "keyword"
},
"source_guid": {
"type": "keyword"
},
"source_lni": {
"type": "keyword"
},
"suffix": {
"type": "keyword"
}
}
}
}
}
}
}
}
Scala Query:
baseQuery.aggs(nestedAggregation("people", OrganizationSchema.People)
.subAggregations(termsAgg("positiontype", "people.position_type")))
Elastic Query:
{"query":{"term":{"_id":{"value":"id"}}},"aggs":{"people":{"nested":{"path":"people"},"aggs":{"positiontype":{"terms":{"field":"people.position_type"}}}}}}
response:
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 6,
"successful": 6,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0,
"hits": []
},
"aggregations": {
"people": {
"doc_count": 52,
"positiontype": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Board Member",
"doc_count": 28
},
{
"key": "Executive",
"doc_count": 22
},
{
"key": "Others",
"doc_count": 2
}
]
}
}
}
}
Scala code:
def getOrganizationPeopleFilters(client: ElasticClient, entityType: String, entityId: String, request: Option[PostFilterApiRequest], baseQuery: SearchRequest): IO[PostFilters] = {
val q = baseQuery.aggs(nestedAggregation("people", OrganizationSchema.People)
.subAggregations(termsAgg("positiontype", "people.position_type")))
client.execute {
q
}.flatMap { res ⇒
esToJsonOrganizationPeopleFilters(res.result)
}
}
The ES query is running and aggregating correctly in Kibana. But, when we are trying to FlatMap the response in the above Scala api code, it is resulting in a ClassCastException (java.lang.ClassCastException: scala.collection.immutable.Map$Map2 cannot be cast to java.lang.Integer)

Multifield wildcard search in ElasticSearch

Consider this very basic T-SQL query:
select * from Users
where FirstName like '%dm0e776467#mail.com%'
or LastName like '%dm0e776467#mail.com%'
or Email like '%dm0e776467#mail.com%'
How can I write this in Lucene?
I have tried the following:
The query way (does not work at all, no results):
{
"query": {
"bool": {
"should": [
{
"wildcard": {
"firstName": "dm0e776467#mail.com"
}
},
{
"wildcard": {
"lastName": "dm0e776467#mail.com"
}
},
{
"wildcard": {
"email": "dm0e776467#mail.com"
}
}
]
}
}
}
The Multimatch way (returns anything where mail.com is present)
{
"query": {
"multi_match": {
"query": "dm0e776467#mail.com",
"fields": [
"firstName",
"lastName",
"email"
]
}
}
}
A third attempt (returns expected result, but if I only insert "mail", then no results are returned)
{
"query": {
"query_string": {
"query": ""dm0e776467#mail.com"",
"fields": [
"firstName",
"lastName",
"email"
],
"default_operator": "or",
"allow_leading_wildcard": true
}
}
}
It seems to me as there is no way to force Elasticsearch to force a query to use the input string as ONE substring?
The standard (default) analyzer will tokenize this email as follows:
GET _analyze
{
"text": "dm0e776467#mail.com",
"analyzer": "standard"
}
yielding
{
"tokens" : [
{
"token" : "dm0e776467",
...
},
{
"token" : "mail.com",
...
}
]
}
This explains why the multi-match works with any *mail.com suffix and why the wildcards are failing.
I suggest the following modifications to your mapping, inspired by this answer:
PUT users
{
"settings": {
"analysis": {
"filter": {
"email": {
"type": "pattern_capture",
"preserve_original": true,
"patterns": [
"([^#]+)",
"(\\p{L}+)",
"(\\d+)",
"#(.+)",
"([^-#]+)"
]
}
},
"analyzer": {
"email": {
"tokenizer": "uax_url_email",
"filter": [
"email",
"lowercase",
"unique"
]
}
}
}
},
"mappings": {
"properties": {
"email": {
"type": "text",
"analyzer": "email"
},
"firstName": {
"type": "text",
"fields": {
"as_email": {
"type": "text",
"analyzer": "email"
}
}
},
"lastName": {
"type": "text",
"fields": {
"as_email": {
"type": "text",
"analyzer": "email"
}
}
}
}
}
}
Note that I've used .as_email fields on your first- & lastName fields -- you may not want to force them to be mapped as emails by default.
Then after indexing a few samples:
POST _bulk
{"index":{"_index":"users","_type":"_doc"}}
{"firstName":"abc","lastName":"adm0e776467#mail.coms","email":"dm0e776467#mail.com"}
{"index":{"_index":"users","_type":"_doc"}}
{"firstName":"xyz","lastName":"opr","email":"dm0e776467#mail.com"}
{"index":{"_index":"users","_type":"_doc"}}
{"firstName":"zyx","lastName":"dm0e776467#mail.com","email":"qwe"}
{"index":{"_index":"users","_type":"_doc"}}
{"firstName":"abc","lastName":"efg","email":"ijk"}
the wildcards are working perfectly fine:
GET users/_search
{
"query": {
"bool": {
"should": [
{
"wildcard": {
"email": "dm0e776467#mail.com"
}
},
{
"wildcard": {
"lastName.as_email": "dm0e776467#mail.com"
}
},
{
"wildcard": {
"firstName.as_email": "dm0e776467#mail.com"
}
}
]
}
}
}
Do check how this tokenizer works under the hood to prevent 'surprising' query results:
GET users/_analyze
{
"text": "dm0e776467#mail.com",
"field": "email"
}

Need JOLT spec file for transfer of complex JSON

I have a complex JSON object (I've simplified it for this example) that I cannot figure out the JOLT transform JSON for. Does anybody have any ideas of what the JOLT spec file should be?
Original JSON
[
{
"date": {
"isoDate": "2019-03-22"
},
"application": {
"name": "SiebelProject"
},
"applicationResults": [
{
"reference": {
"name": "Number of Code Lines"
},
"result": {
"value": 44501
}
},
{
"reference": {
"name": "Transferability"
},
"result": {
"grade": 3.1889542208002064
}
}
]
},
{
"date": {
"isoDate": "2019-03-21"
},
"application": {
"name": "SiebelProject"
},
"applicationResults": [
{
"reference": {
"name": "Number of Code Lines"
},
"result": {
"value": 45000
}
},
{
"reference": {
"name": "Transferability"
},
"result": {
"grade": 3.8
}
}
]
}
]
Desired JSON after transformation and sorting by "Name" ASC, "Date" DESC
[
{
"Name": "SiebelProject",
"Date": "2019-03-22",
"Number of Code Lines": 44501,
"Transferability" : 3.1889542208002064
},
{
"Name": "SiebelProject",
"Date": "2019-03-21",
"Number of Code Lines": 45000,
"Transferability" : 3.8
}
]
I couldn't find a way to do the sort (I'm not even sure you can sort descending in JOLT) but here's a spec to do the transform:
[
{
"operation": "shift",
"spec": {
"*": {
"date": {
"isoDate": "[#3].Date"
},
"application": {
"name": "[#3].Name"
},
"applicationResults": {
"*": {
"reference": {
"name": {
"Number of Code Lines": {
"#(3,result.value)": "[#7].Number of Code Lines"
},
"Transferability": {
"#(3,result.grade)": "[#7].Transferability"
}
}
}
}
}
}
}
}
]
After that there are some tools (like jq I think) that could do the sort.

Elasticsearch: Date Aggregation Most Recent

I have query that works. It aggregates data based on Id and finds the MOST RECENT object based on the created field. The problem I have is that I would like to find the SECOND MOST RECENT instead of MOST RECENT. How would I go about this? I have been looking all through the docs and all I can find is range which doesn't help me to much. Thank you :)
{
"query":{
"match": {
"name": "Robert"
}
},
"aggs": {
"previous": {
"terms": {
"field": "Id",
"order": {"timeCreated": "desc"}
},
"aggs": {
"timeCreated": {
"max": {"field": "created"}
}
}
}
}
}
Top_hits is what you are looking for. Use this:
{
"query":{
"match": {
"name": "A"
}
},
"aggs": {
"previous": {
"terms": {
"field": "Id"
},
"aggs": {
"latestRecords": {
"top_hits": {
"sort": {
"created": {
"order": "desc"
}
},
"size" :2
}
}
}
}
}
}