Multifield wildcard search in ElasticSearch

Multifield wildcard search in ElasticSearch - tsql

Consider this very basic T-SQL query:
select * from Users
where FirstName like '%dm0e776467#mail.com%'
or LastName like '%dm0e776467#mail.com%'
or Email like '%dm0e776467#mail.com%'
How can I write this in Lucene?
I have tried the following:
The query way (does not work at all, no results):
{
"query": {
"bool": {
"should": [
{
"wildcard": {
"firstName": "dm0e776467#mail.com"
}
},
{
"wildcard": {
"lastName": "dm0e776467#mail.com"
}
},
{
"wildcard": {
"email": "dm0e776467#mail.com"
}
}
]
}
}
}
The Multimatch way (returns anything where mail.com is present)
{
"query": {
"multi_match": {
"query": "dm0e776467#mail.com",
"fields": [
"firstName",
"lastName",
"email"
]
}
}
}
A third attempt (returns expected result, but if I only insert "mail", then no results are returned)
{
"query": {
"query_string": {
"query": ""dm0e776467#mail.com"",
"fields": [
"firstName",
"lastName",
"email"
],
"default_operator": "or",
"allow_leading_wildcard": true
}
}
}
It seems to me as there is no way to force Elasticsearch to force a query to use the input string as ONE substring?

The standard (default) analyzer will tokenize this email as follows:
GET _analyze
{
"text": "dm0e776467#mail.com",
"analyzer": "standard"
}
yielding
{
"tokens" : [
{
"token" : "dm0e776467",
...
},
{
"token" : "mail.com",
...
}
]
}
This explains why the multi-match works with any *mail.com suffix and why the wildcards are failing.
I suggest the following modifications to your mapping, inspired by this answer:
PUT users
{
"settings": {
"analysis": {
"filter": {
"email": {
"type": "pattern_capture",
"preserve_original": true,
"patterns": [
"([^#]+)",
"(\\p{L}+)",
"(\\d+)",
"#(.+)",
"([^-#]+)"
]
}
},
"analyzer": {
"email": {
"tokenizer": "uax_url_email",
"filter": [
"email",
"lowercase",
"unique"
]
}
}
}
},
"mappings": {
"properties": {
"email": {
"type": "text",
"analyzer": "email"
},
"firstName": {
"type": "text",
"fields": {
"as_email": {
"type": "text",
"analyzer": "email"
}
}
},
"lastName": {
"type": "text",
"fields": {
"as_email": {
"type": "text",
"analyzer": "email"
}
}
}
}
}
}
Note that I've used .as_email fields on your first- & lastName fields -- you may not want to force them to be mapped as emails by default.
Then after indexing a few samples:
POST _bulk
{"index":{"_index":"users","_type":"_doc"}}
{"firstName":"abc","lastName":"adm0e776467#mail.coms","email":"dm0e776467#mail.com"}
{"index":{"_index":"users","_type":"_doc"}}
{"firstName":"xyz","lastName":"opr","email":"dm0e776467#mail.com"}
{"index":{"_index":"users","_type":"_doc"}}
{"firstName":"zyx","lastName":"dm0e776467#mail.com","email":"qwe"}
{"index":{"_index":"users","_type":"_doc"}}
{"firstName":"abc","lastName":"efg","email":"ijk"}
the wildcards are working perfectly fine:
GET users/_search
{
"query": {
"bool": {
"should": [
{
"wildcard": {
"email": "dm0e776467#mail.com"
}
},
{
"wildcard": {
"lastName.as_email": "dm0e776467#mail.com"
}
},
{
"wildcard": {
"firstName.as_email": "dm0e776467#mail.com"
}
}
]
}
}
}
Do check how this tokenizer works under the hood to prevent 'surprising' query results:
GET users/_analyze
{
"text": "dm0e776467#mail.com",
"field": "email"
}

Related

ElasticSearch Wildcard Tokenizer for Emails

Suppose there are five email addresses stored under field "email":
1. {"email": "john_1#gmail.com"}
2. {"email": "john_2#gmail.com"}
3. {"email": "john_3#outlook.com"}
4. {"email": "john_4#outlook.com}
5. {"email": "john_5#yahoo.com"}
When I try to search with full email address I get the proper result. Where as If I try to search with partial email I gives me no result.
For example If I try to search only joh or john_. However if I try to search john_1 I am able to get the result. How to get the wildcard result in this case.
PUT /test
{
"settings": {
"analysis": {
"filter": {
"email": {
"type": "pattern_capture",
"preserve_original": 1,
"patterns": [
"([^#]+)",
"(\\p{L}+)",
"(\\d+)",
"#(.+)",
"([^-#]+)"
]
}
},
"analyzer": {
"email": {
"tokenizer": "uax_url_email",
"filter": [
"email",
"lowercase",
"unique"
]
}
}
}
},
"mappings": {
"emails": {
"properties": {
"email": {
"type": "string",
"analyzer": "email",
"search_analyzer": "standard",
"fields": {
"raw": {
"type": "keyword"
}
}
}
}
}
}
}

Try using Wildcard Query.
Example:
{
"query": {
"wildcard": {
"email": {
"value": "joh*"
}
}
}
}

Configure monitor query with limitation on aggeration

I am trying to configure a monitor that looks at data logged by cron jobs.
I want to trigger an alert if a job does stop to log data.
The query using SQL looks something like this:
POST _plugins/_sql/
{
"query" : "SELECT instance, job-id, count(*), max(#timestamp) as newest FROM job-statistics-* where #timestamp > '2022-09-28 00:00:00.000' group BY job-id, instance HAVING newest < '2022-09-28 08:45:00.000'"
}
Using exlplain I converted this to a JSON Query and made the timestamp dynamic:
{
"from": 0,
"size": 0,
"timeout": "1m",
"query": {
"range": {
"#timestamp": {
"from": "now-1h",
"to": null,
"include_lower": false,
"include_upper": true,
"boost": 1
}
}
},
"sort": [
{
"_doc": {
"order": "asc"
}
}
],
"aggregations": {
"composite_buckets": {
"composite": {
"size": 1000,
"sources": [
{
"job-id": {
"terms": {
"field": "job-id.keyword",
"missing_bucket": true,
"missing_order": "first",
"order": "asc"
}
}
},
{
"instance": {
"terms": {
"field": "instance.keyword",
"missing_bucket": true,
"missing_order": "first",
"order": "asc"
}
}
}
]
},
"aggregations": {
"count(*)": {
"value_count": {
"field": "_index"
}
},
"max(#timestamp)": {
"max": {
"field": "#timestamp"
}
}
}
}
}
}
From this query, the limitation on the aggeration max(#timestmap) is missing.
In the explain response it is here:
"name": "FilterOperator",
"description": {
"conditions": """<(max(#timestamp), cast_to_timestamp("2022-09-28 08:45:00.000"))"""
},
Ideally, this should be max(#timestmap) < now-30m
My question:
How can I integrate this into the query or the monitor?
Is there another way to do this?
Thanks a lot
Marius

JOLT - filtering array based on object value

how can I do this?
This is the array....
Can you please help me?
Can you please give me the answer???? Thanks a lot
{
"results": {
"data": [
{
"name": "xx",
"typeRelationship": [
{
"relationship": "parent",
"type": {
"id": "yyyyy",
}
}
],
"id": "xxxxxxxx"
},
{
"name": "yy",
"typeRelationship": [
{
"relationshipType": "parent",
"type": {
"id": "CCCC"
}
},
{
"relationshipType": "child",
"service": {
"id": "DDDD"
}
},
{
"relationshipType": "child",
"service": {
"id": "xxxxxxxx"
}
}
],
"id": "yyyyy"
}
]
}}
expected:
This is expected:
{
"data" : [ {
"id" : "xxxx",
"href" : "xxxxxx",
"relation":"parent"
} ]
}
For some reason I need to type so it does let me update!!!

This works.
[
{
"operation": "shift",
"spec": {
"data": {
"*": {
"type": {
"id": {
"xxxx": {
"#3": "data[]"
}
}
}
}
}
}
}
]
Edit 1
The below spec moves all the values which as id=xxxxx to the data array.
[
{
"operation": "shift",
"spec": {
"data": {
"*": {
"type": {
"*": {
"id": {
"xxxx": {
"#(2)": "data[]",
"#(4,relation)": "data[&3].relation"
}
}
}
}
}
}
}
}
]

This totally works.
Thanks.
Can you please let me know what is 2? 3? 4?
Because my array is a bit different and I want to fix those numbers but does not work....
{
"results": {
"data": [
{
"name": "xx",
"typeRelationship": [
{
"relationship": "parent",
"type": {
"id": "yyyyy",
}
}
],
"id": "xxxxxxxx"
},
{
"name": "yy",
"typeRelationship": [
{
"relationshipType": "parent",
"type": {
"id": "CCCC"
}
},
{
"relationshipType": "child",
"service": {
"id": "DDDD"
}
},
{
"relationshipType": "child",
"service": {
"id": "xxxxxxxx"
}
}
],
"id": "yyyyy"
}
]
}
}
expected:
{
"rows" : [ {
"rowdata" : {
"relationshipType" : "child",
"Name" : "yy",
"id" : "yyyyy"
}
} ]
}

Cloud Firestore subcollection query via REST

I have collection x, each document of x has subcollection y. Each document of y has a time attribute. I can't figure out how to query just that subcollection via REST (I know this feature exists in the SDK). My query so far, which is obviously wrong:
{
"structuredQuery": {
"from": [
{
"collectionId": "x",
"allDescendants": true
}
],
"where": {
"compositeFilter": {
"op": "AND",
"filters": [
{
"fieldFilter": {
"field": {
"fieldPath": "y.time"
},
"op": "GREATER_THAN_OR_EQUAL",
"value": {
"integerValue": 1577836800000
}
}
},
{
"fieldFilter": {
"field": {
"fieldPath": "y.time"
},
"op": "LESS_THAN_OR_EQUAL",
"value": {
"integerValue": 1578355200000
}
}
}
]
}
}
}
}
Sending a POST to https://firestore.googleapis.com/v1/projects/PROJECT/databases/{default}/documents:runQuery, but I've also tried .../documents/x/ID/y:runQuery but that's obviously wrong too.

I believe you described a collection group query for collection group y. In the REST API, this is an allDescendants query on the path projects/PROJECT/databases/(default)/documents (known as the root document):
https://firestore.googleapis.com/v1/projects/PROJECT/databases/(default)/documents:runQuery
{
"structuredQuery": {
"from": [
{
"collectionId": "y",
"allDescendants": true
}
],
"where": {
"compositeFilter": {
"op": "AND",
"filters": [
{
"fieldFilter": {
"field": {
"fieldPath": "time"
},
"op": "GREATER_THAN_OR_EQUAL",
"value": {
"integerValue": 1577836800000
}
}
},
{
"fieldFilter": {
"field": {
"fieldPath": "time"
},
"op": "LESS_THAN_OR_EQUAL",
"value": {
"integerValue": 1578355200000
}
}
}
]
}
}
}
}

Declare the path to the subcollection in the URL:
https://firestore.googleapis.com/v1/projects/PROJECT/databases/(default)/documents/x/documentX:runQuery
Then make the collectionId in from collection "y" and allDescendants false:
{
"structuredQuery": {
"from": [
{
"collectionId": "y",
"allDescendants": false
}
],
"where": {
"compositeFilter": {
"op": "AND",
"filters": [
{
"fieldFilter": {
"field": {
"fieldPath": "y.time"
},
"op": "GREATER_THAN_OR_EQUAL",
"value": {
"integerValue": 1577836800000
}
}
},
{
"fieldFilter": {
"field": {
"fieldPath": "y.time"
},
"op": "LESS_THAN_OR_EQUAL",
"value": {
"integerValue": 1578355200000
}
}
}
]
}
}
}
}
Source: https://firebase.google.com/docs/firestore/reference/rest/v1/projects.databases.documents/runQuery#path-parameters

Compare and Combine Objects of same array

I want to iterate each object and map system.myStringValue with value.myStringValue
Can you provide spec for the same ?
I have provided the following input and output json. If any other things require form my side , let me know. I have tried with another spec but its not working for me as I want to combine email and phone in one object
Input :
[
{
"telecom": [
{
"system": {
"myStringValue": "email"
},
"value": {
"myStringValue": "mobqa#tester.com"
}
},
{
"system": {
"myStringValue": "phone"
},
"value": {
"myStringValue": "123"
}
}
]
},
{
"telecom": [
{
"system": {
"myStringValue": "email"
},
"value": {
"myStringValue": "john.doe#tester.com"
}
}
]
},
{
"telecom": [
{
"system": {
"myStringValue": "email"
},
"value": {
"myStringValue": "Gayle55#tester.com"
}
}
]
}
]
Output:
{
"users": [
{
"email":"mobqa#tester.com",
"phone":"123"
},
{
"email":"john.doe#tester.com"
},
{
"email":"Gayle55#tester.com"
}
]
}

[
{
"operation": "shift",
"spec": {
"*": {
"telecom": {
"*": {
"value": {
"myStringValue": {
// # takes value of 'value.myStringValue' and puts it into 'users[&5]' occurence
// #(3,system.myStringValue) go up four levels and then go to 'system.myStringValue'
// value and grab it and put as key value to appropriate occurence of 'users'
"#": "users[&5].#(3,system.myStringValue)"
}
}
}
}
}
}
}
]