Elasticsearch autocomplete or autosuggest by token - autocomplete

I want to create suggestions on how to complete a term based on tokens, similar to google like autocomplete but only with one token or word.
I'd like to search across filenames who will be tokenized. E.g. "BRAND_Connect_A1233.jpg" gets tokenized into "brand", "connect", "a1234" and "jpg".
Now I'd like to ask for some suggestion for e.g. "Con".
The suggestion should deliver the complete matching tokens, not the full filename:
Connect
Contour
Concept
...
The suggestion for "A12" should be "A1234", "A1233", "A1233" ...
Example
Working with queries, facets and filters works fine.
First I created a mapping including a tokenizer and a filter:
curl -XPUT 'localhost:9200/files/?pretty=1' -d '
{
"settings" : {
"analysis" : {
"analyzer" : {
"filename_search" : {
"tokenizer" : "filename",
"filter" : ["lowercase"]
},
"filename_index" : {
"tokenizer" : "filename",
"filter" : ["lowercase","edge_ngram"]
}
},
"tokenizer" : {
"filename" : {
"pattern" : "[^[;_\\.\\/]\\d]+",
"type" : "pattern"
}
},
"filter" : {
"edge_ngram" : {
"side" : "front",
"max_gram" : 20,
"min_gram" : 2,
"type" : "edgeNGram"
}
}
}
},
"mappings" : {
"file" : {
"properties" : {
"filename" : {
"type" : "string",
"search_analyzer" : "filename_search",
"index_analyzer" : "filename_index"
}
}
}
}
}'
Both analyzers work pretty well:
curl -XGET 'localhost:9200/files/_analyze?pretty=1&text=BRAND_ConnectBlue_A1234.jpg&analyzer=filename_search'
curl -XGET 'localhost:9200/files/_analyze?pretty=1&text=BRAND_ConnectBlue_A1234.jpg&analyzer=filename_index'
Now I added some example data
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "BRAND_ConnectBlue_A1234.jpg"}'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "BRAND_Connect_A1233.jpg"}'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "BRAND_ConceptSpace_A1244.jpg"}'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "COMPANY_Connect_A1222.jpg"}'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "COMPANY_Concept_A1233.jpg"}'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "DEALER_Connect_B1234_.jpg"}'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "DEALER_Contour21_B1233.jpg"}'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "DEALER_ConceptCube_B2233.jpg"}'
curl -X POST "localhost:9200/files/_refresh"
Various approaches to get the desired suggestion does not deliver the expected results. I had tried to name the analyzers and tried various combinations of analyzers and wildcards.
curl -XGET 'localhost:9200/files/_suggest?pretty=true' -d '{
"text" : "con",
"simple_phrase" : {
"phrase" : {
"field" : "filename",
"size" : 15,
"real_word_error_likelihood" : 0.75,
"max_errors" : 0.1,
"gram_size" : 3
}
}
}'
curl -XGET 'localhost:9200/files/_suggest?pretty=true' -d '{
"my-suggestion" : {
"text" : "con",
"term" : {
"field" : "filename",
"analyzer": "filename_index"
}
}
}'

You need to add a special mapping to use the completion suggester, as documented in the official ElasticSearch docs. I've modified your example to show how it works.
First create the index. Note the filename_suggest mapping.
curl -XPUT 'localhost:9200/files/?pretty=1' -d '
{
"settings" : {
"analysis" : {
"analyzer" : {
"filename_search" : {
"tokenizer" : "filename",
"filter" : ["lowercase"]
},
"filename_index" : {
"tokenizer" : "filename",
"filter" : ["lowercase","edge_ngram"]
}
},
"tokenizer" : {
"filename" : {
"pattern" : "[^[;_\\.\\/]\\d]+",
"type" : "pattern"
}
},
"filter" : {
"edge_ngram" : {
"side" : "front",
"max_gram" : 20,
"min_gram" : 2,
"type" : "edgeNGram"
}
}
}
},
"mappings" : {
"file" : {
"properties" : {
"filename" : {
"type" : "string",
"analyzer": "filename_index",
"search_analyzer" : "filename_search"
},
"filename_suggest": {
"type": "completion",
"analyzer": "simple",
"search_analyzer": "simple",
"payloads": true
}
}
}
}
}'
Add some data. Note how the filename_suggest has the input field, which contains the keywords to match on.
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "BRAND_ConnectBlue_A1234.jpg", "filename_suggest": { "input": ["BRAND", "ConnectBlue", "A1234", "jpg"], "payload": {} } }'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "BRAND_Connect_A1233.jpg", "filename_suggest": { "input": ["BRAND", "Connect", "A1233", "jpg"], "payload": {} } }'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "BRAND_ConceptSpace_A1244.jpg", "filename_suggest": { "input": ["BRAND", "ConceptSpace", "A1244", "jpg"], "payload": {} } }'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "COMPANY_Connect_A1222.jpg", "filename_suggest": { "input": ["COMPANY", "Connect", "A1222", "jpg"], "payload": {} } }'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "COMPANY_Concept_A1233.jpg", "filename_suggest": { "input": ["COMPANY", "Concept", "A1233", "jpg"], "payload": {} } }'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "DEALER_Connect_B1234_.jpg", "filename_suggest": { "input": ["DEALER", "Connect", "B1234", "jpg"], "payload": {} } }'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "DEALER_Contour21_B1233.jpg", "filename_suggest": { "input": ["DEALER", "Contour21", "B1233", "jpg"], "payload": {} }}'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "DEALER_ConceptCube_B2233.jpg", "filename_suggest": { "input": ["DEALER", "ConceptCube", "B2233", "jpg"], "payload": {} }}'
curl -X POST "localhost:9200/files/_refresh"
Now perform the query:
curl -XPOST 'localhost:9200/files/_suggest?pretty=true' -d '{
"filename_suggest" : {
"text" : "con",
"completion": {
"field": "filename_suggest", "size": 10
}
}
}'
Results:
{
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"filename_suggest" : [ {
"text" : "con",
"offset" : 0,
"length" : 3,
"options" : [ {
"text" : "Connect",
"score" : 2.0,
"payload":{}
}, {
"text" : "Concept",
"score" : 1.0,
"payload":{}
}, {
"text" : "ConceptSpace",
"score" : 1.0,
"payload":{}
}, {
"text" : "ConnectBlue",
"score" : 1.0,
"payload":{}
}, {
"text" : "Contour21",
"score" : 1.0,
"payload":{}
} ]
} ]
}

Related

MongoDB as sink connector not capturing data as expected - kafka?

I am currently using MySQL database as source connector using this config below, I want to monitor changes to a database and send it to mongoDB,
Here's my source connector config,
curl -i -X POST -H "Accept:application/json" -H "Content-Type:application/json" localhost:8083/connectors/ -d '''{
"name": "source_mysql_connector",
"config": {
"connector.class": "io.debezium.connector.mysql.MySqlConnector",
"tasks.max": "1",
"database.hostname": "host.docker.internal",
"database.port": "3306",
"database.user": "test",
"database.password": "$apr1$o7RbW.GvrPIY1",
"database.server.id": "8111999",
"database.server.name": "db_source",
"database.include.list": "example",
"database.history.kafka.bootstrap.servers": "broker:29092",
"database.history.kafka.topic": "schema-changes.example",
"database.allowPublicKeyRetrieval":"true",
"include.schema.changes": "true"
}
}'''
Here's my sink connector (mongodb) config,
curl -i -X POST -H "Accept:application/json" -H "Content-Type:application/json" localhost:8083/connectors/ -d '''{
"name": "sink_mongodb_connector",
"config": {
"connector.class": "com.mongodb.kafka.connect.MongoSinkConnector",
"tasks.max":"1",
"topics":"db_source.example.employees",
"connection.uri":"mongodb://172.17.0.1:27017/example?w=1&journal=true",
"database":"example",
"collection":"employees",
"value.converter": "io.confluent.connect.avro.AvroConverter",
"value.converter.schema.registry.url": "http://schema-registry:8081"
}
}'''
Using this I was able to establish the connection and catch the data changes and store them onto mongodb collection for a table called employees,
But the problem here is when I checked the collections in mongodb the documents were saved like this,
{ "_id" : ObjectId("60d0e6939e00e22f274ccac1"), "before" : null, "after" : { "id" : NumberLong(11), "name" : "Steve Shining", "team" : "DevOps", "birthday" : 11477 }, "source" : { "version" : "1.5.0.Final", "connector" : "mysql", "name" : "db_source", "ts_ms" : NumberLong("1624303251000"), "snapshot" : "false", "db" : "example", "sequence" : null, "table" : "employees", "server_id" : NumberLong(6030811), "gtid" : null, "file" : "mysql-bin.000003", "pos" : NumberLong(5445), "row" : 2, "thread" : null, "query" : null }, "op" : "c", "ts_ms" : NumberLong("1624303251190"), "transaction" : null }
{ "_id" : ObjectId("60d0e6939e00e22f274ccac2"), "before" : null, "after" : { "id" : NumberLong(12), "name" : "John", "team" : "Support", "birthday" : 6270 }, "source" : { "version" : "1.5.0.Final", "connector" : "mysql", "name" : "db_source", "ts_ms" : NumberLong("1624303251000"), "snapshot" : "false", "db" : "example", "sequence" : null, "table" : "employees", "server_id" : NumberLong(6030811), "gtid" : null, "file" : "mysql-bin.000003", "pos" : NumberLong(5445), "row" : 3, "thread" : null, "query" : null }, "op" : "c", "ts_ms" : NumberLong("1624303251190"), "transaction" : null }
But my mysql database looks like this,
mysql> select * from employees;
+----+---------------+-----------+------------+------------+
| id | name | team | birthday |
+----+---------------+-----------+------------+------------+
| 1 | Peter Smith | DevOps | 2003-07-21 |
| 11 | Steve Shining | DevOps | 2001-06-04 |
| 12 | John | Support | 1987-03-03 |
+----+---------------+-----------+------------+------------+
I want my collections to look like this,
{ "_id" : ObjectId("60d0e6939e00e22f274ccac2"), "name" : "John", "team" : "Support", "birthday" : "1987-03-03 "}
What am I doing wrong here? Even the delete message is stored in collection like this, it is not able to identify the message and all. How do I fix it? Even the dates are not stored properly?
Updated:
curl -i -X POST -H "Accept:application/json" -H "Content-Type:application/json" localhost:8083/connectors/ -d '''{
"name": "sink_mongodb_connector",
"config": {
"connector.class": "com.mongodb.kafka.connect.MongoSinkConnector",
"tasks.max":"1",
"topics":"db_source.example.employees",
"connection.uri":"mongodb://172.17.0.1:27017/example?w=1&journal=true",
"database":"example",
"collection":"employees",
"value.converter": "io.confluent.connect.avro.AvroConverter",
"value.converter.schema.registry.url": "http://schema-registry:8081",
"transforms": "unwrap",
"transforms.unwrap.type": "io.debezium.transforms.ExtractNewRecordState",
"transforms.unwrap.drop.tombstones": "false",
"transforms.unwrap.delete.handling.mode": "rewrite"
}
}'''
Problem is not related to Mongo, but the default Debezium format.
What you see is the before, after, and additional CDC event metadata.
not able to identify the message
It is, though ... "after" : { "id" : NumberLong(12), "name" : "John", "team" : "Support", "birthday" : 6270 }
You need to extract/flatten the event so that you only get the "after" field
https://debezium.io/documentation/reference/configuration/event-flattening.html
Regarding the birthday / date values, seems to be a separate issue

how on elasticsearch to do a range filter using the "REST request URI"?

I have this simple data on ES::
curl -XPUT localhost:9200/dt/art/1 -d '{ "age": 77 }'
curl -XPUT localhost:9200/dt/art/2 -d '{ "age": 19 }'
curl -XPUT localhost:9200/dt/art/3 -d '{ "age": 42 }'
curl -XPUT localhost:9200/dt/art/4 -d '{ "age": 33 }'
Is it possible to do a range filter on the "age" using the "REST request URI"?
For now, I only get this with the "REST request body":
$ curl "localhost:9200/dt/art/_search?pretty" -d'{
"query": {
"bool": { "must": { "match_all": {} },
"filter": {
"range": {
"age": { "gte": 20, "lte": 50}}}}}}'
{ "took" : 8, "timed_out" : false,
"_shards" : { "total" : 5, "successful" : 5, "skipped" : 0, "failed" : 0 },
"hits" : {
"total" : 2,
"max_score" : 1.0,
"hits" : [
{ "_index" : "dt", "_type" : "art", "_id" : "4", "_score" : 1.0,
"_source" : { "age" : 33 } },
{ "_index" : "dt", "_type" : "art", "_id" : "3", "_score" : 1.0,
"_source" : { "age" : 42 } }
]
}
}
$
Yes.
GET dt/art/_search?q=age:[19+TO+50]
For dates you can use something like this
_search?q=metrictype:os+datetime:[\"2016-08-30 10:00:00\"+to+\"2016-08-30 10:30:00\" ]+fields:datetime,cpu,disk"

Elasticsearch now function problems

I'm having problems trying to get a query working with the "now" function. My current query looks something like this:
{
"query": {
"bool" : {
"must" : [
{ "match": { "originCountry" : "GB" }},
{ "match": { "destinationCity" : "MIL" }}
]
}
},
"filter" : {
"and": {
"filters": [
{
"exists": {"field": "dateBack"}
} ,
{
"script" : {"script" : "doc['originRegion'].value == doc['destinationRegion'].value"}
},
{
"range": {
"dateOut": {
"gte": "now"
}
}
}
]
}
}
}
That's not returning any results. However if I change the range section to a string date like:
"range": {
"dateOut": {
"gte": "20150101"
}
}
It works perfect. In my index mapping all date fields are using the "basic_date" format (YYYYMMDD)
Could be this creating any issues for the now function? Does anyone knows how the now function works? Is it converting the "now" date to whatever date format the field being compared is using? I'be been unable to find any useful documentation about this.
Thanks
Check your date mapping - it should be YYYYMMdd instead of YYYYMMDD
When I set up the mapping:
curl -XPOST http://localhost:9200/index/testnow/_mapping -d '
{"testnow": {
"properties": {
"dateOut": {"type": "date","format" : "YYYYMMdd"},
"dateBack": {"type": "date","format" : "YYYYMMdd"}
}}}'
and post in a couple of docs:
curl -XPOST http://localhost:9200/index/testnow/ -d '
{
"originCountry": "GB",
"destinationCity": "MIL",
"dateBack" : "20140212",
"originRegion" : "X",
"destinationRegion" : "X",
"dateOut" : "20140201"
}'
curl -XPOST http://localhost:9200/index/testnow/ -d '
{
"originCountry": "GB",
"destinationCity": "MIL",
"dateBack" : "20150212",
"originRegion" : "X",
"destinationRegion" : "X",
"dateOut" : "20150201"
}'
and run the query:
curl -XGET http://localhost:9200/index/testnow/_search -d '
{
"query" : {
"filtered" : {
"query": {
"bool" : {
"must" : [
{ "match": { "originCountry" : "GB" }},
{ "match": { "destinationCity" : "MIL" }}
]
}
},
"filter" : {
"and" : [
{"exists": {"field": "dateBack"}},
{"script" : {"script" : "doc[\"originRegion\"].value == doc[\"destinationRegion\"].value"}},
{"range": {"dateOut": {"gte": "now"}}}
]} }}}'
I get back a single document as expected:
{
"took" : 11,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : 1.4142135,
"hits" : [ {
"_index" : "index",
"_type" : "testnow",
"_id" : "AUqgq8u4aqAGLvfmRnfz",
"_score" : 1.4142135,
"_source":
{
"originCountry": "GB",
"destinationCity": "MIL",
"dateBack" : "20150212",
"originRegion" : "X",
"destinationRegion" : "X",
"dateOut" : "20150201"
}
} ]
}
}

Elasticsearch's keyword tokenizer and searching for emails does not really work

I have set up an index like this:
POST /testindex/ -d '
{
"settings": {
"analysis": {
"analyzer": {
"analyzer_keyword": {
"tokenizer": "keyword"
}
}
}
},
"mappings": {
"users": {
"properties": {
"email": {
"analyzer": "analyzer_keyword",
"type": "string"
}
}
}
}
}'
Now I have added some users documents to testindex whereas a user contains an email address. If I want to search for a user document by specifying the email address like the following, it does not really work as expected:
GET /testindex/users/_search
{
"query" : {
"term" : { "email" : "hello#host.com" }
}
}
This query returns 0 results. But if I say "email": "hello" or "email": "host.com" it returns the exact document. But what is wrong with the #? How can I search by the complete email address?
The elasticsearch documentation says:
A tokenizer of type keyword that emits the entire input as a single input. The entire input is hello#host.com.
I also tried uax_url_email tokenizer. Does not work either.
Seems to work fine to me:
curl -XDELETE "localhost:9200/testindex?pretty"
curl -XPOST "localhost:9200/testindex?pretty" -d '
{
"settings": {
"analysis": {
"analyzer": {
"analyzer_keyword": {
"tokenizer": "keyword"
}
}
}
},
"mappings": {
"users": {
"properties": {
"email": {
"analyzer": "analyzer_keyword",
"type": "string"
}
}
}
}
}'
curl -XPOST "localhost:9200/testindex/users?pretty&refresh" -d '{"email": "hello#host.com"}'
curl -XGET "localhost:9200/testindex/users/_search?pretty" -d '{
"query" : {
"term" : { "email" : "hello#host.com" }
}
}'
it returns:
{
"error" : "IndexMissingException[[testindex] missing]",
"status" : 404
}
{
"ok" : true,
"acknowledged" : true
}
{
"ok" : true,
"_index" : "testindex",
"_type" : "users",
"_id" : "GkPG9l83RGyeMyGM9x6ecQ",
"_version" : 1
}
{
"took" : 62,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : 0.30685282,
"hits" : [ {
"_index" : "testindex",
"_type" : "users",
"_id" : "GkPG9l83RGyeMyGM9x6ecQ",
"_score" : 0.30685282, "_source" : {"email": "hello#host.com"}
} ]
}
}
on both 0.90.7 and current master. Did you try to delete the index before changing the mapping?

mapping in create index in elasticsearch through mongodb river is not taking effect

I am trying to index mongodb in elasticsearch using mongodb-river using the following command but the document mapping is not taking effect. It is still using the default analyzer(standard) for field text
Mongodb-river
The document specifies the creation of index but there is no documentation on how to provide custom mapping. This is what I tried. Is there any other documentation where I can find how to specify custom analyzers etc in using mongodb-river.
curl -XPUT "localhost:9200/_river/autocompleteindex/_meta" -d '
{
"type": "mongodb",
"mongodb": {
"host": "rahulg-dc",
"port": "27017",
"db": "qna",
"collection": "autocomplete_questions"
},
"index": {
"name": "autocompleteindex",
"type": "autocomplete_questions",
"analysis" : {
"analyzer" : {
"str_search_analyzer" : {
"tokenizer" : "keyword",
"filter" : ["lowercase"]
},
"str_index_analyzer" : {
"tokenizer" : "keyword",
"filter" : ["lowercase", "ngram"]
}
},
"filter" : {
"ngram" : {
"type" : "ngram",
"min_gram" : 2,
"max_gram" : 20
}
}
}
},
"autocompleteindex": {
"_boost" : {
"name" : "po",
"null_value" : 1.0
},
"properties": {
"po": {
"type": "double"
},
"text": {
"type": "string",
"boost": 3.0,
"search_analyzer" : "str_search_analyzer",
"index_analyzer" : "str_index_analyzer"
}
}
}
}'
The query returns proper results is I search by full words but does not match any substring match. Also, the boost factor is not showing its effect.
What am I doing wrong ??
You have to create first your index with your index settings (analyzer):
"analysis" : {
"analyzer" : {
"str_search_analyzer" : {
"tokenizer" : "keyword",
"filter" : ["lowercase"]
},
"str_index_analyzer" : {
"tokenizer" : "keyword",
"filter" : ["lowercase", "ngram"]
}
},
"filter" : {
"ngram" : {
"type" : "ngram",
"min_gram" : 2,
"max_gram" : 20
}
}
}
Then you can define a mapping for your type:
"autocomplete_questions": {
"_boost" : {
"name" : "po",
"null_value" : 1.0
},
"properties": {
"po": {
"type": "double"
},
"text": {
"type": "string",
"boost": 3.0,
"search_analyzer" : "str_search_analyzer",
"index_analyzer" : "str_index_analyzer"
}
}
}
And only then, you can create the river:
curl -XPUT "localhost:9200/_river/autocompleteindex/_meta" -d '
{
"type": "mongodb",
"mongodb": {
"host": "rahulg-dc",
"port": "27017",
"db": "qna",
"collection": "autocomplete_questions"
},
"index": {
"name": "autocompleteindex",
"type": "autocomplete_questions"} }
Does it help?