Highlighting part of word in elasticsearch - autocomplete

I have made a auto-suggester in elastic search using n-gram tokenizer. Now I want to highlight the user entered character sequence in the auto suggest list. For this purpose I used the highlighter available in elastic search my code is as below but in the output the complete term is being highlighted where am I going wrong.
{
"query": {
"query_string": {
"query": "soft",
"default_field": "competency_display_name"
}
},
"highlight": {
"pre_tags": ["<b>"],
"post_tags": ["</b>"],
"fields": {
"competency_display_name": {}
}
}
}
and the result is
{
"took": 8,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "competency_auto_suggest",
"_type": "competency",
"_id": "4",
"_score": 1,
"_source": {
"review": null,
"competency_title": "Software Development",
"id": 4,
"competency_display_name": "Software Development"
},
"highlight": {
"competency_display_name": [
"<b>Software Development</b>"
]
}
}
]
}
}
mapping
"competency":{
"properties": {
"competency_display_name":{
"type":"string",
"index_analyzer": "index_ngram_analyzer",
"search_analyzer": "search_term_analyzer"
}
}
}
settings
"analysis": {
"filter": {
"ngram_tokenizer": {
"type": "nGram",
"min_gram": "1",
"max_gram": "15",
"token_chars": [ "letter", "digit" ]
}
},
"analyzer": {
"index_ngram_analyzer": {
"type": "custom",
"tokenizer": "keyword",
"filter": [ "ngram_tokenizer", "lowercase" ]
},
"search_term_analyzer": {
"type": "custom",
"tokenizer": "keyword",
"filter": "lowercase"
}
}
}
how to highlight Soft instead of Software Development.

You should use ngram tokenizer instead of ngram filter to highlight in this case.
with_positions_offsets is needed to help highlighting more faster.
Here's the workable settings & mapping :
"analysis": {
"tokenizer": {
"ngram_tokenizer": {
"type": "nGram",
"min_gram": "1",
"max_gram": "15",
"token_chars": [ "letter", "digit" ]
}
},
"analyzer": {
"index_ngram_analyzer": {
"type": "custom",
"tokenizer": "ngram_tokenizer",
"filter": [ "lowercase" ]
},
"search_term_analyzer": {
"type": "custom",
"tokenizer": "keyword",
"filter": "lowercase"
}
}
}
mapping
"competency":{
"properties": {
"competency_display_name":{
"type":"string",
"index_analyzer": "index_ngram_analyzer",
"search_analyzer": "search_term_analyzer",
"term_vector":"with_positions_offsets"
}
}
}

Related

how to find selected fields from sample data in nested array in mongodb

I have sample collection of data
[
{
data: [
{
"id": "0001",
"type": "donut",
"name": "Cake",
"ppu": 0.55,
"batters": {
"batter": [
{
"id": "1001",
"type": "Regular"
},
{
"id": "1002",
"type": "Chocolate"
},
{
"id": "1003",
"type": "Blueberry"
},
{
"id": "1004",
"type": "Devil's Food"
}
]
},
"topping": [
{
"id": "5001",
"type": "None"
},
{
"id": "5002",
"type": "Glazed"
},
{
"id": "5005",
"type": "Sugar"
},
{
"id": "5007",
"type": "Powdered Sugar"
},
{
"id": "5006",
"type": "Chocolate with Sprinkles"
},
{
"id": "5003",
"type": "Chocolate"
},
{
"id": "5004",
"type": "Maple"
}
]
},
{
"id": "0002",
"type": "donut",
"name": "Raised",
"ppu": 0.55,
"batters": {
"batter": [
{
"id": "1001",
"type": "Regular"
}
]
},
"topping": [
{
"id": "5001",
"type": "None"
},
{
"id": "5002",
"type": "Glazed"
},
{
"id": "5005",
"type": "Sugar"
},
{
"id": "5003",
"type": "Chocolate"
},
{
"id": "5004",
"type": "Maple"
}
]
},
{
"id": "0003",
"type": "donut",
"name": "Old Fashioned",
"ppu": 0.55,
"batters": {
"batter": [
{
"id": "1001",
"type": "Regular"
},
{
"id": "1002",
"type": "Chocolate"
}
]
},
"topping": [
{
"id": "5001",
"type": "None"
},
{
"id": "5002",
"type": "Glazed"
},
{
"id": "5003",
"type": "Chocolate"
},
{
"id": "5004",
"type": "Maple"
}
]
}
]
}
]
and I need data only in this formate of a specific id.
[
{
"id": "0001",
"type": "donut",
"name": "Cake",
"ppu": 0.55
}
]
aggregate
db.collection.aggregate({
"$unwind": "$data"
},
{
"$match": {
"data.id": "0001"
}
},
{
"$project": {
"_id": "$data.id",
"type": "$data.type",
"name": "$data.name",
"ppu": "$data.ppu"
}
})
mongoplayground

mongodb agregate and filter data

I try to filter some results data from mongodb with mongoose in javascript.
This is my json structure:
{
"name": "john",
"firstname": "doe",
"yearold": 22,
"recipes": [
{
"title": "cheesecake",
"data": [
{
"name": "egg",
"label": "Eggs for",
"value": 6,
"unit": "piece"
},
{
"name": "oil",
"label": "Specific oil",
"unit": "oz",
"value": 0.2
},
{
"name": "flour",
"label": "Wholemel flour",
"value": 450,
"unit": "gr"
}
]
},
{
"title": "cake",
"data": [
{
"name": "egg",
"label": "Eggs for",
"value": 6,
"unit": "piece"
},
{
"name": "flour",
"label": "Wholemel flour",
"value": 500,
"unit": "gr"
},
]
}
]
}
In some case i need to return json data with hiding some values. For example I have a list that specifies all the values ​​to hide
hidekeys=["egg"];
and i would like to get this:
{
"name": "john",
"firstname": "doe",
"yearold": 22,
"recipes": [
{
"title": "cheesecake",
"data": [
{
"name": "egg",
"label": "Eggs for",
"value": #######,
"unit": "piece"
},
{
"name": "oil",
"label": "Specific oil",
"unit": "oz",
"value": 0.2
},
{
"name": "flour",
"label": "Wholemel flour",
"value": 450,
"unit": "gr"
}
]
},
{
"title": "cake",
"data": [
{
"name": "egg",
"label": "Eggs for",
"value": #######,
"unit": "piece"
},
{
"name": "flour",
"label": "Wholemel flour",
"value": 500,
"unit": "gr"
},
]
}
]
}
For each recipe i need to hide ingredient value if it is specified in hidekeys.
I tried something with $project and $cond but it doesnt works
Here's a quick way of how to achieve this using $map
const hidekeys = ["egg"];
db.collection.aggregate([
{
$addFields: {
recipes: {
$map: {
input: "$recipes",
as: "recipe",
in: {
$mergeObjects: [
"$$recipe",
{
data: {
$map: {
input: "$$recipe.data",
as: "datum",
in: {
"$mergeObjects": [
"$$datum",
{
$cond: [
{
"$setIsSubset": [
[
"$$datum.name"
],
hidekeys
]
},
{
value: "#####"
},
{
value: "$$datum.value"
}
]
}
]
}
}
}
}
]
}
}
}
}
}
])
Mongo Playground

JOLT Spec for supporting the input json

My input JSON looks like below, but i am not sure of how to do internal array related parameter transformation using JOLT. Any help is appreciated as i am new to JOLT
{
"pktId": 7603,
"seq": 1,
"vehicleNumber": "66079",
"rmdLocation": "1",
"rmdTime": "2019-01-07T11:27:05.745Z",
"position": {
"lat": 55.4911232,
"lng": -3.686831
},
"dataSource": 11,
"frames": [
{
"seq": 0,
"card": 8,
"channel": 6,
"value": 117
},
{
"seq": 1,
"card": 8,
"channel": 6,
"value": 120
}
]
}
Below is the spec file i have created but it is not complete
[
{
"operation": "shift",
"spec": {
"frames": {
"*": {
"seq": "parameters[&1].seq",
"card": "parameters[&1].card",
"channel": "parameters[&1].channel",
"value": "parameters[&1].value"
}
},
"rmdTime": "messageTime",
"vehicleNumber": "roadNumber"
}
},
{
"operation": "default",
"spec": {
"appId": "configMsgXX",
"customerId": "ABC",
"messageRev": 1,
"messageType": "customStatistics"
}
}
]
Expected output is as below
{
"appId": "configMsgXX",
"customerId": "ABC",
"deviceId": string1+roadNumber+string2",
"messageRev": 1,
"messageTime": 1543395341000,
"messageType": "customStatistics",
"parameters": [
{
"address": string1+string2,
"name": "EM2000VoltageMainGenerator",
"timestamp": 1543395341000,
"quality": "3",
"datatype": "INTEGER",
"value": 100,
"qualityReason": "Stale Data",
"category": "REAL"
}
],
"roadNumber": 66079
}
I am using this library https://github.com/bazaarvoice/jolt
[
{
"operation": "shift",
"spec": {
"frames": {
"*": {
"seq": "parameters[&1].seq",
"card": "parameters[&1].card",
"channel": "parameters[&1].channel",
"value": "parameters[&1].value",
"#(2,rmdTime)": "parameters[&1].timestamp"
}
},
"rmdTime": "messageTime",
"vehicleNumber": "roadNumber"
}
},
{
"operation": "modify-overwrite-beta",
"spec": {
"parameters": {
"*": {
"quality": "3",
"name": "",
"address": "",
"datatype": "INTEGER",
"qualityReason": "Stale Data",
"category": "REAL"
}
}
}
},
{
"operation": "default",
"spec": {
"appId": "configMsgXX",
"customerId": "ABC",
"messageRev": 1,
"messageType": "customStatistics"
}
}
]

Elasticsearch - query dates without a specified timezone

I have an index with the following mappings - standard format for a date. In the 2nd record below the time specified is actually a local time - but ES treats it as UTC.
Even though ES is internally converting all parsed datetimes to UTC but it must obviously store the original string as well.
My question is whether (and how) it might be possible to query all records for which the scheduledDT value doesn't have the timezone explicitly specified.
{
"curator_v3": {
"mappings": {
"published": {
"analyzer": "classic",
"numeric_detection": true,
"properties": {
"Id": {
"type": "string",
"index": "not_analyzed",
"include_in_all": false
},
"createDT": {
"type": "date",
"format": "dateOptionalTime",
"include_in_all": false
},
"scheduleDT": {
"type": "date",
"format": "dateOptionalTime",
"include_in_all": false
},
"title": {
"type": "string",
"fields": {
"english": {
"type": "string",
"analyzer": "english"
},
"raw": {
"type": "string",
"index": "not_analyzed"
},
"shingle": {
"type": "string",
"analyzer": "shingle"
},
"spanish": {
"type": "string",
"analyzer": "spanish"
}
},
"include_in_all": false
}
}
}
}
}
}
We use .NET as our client to ElasticSearch and haven't been consistent in specifying a timezone for the scheduleDT field.
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 12,
"successful": 12,
"failed": 0
},
"hits": {
"total": 32,
"max_score": null,
"hits": [
{
"_index": "curator_v3",
"_type": "published",
"_id": "29651227",
"_score": null,
"fields": {
"Id": [
"29651227"
],
"scheduleDT": [
"2015-11-21T22:17:51.0946798-06:00"
],
"title": [
"97 Year-Old Woman Cries Tears Of Joy After Finally Getting Her High School Diploma"
],
"createDT": [
"2015-11-21T22:13:32.3597142-06:00"
]
},
"sort": [
1448165871094
]
},
{
"_index": "curator_v3",
"_type": "published",
"_id": "210466413",
"_score": null,
"fields": {
"Id": [
"210466413"
],
"scheduleDT": [
"2015-11-22T12:00:00"
],
"title": [
"6 KC treats to bring to Thanksgiving"
],
"createDT": [
"2015-11-20T15:08:25.4282-06:00"
]
},
"sort": [
1448193600000
]
}
]
},
"aggregations": {
"ScheduleDT": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 27,
"buckets": [
{
"key": 1448165871094,
"key_as_string": "2015-11-22T04:17:51.094Z",
"doc_count": 1
},
{
"key": 1448193600000,
"key_as_string": "2015-11-22T12:00:00.000Z",
"doc_count": 4
}
]
}
}
}
You can do this by querying the document having a scheduleDT whose field length is less than 20 characters (e.g. 2015-11-22T12:00:00). All the date fields with a specified time zone would be longer.
Something like this should do:
{
"query": {
"filtered": {
"filter": {
"script": {
"script": "doc.scheduleDT.value.size() < 20"
}
}
}
}
}
Note, however, that in order to make your queries easier to create you should always try to convert all your timestamps in UTC before indexing your documents.
Finally, also make sure that you have dynamic scripting enabled in order to run the above query.
UPDATE
Actually, if you use the _source directly in the script it will work because it will return the real value from the source as it was when the document was indexed:
{
"query": {
"filtered": {
"filter": {
"script": {
"script": "_source.scheduleDT.size() < 20"
}
}
}
}
}

Elastic Search: Any way to make space-separated words in a comma-separated list regarded as one term?

I don't know if this is possible, but I'm trying to search by locations with an "exact search" option. There are a couple fields that get searched, with the most important one being the "location_raw" field:
"match": {
"location.location_raw": {
"type": "boolean",
"operator": "AND",
"query": "[location query]",
"analyzer": "standard"
}
}
The location_raw field is a location string with a comma between each place, such as "Sudbury, Middlesex, Massachusetts" or "Leamington, Warwickshire, England". If someone searches for "Sudbury, Middlesex" it gets passed in as
"query": "Sudbury Middlesex"
and both of those terms must exist in the location_raw field. This part works.
The problem is that when the location_raw field contains multi-word location, like New York or Saint George, these get returned when someone searches for "York" or "George." If I do an exact search for "George," I do not want to get results for "Saint George." Is there any way to make Elastic consider "Saint George" one term in the string "Saint George, Stamford, Lincoln, England"?
Here's one way to do it, but you have to query in csv too, or use a terms filter.
I used a pattern analyzer with a simple pattern: ", ". I set up a simple index with a single document:
PUT /test_index
{
"settings": {
"number_of_shards": 1,
"analysis": {
"analyzer": {
"csv": {
"type": "pattern",
"pattern": ", ",
"lowercase": false
}
}
}
},
"mappings": {
"doc": {
"properties": {
"location": {
"type": "string",
"index_analyzer": "csv",
"search_analyzer": "standard",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
POST /test_index/_bulk
{"index":{"_index":"test_index","_type":"doc","_id":1}}
{"location":"Saint George, Stamford, Lincoln, England"}
I can see the terms generated with a simple terms aggregation:
POST /test_index/_search?search_type=count
{
"aggs": {
"location_terms": {
"terms": {
"field": "location"
}
}
}
}
...
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0,
"hits": []
},
"aggregations": {
"location_terms": {
"buckets": [
{
"key": "England",
"doc_count": 1
},
{
"key": "Lincoln",
"doc_count": 1
},
{
"key": "Saint George",
"doc_count": 1
},
{
"key": "Stamford",
"doc_count": 1
}
]
}
}
}
And then if I query with the same csv syntax, the document isn't returned for "George, England":
POST /test_index/_search
{
"query": {
"match": {
"location": {
"type": "boolean",
"operator": "AND",
"query": "George, England",
"analyzer": "csv"
}
}
}
}
...
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 0,
"max_score": null,
"hits": []
}
}
but is for "Saint George, England":
POST /test_index/_search
{
"query": {
"match": {
"location": {
"type": "boolean",
"operator": "AND",
"query": "Saint George, England",
"analyzer": "csv"
}
}
}
}
...
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.2169777,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": 0.2169777,
"_source": {
"location": "Saint George, Stamford, Lincoln, England"
}
}
]
}
}
This query is equivalent, and probably more performant:
POST /test_index/_search
{
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"terms": {
"location": [
"Saint George",
"England"
],
"execution": "and"
}
}
}
}
}
Here's the code I used to test it:
http://sense.qbox.io/gist/234ea93accb7b20ad8fd33e62fe92f1d450a51ab