ElasticSearch river from Mongo messing up field mappings - mongodb

I'm using Mongo, Elastic Search and this river plugin: https://github.com/richardwilly98/elasticsearch-river-mongodb
I have successfully set everything up in that the river keeps the ES data updated when Mongo is updated, but the river is straight up copying all the properties from the Mongo documents into ES, but I only want a small sub-set of those records. E.g. if a Mongo doc has 30 properties all of them are getting put into ES instead of only the 5 that I want. I assume the issue is with the mappings, and I've followed several docs and another Stack Overflow thread (curl -X POST -d #mapping.json + mapping not created) but it still is not working for me. Here is what I'm doing:
I'm creating my index with:
curl -XPOST "http://localhost:9200/mongoindex" -d #index.json
index.json:
{
"settings" : {
"number_of_shards" : 1
},
"analysis" : {
"analyzer" : {
"str_search_analyzer" : {
"tokenizer" : "keyword",
"filter" : ["lowercase"]
},
"str_index_analyzer" : {
"tokenizer" : "keyword",
"filter" : ["lowercase", "ngram"]
}
},
"filter" : {
"ngram" : {
"type" : "ngram",
"min_gram" : 2,
"max_gram" : 20
}
}
}
}
Then running:
curl -XPOST "http://localhost:9200/mongoindex/listing/_mapping" -d #mapping.json
With this data:
{
"listing":{
"properties":{
"_all": {
"enabled": false
},
"title": {
"type": "string",
"store": false,
"index": "not_analyzed"
},
"bathrooms": {
"type": "integer",
"store": true,
"index": "analyzed"
},
"bedrooms": {
"type": "integer",
"store": true,
"index": "analyzed"
},
"address": {
"type": "nested",
"include_in_parent": true,
"store": true,
"properties": {
"counrty": {
"type":"string"
},
"city": {
"type":"string"
},
"stateOrProvince": {
"type":"string"
},
"fullStreetAddress": {
"type":"string"
},
"postalCode": {
"type":"string"
}
}
},
"location": {
"type": "geo_point",
"full_name": "geometry.coordiantes",
"store": true
}
}
}
}
Then finally creating the river with:
curl -XPUT "http://localhost:9200/_river/mongoindex/_meta" -d #river.json
river.json:
{
"type": "mongodb",
"mongodb": {
"db": "blueprint",
"collection": "Listing",
"options": {
"secondary_read_preference": true,
"drop_collection": true
}
},
"index": {
"name": "mongoindex",
"type": "listing"
}
}
After all that the river works in that ES is populated, but its a verbatim copy of Mongo right now, and I need to modify the mappings, but it just is not taking effect. What am I missing?
This is what my mapping looks like after the river runs.... nothing like what I want it to look like.

I would set dynamic mapping to false:
The dynamic creation of mappings for unmapped types can be completely
disabled by setting index.mapper.dynamic to false.
http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/mapping-dynamic-mapping.html
Others have had similar issues to yours and it looks like the best solution so far has been to prevent the MongoDB River from dynamically mapping at all:
https://github.com/richardwilly98/elasticsearch-river-mongodb/issues/75

Turns out the issue was that the dynamic property was left out of the mappings config. It should be in 2 places, on the index.json as shown above, and in the mappings.json:
{
"listing":{
"_source": {
"enabled": false
},
"dynamic": false, // <--- Need to add this
"properties":{
"_all": {
"enabled": false
},
"title": {
"type": "string",
"store": false,
"index": "str_index_analyzer"
},
"bathrooms": {
"type": "integer",
"store": true,
"index": "analyzed"
},
"bedrooms": {
"type": "integer",
"store": true,
"index": "analyzed"
},
"address": {
"type": "nested",
"include_in_parent": true,
"store": true,
"properties": {
"counrty": {
"type":"string",
"index": "str_index_analyzer"
},
"city": {
"type":"string",
"index": "str_index_analyzer"
},
"stateOrProvince": {
"type":"string",
"index": "str_index_analyzer"
},
"fullStreetAddress": {
"type":"string",
"index": "str_index_analyzer"
},
"postalCode": {
"type":"string"
}
}
},
"location": {
"type": "geo_point",
"full_name": "geometry.coordiantes",
"store": true
}
}
}
}
The 902 docs vs 451, I think that is an bug in the ElasticSearch Head plugin I'm using to browse documents. It doesn't have duplicates, but a couple of spots show 902 docs as a summary of sorts.

Related

Querying a map (<String, Object>) in JSON through MongoDB

How to query a map of type Map<String, List> in JSON form, in MongoDB?
Sample JSON:
{
"WIDTH": 810,
"HEIGHT": 465,
"MODULES": {
"23": {
"XNAME": "COMP1",
"PARAMS": {
"_Klockers": {
"TYPE": "text",
"VALUE": "Klocker#3"
},
"SUBSYS": {
"TYPE": "text",
"VALUE": "2"
},
"EP": {
"TYPE": "integer",
"VALUE": "2"
}
}
},
"24": {
"XNAME": "COMP2",
"PARAMS": {
"_Rockers": {
"TYPE": "text",
"VALUE": "Rocker#3"
},
"Driver": {
"TYPE": "binary",
"VALUE": 1
},
"EP": {
"TYPE": "long",
"VALUE": "233"
}
}
},
"25": {
"XNAME": "COMP3",
"PARAMS": {
"_Mockers": {
"TYPE": "text",
"VALUE": "Mocker#3"
},
"SYSMain": {
"TYPE": "text",
"VALUE": "2342"
},
"TLP": {
"TYPE": "double",
"VALUE": "2.3"
}
}
}
}
}
Basically I want to :
List all the "XNAME" field values of all keys in "MODULES".
Expected output : {"COMP1", "COMP2", "COMP3"}
List all the "TYPE" in "PARAMS" object within each key of "MODULES".
Expected output : {"text", "text", "integer", "text", "binary", "long", "text", "text", "double"}
I am new to MongoDB and any help or redirection is appreciated.
You can use this
db.collection.aggregate([
{
$project: {//You require this as your data is dynamic
"modules": {
"$objectToArray": "$MODULES"
}
}
},
{//Destruct the array
"$unwind": "$modules"
},
{
"$project": {//Again, requires the same as keys are dynamic
"types": {
"$objectToArray": "$modules.v.PARAMS"
},
xname: "$modules.v.XNAME"
}
},
{//Destruct the types
$unwind: "$types"
},
{//Get the distinct values
$group: {
"_id": null,
"xname": {
"$addToSet": "$xname"
},
"types": {
"$addToSet": "$types.v.TYPE"
},
}
}
])

LoopBack 3.0: where filter not returning results from REST API

I have a LoopBack API with a single simple model like this:
{
"name": "Establishment",
"base": "PersistedModel",
"idInjection": true,
"options": {
"validateUpsert": true
},
"properties": {
"Distance": {
"type": "number"
},
"EstablishmentId": {
"type": "number"
},
"EstablishmentType": {
"type": "string"
},
"Location": {
"type": "string"
},
"MinCost": {
"type": "number"
},
"Name": {
"type": "string"
},
"Stars": {
"type": "number"
},
"UserRating": {
"type": "number"
},
"UserRatingTitle": {
"type": "string"
},
"UserRatingCount": {
"type": "number"
},
"ImageUrl": {
"type": "string"
},
"ThumbnailUrl": {
"type": "string"
}
},
"validations": [],
"relations": {},
"acls": [],
"methods": {}
}
A simple call to http://localhost:3000/api/Establishments returns all of the results, as expected; but a call to http://localhost:3000/api/Establishments?filter[where][distance][gt]=30 yields no results at all: an empty array.
There are lots of Establishments with a Distance greater than 30; and indeed using the where filter on other properties also results in an empty array. What could I be missing?
As I mentioned in the comment, it is case-sensitive and I varified it on my app to be certain about it.
it should be :
http://localhost:3000/api/Establishments?filter[where][Distance][gt]=30
or you can try with this format :
http://localhost:3000/api/Establishments?filter={"where":{"Distance":{"gt":30}}}

Elasticsearch - query dates without a specified timezone

I have an index with the following mappings - standard format for a date. In the 2nd record below the time specified is actually a local time - but ES treats it as UTC.
Even though ES is internally converting all parsed datetimes to UTC but it must obviously store the original string as well.
My question is whether (and how) it might be possible to query all records for which the scheduledDT value doesn't have the timezone explicitly specified.
{
"curator_v3": {
"mappings": {
"published": {
"analyzer": "classic",
"numeric_detection": true,
"properties": {
"Id": {
"type": "string",
"index": "not_analyzed",
"include_in_all": false
},
"createDT": {
"type": "date",
"format": "dateOptionalTime",
"include_in_all": false
},
"scheduleDT": {
"type": "date",
"format": "dateOptionalTime",
"include_in_all": false
},
"title": {
"type": "string",
"fields": {
"english": {
"type": "string",
"analyzer": "english"
},
"raw": {
"type": "string",
"index": "not_analyzed"
},
"shingle": {
"type": "string",
"analyzer": "shingle"
},
"spanish": {
"type": "string",
"analyzer": "spanish"
}
},
"include_in_all": false
}
}
}
}
}
}
We use .NET as our client to ElasticSearch and haven't been consistent in specifying a timezone for the scheduleDT field.
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 12,
"successful": 12,
"failed": 0
},
"hits": {
"total": 32,
"max_score": null,
"hits": [
{
"_index": "curator_v3",
"_type": "published",
"_id": "29651227",
"_score": null,
"fields": {
"Id": [
"29651227"
],
"scheduleDT": [
"2015-11-21T22:17:51.0946798-06:00"
],
"title": [
"97 Year-Old Woman Cries Tears Of Joy After Finally Getting Her High School Diploma"
],
"createDT": [
"2015-11-21T22:13:32.3597142-06:00"
]
},
"sort": [
1448165871094
]
},
{
"_index": "curator_v3",
"_type": "published",
"_id": "210466413",
"_score": null,
"fields": {
"Id": [
"210466413"
],
"scheduleDT": [
"2015-11-22T12:00:00"
],
"title": [
"6 KC treats to bring to Thanksgiving"
],
"createDT": [
"2015-11-20T15:08:25.4282-06:00"
]
},
"sort": [
1448193600000
]
}
]
},
"aggregations": {
"ScheduleDT": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 27,
"buckets": [
{
"key": 1448165871094,
"key_as_string": "2015-11-22T04:17:51.094Z",
"doc_count": 1
},
{
"key": 1448193600000,
"key_as_string": "2015-11-22T12:00:00.000Z",
"doc_count": 4
}
]
}
}
}
You can do this by querying the document having a scheduleDT whose field length is less than 20 characters (e.g. 2015-11-22T12:00:00). All the date fields with a specified time zone would be longer.
Something like this should do:
{
"query": {
"filtered": {
"filter": {
"script": {
"script": "doc.scheduleDT.value.size() < 20"
}
}
}
}
}
Note, however, that in order to make your queries easier to create you should always try to convert all your timestamps in UTC before indexing your documents.
Finally, also make sure that you have dynamic scripting enabled in order to run the above query.
UPDATE
Actually, if you use the _source directly in the script it will work because it will return the real value from the source as it was when the document was indexed:
{
"query": {
"filtered": {
"filter": {
"script": {
"script": "_source.scheduleDT.size() < 20"
}
}
}
}
}

How can I use CloudKit web services to query based on a reference field?

I've got two CloudKit data objects that look somewhat like this:
Parent Object:
{
"records": [
{
"recordName": "14102C0A-60F2-4457-AC1C-601BC628BF47-184-000000012D225C57",
"recordType": "ParentObject",
"fields": {
"fsYear": {
"value": "2015",
"type": "STRING"
},
"displayOrder": {
"value": 2015221153856287200,
"type": "INT64"
},
"fjpFSGuidForReference": {
"value": "14102C0A-60F2-4457-AC1C-601BC628BF47-184-000000012D225C57",
"type": "STRING"
},
"fsDateSearch": {
"value": "2015221153856287158",
"type": "STRING"
},
},
"recordChangeTag": "id4w7ivn",
"created": {
"timestamp": 1439149087571,
"userRecordName": "_0d26968032e31bbc72c213037b6cb35d",
"deviceID": "A19CD995FDA3093781096AF5D818033A241D65C1BFC3D32EC6C5D6B3B4A9AA6B"
},
"modified": {
"timestamp": 1439149087571,
"userRecordName": "_0d26968032e31bbc72c213037b6cb35d",
"deviceID": "A19CD995FDA3093781096AF5D818033A241D65C1BFC3D32EC6C5D6B3B4A9AA6B"
}
}
],
"total":
}
Child Object:
{
"records": [
{
"recordName": "2015221153856287168",
"recordType": "ChildObject",
"fields": {
"District": {
"value": "002",
"type": "STRING"
},
"ZipCode": {
"value": "12345",
"type": "STRING"
},
"InspecReference": {
"value": {
"recordName": "14102C0A-60F2-4457-AC1C-601BC628BF47-184-000000012D225C57",
"action": "NONE",
"zoneID": {
"zoneName": "_defaultZone"
}
},
"type": "REFERENCE"
},
},
"recordChangeTag": "id4w7lew",
"created": {
"timestamp": 1439149090856,
"userRecordName": "_0d26968032e31bbc72c213037b6cb35d",
"deviceID": "A19CD995FDA3093781096AF5D818033A241D65C1BFC3D32EC6C5D6B3B4A9AA6B"
},
"modified": {
"timestamp": 1439149090856,
"userRecordName": "_0d26968032e31bbc72c213037b6cb35d",
"deviceID": "A19CD995FDA3093781096AF5D818033A241D65C1BFC3D32EC6C5D6B3B4A9AA6B"
}
}
],
"total": 1
}
I'm trying to write a query to directly access the CloudKit web service and return the Child Object based on the reference of the parent object.
My test JSON looks something like this:
{"query":{"recordType":"ChildObject","filterBy":{"fieldName":"InspecReference","fieldValue":{ "value" : "14102C0A-60F2-4457-AC1C-601BC628BF47-184-000000012D225C57", "type" : "string" },"comparator":"EQUALS"}},"zoneID":{"zoneName":"_defaultZone"}}
However, I'm getting the following error from CloudKit:
{"uuid":"33db91f3-b768-4a68-9056-216ecc033e9e","serverErrorCode":"BAD_REQUEST","reason":"BadRequestException:
Unexpected input"}
I'm guessing I have the Record Field Dictionary in the query wrong. However, the documentation isn't clear on what this should look like on a reference object.
You have to re-create the actual object of the reference. In this particular case, the JSON looks like this:
{
"query": {
"recordType": "ChildObject",
"filterBy": {
"fieldName": "InspecReference",
"fieldValue": {
"value": {
"recordName": "14102C0A-60F2-4457-AC1C-601BC628BF47-184-000000012D225C57",
"action": "NONE"
},
"type": "REFERENCE"
},
"comparator": "EQUALS"
}
},
"zoneID": {
"zoneName": "_defaultZone"
}
}

ElasticSearch autocomplete returning 0 hits

I am trying to build an autocomplete feature for our database running on MongoDB. We need to provide autocomplete which lets users complete their queries by offering suggestions while they are typing in the search box.
I have a collection of articles from various sources, which is having the following fields :
{
"title" : "Its the title of a random article",
"cont" : { "paragraphs" : [ .... ] },
and so on..
}
I went through a video by Clinton Gormley. From 37:00 through 42:00 minute, Gormley describes an autocomplete using edgeNGram. Also, I referred to this question to recognize that both are almost the same things, just the mappings differ.
So based on these experiences, I built almost identical settings and mapping and then restored articles collection to ensure that it is indexed by ElasticSearch
The indexing scheme is as follows:
POST /title_autocomplete/title
{
"settings": {
"analysis": {
"filter": {
"autocomplete": {
"type": "edgeNGram",
"min_gram": 2,
"max_gram": 50
}
},
"analyzer": {
"title" : {
"type" : "standard",
"stopwords":[]
},
"autocomplete": {
"type" : "autocomplete",
"tokenizer": "standard",
"filter": ["lowercase", "autocomplete"]
}
}
}
},
"mappings": {
"title": {
"type": "multi_field",
"fields" : {
"title" : {
"type": "string",
"analyzer": "title"
},
"autocomplete" : {
"type": "string",
"index_analyzer": "autocomplete",
"search_analyzer" : "title"
}
}
}
}
}
But when I run the search query, I am unable to get any hits!
GET /title_autocomplete/title/_search
{
"query": {
"bool" : {
"must" : {
"match" : {
"title.autocomplete" : "Its the titl"
}
},
"should" : {
"match" : {
"title" : "Its the titl"
}
}
}
}
}
Can anybody please explain what's wrong with the mapping query or settings? I have been reading ElasticSearch docs for over 7 days now but seem to get nowhere more than full text searches!
ElastiSearch version : 0.90.10
MongoDB version : v2.4.9
using _river
Ubuntu 12.04 64bit
UPDATE
I realised that mapping is screwed after applying previous settings:
GET /title_autocomplete/_mapping
{
"title_autocomplete": {
"title": {
"properties": {
"analysis": {
"properties": {
"analyzer": {
"properties": {
"autocomplete": {
"properties": {
"filter": {
"type": "string"
},
"tokenizer": {
"type": "string"
},
"type": {
"type": "string"
}
}
},
"title": {
"properties": {
"type": {
"type": "string"
}
}
}
}
},
"filter": {
"properties": {
"autocomplete": {
"properties": {
"max_gram": {
"type": "long"
},
"min_gram": {
"type": "long"
},
"type": {
"type": "string"
}
}
}
}
}
}
},
"content": {
... paras and all ...
}
"title": {
"type": "string"
},
"url": {
"type": "string"
}
}
}
}
}
Analyzers and filters are actually mapped into the document after the settings are applied whereas original title field is not affected at all! Is this normal??
I guess this explains why the query is not matching. There is no title.autocomplete field or title.title field at all.
So how should I proceed now?
For those facing this problem, its better to delete the index and start again instead of wasting time with the _river just as DrTech pointed out in the comment.
This saves time but is not a solution. (Therefore not marking it as answer.)
The key is to set up the mappings and index before you initiate the river.
We had an existing setup with a mongodb river and an index called coresearch that we wanted to add autocomplete capacity to, this is the set of commands we used to delete the existing index and river and start again.
Stack is:
ElasticSearch 1.1.1
MongoDB 2.4.9
ElasticSearchMapperAttachments v2.0.0
ElasticSearchRiverMongoDb/2.0.0
Ubuntu 12.04.2 LTS
curl -XDELETE "localhost:9200/_river/node"
curl -XDELETE "localhost:9200/coresearch"
curl -XPUT "localhost:9200/coresearch" -d '
{
"settings": {
"analysis": {
"filter": {
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 20
}
},
"analyzer": {
"autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"autocomplete_filter"
]
}
}
}
}
}'
curl -XPUT "localhost:9200/coresearch/_mapping/users" -d '{
"users": {
"properties": {
"firstname": {
"type": "string",
"search_analyzer": "standard",
"index_analyzer": "autocomplete"
},
"lastname": {
"type": "string",
"search_analyzer": "standard",
"index_analyzer": "autocomplete"
},
"username": {
"type": "string",
"search_analyzer": "standard",
"index_analyzer": "autocomplete"
},
"email": {
"type": "string",
"search_analyzer": "standard",
"index_analyzer": "autocomplete"
}
}
}
}'
curl -XPUT "localhost:9200/_river/node/_meta" -d '
{
"type": "mongodb",
"mongodb": {
"servers": [
{ "host": "127.0.0.1", "port": 27017 }
],
"options":{
"exclude_fields": ["time"]
},
"db": "users",
"gridfs": false,
"options": {
"import_all_collections": true
}
},
"index": {
"name": "coresearch",
"type": "documents"
}
}'