Elasticsearch indexing of Twitter bounding box not recognized as a geo_shape - rest

I'm trying to create an Elasticsearch mapping for Twitter's Place geo bounding_box array and I can't get Elasticsearch to index it as a geo bounding box. In my app, I will be getting the raw JSON from Twitter4j, however the bounding box does not close the bounding box, so for the purpose of this test, I edited the json and closed it. I'm using Elastic cloud (ES v5) and the Rest API and then visualizing with Kibana.
Here is the mapping I'm trying to use. I've tried several variations with and without a "properties" block and it doesn't work. With this mapping, I am successfully able to PUT the mapping, but when I POST the document, Kibana recognizes the array as an unknown field type.
The Point coordinates field is indexed as a geopoint just fine, it's the bounding box that does not.
Here is my mapping:
PUT /testgeo
{
"mappings": {
"tweet": {
"_all": {
"enabled": false
},
"properties": {
"created_at": {
"type": "date",
"format": "EEE MMM dd HH:mm:ss Z YYYY||strict_date_optional_time||epoch_millis"
},
"coordinates": {
"properties": {
"coordinates": {
"type": "geo_point",
"ignore_malformed": true
}
}
},
"place": {
"properties": {
"bounding_box": {
"type": "geo_shape",
"tree": "quadtree",
"precision": "1m"
}
}
}
}
}
}
}
Here is the snippet of the document I am trying to POST (NOTE: I manually added the 5th array element to close the bounding box).
POST /testgeo/tweet/1
{
...
"coordinates": {
"type": "point",
"coordinates": [
0.78055556,
51.97222222
]
},
"place": {
"id": "0c31a1a5b970086e",
"url": "https:\/\/api.twitter.com\/1.1\/geo\/id\/0c31a1a5b970086e.json",
"place_type": "city",
"name": "Bures",
"full_name": "Bures, England",
"country_code": "GB",
"country": "United Kingdom",
"bounding_box": {
"type": "polygon",
"coordinates": [
[
[
0.773779,
51.96971
],
[
0.773779,
51.976437
],
[
0.781794,
51.976437
],
[
0.781794,
51.96971
],
[
0.773779,
51.96971
]
]
]
},
"attributes": {
}
},
If anyone can identify the reason for this and correct it, I would be most appreciative.
NOTE 1:: I tried using the mapping and document examples from Elastic's geo_shape documentation page and Kibana again showed the location field as unknown type.
PUT /testgeo
{
"mappings": {
"tweet": {
"_all": {
"enabled": false
},
"properties": {
"location": {
"type": "geo_shape",
"tree": "quadtree",
"precision": "1m"
}
}
}
}
}
POST /testgeo/tweet/1
{
"location" : {
"type" : "polygon",
"coordinates" : [
[ [100.0, 0.0], [101.0, 0.0], [101.0, 1.0], [100.0, 1.0], [100.0, 0.0] ]
]
}
}

Turns out that Kibana simply does reflect the type for GeoShape's. When doing a geo query, however, Elasticsearch returns correct results.
For example:
"query": {
"bool": {
"must": {
"match_all": {}
},
"filter": {
"geo_shape": {
"place.bounding_box": {
"shape": {
"type": "polygon",
"coordinates": [
[
[
0.773779,
51.96971
],
[
0.773779,
51.976437
],
[
0.781794,
51.976437
],
[
0.781794,
51.96971
],
[
0.773779,
51.96971
]
]
]
},
"relation": "within"
}
}
}
}
}
}

Even though you seem to have found a solution to your problem I just wanted to say there is a fix now for this issue by using the coerce option in the mapping for geo_shape like so:
"properties": {
"bounding_box": {
"type": "geo_shape",
"tree": "quadtree",
"precision": "1m",
"coerce": true
}
}
Also see:
https://github.com/elastic/elasticsearch/pull/11161

Related

MongoDB - strange inconsistency in geowithin polygon search

Using MongoDB stuck with the strange problem that polygon position on the map doesn't match the polygon points searched by $geowithin command.
This point shouldn't be in the polygon based on UI view -> however found by Mongo query.
-88.35311589225228, 50.46582815393761
Checked different version of MongoDB, drivers & etc -> however didn't help. Any ideas what can be the issue? Or just also difference?
DB Query:
db.device.find({
'location': {
$geoWithin: {
$geometry: {
type: 'Polygon',
coordinates: [[[-94.43847656250001, 52.855864177853995], [-87.89062500000001, 45.9511496866914], [-79.93652343750001, 46.55886030311719], [-94.43847656250001, 52.855864177853995]]] }
}
}
}
);
Visual representative:
https://geojson.io/#map=5.06/50.09/-88.52
{
"type": "FeatureCollection",
"features": [
{
"type": "Feature",
"properties": {},
"geometry": {
"type": "Point",
"coordinates": [
-88.35311589225228, 50.46582815393761
]
}
},
{
"type": "Feature",
"properties": {},
"geometry": {
"type": "Polygon",
"coordinates": [
[
[
-94.43847656250001,
52.855864177853995
],
[
-87.89062500000001,
45.9511496866914
],
[
-79.93652343750001,
46.55886030311719
],
[
-94.43847656250001,
52.855864177853995
]
]
]
}
}
]
}

Can I Use MongoDB Geospatial Query to Find Closest LineString to Point?

I have a collection of geoJSON LineString objects and need to determine which is closest to a point. I don't have much experience with Mongo DB, but have used the $geoNear to find closest points. Is there a way to adapt this to work with a collection of LineStrings?
Example collection:
{
"_id": ObjectId("5ee3e2deee404124a8ba4382"),
"geoJSON": {
"type": "Feature",
"geometry": {
"type": "LineString",
"coordinates": [
[
-85.5,
31.0
],
[
-85.6,
31.0
]
]
}
}
}
{
"_id": ObjectId("5ee3e2deee404124a8ba4383"),
"geoJSON": {
"type": "Feature",
"geometry": {
"type": "LineString",
"coordinates": [
[
-85.55,
31.5
],
[
-85.6,
31.5
]
]
}
}
}
{
"_id": ObjectId("5ee3e2deee404124a8ba4384"),
"geoJSON": {
"type": "Feature",
"geometry": {
"type": "LineString",
"coordinates": [
[
-85.5,
32.0
],
[
-85.6,
32.0
]
]
}
}
}
I'd like to search this collection to determine which line is closest to the point [-85.55, 31.77]. This should return the third line (blue line in the image below). Is there a way to do this efficiently in MongoDB?
Plot

MongoDB geointersect fail to find big polygons

I'm using mongo to store some geolocalized data, with the goal of retriving them using $geointersect. Specifically I have this document stored in my db:
{
"loc": {
"geometry": {
"type": "Polygon",
"coordinates": [
[
[
-179.875,
-89.875
],
[
179.875,
-89.875
],
[
179.875,
89.875
],
[
-179.875,
89.875
],
[
-179.875,
-89.875
]
]
]
},
"crs": {
"type": "name",
"properties": {
"name": "urn:x-mongodb:crs:strictwinding:EPSG:4326"
}
},
"type": "Feature",
"properties": {}
},
"_id": "576af8e31d41c87fa1f1d04f"
}
Which as you may notice covers almost the entire World. Now it doesn't matter what coordinates I input for my $geointersect query, it will never be returned... Does anyone know why?
An example of the query I'm using could be:
[
{
"loc.geometry": {
"$geoIntersects": {
"$geometry": {
"type": "Polygon",
"coordinates": [
[
[
13.4307861328125,
41.599013054830216
],
[
13.9801025390625,
41.599013054830216
],
[
13.9801025390625,
41.80407814427234
],
[
13.4307861328125,
41.80407814427234
],
[
13.4307861328125,
41.599013054830216
]
]
]
}
}
}
},
{
"loc": 1
}
]
Of course the coordinates of the query are contained inside the polygon in the saved document (I mean... it's as big as the the world, duh!) but for some reason it doesn't find any match... I'm kinda lost.
Just looking into this now. If the polygon is solid and bigger than an earth's hemisphere, mongo returns everything outside of the area. check out their big poly and crs in the geoWithin documentation

Getting Lifetime Values from Google Analytics API

Google Analytics API documentation shows that, for fetching the lifetime values, the date ranges should not be specified. But when I make such a request (without date range), it returns empty dimension and metrics result. But when I use date range, it returns dimension and metrics values for that date range.
The following is an excerpt from the API documentation :
Date ranges should not be specified for cohorts or Lifetime value
requests.
For example, if I make the request without date range, as follows:
{
"reportRequests": [
{
"viewId": "XXXXXXXXX",
"dimensions": [
{
"name": "ga:date"
},
{
"name": "ga:eventLabel"
}
],
"metrics": [
{
"expression": "ga:totalEvents"
}
]
}
]
}
I get the following response:
{
"reports": [
{
"columnHeader": {
"dimensions": [
"ga:date",
"ga:eventLabel"
],
"metricHeader": {
"metricHeaderEntries": [
{
"name": "ga:totalEvents",
"type": "INTEGER"
}
]
}
},
"data": {
"totals": [
{
"values": [
"0"
]
}
]
}
}
]
}
However, if I include the date range,
{
"reportRequests": [
{
"viewId": "XXXXXXXX",
"dimensions": [
{
"name": "ga:date"
},
{
"name": "ga:eventLabel"
}
],
"metrics": [
{
"expression": "ga:totalEvents"
}
],
"dateRanges": [
{
"startDate": "2016-01-01",
"endDate": "2016-04-30"
}
]
}
]
}
I get the following response:
{
"reports": [
{
"columnHeader": {
"dimensions": [
"ga:date",
"ga:eventLabel"
],
"metricHeader": {
"metricHeaderEntries": [
{
"name": "ga:totalEvents",
"type": "INTEGER"
}
]
}
},
"data": {
"rows": [
{
"dimensions": [
"20160412",
"http://mytestblog.com/"
],
"metrics": [
{
"values": [
"1"
]
}
]
},
{
"dimensions": [
"20160412",
"http://mytestblog.com/2016/04/first-post.html"
],
"metrics": [
{
"values": [
"3"
]
}
]
},
{
"dimensions": [
"20160419",
"http://mytestblog.com/"
],
"metrics": [
{
"values": [
"4"
]
}
]
},
{
"dimensions": [
"20160419",
"http://mytestblog.com/2016/04/fourth.html"
],
"metrics": [
{
"values": [
"13"
]
}
]
}
],
"totals": [
{
"values": [
"21"
]
}
],
"rowCount": 4,
"minimums": [
{
"values": [
"1"
]
}
],
"maximums": [
{
"values": [
"13"
]
}
]
}
}
]
}
Why is it that, even though specified in the documentation, I have to specify date range in the ReportRequest to get the values? Am I misunderstanding the meaning of Lifetime values here?
The reportRequest object should have either a value for dateRanges or a definition value for cohortGroup. When you omit both the requests assumes the default values for a startDate of 7daysAgo and an endDate of yesterday.
The correct interpretation of the docs is that the reportRequest should not have a dateRange defined for cohort and LTV requests. But in order to make a cohort or lifetime value request you must add a cohort definition. For Lifetime value requests the cohort definition should have a specific dateRange in addition to the lifetimeValue field set to true:
POST https://analyticsreporting.googleapis.com/v4/reports:batchGet
{
"reportRequests": [
{
"viewId": "XXXX",
"dimensions": [
{"name": "ga:cohort" },
{"name": "ga:cohortNthWeek" }],
"metrics": [
{"expression": "ga:cohortTotalUsersWithLifetimeCriteria"},
{"expression": "ga:cohortRevenuePerUser"}
],
"cohortGroup": {
"cohorts": [{
"name": "cohort 1",
"type": "FIRST_VISIT_DATE",
"dateRange": {
"startDate": "2015-08-01",
"endDate": "2015-09-01"
}
},
{
"name": "cohort 2",
"type": "FIRST_VISIT_DATE",
"dateRange": {
"startDate": "2015-07-01",
"end_date": "2015-08-01"
}
}],
"lifetimeValue": True
}
}]
}

Elasticsearch - query dates without a specified timezone

I have an index with the following mappings - standard format for a date. In the 2nd record below the time specified is actually a local time - but ES treats it as UTC.
Even though ES is internally converting all parsed datetimes to UTC but it must obviously store the original string as well.
My question is whether (and how) it might be possible to query all records for which the scheduledDT value doesn't have the timezone explicitly specified.
{
"curator_v3": {
"mappings": {
"published": {
"analyzer": "classic",
"numeric_detection": true,
"properties": {
"Id": {
"type": "string",
"index": "not_analyzed",
"include_in_all": false
},
"createDT": {
"type": "date",
"format": "dateOptionalTime",
"include_in_all": false
},
"scheduleDT": {
"type": "date",
"format": "dateOptionalTime",
"include_in_all": false
},
"title": {
"type": "string",
"fields": {
"english": {
"type": "string",
"analyzer": "english"
},
"raw": {
"type": "string",
"index": "not_analyzed"
},
"shingle": {
"type": "string",
"analyzer": "shingle"
},
"spanish": {
"type": "string",
"analyzer": "spanish"
}
},
"include_in_all": false
}
}
}
}
}
}
We use .NET as our client to ElasticSearch and haven't been consistent in specifying a timezone for the scheduleDT field.
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 12,
"successful": 12,
"failed": 0
},
"hits": {
"total": 32,
"max_score": null,
"hits": [
{
"_index": "curator_v3",
"_type": "published",
"_id": "29651227",
"_score": null,
"fields": {
"Id": [
"29651227"
],
"scheduleDT": [
"2015-11-21T22:17:51.0946798-06:00"
],
"title": [
"97 Year-Old Woman Cries Tears Of Joy After Finally Getting Her High School Diploma"
],
"createDT": [
"2015-11-21T22:13:32.3597142-06:00"
]
},
"sort": [
1448165871094
]
},
{
"_index": "curator_v3",
"_type": "published",
"_id": "210466413",
"_score": null,
"fields": {
"Id": [
"210466413"
],
"scheduleDT": [
"2015-11-22T12:00:00"
],
"title": [
"6 KC treats to bring to Thanksgiving"
],
"createDT": [
"2015-11-20T15:08:25.4282-06:00"
]
},
"sort": [
1448193600000
]
}
]
},
"aggregations": {
"ScheduleDT": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 27,
"buckets": [
{
"key": 1448165871094,
"key_as_string": "2015-11-22T04:17:51.094Z",
"doc_count": 1
},
{
"key": 1448193600000,
"key_as_string": "2015-11-22T12:00:00.000Z",
"doc_count": 4
}
]
}
}
}
You can do this by querying the document having a scheduleDT whose field length is less than 20 characters (e.g. 2015-11-22T12:00:00). All the date fields with a specified time zone would be longer.
Something like this should do:
{
"query": {
"filtered": {
"filter": {
"script": {
"script": "doc.scheduleDT.value.size() < 20"
}
}
}
}
}
Note, however, that in order to make your queries easier to create you should always try to convert all your timestamps in UTC before indexing your documents.
Finally, also make sure that you have dynamic scripting enabled in order to run the above query.
UPDATE
Actually, if you use the _source directly in the script it will work because it will return the real value from the source as it was when the document was indexed:
{
"query": {
"filtered": {
"filter": {
"script": {
"script": "_source.scheduleDT.size() < 20"
}
}
}
}
}