Jolt to convert json array into single element

Jolt to convert json array into single element - jolt

Shown below are 2 records in the JSON document ( i will have more than 2 ) :
[{
"abc": [{
"value": 44.636914,
"testName": "1 Month",
"date": "2020-02-14"
},
{
"value": -0.117243,
"testName": "1 Week",
"date": "2020-01-21"
}],
"currency": "CAD",
"date2": "2020-01-14",
"id": "W38450"
}
{
"abc": [{
"value": -0.873175,
"testName": "1 Month",
"date": "2020-02-14"
},
{
"value": 0.005599,
"testName": "1 Week",
"date": "2020-01-21"
},
{
"value": 0.001231,
"testName": "11 Week",
"date": "2020-02-21"
}],
"currency": "CAD",
"date2": "2020-01-14",
"id": "W38209"
}]
i would like to use jolt to convert the array in "abc" as a single value
desired output:
[{
"abc": "[{'value': 44.636914,'testName': '1 Month','date': '2020-02-14'},{'value': -0.117243,'testName': '1 Week','date': '2020-01-21'}]",
"currency": "CAD",
"date2": "2020-01-14",
"id": "W38450"
}
{
"abc": "[{'value': -0.873175,'testName': '1 Month','date': '2020-02-14'},{'value': 0.005599,'testName': '1 Week','date': '2020-01-21'},{'value': 0.001231,'testName': '11 Week','date': '2020-02-21'}]",
"currency": "CAD",
"date2": "2020-01-14",
"id": "W38209"
}]
Note that the array size of "abc" is not constant and i think it is necessarily to change double quote (") to single quote (') as well.
please advice a solution using Jolt.

Here is sample of stucture. Now what left to do is to combine with concat function.
[
{
"operation": "shift",
"spec": {
"*": {
"abc": {
"*": {
"*": {
"$": "[&4].toConcat[&2]",
"#": "[&4].toConcat[&2]"
}
}
},
"currency": "[&1].currency",
"date2": "[&1].date2",
"id": "[&1].id"
}
}
},
{
"operation": "modify-default-beta",
"spec": {
"*": {
"abc": "=join(',',#(1,toConcat))"
}
}
}
]

Related

Group results in Jolt

I am trying to group some data in arrays based on a json field.
[
{
"Type": "Cash",
"Value1": "668131AA3",
"ParentId": "87984E01",
"CashId": "6688E582",
"StockId": null
},
{
"Type": "Stock",
"Value1": "668131AA3",
"ParentId": "87984E01",
"CashId": "1FD714A9",
"StockId": "1FD714A9"
},
{
"Type": "Stock",
"Value1": "668131AA3",
"ParentId": "87984E01",
"CashId": "0635B045",
"StockId": "0635B045"
},
{
"Type": "Cash",
"Value1": "668131AA3",
"ParentId": "87984E01",
"CashId": "47E65472",
"StockId": null
}
]
And the desired output is:
"Value1": "668131AA3",
"ParentId": "87984E01",
"CashPayouts" : [
{"CashId": "6688E582"},
{"CashId": "87984E01"}
],
"StockPayouts" : [
{"StockId": "1FD714A9","CashId": "1FD714A9"},
{"StockId": "0635B045","CashId": "0635B045"}
]
}
Basically, if the Type=Cash, CashId field should be grouped in CashPayouts array. For Type=Stock, group two fields in StockPayouts array.
Is it possible to do this transformation in Jolt? If it is true, how?
Thanks,
Juan

This should be completed example:
[
{
"operation": "shift",
"spec": {
"*": {
"Value1": "Value1",
"ParentId": "ParentId",
"Type": {
"Stock": {
"#(2)": {
"CashId": "t.StockPayouts[&4].CashId",
"StockId": "t.StockPayouts[&4].StockId"
}
},
"Cash": {
"#(2)": {
"CashId": "t.CashPayouts[&4].CashId"
}
}
}
}
}
},
{
"operation": "modify-overwrite-beta",
"spec": {
"*": "=recursivelySquashNulls",
"Value1": "=firstElement(#(1,Value1))",
"ParentId": "=firstElement(#(1,ParentId))"
}
},
{
"operation": "shift",
"spec": {
"Value1": "Value1",
"ParentId": "ParentId",
"t": ""
}
}
]

JOLT Spec for supporting the input json

My input JSON looks like below, but i am not sure of how to do internal array related parameter transformation using JOLT. Any help is appreciated as i am new to JOLT
{
"pktId": 7603,
"seq": 1,
"vehicleNumber": "66079",
"rmdLocation": "1",
"rmdTime": "2019-01-07T11:27:05.745Z",
"position": {
"lat": 55.4911232,
"lng": -3.686831
},
"dataSource": 11,
"frames": [
{
"seq": 0,
"card": 8,
"channel": 6,
"value": 117
},
{
"seq": 1,
"card": 8,
"channel": 6,
"value": 120
}
]
}
Below is the spec file i have created but it is not complete
[
{
"operation": "shift",
"spec": {
"frames": {
"*": {
"seq": "parameters[&1].seq",
"card": "parameters[&1].card",
"channel": "parameters[&1].channel",
"value": "parameters[&1].value"
}
},
"rmdTime": "messageTime",
"vehicleNumber": "roadNumber"
}
},
{
"operation": "default",
"spec": {
"appId": "configMsgXX",
"customerId": "ABC",
"messageRev": 1,
"messageType": "customStatistics"
}
}
]
Expected output is as below
{
"appId": "configMsgXX",
"customerId": "ABC",
"deviceId": string1+roadNumber+string2",
"messageRev": 1,
"messageTime": 1543395341000,
"messageType": "customStatistics",
"parameters": [
{
"address": string1+string2,
"name": "EM2000VoltageMainGenerator",
"timestamp": 1543395341000,
"quality": "3",
"datatype": "INTEGER",
"value": 100,
"qualityReason": "Stale Data",
"category": "REAL"
}
],
"roadNumber": 66079
}
I am using this library https://github.com/bazaarvoice/jolt

[
{
"operation": "shift",
"spec": {
"frames": {
"*": {
"seq": "parameters[&1].seq",
"card": "parameters[&1].card",
"channel": "parameters[&1].channel",
"value": "parameters[&1].value",
"#(2,rmdTime)": "parameters[&1].timestamp"
}
},
"rmdTime": "messageTime",
"vehicleNumber": "roadNumber"
}
},
{
"operation": "modify-overwrite-beta",
"spec": {
"parameters": {
"*": {
"quality": "3",
"name": "",
"address": "",
"datatype": "INTEGER",
"qualityReason": "Stale Data",
"category": "REAL"
}
}
}
},
{
"operation": "default",
"spec": {
"appId": "configMsgXX",
"customerId": "ABC",
"messageRev": 1,
"messageType": "customStatistics"
}
}
]

Elasticsearch - query dates without a specified timezone

I have an index with the following mappings - standard format for a date. In the 2nd record below the time specified is actually a local time - but ES treats it as UTC.
Even though ES is internally converting all parsed datetimes to UTC but it must obviously store the original string as well.
My question is whether (and how) it might be possible to query all records for which the scheduledDT value doesn't have the timezone explicitly specified.
{
"curator_v3": {
"mappings": {
"published": {
"analyzer": "classic",
"numeric_detection": true,
"properties": {
"Id": {
"type": "string",
"index": "not_analyzed",
"include_in_all": false
},
"createDT": {
"type": "date",
"format": "dateOptionalTime",
"include_in_all": false
},
"scheduleDT": {
"type": "date",
"format": "dateOptionalTime",
"include_in_all": false
},
"title": {
"type": "string",
"fields": {
"english": {
"type": "string",
"analyzer": "english"
},
"raw": {
"type": "string",
"index": "not_analyzed"
},
"shingle": {
"type": "string",
"analyzer": "shingle"
},
"spanish": {
"type": "string",
"analyzer": "spanish"
}
},
"include_in_all": false
}
}
}
}
}
}
We use .NET as our client to ElasticSearch and haven't been consistent in specifying a timezone for the scheduleDT field.
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 12,
"successful": 12,
"failed": 0
},
"hits": {
"total": 32,
"max_score": null,
"hits": [
{
"_index": "curator_v3",
"_type": "published",
"_id": "29651227",
"_score": null,
"fields": {
"Id": [
"29651227"
],
"scheduleDT": [
"2015-11-21T22:17:51.0946798-06:00"
],
"title": [
"97 Year-Old Woman Cries Tears Of Joy After Finally Getting Her High School Diploma"
],
"createDT": [
"2015-11-21T22:13:32.3597142-06:00"
]
},
"sort": [
1448165871094
]
},
{
"_index": "curator_v3",
"_type": "published",
"_id": "210466413",
"_score": null,
"fields": {
"Id": [
"210466413"
],
"scheduleDT": [
"2015-11-22T12:00:00"
],
"title": [
"6 KC treats to bring to Thanksgiving"
],
"createDT": [
"2015-11-20T15:08:25.4282-06:00"
]
},
"sort": [
1448193600000
]
}
]
},
"aggregations": {
"ScheduleDT": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 27,
"buckets": [
{
"key": 1448165871094,
"key_as_string": "2015-11-22T04:17:51.094Z",
"doc_count": 1
},
{
"key": 1448193600000,
"key_as_string": "2015-11-22T12:00:00.000Z",
"doc_count": 4
}
]
}
}
}

You can do this by querying the document having a scheduleDT whose field length is less than 20 characters (e.g. 2015-11-22T12:00:00). All the date fields with a specified time zone would be longer.
Something like this should do:
{
"query": {
"filtered": {
"filter": {
"script": {
"script": "doc.scheduleDT.value.size() < 20"
}
}
}
}
}
Note, however, that in order to make your queries easier to create you should always try to convert all your timestamps in UTC before indexing your documents.
Finally, also make sure that you have dynamic scripting enabled in order to run the above query.
UPDATE
Actually, if you use the _source directly in the script it will work because it will return the real value from the source as it was when the document was indexed:
{
"query": {
"filtered": {
"filter": {
"script": {
"script": "_source.scheduleDT.size() < 20"
}
}
}
}
}

Elastic Search: Any way to make space-separated words in a comma-separated list regarded as one term?

I don't know if this is possible, but I'm trying to search by locations with an "exact search" option. There are a couple fields that get searched, with the most important one being the "location_raw" field:
"match": {
"location.location_raw": {
"type": "boolean",
"operator": "AND",
"query": "[location query]",
"analyzer": "standard"
}
}
The location_raw field is a location string with a comma between each place, such as "Sudbury, Middlesex, Massachusetts" or "Leamington, Warwickshire, England". If someone searches for "Sudbury, Middlesex" it gets passed in as
"query": "Sudbury Middlesex"
and both of those terms must exist in the location_raw field. This part works.
The problem is that when the location_raw field contains multi-word location, like New York or Saint George, these get returned when someone searches for "York" or "George." If I do an exact search for "George," I do not want to get results for "Saint George." Is there any way to make Elastic consider "Saint George" one term in the string "Saint George, Stamford, Lincoln, England"?

Here's one way to do it, but you have to query in csv too, or use a terms filter.
I used a pattern analyzer with a simple pattern: ", ". I set up a simple index with a single document:
PUT /test_index
{
"settings": {
"number_of_shards": 1,
"analysis": {
"analyzer": {
"csv": {
"type": "pattern",
"pattern": ", ",
"lowercase": false
}
}
}
},
"mappings": {
"doc": {
"properties": {
"location": {
"type": "string",
"index_analyzer": "csv",
"search_analyzer": "standard",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
POST /test_index/_bulk
{"index":{"_index":"test_index","_type":"doc","_id":1}}
{"location":"Saint George, Stamford, Lincoln, England"}
I can see the terms generated with a simple terms aggregation:
POST /test_index/_search?search_type=count
{
"aggs": {
"location_terms": {
"terms": {
"field": "location"
}
}
}
}
...
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0,
"hits": []
},
"aggregations": {
"location_terms": {
"buckets": [
{
"key": "England",
"doc_count": 1
},
{
"key": "Lincoln",
"doc_count": 1
},
{
"key": "Saint George",
"doc_count": 1
},
{
"key": "Stamford",
"doc_count": 1
}
]
}
}
}
And then if I query with the same csv syntax, the document isn't returned for "George, England":
POST /test_index/_search
{
"query": {
"match": {
"location": {
"type": "boolean",
"operator": "AND",
"query": "George, England",
"analyzer": "csv"
}
}
}
}
...
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 0,
"max_score": null,
"hits": []
}
}
but is for "Saint George, England":
POST /test_index/_search
{
"query": {
"match": {
"location": {
"type": "boolean",
"operator": "AND",
"query": "Saint George, England",
"analyzer": "csv"
}
}
}
}
...
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.2169777,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": 0.2169777,
"_source": {
"location": "Saint George, Stamford, Lincoln, England"
}
}
]
}
}
This query is equivalent, and probably more performant:
POST /test_index/_search
{
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"terms": {
"location": [
"Saint George",
"England"
],
"execution": "and"
}
}
}
}
}
Here's the code I used to test it:
http://sense.qbox.io/gist/234ea93accb7b20ad8fd33e62fe92f1d450a51ab

Get the first document using $in with mongodb

how can get the first element using in in mongo ?
if i've a list like ['car', 'house', 'cat', dog'], and a collection which contains many documents these element, i'd like to find the first document which contain cat, and first which contains dog etc.
I've tried to use limit() but in fact it gives me only one document, which can be either car, or dog or cat etc.
is there a way to combine a limit with $in ?
Thanks
EDIT:
example of data i've:
{
"_id": {
"$oid": "51d53ace9e674607e837d62d"
},
"sensors": [{
"name": "os-hostname",
"value": "yahourt"
}, {
"name": "os-domain-name",
"value": ""
}, {
"name": "os-platform",
"value": "Win32NT"
}, {
"name": "os-fullname",
"value": "Microsoft Windows XP Professional"
}, {
"name": "os-version",
"value": "5.1.2600.131072"
}],
"type": "os",
"serial": "2_os_os-hostname_yahourt"
} {
"_id": {
"$oid": "51d53ace9e674607e837d62e"
},
"sensors": [{
"name": "cpu-id",
"value": "_Total"
}, {
"name": "cpu-usage",
"value": 37.2257042
}],
"type": "cpu",
"serial": "2_cpu_cpu-id_total"
} {
"_id": {
"$oid": "51d53ace9e674607e837d62f"
},
"sensors": [{
"name": "cpu-id",
"value": "0"
}, {
"name": "cpu-usage",
"value": 48.90282
}],
"type": "cpu",
"serial": "2_cpu_cpu-id_0"
} {
"_id": {
"$oid": "51d53ace9e674607e837d630"
},
"sensors": [{
"name": "cpu-id",
"value": "1"
}, {
"name": "cpu-usage",
"value": 25.54859
}],
"type": "cpu",
"serial": "2_cpu_cpu-id_1"
} {
"_id": {
"$oid": "51d53ace9e674607e837d631"
},
"sensors": [{
"name": "volume-name",
"value": "C:"
}, {
"name": "volume-label",
"value": ""
}, {
"name": "volume-total-size",
"value": "52427898880"
}, {
"name": "volume-total-free-space",
"value": "20305170432"
}, {
"name": "volume-percent-free-space",
"value": "38"
}, {
"name": "volume-reads-per-second",
"value": 0.0
}, {
"name": "volume-writes-per-second",
"value": 9.324152
}, {
"name": "volume-read-bytes-per-second",
"value": 0.0
}, {
"name": "volume-write-bytes-per-second",
"value": 194141.6
}, {
"name": "volume-queue-length",
"value": 0.0
}],
"type": "disk",
"serial": "2_disk_volume-name_c"
}

You cannot add a limit to $in but you could cheat by using the aggregation framework:
db.collection.aggregate([
{$match:{serial:{$in:[list_of_serials]}}},
{$sort:{_id:-1}},
{$group:{_id:'$serial',type:{$first:'$type'},sensors:{$first:'$sensors'},id:{$first:'$_id'}}}
]);
Would get a list of all first found of each type.
Edit
The update will get the last inserted according to the _id.