How to handle nested array in a DRUID - druid

My json is as below:
{
"id":11966121,
"employer_id":175,
"account_attributes":[
{
"id":155387028,
"is_active":false,
"created_at":"2018-06-06T02:12:25.243Z",
"updated_at":"2021-03-15T17:38:04.598Z"
},
{
"id":155387062,
"is_active":true,
"created_at":"2018-06-06T02:12:25.243Z",
"updated_at":"2021-03-15T17:38:04.598Z"
}
],
"created_at":"2017-12-13T18:31:04.000Z",
"updated_at":"2021-03-14T23:50:43.180Z"
}
I want to parse the message and have a table with flatten account_attributes
Considering the sample payload the o/p should have two rows:
id |account_attributes_id| is_active | created_at | updated_at|
11966121|155387028|false |2018-06-06T02:12:25.243Z|2021-03-15T17:38:04.598Z |
11966121|155387062|true |2018-06-06T02:12:25.243Z|2021-03-15T17:38:04.598Z |
Is this possible?

Related

Sort by json element at nested level for jsonb data - postgresql

I have below table in postgresql which stored JSON data in jsonb type of column.
CREATE TABLE "Trial" (
id SERIAL PRIMARY KEY,
data jsonb
);
Below is the sample json structure
{
"id": "000000007001593061",
"core": {
"groupCode": "DVL",
"productType": "ZDPS",
"productGroup": "005001000"
},
"plants": [
{
"core": {
"mrpGroup": "ZMTS",
"mrpTypeDesc": "MRP",
"supLeadTime": 777
},
"storageLocation": [
{
"core": {
"storageLocation": "H050"
}
},
{
"core": {
"storageLocation": "H990"
}
},
{
"core": {
"storageLocation": "HM35"
}
}
]
}
],
"discriminator": "Material"
}
These are the scripts for insert json data
INSERT INTO "Trial"(data)
VALUES(CAST('{"id":"000000007001593061","core":{"groupCode":"DVL","productType":"ZDPS","productGroup":"005001000"},"plants":[{"core":{"mrpGroup":"ZMTS","mrpTypeDesc":"MRP","supLeadTime":777},"storageLocation":[{"core":{"storageLocation":"H050"}},{"core":{"storageLocation":"H990"}},{"core":{"storageLocation":"HM35"}}]}],"discriminator":"Material"}' AS JSON))
INSERT INTO "Trial"(data)
VALUES(CAST('{"id":"000000000104107816","core":{"groupCode":"ELC","productType":"ZDPS","productGroup":"005001000"},"plants":[{"core":{"mrpGroup":"ZCOM","mrpTypeDesc":"MRP","supLeadTime":28},"storageLocation":[{"core":{"storageLocation":"H050"}},{"core":{"storageLocation":"H990"}}]}],"discriminator":"Material"}' AS JSON))
INSERT INTO "Trial"(data)
VALUES(CAST('{"id":"000000000104107818","core":{"groupCode":"DVK","productType":"ZDPS","productGroup":"005001000"},"plants":[{"core":{"mrpGroup":"ZMTL","mrpTypeDesc":"MRP","supLeadTime":28},"storageLocation":[{"core":{"storageLocation":"H050"}},{"core":{"storageLocation":"H990"}}]}]}' AS JSON))
If try to sort by at first level then it works
select id,data->'core'->'groupCode'
from "Trial"
order by data->'core'->'groupCode' desc
But when I try to sort by at nested level, below is the script then it doesn't work for me, I'm for sure I'm wrong for this script but don't know what is it ? Need assistant if someone knows how to order by at nested level for JSONB data.
select id,data->'plants'
from sap."Trial"
order by data->'plants'->'core'->'mrpGroup' desc
Need assistance for write a query for order by at nested level for JSONB data.
Below query works for me
SELECT id, data FROM "Trial" ORDER BY jsonb_path_query_array(data, '$.plants[*].core[*].mrpGroup') desc limit 100

How to Construct Nested JSON Message on Output Topic in KSQLDB

from one of the source systems i received the below event payload
Created Stream1 for the below json payload
Event JSON 1
{
"event": {
"header": {
"name":"abc",
"version":"1.0",
"producer":"123",
"channel":"lab",
"countryCode":"US"
},
"body":{"customerIdentifiers":[
{"customerIdentifier":"1234","customerIdType":"cc"},
{"customerIdentifier":"234","customerIdType":"id"}
],
"accountIdentifiers":[
{"accountIdentifier":"123","accountIdType":"no"},
{"accountIdentifier":"Primary","accountIdType":"da"}
],
"eventDetails":{
"offeramount":"40000",
"apr":"2.6%",
"minpayment":"400",
"status":"Approved"
}
}
}
Event JSON 2
{
"event": {
"header": {
"name":"abc",
"version":"1.0",
"producer":"123",
"channel":"lab",
"countryCode":"US"
},
"body":{"customerIdentifiers":[
{"customerIdentifier":"1234","customerIdType":"cc"},
{"customerIdentifier":"234","customerIdType":"id"}
],
"accountIdentifiers":[
{"accountIdentifier":"123","accountIdType":"no"},
{"accountIdentifier":"Primary","accountIdType":"da"}
],
"eventDetails":{
"offeramount":"70000",
"apr":"3.6%",
"minpayment":"600",
"status":"Rejected"
}
}
}
I have created aggregation table on the the above stream1
CREATE TABLE EVENT_TABLE AS
SELECT
avg(minpayment) as Avg_MinPayment,
avg(apr) AS Avg_APr,
avg(offeramount) AS Avgofferamount ,
status
FROM STREAM1
GROUP BY status
EMIT CHANGES;
Status | Avg_MinPayment | Avg_APr | Avgofferamount
-----------------------------------------
Approved | 400 | 2.6% | 40000
Rejected | 600 | 3.6% | 70000
I got the above result from KTable and KTable Topic json look like this
Aggregate JSON1
PRINT 'EVENT_TABLE';
{
"Status" : "Approved",
"Avg_Minpayment" : "400",
"Avg_APr" : "2.6%",
"offeramount" : "40000"
}
Aggregate JSON2
{
"Status" : "Rejected",
"Avg_Minpayment" : "600",
"Avg_APr" : "3.6%",
"offeramount" : "70000"
}
But i have to Construct and publish the final target json on output topic like below json format. i have to add the header and body to the aggregate json1 and aggregate json2.
{
"event":{
"header":{
"name":"abc",
"version":"1.0",
"producer":"123",
"channel":"lab",
"countryCode":"US"
},
"body":{
"Key":[
{"Status":"approved","Avg_Minpayment":"400","Avg_APr":"2.6%","offeramount":"40000"},
{"Status":"rejected","Avg_Minpayment":"600","Avg_APr":"3.6%","offeramount":"70000"}
]
}
}
It's not terribly clear what you're trying to achieve, given that your example SQL won't produce the example output, given then example input. In fact your example SQL would fail with unknown column errors.
Something like the following would generate your example output:
CREATE TABLE EVENT_TABLE AS
SELECT
status,
avg(eventDetails->minpayment) as Avg_MinPayment,
avg(eventDetails->apr) AS Avg_APr,
avg(eventDetails->offeramount) AS Avgofferamount
FROM STREAM1
GROUP BY status
EMIT CHANGES;
Next, your example output...
Status | Avg_MinPayment | Avg_APr | Avgofferamount
-----------------------------------------
Approved | 400 | 2.6% | 40000
Rejected | 600 | 3.6% | 70000
...is outputting one row per status. Yet, the output you say you want to achieve ...
{
"event":{
"header":{
"name":"abc",
"version":"1.0",
"producer":"123",
"channel":"lab",
"countryCode":"US"
},
"body":{
"Key":[
{"Status":"approved","Avg_Minpayment":"400","Avg_APr":"2.6%","offeramount":"40000"},
{"Status":"rejected","Avg_Minpayment":"600","Avg_APr":"3.6%","offeramount":"70000"}
]
}
}
...contains both statuses, i.e. its combining both of your example input messages into a single output.
If I'm understanding you correctly, and you do indeed want to output the above JSON, then:
You would first need to include the event information. But which event information? If you know they're always going to be the same, then you can use:
CREATE TABLE EVENT_TABLE AS
SELECT
status,
latest_by_offset(event) as event,
avg(eventDetails->minpayment) as Avg_MinPayment,
avg(eventDetails->apr) AS Avg_APr,
avg(eventDetails->offeramount) AS Avgofferamount
FROM STREAM1
GROUP BY status
EMIT CHANGES;
The latest_by_offset aggregate function will capture the event information from the last message it saw. Though I'm not convinced this is what you want. Could you not be getting other rejected and accepted messages with different event information? If it is the event information that identifies which messages should be grouped together, then something like this might give you something close to what you want:
CREATE TABLE EVENT_TABLE AS
SELECT
event,
collect_list(eventDetails) as body
FROM STREAM1
GROUP BY event
EMIT CHANGES;
If this is close, then you may want to use the STRUCT constructor and AS_VALUE function to restructure your output. For example:
CREATE TABLE EVENT_TABLE AS
SELECT
event as key,
AS_VALUE(event) as event,
STRUCT(
keys := collect_list(eventDetails)
) as body
FROM STREAM1
GROUP BY event
EMIT CHANGES;

Postgres find in jsonb nested array

I have a case when my data in in nested arrays of jsonb in order to find the value I have to do multiple JSONB_ARRAY_ELEMENTS which is costly and takes a lots of nested code.
The json file has the continents inside countries and inside cities.
I need to access a city value.
Is there a way to make this query simpler and faster?
I was trying to solve it using JSON_EXTRACT_PATH but in order to get in to a array but I need the indexes.
WITH mydata AS (
SELECT '
{
"continents":[
{
"name":"America",
"area":43316000,
"countries":[
{
"country_name":"Canada",
"capital":"Toronto",
"cities":[
{
"city_name":"Ontario",
"population":2393933
},
{
"city_name":"Quebec",
"population":12332
}
]
},
{
"country_name":"Brazil",
"capital":"Brasilia",
"cities":[
{
"city_name":"Sao Paolo",
"population":34534534
},
{
"city_name":"Rio",
"population":445345
}
]
}
]
},
{
"name":"Europa",
"area":10530751,
"countries":[
{
"country_name":"Switzerland",
"capital":"Zurich",
"cities":[
{
"city_name":"Ginebra",
"population":4564565
},
{
"city_name":"Basilea",
"population":4564533
}
]
},
{
"country_name":"Norway",
"capital":"Oslo",
"cities":[
{
"city_name":"Oslo",
"population":3243534
},
{
"city_name":"Steinkjer",
"population":4565465
}
]
}
]
}
]
}
'::JSONB AS data_column
)
SELECT cit.city->>'city_name' AS city,
(cit.city->>'population')::INTEGER AS population
FROM (SELECT JSONB_ARRAY_ELEMENTS(coun.country->'cities') AS city
FROM (SELECT JSONB_ARRAY_ELEMENTS(cont.continent->'countries') AS country
FROM (SELECT JSONB_ARRAY_ELEMENTS(data_column->'continents') AS continent
FROM mydata
) AS cont
WHERE cont.continent #> '{"name":"Europa"}'
) AS coun
WHERE coun.country #> '{"country_name" : "Norway"}'
) AS cit
WHERE cit.city #> '{"city_name": "Oslo"}'
See my nested queries? looks ugly, I can get the answer using: JSONB_EXTRACT_PATH( data_column->'continents', '1', 'countries', '1', 'cities', '0', 'population') but I had to hardcode the array indexes.
Hope you can help me out.
Thanks.
You don't need any nesting, you can do lateral queries:
SELECT
city->>'city_name' AS city,
(city->>'population')::INTEGER AS population
FROM
mydata,
JSONB_ARRAY_ELEMENTS(data_column->'continents') AS continent,
JSONB_ARRAY_ELEMENTS(continent->'countries') AS country,
JSONB_ARRAY_ELEMENTS(country->'cities') AS city
WHERE continent ->> 'name' = 'Europa'
AND country ->> 'country_name' = 'Norway'
AND city ->> 'city_name' = 'Oslo';
(online demo)
However, since you mentioned paths and having to specify indices in there, this is actually the perfect use case for Postgres 12 JSON paths:
SELECT jsonb_path_query(data_column, '$.continents[*]?(#.name == "Europa").countries[*]?(#.country_name=="Norway").cities[*]?(#.city_name=="Oslo")') FROM mydata
(online demo)

In MongoDB, how to clone a column in a collection?

Is there a way to add a new column in a collection which is clone of an existing column in the same collection?
PersonTable
_id | Name
1 | John
Result
_id | Name | Name(cloned)
1 | John | John
Hopefully without a foreach loop.
You can use bulkWrite operation
const persons = await PersionTable.find({})
const updateTable = await PersionTable.bulkWrite(
persons.map((person) => {
person.clonedName = person.name
return({
updateOne : {
filter: { _id: person._id },
update: { $set: person }
}
})
})
)

Cross-venue visitor reporting approach in Location Based Service system

I'm finding an approach to resolve cross-venue vistor report for my client, he wants an HTTP API that return the total unique count of his customer who has visited more than one shop in day range (that API must return in 1-2 seconds).
The raw data sample (...millions records in reality):
--------------------------
DAY | CUSTOMER | VENUE
--------------------------
1 | cust_1 | A
2 | cust_2 | A
3 | cust_1 | B
3 | cust_2 | A
4 | cust_1 | C
5 | cust_3 | C
6 | cust_3 | A
Now, I want to calculate the cross-visitor report. IMO the steps would be as following:
Step 1: aggregate raw data from day 1 to 6
--------------------------
CUSTOMER | VENUE VISIT
--------------------------
cus_1 | [A, B, C]
cus_2 | [A]
cus_3 | [A, C]
Step 2: produce the final result
Total unique cross-customer: 2 (cus_1 and cus_3)
I've tried somes solutions:
I firstly used MongoDB to store data, then using Flask to write an API that uses MongoDB's utilities: aggregation, addToSet, group, count... But the API's response time is unacceptable.
Then, I switched to ElasticSearch with hope on its Aggregation command sets, but they do not support pipeline group command on the output result from the first "terms" aggregation.
After that, I read about Redis Sets, Sorted Sets,... But they couldn't help.
Could you please show me a clue to solve my problem.
Thank in advanced!
You can easily do this with Elasticsearch by leveraging one date_histogram aggregation to bucket by day, two terms aggregations (first bucket by customer and then by venue) and then only select the customers which visited more than one venue any given day using the bucket_selector pipeline aggregation. It looks like this:
POST /sales/_search
{
"size": 0,
"aggs": {
"by_day": {
"date_histogram": {
"field": "date",
"interval": "day"
},
"aggs": {
"customers": {
"terms": {
"field": "customer.keyword"
},
"aggs": {
"venues": {
"terms": {
"field": "venue.keyword"
}
},
"cross_selector": {
"bucket_selector": {
"buckets_path": {
"venues_count": "venues._bucket_count"
},
"script": {
"source": "params.venues_count > 1"
}
}
}
}
}
}
}
}
}
In the result set, you'll get customers 1 and 3 as expected.
UPDATE:
Another approach involves using a scripted_metric aggregation in order to implement the logic yourself. It's a bit more complicated and might not perform well depending on the number of documents and hardware you have, but the following algorithm would yield the response 2 exactly as you expect:
POST sales/_search
{
"size":0,
"aggs": {
"unique": {
"scripted_metric": {
"init_script": "params._agg.visits = new HashMap()",
"map_script": "def cust = doc['customer.keyword'].value; def venue = doc['venue.keyword'].value; def venues = params._agg.visits.get(cust); if (venues == null) { venues = new HashSet(); } venues.add(venue); params._agg.visits.put(cust, venues)",
"combine_script": "def merged = new HashMap(); for (v in params._agg.visits.entrySet()) { def cust = merged.get(v.key); if (cust == null) { merged.put(v.key, v.value) } else { cust.addAll(v.value); } } return merged",
"reduce_script": "def merged = new HashMap(); for (agg in params._aggs) { for (v in agg.entrySet()) {def cust = merged.get(v.key); if (cust == null) {merged.put(v.key, v.value)} else {cust.addAll(v.value); }}} def unique = 0; for (m in merged.entrySet()) { if (m.value.size() > 1) unique++;} return unique"
}
}
}
}
Response:
{
"took": 1413,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 7,
"max_score": 0,
"hits": []
},
"aggregations": {
"unique": {
"value": 2
}
}
}