I am trying to extract the following avro record
{
"StateName": "Alabama",
"Capital": "Montgomery",
"Counties": [{
"CountyName": "Baldwin",
"CountyPopulation": 200000,
"Cities": [{
"CityName": "Daphne",
"CityPopulation": 20000
},
{
"CityName": "Foley",
"CityPopulation": 14000
}
]
}, {
"CountyName": "Calhoun",
"CountyPopulation": 100000,
"Cities": [{
"CityName": "Anniston",
"CityPopulation": 23000
},
{
"CityName": "Glencoe",
"CityPopulation": 5000
}
]
}]
}
and modify them and create new individual record like this(Extract each county and create new records based on county like this)
{
"StateName": "Alabama",
"Capital": "Montgomery",
"CountyName": "Baldwin",
"CountyPopulation": 200000,
"Cities": [{
"CityName": "Daphne",
"CityPopulation": 20000
},
{
"CityName": "Foley",
"CityPopulation": 14000
}
]
}
I am trying to extract the records using the json4s. Taken the reference from https://nmatpt.com/blog/2017/01/29/json4s-custom-serializer/
val StateName = avroRecord.get("StateName").asInstanceOf[Utf8].toString
val Capital = avroRecord.get("Capital").asInstanceOf[Utf8].toString
val CountyArray = avroRecord.get("Counties").toString
val jsonData = parse(CountyArray, useBigDecimalForDouble = true)
val CountyList = jsonData match {
case JArray(_) =>
jsonData.extract[List[CountyArrayRecord]]
case JObject(_) =>
List(jsonData.extract[CountyArrayRecord])
List()
}
Custom serializer
implicit val formats: Formats = Serialization.formats(NoTypeHints) + new TestSerializer
class TestSerializer extends CustomSerializer[CountyArrayRecord](format => (
{ case jsonObj: JObject =>
val countyName = (jsonObj \ "CountyName").extract[String]
val countyPopulation = (jsonObj \ "CountyPopulation").extract[Int]
val cities = (jsonObj \ "Cities").extract[List[GenericRecord]]
CountyArrayRecord(countyName, countyPopulation, cities)
}
)
)
Once extracted trying to create list new records using avro4s.Taken reference from this https://github.com/sksamuel/avro4s#avro-records
val returnList = CountyList.map { CountyListRecord =>
val record = FinalCountyRecord (StateName, Capital, CountyListRecord.CountyName, CountyListRecord.CountyPopulation, CountyListRecord.Cities)
val format = RecordFormat[FinalCountyRecord]
format.to(record)
}
returnList
But this does not seem to work since county list has another list(Cities) inside.
I am processing a dataframe and converting into Dataset[Event] using Event case class.How ever there are nested Ids for which i need to multiply the events based on the flattening of nested device:os.
I am able to return the case class Event at the Kafka event level. But not sure how to multiply events .
Kafka incoming Event:
{
"partition": 1,
"key": "34768_20220203_MFETP501",
"offset": 1841543,
"createTime": 1646041475348,
"topic": "topic_int",
"publishTime": 1646041475344,
"errorCode": 0,
"userActions": {
"productId": "3MFETP501",
"createdDate": "2022-02-26T11:19:35.786Z",
"events": [
{
"GUID": "dbb1-f38b-f7f0-44af-90da-80179412f89c",
"eventDate": "2022-02-26T11:19:35.786Z",
"familyId": 2010,
"productTypeId": 1004678,
"serialID": "890479804",
"productName": "MFE Total Protection 2021 Family Pack",
"features": {
"mapping": [
{
"deviceId": 999795,
"osId": [
100
]
},
{
"deviceId": 987875
"osId": [
101
]
}
]
}
}
]
}
}
The expected output case classes for Event
Event("3MFETP501","1004678","2010","3MFETP501:890479804","MFE Total Protection 2021 Family Pack","999795_100", Map("targetId"->"999795_100") )
Event("3MFETP501","1004678","2010","3MFETP501:890479804","MFE Total Protection 2021 Family Pack","987875_100", Map("targetId"->"987875_100") )
case class Event(
productId: String,
familyId: String,
productTypeId: String,
key: String,
productName: String,
deviceOS:String,
var featureMap: mutable.Map[String, String])
val finalDataset:Dataset[Event] = inputDataFrame.flatMap(
row=> {
val productId = row.getAs[String]("productId")
val userActions = row.getAs[Row]("userActions")
val userEvents:mutable.Seq[Row] = userActions.getAs[mutable.WrappedArray[Row]]("events")
val processedEvents:mutable.Seq[Row]= userEvents.map(
event=>
val productTypeId = event.getAs[Int]("productTypeId")
val familyId = event.getAs[String]("familyId")
val features = activity.getAs[mutable.WrappedArray[Row]]("features")
val serialId = activity.getAs[String]("serialId")
val key = productId+":"+serialId
val features = mutable.Map[String, String]().withDefaultValue(null)
val device_os_list=List("999795_100","987875_101")
//Feature Map is for every device_os ( example "targetId"->"999795_100") for 999795_100
if (familyId == 2010 )
{
val a: Option[List[String]] = flatten the deviceId,osId ..
a.get.map(i=>{
val key: String = methodToCombinedeviceIdAndosId
val featureMapping: mutable.Map[String, String] = getfeatureMapForInvidualKey
Event(productId,productTypeId,familyId,key,productName,device_os,feature) ---> This is returning **List[Event]**
})
}
else{
Event(productId,productTypeId,familyId,key,productName,device_os,feature) --> This is returning **Event**. THIS WORKS
}
)
}
)
I do not implement it fully the same but I think it will be possible to understand logic and apply it on your case.
I created json file like kafka.json and put there code like this(your event):
[{
"partition": 1,
"key": "34768_20220203_MFETP501",
"offset": 1841543,
"createTime": 1646041475348,
"topic": "topic_int",
"publishTime": 1646041475344,
"errorCode": 0,
"userActions": {
"productId": "3MFETP501",
"createdDate": "2022-02-26T11:19:35.786Z",
"events": [
{
"GUID": "dbb1-f38b-f7f0-44af-90da-80179412f89c",
"eventDate": "2022-02-26T11:19:35.786Z",
"familyId": 2010,
"productTypeId": 1004678,
"serialID": "890479804",
"productName": "MFE Total Protection 2021 Family Pack",
"features": {
"mapping": [
{
"deviceId": 999795,
"osId": [
100
]
},
{
"deviceId": 987875,
"osId": [
101
]
}
]
}
}
]
}
}]
Please find below first solution that is based on flatMap and for loop.
case class Event(
productId: String,
familyId: String,
productTypeId: String,
key: String,
productName: String,
deviceOS: String,
featureMap: Map[String, String])
import org.apache.spark.sql.{Dataset, Row, SparkSession}
import scala.collection.mutable
val spark = SparkSession
.builder
.appName("StructuredStreaming")
.master("local[*]")
.getOrCreate()
private val inputDataFrame = spark.read.option("multiline", "true").format("json").load("/absolute_path_to_kafka.json")
import spark.implicits._
val finalDataset: Dataset[Event] = inputDataFrame.flatMap(
row => {
val userActions = row.getAs[Row]("userActions")
val productId = userActions.getAs[String]("productId")
val userEvents = userActions.getAs[mutable.WrappedArray[Row]]("events")
for (event <- userEvents;
familyId = event.getAs[Int]("familyId").toString;
productTypeId = event.getAs[Int]("productTypeId").toString;
serialId = event.getAs[String]("serialID");
productName = event.getAs[String]("productName");
key = s"$productId:$serialId";
features = event.getAs[Row]("features");
mappings = features.getAs[mutable.WrappedArray[Row]]("mapping");
mappingRow <- mappings;
deviceId = mappingRow.getAs[Long]("deviceId");
osIds = mappingRow.getAs[mutable.WrappedArray[Long]]("osId");
osId <- osIds;
deviseOs = deviceId + "_" + osId
) yield Event(productId, familyId, productTypeId, key, productName, deviseOs, Map("target" -> (deviseOs)))
}
)
finalDataset.foreach(e => println(e))
// Event(3MFETP501,2010,1004678,3MFETP501:890479804,MFE Total Protection 2021 Family Pack,999795_100,Map(target -> 999795_100))
// Event(3MFETP501,2010,1004678,3MFETP501:890479804,MFE Total Protection 2021 Family Pack,987875_101,Map(target -> 987875_101))
Also, you can solve this task using select, withColumn, explode, concat functions.
case class Event(
productId: String,
familyId: String,
productTypeId: String,
key: String,
productName: String,
deviceOS: String,
featureMap: Map[String, String])
import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.sql.functions.{col, explode, concat, lit, map}
val spark = SparkSession
.builder
.appName("StructuredStreaming")
.master("local[*]")
.getOrCreate()
private val inputDataFrame = spark.read.option("multiline", "true").format("json").load("/absolute_path_to_kafka.json")
val transformedDataFrame = inputDataFrame
.select(col("userActions.productId").as("productId"),
explode(col("userActions.events")).as("event"))
.select(col("productId"),
col("event.familyId").as("familyId"),
col("event.productTypeId").as("productTypeId"),
col("event.serialID").as("serialID"),
col("event.productName").as("productName"),
explode(col("event.features.mapping")).as("features")
)
.select(
col("productId"),
col("familyId"),
col("productTypeId"),
col("serialID"),
col("productName"),
col("features.deviceId").as("deviceId"),
explode(col("features.osId")).as("osId")
)
.withColumn("key", concat(col("productId"), lit(":"), col("serialID")))
.withColumn("deviceOS", concat(col("deviceId"), lit("_"), col("osId")))
.withColumn("featureMap", map(lit("target"), col("deviceOS")))
import spark.implicits._
private val result: Dataset[Event] = transformedDataFrame.as[Event]
result.foreach(e => println(e))
// Event(3MFETP501,2010,1004678,3MFETP501:890479804,MFE Total Protection 2021 Family Pack,999795_100,Map(target -> 999795_100))
// Event(3MFETP501,2010,1004678,3MFETP501:890479804,MFE Total Protection 2021 Family Pack,987875_101,Map(target -> 987875_101))
Add option to customize response based on the value one of the field. I replace here for comprehension to map/flatmap, so you can return as response one or several events based on the type. Also, I customized json a little bit to show more examples in the result.
New json:
[{
"partition": 1,
"key": "34768_20220203_MFETP501",
"offset": 1841543,
"createTime": 1646041475348,
"topic": "topic_int",
"publishTime": 1646041475344,
"errorCode": 0,
"userActions": {
"productId": "3MFETP501",
"createdDate": "2022-02-26T11:19:35.786Z",
"events": [
{
"GUID": "dbb1-f38b-f7f0-44af-90da-80179412f89c",
"eventDate": "2022-02-26T11:19:35.786Z",
"familyId": 2010,
"productTypeId": 1004678,
"serialID": "890479804",
"productName": "MFE Total Protection 2021 Family Pack",
"features": {
"mapping": [
{
"deviceId": 999795,
"osId": [
100,
110
]
},
{
"deviceId": 987875,
"osId": [
101
]
}
]
}
},
{
"GUID": "1111-2222-f7f0-44af-90da-80179412f89c",
"eventDate": "2022-03-26T11:19:35.786Z",
"familyId": 2011,
"productTypeId": 1004679,
"serialID": "890479805",
"productName": "Product name",
"features": {
"mapping": [
{
"deviceId": 999796,
"osId": [
103
]
},
{
"deviceId": 987877,
"osId": [
104
]
}
]
}
}
]
}
}]
Please find code below:
case class Event(
productId: String,
familyId: String,
productTypeId: String,
key: String,
productName: String,
deviceOS: String,
featureMap: Map[String, String])
import org.apache.spark.sql.{Dataset, SparkSession}
val spark = SparkSession
.builder
.appName("StructuredStreaming")
.master("local[*]")
.getOrCreate()
private val inputDataFrame = spark.read.option("multiline", "true").format("json").load("/absolute_path_to_kafka.json")
import spark.implicits._
val finalDataset: Dataset[Event] = inputDataFrame.flatMap(
row => {
val userActions = row.getAs[Row]("userActions")
val productId = userActions.getAs[String]("productId")
val userEvents = userActions.getAs[mutable.WrappedArray[Row]]("events")
for (event <- userEvents;
productTypeId = event.getAs[Int]("productTypeId").toString;
serialId = event.getAs[String]("serialID");
productName = event.getAs[String]("productName");
key = s"$productId:$serialId";
familyId = event.getAs[Int]("familyId").toString;
features = event.getAs[Row]("features");
mappings = features.getAs[mutable.WrappedArray[Row]]("mapping");
mappingRow <- mappings;
deviceId = mappingRow.getAs[Long]("deviceId");
osIds = mappingRow.getAs[mutable.WrappedArray[Long]]("osId");
osId <- osIds;
deviseOs = deviceId + "_" + osId
) yield Event(productId, familyId, productTypeId, key, productName, deviseOs, Map("target" -> deviseOs))
userEvents.flatMap(event => {
val productTypeId = event.getAs[Int]("productTypeId").toString
val serialId = event.getAs[String]("serialID")
val productName = event.getAs[String]("productName")
val key = s"$productId:$serialId"
val familyId = event.getAs[Long]("familyId")
if(familyId == 2010) {
val features = event.getAs[Row]("features")
val mappings = features.getAs[mutable.WrappedArray[Row]]("mapping")
mappings.flatMap(mappingRow => {
val deviceId = mappingRow.getAs[Long]("deviceId")
val osIds = mappingRow.getAs[mutable.WrappedArray[Long]]("osId")
osIds.map(osId => {
val devise_os = deviceId + "_" + osId
Event(productId, familyId.toString, productTypeId, key, productName, devise_os, Map("target" -> devise_os))
})
})
} else {
Seq(Event(productId, familyId.toString, productTypeId, key, productName, "default_defice_os", Map("target" -> "default_defice_os")))
}
})
}
)
finalDataset.foreach(e => println(e))
// Event(3MFETP501,2010,1004678,3MFETP501:890479804,MFE Total Protection 2021 Family Pack,999795_100,Map(target -> 999795_100))
// Event(3MFETP501,2010,1004678,3MFETP501:890479804,MFE Total Protection 2021 Family Pack,999795_110,Map(target -> 999795_110))
// Event(3MFETP501,2010,1004678,3MFETP501:890479804,MFE Total Protection 2021 Family Pack,987875_101,Map(target -> 987875_101))
// Event(3MFETP501,2011,1004679,3MFETP501:890479805,Product name,default_defice_os,Map(target -> default_defice_os))
As this is under a Row of DataFrame, returning Event case class , converts into DataSet.Issue here is for one condition ,i am getting List[Event] and rest type , i am getting only Event class
FYI :This is not an answer. But my further attempt to solve.
if (familyId == 2010 )
{
val a: Option[List[String]] = flatten the deviceId,osId ..
a.get.map(i=>{
val key: String = methodToCombinedeviceIdAndosId
val featureMapping: mutable.Map[String, String] = getfeatureMapForInvidualKey
Event(productId,productTypeId,familyId,key,productName,device_os,feature) ---> This is returning List[Event]
})
}
else{
Event(productId,productTypeId,familyId,key,productName,device_os,feature) --> This is returning Event
}
I need modify .block() repository database operations to without .block() -ยป just put request to stream not to memory.
When the results return simple entity or just something list is ok!
But when I return combined dto I get this:
{
"resultsLength": 1,
"results": {
"scanAvailable": true
}
}
Mapped dto to response
data class CustomObjectResultDto(
val resultsLength: Long?,
#JsonInclude
val results: Mono<MutableList<CustomObjectDto>>
)
JsonFormat:
Want:
{
"resultsLength": 1,
"results": {
"id": "61953fb7e85fe0605c00bdd1",
"cardRequestDate": "2029-02-05T23:00:00Z",
"userName": "XYZ",
"applicationType": {
"name": "plastic",
"label": "plastic_label"
},
"status": {
"name": "accepted",
"label": "accepted_label"
},
"statusChangeDate": "2019-02-10T23:00:00Z",
"virtualCardNumber": 157426399
}
}
Get:
{
"resultsLength": 1,
"results": {
"scanAvailable": true
}
}
Repository:
override fun listing(ctx: context, filterParams: FilterParams, filterConfig: FilterConfig):
Mono<MutableList<CustomObject>> {
val sort = Sort.by(Sort.Direction.fromString(filterConfig.order), filterConfig.orderBy)
val pageRequest = PageRequest.of(filterConfig.pageIndex, filterConfig.pageLimit, sort)
val userInfoWithoutUser = context(
partnerId = ctx.partnerId,
correlationId = ctx.correlationId
)
val query =
createQuery(userInfoWithoutUser, filterParams).collation(Collation.of(DEFAULT_COLLATION)).with(pageRequest)
return reactiveMongoTemplate.find(query, CustomObject::class.java).collectList()
}
Service:
override fun listing(
ctx: context,
filterConfig: FilterConfig,
filterParams: FilterParams
): CustomObjectResultDto {
val result = repository.listing(ctx, filterParams, filterConfig)
.flatMapIterable { list: List<CostumObject> ->
list.map { it.toItemDto() }
}.collectList()
val count = repository.count(ctx, filterParams)
return CustomObjectResultDto(count,result)
}
Controller:
...
): MutableHttpResponse<CustomObjectResultDto> {
val ctx = context(partnerId, userId, correlationId)
...log...
val filterParams = FilterParams(userName, applicationType, status, wifiStatus, cardNumber)
val filterConfig = FilterConfig(pageLimit, pageIndex, PAGE_ORDER_BY_VALUES.getValue(orderBy), order)
return HttpResponse.ok(service.listing(ctx, filterConfig, filterParams))
}
I solved
override fun listing(
ctx: context,
filterConfig: FilterConfig,
filterParams: FilterParams
): Mono<CustomObjectResultDto> {
val count = repository.count(ctx, filterParams)
return repository.listing(ctx, filterParams, filterConfig)
.flatMapIterable { list: List<CustomObject> ->
list.map { it.toItemDto() }
}.collectList()
.map { listOfDocuments ->
CustomObjectResultDto(count, listOfDocuments)
}
}
I have an hot observable of a sequence of items that are have a key that identifies a specific sub-stream. I'm interested to map those M streams into N with N < M (group them into N buckets). For each bucket, each time an element arrives, I want to apply a function to the latest element of each underlining sequence of that group. I've prior knowledge of both N and M groups.
In the following sample, we have a sequence of quote for four fruits. I want to map those streams into two, by the type of fruit (Apple or Pear). For each group I want to collect the last known quote of each fruit.
class Input {
public string ProductID {get;set;}
public string ProductType {get;set;}
public int Price {get;set;}
}
class Output {
public string ProductType {get;set;}
public Input[] Underlining {get;set;}
}
var obs = new List<Input> {
new Input { ProductID = "Stark", ProductType = "Apple", Price = 21 },
new Input { ProductID = "Jonagold", ProductType = "Apple", Price = 12 },
new Input { ProductID = "Williams", ProductType = "Pear", Price = 33 },
new Input { ProductID = "Beth", ProductType = "Pear", Price = 22 },
new Input { ProductID = "Stark", ProductType = "Apple", Price = 43 },
new Input { ProductID = "Williams", ProductType = "Pear", Price = 55 },
new Input { ProductID = "Beth", ProductType = "Pear", Price = 66 },
new Input { ProductID = "Jonagold", ProductType = "Apple", Price = 77 },
new Input { ProductID = "Jonagold", ProductType = "Apple", Price = 25 },
new Input { ProductID = "Williams", ProductType = "Pear", Price = 77 },
new Input { ProductID = "Beth", ProductType = "Pear", Price = 13 },
new Input { ProductID = "Stark", ProductType = "Apple", Price = 21 },
}.ToObservable();
IObservable<Output> result = obs.GroupBy ... Select ... Concat ... ; // I'm a bit loss here
result.Dump();
Expected result:
{ ProductType = "Apple", Underlining = [{ ProductID = "Stark", Price = 21 }] }
{ ProductType = "Apple", Underlining = [{ ProductID = "Stark", Price = 21 }, { ProductID = "Jonagold", Price = 12 }] }
{ ProductType = "Pear", Underlining = [{ ProductID = "Williams", Price = 23 }] }
{ ProductType = "Pear", Underlining = [{ ProductID = "Williams", Price = 23 }, { ProductID = "Beth", Price = 22 }] }
{ ProductType = "Apple", Underlining = [{ ProductID = "Stark", Price = **43** }, { ProductID = "Jonagold", Price = 12 }] }
{ ProductType = "Pear", Underlining = [{ ProductID = "Williams", Price = **55** }, { ProductID = "Beth", Price = 22 }] }
{ ProductType = "Pear", Underlining = [{ ProductID = "Williams", Price = 55 }, { ProductID = "Beth", Price = **66** }] }
{ ProductType = "Apple", Underlining = [{ ProductID = "Stark", Price = 43 }, { ProductID = "Jonagold", Price = **77** }] }
{ ProductType = "Apple", Underlining = [{ ProductID = "Stark", Price = 43 }, { ProductID = "Jonagold", Price = **25** }] }
{ ProductType = "Pear", Underlining = [{ ProductID = "Williams", Price = **77** }, { ProductID = "Beth", Price = 66 }] }
{ ProductType = "Pear", Underlining = [{ ProductID = "Williams", Price = 77 }, { ProductID = "Beth", Price = **13** }] }
{ ProductType = "Apple", Underlining = [{ ProductID = "Stark", Price = **21** }, { ProductID = "Jonagold", Price = 25 }] }
I think this is what you want:
var outputs =
obs
.GroupBy(x => x.ProductType)
.Select(xs =>
xs
.Scan(
new Dictionary<string, Input>(),
(d, x) => { d[x.ProductID] = x; return d; })
.Select(x => new Output()
{
ProductType = xs.Key,
Underlining = x.Values.ToArray(),
}))
.Merge();
I used outputs.Select(x => $"{{ ProductType = \"{x.ProductType}\", Underlining = [{String.Join(", ", x.Underlining.Select(y => $"{{ ProductID = \"{y.ProductID}\", Price = {y.Price} }}"))}] }}") to get the following output to test it:
{ ProductType = "Apple", Underlining = [{ ProductID = "Stark", Price = 21 }] }
{ ProductType = "Apple", Underlining = [{ ProductID = "Stark", Price = 21 }, { ProductID = "Jonagold", Price = 12 }] }
{ ProductType = "Pear", Underlining = [{ ProductID = "Williams", Price = 33 }] }
{ ProductType = "Pear", Underlining = [{ ProductID = "Williams", Price = 33 }, { ProductID = "Beth", Price = 22 }] }
{ ProductType = "Apple", Underlining = [{ ProductID = "Stark", Price = 43 }, { ProductID = "Jonagold", Price = 12 }] }
{ ProductType = "Pear", Underlining = [{ ProductID = "Williams", Price = 55 }, { ProductID = "Beth", Price = 22 }] }
{ ProductType = "Pear", Underlining = [{ ProductID = "Williams", Price = 55 }, { ProductID = "Beth", Price = 66 }] }
{ ProductType = "Apple", Underlining = [{ ProductID = "Stark", Price = 43 }, { ProductID = "Jonagold", Price = 77 }] }
{ ProductType = "Apple", Underlining = [{ ProductID = "Stark", Price = 43 }, { ProductID = "Jonagold", Price = 25 }] }
{ ProductType = "Pear", Underlining = [{ ProductID = "Williams", Price = 77 }, { ProductID = "Beth", Price = 66 }] }
{ ProductType = "Pear", Underlining = [{ ProductID = "Williams", Price = 77 }, { ProductID = "Beth", Price = 13 }] }
{ ProductType = "Apple", Underlining = [{ ProductID = "Stark", Price = 21 }, { ProductID = "Jonagold", Price = 25 }] }