Deserializing structured stream from kafka with Spark - scala

I try to read a kafka topic from Spark 3.0.2 with the scala code below.
Here are my imports:
import io.confluent.kafka.schemaregistry.client.{CachedSchemaRegistryClient, SchemaRegistryClient}
import io.confluent.kafka.serializers.AbstractKafkaAvroDeserializer
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.commons.cli.CommandLine
import org.apache.spark.sql._
import org.apache.spark.sql.avro.SchemaConverters
import org.apache.spark.sql.streaming.OutputMode
Functions to retreive schema from a Schema Registry service and to convert it into spark format :
var schemaRegistryClient: SchemaRegistryClient = _
var kafkaAvroDeserializer: AvroDeserializer = _
def lookupTopicSchema(topic: String): String = {
schemaRegistryClient.getLatestSchemaMetadata(topic).getSchema
}
def avroSchemaToSparkSchema(avroSchema: String): SchemaConverters.SchemaType = {
SchemaConverters.toSqlType(new Schema.Parser().parse(avroSchema))
}
Definition of class to deserialize data from the kafka topic :
object DeserializerWrapper {
val deserializer: AvroDeserializer = kafkaAvroDeserializer
}
class AvroDeserializer extends AbstractKafkaAvroDeserializer {
def this(client: SchemaRegistryClient) {
this()
this.schemaRegistry = client
}
override def deserialize(bytes: Array[Byte]): String = {
val value = super.deserialize(bytes)
Option(value) match {
case Some(Array()) | None => null
case Some(a) =>
val genericRecord = a.asInstanceOf[GenericRecord]
genericRecord.toString
}
}
}
The main function to execute the job :
def main(): Unit = {
val spark = SparkSession.builder.getOrCreate
val bootstrapServers = "kafka1:9192"
val topic = "sample_topic"
val shemaRegistryName = "avro_schema_A"
val schemaRegistryUrl = "http://myHost:8001"
schemaRegistryClient = new CachedSchemaRegistryClient(schemaRegistryUrl, 128)
val kafkaAvroDeserializer = new AvroDeserializer(schemaRegistryClient)
spark.udf.register("deserialize", (bytes: Array[Byte]) => DeserializerWrapper.deserializer.deserialize(bytes))
val kafkaDataFrame = (spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", bootstrapServers)
.option("subscribe", topic)
.option("kafka.group.id", "group_id")
.option("startingOffsets", "latest")
.load()
)
val valueDataFrame = kafkaDataFrame.selectExpr("""deserialize(value) AS message""")
import org.apache.spark.sql.functions._
val dfValueSchema = {
val rawSchema = lookupTopicSchema(shemaRegistryName)
avroSchemaToSparkSchema(rawSchema)
}
val formattedDataFrame = (valueDataFrame
.select(from_json(col("message"), dfValueSchema.dataType).alias("parsed_value"))
.select("parsed_value.*")
)
(formattedDataFrame
.writeStream
.format("console")
.outputMode(OutputMode.Append())
.option("truncate", false)
.start()
.awaitTermination()
)
}
When I execute the main function, the job crashes with the error :
-------------------------------------------
Batch: 0
-------------------------------------------
+-----+-----+-----+-----+-----+-----+-----+
|col-A|col-B|col-C|col-D|col-E|col-F|col-G|
+-----+-----+-----+-----+-----+-----+-----+
+-----+-----+-----+-----+-----+-----+-----+
21/07/23 15:41:31 ERROR TaskSetManager: Task 0 in stage 3.0 failed 4 times; aborting job
21/07/23 15:41:31 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite#227770fb is aborting.
21/07/23 15:41:31 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite#227770fb aborted.
21/07/23 15:41:31 ERROR MicroBatchExecution: Query [id = 5c76f495-b2c8-46e7-90b0-dd715c8466f3, runId = c38ccde6-3c98-488c-bfd9-a71a17296503] terminated with error
org.apache.spark.SparkException: Writing job aborted.
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:413)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2$(WriteToDataSourceV2Exec.scala:361)
at org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec.writeWithV2(WriteToDataSourceV2Exec.scala:322)
at org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec.run(WriteToDataSourceV2Exec.scala:329)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:39)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:39)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:45)
at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3627)
at org.apache.spark.sql.Dataset.$anonfun$collect$1(Dataset.scala:2940)
at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3618)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3616)
at org.apache.spark.sql.Dataset.collect(Dataset.scala:2940)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$16(MicroBatchExecution.scala:586)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$15(MicroBatchExecution.scala:581)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:352)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:350)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:69)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runBatch(MicroBatchExecution.scala:581)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$2(MicroBatchExecution.scala:223)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:352)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:350)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:69)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$1(MicroBatchExecution.scala:191)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:57)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:185)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:334)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:245)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 3.0 failed 4 times, most recent failure: Lost task 0.3 in stage 3.0 (TID 13, 172.20.0.4, executor 1): org.apache.spark.SparkException: Failed to execute user defined function($read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$Lambda$1119/1813496428: (binary) => string)
at org.apache.spark.sql.catalyst.expressions.ScalaUDF.eval(ScalaUDF.scala:1130)
at org.apache.spark.sql.catalyst.expressions.UnaryExpression.eval(Expression.scala:457)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.subExpr_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.$anonfun$run$7(WriteToDataSourceV2Exec.scala:441)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1411)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:477)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:385)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:462)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:465)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NullPointerException
at $line55.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.$anonfun$res3$1(<console>:41)
at org.apache.spark.sql.catalyst.expressions.ScalaUDF.$anonfun$f$2(ScalaUDF.scala:157)
at org.apache.spark.sql.catalyst.expressions.ScalaUDF.eval(ScalaUDF.scala:1127)
... 17 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2114)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:382)
... 37 more
Caused by: org.apache.spark.SparkException: Failed to execute user defined function($read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$Lambda$1119/1813496428: (binary) => string)
at org.apache.spark.sql.catalyst.expressions.ScalaUDF.eval(ScalaUDF.scala:1130)
at org.apache.spark.sql.catalyst.expressions.UnaryExpression.eval(Expression.scala:457)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.subExpr_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.$anonfun$run$7(WriteToDataSourceV2Exec.scala:441)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1411)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:477)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:385)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:462)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:465)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NullPointerException
at $line55.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.$anonfun$res3$1(<console>:41)
at org.apache.spark.sql.catalyst.expressions.ScalaUDF.$anonfun$f$2(ScalaUDF.scala:157)
at org.apache.spark.sql.catalyst.expressions.ScalaUDF.eval(ScalaUDF.scala:1127)
... 17 more
It seems that the udf throws a NullPointerException, but I explicity implemented a pattern matching to avoid it.
Can anybody please help on this ?
Thanks a lot.

I finally come over the deserialization process using the ABRiS library. It is way to easy to make connection with the Schema registry and do Ser/Deser operations of the kafka stream.
#OneCricketeer I really appreciate your help on this.
For information, the connection to the SR can be made simply as :
import za.co.absa.abris.config.AbrisConfig
val abrisConfig = AbrisConfig
.fromConfluentAvro
.downloadReaderSchemaById(123)
.usingSchemaRegistry("http://schema-registry.com:8081")
Then, the conversion process is made with a single line of code :
import za.co.absa.abris.avro.functions.from_avro
val outputDf = inputStreamDf.select(from_avro(col("value"), abrisConfig) as "data")

Related

How to execute 'spark.sql' inside an UDF?

When I am trying to execute 'sparl.sql' inside an UDF, I am getting java.lang.NullPointerException. Is there any way, how can I execute that?
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.{Column, DataFrame, SparkSession}
// Define the UDF
def myUdf(spark: SparkSession): UserDefinedFunction = udf((col1: String, col2: String) => {
// Execute the SQL query
val result = spark.sql(s"SELECT 'Hello World!' as text")
// Return the result as a string
result.toString()
})
// Use the UDF in a DataFrame transformation
def transform(df: DataFrame, col1: Column, col2: Column): DataFrame = {
df.withColumn("result", myUdf(spark)(col1, col2))
}
val res = transform(df, col("salary"), col("gender"))
res.show()
Above code is throwing below exception
22/12/07 11:10:32 ERROR Executor: Exception in task 0.0 in stage 1679.0 (TID 19329)
org.apache.spark.SparkException: Failed to execute user defined function($read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$$$82b5b23cea489b2712a1db46c77e458$$$$w$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$Lambda$4802/591483562: (string, string) => string)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:345)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.NullPointerException
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:154)
at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:152)
at org.apache.spark.sql.SparkSession.$anonfun$sql$2(SparkSession.scala:616)
at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:616)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:613)
I am afraid this won't work.
From a technical standpoint, UDFs in general run on an executor, and Spark session can only be accessed and used to schedule further work, on a driver. The null pointer exception you can is most likely an attempt to obtain some pieces of the Spark session that are not available on the executor.
From a semantic standpoint, if this were permitted, then processing of each row would create a new query potentially processing lots of rows. Imagine a dataframe with 10M records, and creating 10M queries. That would not be feasible to implement.

spark stateful streaming with checkpoint + kafka producer

How can I integrate Kafka producer with spark stateful streaming which uses checkpoint along with StreamingContext.getOrCreate.
I read this post: How to write spark streaming DF to Kafka topic and implemented the method mentioned in this post: Spark and Kafka integration patterns
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
import org.apache.kafka.common.serialization.StringSerializer
import java.util.Properties
class KafkaSink(createProducer: () => KafkaProducer[String, String]) extends Serializable {
lazy val producer = createProducer()
def send(topic: String, value: String): Unit = producer.send(new ProducerRecord(topic, value))
}
object KafkaSink {
def apply(): KafkaSink = {
val f = () => {
val kafkaProducerProps: Properties = {
val props = new Properties()
props.put("bootstrap.servers", "127.0.0.1:9092")
props.setProperty("batch.size", "8192");
props.put("key.serializer", classOf[StringSerializer].getName)
props.put("value.serializer", classOf[StringSerializer].getName)
props.setProperty("request.timeout.ms", "60000")
props
}
val producer = new KafkaProducer[String, String](kafkaProducerProps)
producer
}
new KafkaSink(f)
}
}
and
package webmetric
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.common.serialization.{ByteArraySerializer, StringDeserializer, StringSerializer}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, State, StateSpec, StreamingContext}
import java.util.Properties
import scala.concurrent.Future;
object RecoverableJsonProcess {
def createContext(checkpointDirectory: String)
: StreamingContext = {
// If you do not see this printed, that means the StreamingContext has been loaded
// from the new checkpoint
println("Creating new context")
val sparkConf = new SparkConf().setAppName("RecoverableNetworkWordCount").setMaster("local[2]")
// Create the context with a 1 second batch size
val ssc = new StreamingContext(sparkConf, Seconds(4))
ssc.checkpoint(checkpointDirectory)
...
...
val globalKafkaSink = ssc.sparkContext.broadcast(KafkaSink())
val mappingFunc = ...
val stateDstream = xitemPairs.mapWithState(
StateSpec.function(mappingFunc))
stateDstream.foreachRDD { rdd =>
rdd.foreach { message =>
globalKafkaSink.value.send("mytopic",message.toString())
}
}
stateDstream.print()
ssc
}
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
System.setProperty("hadoop.home.dir", "H:\\work\\spark\\")
val checkpointDirectory = "H:/work/spark/chk2"
val ssc = StreamingContext.getOrCreate(checkpointDirectory,
() => createContext(checkpointDirectory))
ssc.start()
ssc.awaitTermination()
}
}
I did it and it works in first run which create a new context. But when try to get context from checkpoint directory after running it rises the following error when using Kafka producer:
22/04/20 09:34:44 ERROR Executor: Exception in task 0.0 in stage 99.0 (TID 35)
java.lang.ClassCastException: org.apache.spark.util.SerializableConfiguration cannot be cast to webmetric.MySparkKafkaProducer
at webmetric.RecoverableJsonProcess$.$anonfun$createContext$8(RecoverableJsonProcess.scala:92)
at webmetric.RecoverableJsonProcess$.$anonfun$createContext$8$adapted(RecoverableJsonProcess.scala:90)
at scala.collection.AbstractIterator.foreach(Iterator.scala:932)
at org.apache.spark.rdd.RDD.$anonfun$foreach$2(RDD.scala:1012)
at org.apache.spark.rdd.RDD.$anonfun$foreach$2$adapted(RDD.scala:1012)
at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2242)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
22/04/20 09:34:44 WARN TaskSetManager: Lost task 0.0 in stage 99.0 (TID 35) (hajibaba.PC executor driver): java.lang.ClassCastException: org.apache.spark.util.SerializableConfiguration cannot be cast to webmetric.MySparkKafkaProducer
at webmetric.RecoverableJsonProcess$.$anonfun$createContext$8(RecoverableJsonProcess.scala:92)
at webmetric.RecoverableJsonProcess$.$anonfun$createContext$8$adapted(RecoverableJsonProcess.scala:90)
at scala.collection.AbstractIterator.foreach(Iterator.scala:932)
at org.apache.spark.rdd.RDD.$anonfun$foreach$2(RDD.scala:1012)
at org.apache.spark.rdd.RDD.$anonfun$foreach$2$adapted(RDD.scala:1012)
at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2242)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
22/04/20 09:34:44 ERROR TaskSetManager: Task 0 in stage 99.0 failed 1 times; aborting job
22/04/20 09:34:44 ERROR JobScheduler: Error running job streaming job 1650431044000 ms.0
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 99.0 failed 1 times, most recent failure: Lost task 0.0 in stage 99.0 (TID 35) (hajibaba.PC executor driver): java.lang.ClassCastException: org.apache.spark.util.SerializableConfiguration cannot be cast to webmetric.MySparkKafkaProducer
at webmetric.RecoverableJsonProcess$.$anonfun$createContext$8(RecoverableJsonProcess.scala:92)
at webmetric.RecoverableJsonProcess$.$anonfun$createContext$8$adapted(RecoverableJsonProcess.scala:90)
at scala.collection.AbstractIterator.foreach(Iterator.scala:932)
at org.apache.spark.rdd.RDD.$anonfun$foreach$2(RDD.scala:1012)
at org.apache.spark.rdd.RDD.$anonfun$foreach$2$adapted(RDD.scala:1012)
at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2242)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2251)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2200)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2199)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2199)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1078)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1078)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1078)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2438)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2380)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2369)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2202)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2223)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2242)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2267)
at org.apache.spark.rdd.RDD.$anonfun$foreach$1(RDD.scala:1012)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
at org.apache.spark.rdd.RDD.foreach(RDD.scala:1010)
at webmetric.RecoverableJsonProcess$.$anonfun$createContext$7(RecoverableJsonProcess.scala:90)
at webmetric.RecoverableJsonProcess$.$anonfun$createContext$7$adapted(RecoverableJsonProcess.scala:89)
at org.apache.spark.streaming.dstream.DStream.$anonfun$foreachRDD$2(DStream.scala:629)
at org.apache.spark.streaming.dstream.DStream.$anonfun$foreachRDD$2$adapted(DStream.scala:629)
at org.apache.spark.streaming.dstream.ForEachDStream.$anonfun$generateJob$2(ForEachDStream.scala:51)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:12)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:417)
at org.apache.spark.streaming.dstream.ForEachDStream.$anonfun$generateJob$1(ForEachDStream.scala:51)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:12)
at scala.util.Try$.apply(Try.scala:209)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.$anonfun$run$1(JobScheduler.scala:256)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:12)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassCastException: org.apache.spark.util.SerializableConfiguration cannot be cast to webmetric.MySparkKafkaProducer
at webmetric.RecoverableJsonProcess$.$anonfun$createContext$8(RecoverableJsonProcess.scala:92)
at webmetric.RecoverableJsonProcess$.$anonfun$createContext$8$adapted(RecoverableJsonProcess.scala:90)
at scala.collection.AbstractIterator.foreach(Iterator.scala:932)
at org.apache.spark.rdd.RDD.$anonfun$foreach$2(RDD.scala:1012)
at org.apache.spark.rdd.RDD.$anonfun$foreach$2$adapted(RDD.scala:1012)
at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2242)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
... 3 more
I dug around this a little bit and found out that streaming doesn't support broadcast. Take a look at this issue on official spark issues page. If I understood correctly, broadcast variables will cause exceptions when recovering from checkpoint, since the actual data is lost in executor side, and recovery from driver side is not possible. Now there are some workarounds for this, I found this one which looks promising, in the mentioned example, it uses an accumulator, but there should be similar implementation for broadcast.

Read a large zstd file with Spark (Scala)

I'm trying to load a zstd-compressed JSON file with the archive size of 16.4 GB using Spark 3.1.1 with Scala 2.12.10. See Sample file for reference.
For reference, my PC has 32 GB of RAM. The zstd decompressor I'm using is from the native libraries via
LD_LIBRARY_PATH=/opt/hadoop/lib/native
My configuration:
trait SparkProdContext {
private val master = "local[*]"
private val appName = "testing"
private val conf: SparkConf = new SparkConf()
.setMaster(master)
.setAppName(appName)
.set("spark.driver.allowMultipleContexts", "false")
.set("spark.ui.enabled", "false")
val ss: SparkSession = SparkSession.builder().config(conf).getOrCreate()
val sc: SparkContext = ss.sparkContext
val sqlContext: SQLContext = ss.sqlContext
}
My code:
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.catalyst.ScalaReflection
import ss.implicits._
case class Comment (
author: String,
body: String,
score: BigInt,
subreddit_id: String,
subreddit: String,
id: String,
parent_id: String,
link_id: String,
retrieved_on: BigInt,
created_utc: BigInt,
permalink: String
)
val schema = ScalaReflection.schemaFor[Comment].dataType.asInstanceOf[StructType]
val comments = ss.read.schema(schema).json("/home/user/Downloads/RC_2020-03.zst").as[Comment]
Upon running the code I'm getting the following error.
22/01/06 23:59:44 INFO CodecPool: Got brand-new decompressor [.zst]
22/01/06 23:59:44 ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
java.lang.InternalError: Frame requires too much memory for decoding
at org.apache.hadoop.io.compress.zstd.ZStandardDecompressor.inflateBytesDirect(Native Method)
at org.apache.hadoop.io.compress.zstd.ZStandardDecompressor.decompress(ZStandardDecompressor.java:181)
at org.apache.hadoop.io.compress.DecompressorStream.decompress(DecompressorStream.java:111)
at org.apache.hadoop.io.compress.DecompressorStream.read(DecompressorStream.java:105)
at java.base/java.io.InputStream.read(InputStream.java:205)
at org.apache.hadoop.util.LineReader.fillBuffer(LineReader.java:182)
at org.apache.hadoop.util.LineReader.readDefaultLine(LineReader.java:218)
at org.apache.hadoop.util.LineReader.readLine(LineReader.java:176)
at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.skipUtfByteOrderMark(LineRecordReader.java:152)
at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.nextKeyValue(LineRecordReader.java:192)
at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:37)
at org.apache.spark.sql.execution.datasources.HadoopFileLinesReader.hasNext(HadoopFileLinesReader.scala:69)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:173)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithoutKey_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:132)
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
Any ideas appreciated!

How do I fix a driver error coming from writing a Dataset to redshift using scala spark?

def streamDSToRedshiftOutputIO[TIMELINE <: TimeLine : Encoder](tableName: String)(batchIntervalSeconds: Int)(dataset: Dataset[TIMELINE]): Unit = {
dataset
.writeStream
.foreachBatch((batchDS: Dataset[TIMELINE], batchId: Long) => {
FuncUtils.writeBatchDSToRedShift(tableName)(batchDS);
})
.option("checkpointLocation", "/tmp/ck/job_event")
.outputMode(outputMode = "append")
.trigger(Trigger.ProcessingTime(batchIntervalSeconds * 1000))
.start()
.awaitTermination()
}
def writeBatchDSToRedShift[A: Encoder](tableName: String)(batchDS: Dataset[A]): Unit = {
batchDS.write
.format(source = "jdbc")
.option("url", Env.REDSHIFT_URL)
.option("driver", Constants.redshiftDriver)
.option("dbtable", s"${Env.REDSHIFT_SCHEMA}.${tableName}")
.option("user", Env.REDSHIFT_USERNAME)
.option("password", Env.REDSHIFT_PASSWORD)
.mode(saveMode = SaveMode.Append)
.save()
}
With the code above I get the error message during runtime:
java.sql.SQLFeatureNotSupportedException:
[Amazon][JDBC](10220) Driver does not support this optional feature.
Full trace:
19/08/16 04:01:05 ERROR Client: Application diagnostics message: User class threw exception: org.apache.spark.sql.streaming.StreamingQueryException: Job aborted due to stage failure: Task 5 in stage 2.0 failed 4 times, most recent failure: Lost task 5.3 in stage 2.0 (TID 22, ip-10-50-1-157.us-west-2.compute.internal, executor 2): java.sql.SQLFeatureNotSupportedException: [Amazon][JDBC](10220) Driver does not support this optional feature.
at com.amazon.exceptions.ExceptionConverter.toSQLException(Unknown Source)
at com.amazon.jdbc.common.SPreparedStatement.checkTypeSupported(Unknown Source)
at com.amazon.jdbc.common.SPreparedStatement.setNull(Unknown Source)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:658)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:834)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:834)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:935)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:935)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Additionally this is the actual Case class thats contained in the dataset being written:
case class JobEventTimeline(
job_id: String,
job_type: Option[String],
in_planning: Option[Boolean],
team_id: String,
actor_id: String,
admin_actor_id: Option[String],
session_id: String,
client_session_id: Option[String],
client_created_at: Long,
seq_id: Long,
is_side_effect: Option[Boolean],
op_action: String,
step_id: Option[String],
job_base_step_id: Option[String],
field_id: Option[String],
server_received_at: Option[Long],
// "Enriched" data. Data is pulled in from other sources during stream processing
team_domain: Option[String] = None,
team_name: Option[String] = None,
team_is_test: Option[Boolean] = None,
actor_email: Option[String] = None,
actor_name: Option[String] = None
) extends TimeLine

Registering UDF in SparkStreaming Application

I have a Spark streaming application that uses SparkSQL written in Scala that attempts to register a udf after getting an RDD. I get the error below. Is it not possible to register udfs in a SparkStreaming app?
Here is the code snippet that throws the error:
sessionStream.foreachRDD((rdd: RDD[(String)], time: Time) => {
val sqlcc = SqlContextSingleton.getInstance(rdd.sparkContext)
sqlcc.udf.register("getUUID", () => java.util.UUID.randomUUID().toString)
...
}
Here is the error throw when I attempt to register the function:
Exception in thread "pool-6-thread-6" java.lang.NoSuchMethodError: scala.reflect.api.JavaUniverse.runtimeMirror(Ljava/lang/ClassLoader;)Lscala/reflect/api/JavaUniverse$JavaMirror;
at com.ignitionone.datapipeline.ClusterApp$$anonfun$CreateCheckpointStreamContext$1.apply(ClusterApp.scala:173)
at com.ignitionone.datapipeline.ClusterApp$$anonfun$CreateCheckpointStreamContext$1.apply(ClusterApp.scala:164)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:42)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:40)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:40)
at scala.util.Try$.apply(Try.scala:161)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:32)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:176)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:176)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:176)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:175)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
sessionStream.foreachRDD((rdd: RDD[Event], time: Time) => {
val f = (t: Long) => t - t % 60000
val sqlContext = SQLContext.getOrCreate(rdd.sparkContext)
import sqlContext.implicits._
val df = rdd.toDF()
val per_min = udf(f)
val grouped = df.groupBy(per_min(df("created_at")) as "created_at",
df("blah"),
df("status")
).agg(sum("price") as "price",sum("payout") as "payout", sum("counter") as "counter")
...
}
is working fine by me