How can I integrate Kafka producer with spark stateful streaming which uses checkpoint along with StreamingContext.getOrCreate.
I read this post: How to write spark streaming DF to Kafka topic and implemented the method mentioned in this post: Spark and Kafka integration patterns
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
import org.apache.kafka.common.serialization.StringSerializer
import java.util.Properties
class KafkaSink(createProducer: () => KafkaProducer[String, String]) extends Serializable {
lazy val producer = createProducer()
def send(topic: String, value: String): Unit = producer.send(new ProducerRecord(topic, value))
}
object KafkaSink {
def apply(): KafkaSink = {
val f = () => {
val kafkaProducerProps: Properties = {
val props = new Properties()
props.put("bootstrap.servers", "127.0.0.1:9092")
props.setProperty("batch.size", "8192");
props.put("key.serializer", classOf[StringSerializer].getName)
props.put("value.serializer", classOf[StringSerializer].getName)
props.setProperty("request.timeout.ms", "60000")
props
}
val producer = new KafkaProducer[String, String](kafkaProducerProps)
producer
}
new KafkaSink(f)
}
}
and
package webmetric
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.common.serialization.{ByteArraySerializer, StringDeserializer, StringSerializer}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, State, StateSpec, StreamingContext}
import java.util.Properties
import scala.concurrent.Future;
object RecoverableJsonProcess {
def createContext(checkpointDirectory: String)
: StreamingContext = {
// If you do not see this printed, that means the StreamingContext has been loaded
// from the new checkpoint
println("Creating new context")
val sparkConf = new SparkConf().setAppName("RecoverableNetworkWordCount").setMaster("local[2]")
// Create the context with a 1 second batch size
val ssc = new StreamingContext(sparkConf, Seconds(4))
ssc.checkpoint(checkpointDirectory)
...
...
val globalKafkaSink = ssc.sparkContext.broadcast(KafkaSink())
val mappingFunc = ...
val stateDstream = xitemPairs.mapWithState(
StateSpec.function(mappingFunc))
stateDstream.foreachRDD { rdd =>
rdd.foreach { message =>
globalKafkaSink.value.send("mytopic",message.toString())
}
}
stateDstream.print()
ssc
}
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
System.setProperty("hadoop.home.dir", "H:\\work\\spark\\")
val checkpointDirectory = "H:/work/spark/chk2"
val ssc = StreamingContext.getOrCreate(checkpointDirectory,
() => createContext(checkpointDirectory))
ssc.start()
ssc.awaitTermination()
}
}
I did it and it works in first run which create a new context. But when try to get context from checkpoint directory after running it rises the following error when using Kafka producer:
22/04/20 09:34:44 ERROR Executor: Exception in task 0.0 in stage 99.0 (TID 35)
java.lang.ClassCastException: org.apache.spark.util.SerializableConfiguration cannot be cast to webmetric.MySparkKafkaProducer
at webmetric.RecoverableJsonProcess$.$anonfun$createContext$8(RecoverableJsonProcess.scala:92)
at webmetric.RecoverableJsonProcess$.$anonfun$createContext$8$adapted(RecoverableJsonProcess.scala:90)
at scala.collection.AbstractIterator.foreach(Iterator.scala:932)
at org.apache.spark.rdd.RDD.$anonfun$foreach$2(RDD.scala:1012)
at org.apache.spark.rdd.RDD.$anonfun$foreach$2$adapted(RDD.scala:1012)
at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2242)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
22/04/20 09:34:44 WARN TaskSetManager: Lost task 0.0 in stage 99.0 (TID 35) (hajibaba.PC executor driver): java.lang.ClassCastException: org.apache.spark.util.SerializableConfiguration cannot be cast to webmetric.MySparkKafkaProducer
at webmetric.RecoverableJsonProcess$.$anonfun$createContext$8(RecoverableJsonProcess.scala:92)
at webmetric.RecoverableJsonProcess$.$anonfun$createContext$8$adapted(RecoverableJsonProcess.scala:90)
at scala.collection.AbstractIterator.foreach(Iterator.scala:932)
at org.apache.spark.rdd.RDD.$anonfun$foreach$2(RDD.scala:1012)
at org.apache.spark.rdd.RDD.$anonfun$foreach$2$adapted(RDD.scala:1012)
at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2242)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
22/04/20 09:34:44 ERROR TaskSetManager: Task 0 in stage 99.0 failed 1 times; aborting job
22/04/20 09:34:44 ERROR JobScheduler: Error running job streaming job 1650431044000 ms.0
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 99.0 failed 1 times, most recent failure: Lost task 0.0 in stage 99.0 (TID 35) (hajibaba.PC executor driver): java.lang.ClassCastException: org.apache.spark.util.SerializableConfiguration cannot be cast to webmetric.MySparkKafkaProducer
at webmetric.RecoverableJsonProcess$.$anonfun$createContext$8(RecoverableJsonProcess.scala:92)
at webmetric.RecoverableJsonProcess$.$anonfun$createContext$8$adapted(RecoverableJsonProcess.scala:90)
at scala.collection.AbstractIterator.foreach(Iterator.scala:932)
at org.apache.spark.rdd.RDD.$anonfun$foreach$2(RDD.scala:1012)
at org.apache.spark.rdd.RDD.$anonfun$foreach$2$adapted(RDD.scala:1012)
at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2242)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2251)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2200)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2199)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2199)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1078)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1078)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1078)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2438)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2380)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2369)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2202)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2223)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2242)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2267)
at org.apache.spark.rdd.RDD.$anonfun$foreach$1(RDD.scala:1012)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
at org.apache.spark.rdd.RDD.foreach(RDD.scala:1010)
at webmetric.RecoverableJsonProcess$.$anonfun$createContext$7(RecoverableJsonProcess.scala:90)
at webmetric.RecoverableJsonProcess$.$anonfun$createContext$7$adapted(RecoverableJsonProcess.scala:89)
at org.apache.spark.streaming.dstream.DStream.$anonfun$foreachRDD$2(DStream.scala:629)
at org.apache.spark.streaming.dstream.DStream.$anonfun$foreachRDD$2$adapted(DStream.scala:629)
at org.apache.spark.streaming.dstream.ForEachDStream.$anonfun$generateJob$2(ForEachDStream.scala:51)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:12)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:417)
at org.apache.spark.streaming.dstream.ForEachDStream.$anonfun$generateJob$1(ForEachDStream.scala:51)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:12)
at scala.util.Try$.apply(Try.scala:209)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.$anonfun$run$1(JobScheduler.scala:256)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:12)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassCastException: org.apache.spark.util.SerializableConfiguration cannot be cast to webmetric.MySparkKafkaProducer
at webmetric.RecoverableJsonProcess$.$anonfun$createContext$8(RecoverableJsonProcess.scala:92)
at webmetric.RecoverableJsonProcess$.$anonfun$createContext$8$adapted(RecoverableJsonProcess.scala:90)
at scala.collection.AbstractIterator.foreach(Iterator.scala:932)
at org.apache.spark.rdd.RDD.$anonfun$foreach$2(RDD.scala:1012)
at org.apache.spark.rdd.RDD.$anonfun$foreach$2$adapted(RDD.scala:1012)
at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2242)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
... 3 more
I dug around this a little bit and found out that streaming doesn't support broadcast. Take a look at this issue on official spark issues page. If I understood correctly, broadcast variables will cause exceptions when recovering from checkpoint, since the actual data is lost in executor side, and recovery from driver side is not possible. Now there are some workarounds for this, I found this one which looks promising, in the mentioned example, it uses an accumulator, but there should be similar implementation for broadcast.
Related
When I am trying to execute 'sparl.sql' inside an UDF, I am getting java.lang.NullPointerException. Is there any way, how can I execute that?
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.{Column, DataFrame, SparkSession}
// Define the UDF
def myUdf(spark: SparkSession): UserDefinedFunction = udf((col1: String, col2: String) => {
// Execute the SQL query
val result = spark.sql(s"SELECT 'Hello World!' as text")
// Return the result as a string
result.toString()
})
// Use the UDF in a DataFrame transformation
def transform(df: DataFrame, col1: Column, col2: Column): DataFrame = {
df.withColumn("result", myUdf(spark)(col1, col2))
}
val res = transform(df, col("salary"), col("gender"))
res.show()
Above code is throwing below exception
22/12/07 11:10:32 ERROR Executor: Exception in task 0.0 in stage 1679.0 (TID 19329)
org.apache.spark.SparkException: Failed to execute user defined function($read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$$$82b5b23cea489b2712a1db46c77e458$$$$w$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$Lambda$4802/591483562: (string, string) => string)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:345)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.NullPointerException
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:154)
at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:152)
at org.apache.spark.sql.SparkSession.$anonfun$sql$2(SparkSession.scala:616)
at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:616)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:613)
I am afraid this won't work.
From a technical standpoint, UDFs in general run on an executor, and Spark session can only be accessed and used to schedule further work, on a driver. The null pointer exception you can is most likely an attempt to obtain some pieces of the Spark session that are not available on the executor.
From a semantic standpoint, if this were permitted, then processing of each row would create a new query potentially processing lots of rows. Imagine a dataframe with 10M records, and creating 10M queries. That would not be feasible to implement.
I try to read a kafka topic from Spark 3.0.2 with the scala code below.
Here are my imports:
import io.confluent.kafka.schemaregistry.client.{CachedSchemaRegistryClient, SchemaRegistryClient}
import io.confluent.kafka.serializers.AbstractKafkaAvroDeserializer
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.commons.cli.CommandLine
import org.apache.spark.sql._
import org.apache.spark.sql.avro.SchemaConverters
import org.apache.spark.sql.streaming.OutputMode
Functions to retreive schema from a Schema Registry service and to convert it into spark format :
var schemaRegistryClient: SchemaRegistryClient = _
var kafkaAvroDeserializer: AvroDeserializer = _
def lookupTopicSchema(topic: String): String = {
schemaRegistryClient.getLatestSchemaMetadata(topic).getSchema
}
def avroSchemaToSparkSchema(avroSchema: String): SchemaConverters.SchemaType = {
SchemaConverters.toSqlType(new Schema.Parser().parse(avroSchema))
}
Definition of class to deserialize data from the kafka topic :
object DeserializerWrapper {
val deserializer: AvroDeserializer = kafkaAvroDeserializer
}
class AvroDeserializer extends AbstractKafkaAvroDeserializer {
def this(client: SchemaRegistryClient) {
this()
this.schemaRegistry = client
}
override def deserialize(bytes: Array[Byte]): String = {
val value = super.deserialize(bytes)
Option(value) match {
case Some(Array()) | None => null
case Some(a) =>
val genericRecord = a.asInstanceOf[GenericRecord]
genericRecord.toString
}
}
}
The main function to execute the job :
def main(): Unit = {
val spark = SparkSession.builder.getOrCreate
val bootstrapServers = "kafka1:9192"
val topic = "sample_topic"
val shemaRegistryName = "avro_schema_A"
val schemaRegistryUrl = "http://myHost:8001"
schemaRegistryClient = new CachedSchemaRegistryClient(schemaRegistryUrl, 128)
val kafkaAvroDeserializer = new AvroDeserializer(schemaRegistryClient)
spark.udf.register("deserialize", (bytes: Array[Byte]) => DeserializerWrapper.deserializer.deserialize(bytes))
val kafkaDataFrame = (spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", bootstrapServers)
.option("subscribe", topic)
.option("kafka.group.id", "group_id")
.option("startingOffsets", "latest")
.load()
)
val valueDataFrame = kafkaDataFrame.selectExpr("""deserialize(value) AS message""")
import org.apache.spark.sql.functions._
val dfValueSchema = {
val rawSchema = lookupTopicSchema(shemaRegistryName)
avroSchemaToSparkSchema(rawSchema)
}
val formattedDataFrame = (valueDataFrame
.select(from_json(col("message"), dfValueSchema.dataType).alias("parsed_value"))
.select("parsed_value.*")
)
(formattedDataFrame
.writeStream
.format("console")
.outputMode(OutputMode.Append())
.option("truncate", false)
.start()
.awaitTermination()
)
}
When I execute the main function, the job crashes with the error :
-------------------------------------------
Batch: 0
-------------------------------------------
+-----+-----+-----+-----+-----+-----+-----+
|col-A|col-B|col-C|col-D|col-E|col-F|col-G|
+-----+-----+-----+-----+-----+-----+-----+
+-----+-----+-----+-----+-----+-----+-----+
21/07/23 15:41:31 ERROR TaskSetManager: Task 0 in stage 3.0 failed 4 times; aborting job
21/07/23 15:41:31 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite#227770fb is aborting.
21/07/23 15:41:31 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite#227770fb aborted.
21/07/23 15:41:31 ERROR MicroBatchExecution: Query [id = 5c76f495-b2c8-46e7-90b0-dd715c8466f3, runId = c38ccde6-3c98-488c-bfd9-a71a17296503] terminated with error
org.apache.spark.SparkException: Writing job aborted.
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:413)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2$(WriteToDataSourceV2Exec.scala:361)
at org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec.writeWithV2(WriteToDataSourceV2Exec.scala:322)
at org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec.run(WriteToDataSourceV2Exec.scala:329)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:39)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:39)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:45)
at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3627)
at org.apache.spark.sql.Dataset.$anonfun$collect$1(Dataset.scala:2940)
at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3618)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3616)
at org.apache.spark.sql.Dataset.collect(Dataset.scala:2940)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$16(MicroBatchExecution.scala:586)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$15(MicroBatchExecution.scala:581)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:352)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:350)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:69)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runBatch(MicroBatchExecution.scala:581)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$2(MicroBatchExecution.scala:223)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:352)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:350)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:69)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$1(MicroBatchExecution.scala:191)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:57)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:185)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:334)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:245)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 3.0 failed 4 times, most recent failure: Lost task 0.3 in stage 3.0 (TID 13, 172.20.0.4, executor 1): org.apache.spark.SparkException: Failed to execute user defined function($read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$Lambda$1119/1813496428: (binary) => string)
at org.apache.spark.sql.catalyst.expressions.ScalaUDF.eval(ScalaUDF.scala:1130)
at org.apache.spark.sql.catalyst.expressions.UnaryExpression.eval(Expression.scala:457)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.subExpr_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.$anonfun$run$7(WriteToDataSourceV2Exec.scala:441)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1411)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:477)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:385)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:462)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:465)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NullPointerException
at $line55.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.$anonfun$res3$1(<console>:41)
at org.apache.spark.sql.catalyst.expressions.ScalaUDF.$anonfun$f$2(ScalaUDF.scala:157)
at org.apache.spark.sql.catalyst.expressions.ScalaUDF.eval(ScalaUDF.scala:1127)
... 17 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2114)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:382)
... 37 more
Caused by: org.apache.spark.SparkException: Failed to execute user defined function($read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$Lambda$1119/1813496428: (binary) => string)
at org.apache.spark.sql.catalyst.expressions.ScalaUDF.eval(ScalaUDF.scala:1130)
at org.apache.spark.sql.catalyst.expressions.UnaryExpression.eval(Expression.scala:457)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.subExpr_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.$anonfun$run$7(WriteToDataSourceV2Exec.scala:441)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1411)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:477)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:385)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:462)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:465)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NullPointerException
at $line55.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.$anonfun$res3$1(<console>:41)
at org.apache.spark.sql.catalyst.expressions.ScalaUDF.$anonfun$f$2(ScalaUDF.scala:157)
at org.apache.spark.sql.catalyst.expressions.ScalaUDF.eval(ScalaUDF.scala:1127)
... 17 more
It seems that the udf throws a NullPointerException, but I explicity implemented a pattern matching to avoid it.
Can anybody please help on this ?
Thanks a lot.
I finally come over the deserialization process using the ABRiS library. It is way to easy to make connection with the Schema registry and do Ser/Deser operations of the kafka stream.
#OneCricketeer I really appreciate your help on this.
For information, the connection to the SR can be made simply as :
import za.co.absa.abris.config.AbrisConfig
val abrisConfig = AbrisConfig
.fromConfluentAvro
.downloadReaderSchemaById(123)
.usingSchemaRegistry("http://schema-registry.com:8081")
Then, the conversion process is made with a single line of code :
import za.co.absa.abris.avro.functions.from_avro
val outputDf = inputStreamDf.select(from_avro(col("value"), abrisConfig) as "data")
I am unable to save structured streaming data from Kafka into MongoDB. This is the first time i am implementing Kafka-Spark Structured Streaming data to MongoDB sink. I have followed this article https://learningfromdata.blog/2017/04/16/real-time-data-ingestion-with-apache-spark-structured-streaming-implementation/
It suggests to build a MongoForeachWriter and a Helper class along with the structured streaming program. However, following so i havent been successful in seeing the data in MongoDB collection. Could someone see and correct where i am going wrong???
Error:
Error:
Current State: ACTIVE
Thread State: RUNNABLE
Logical Plan:
Project [cast(value#8 as string) AS value#21]
+- StreamingExecutionRelation KafkaSource[Subscribe[TOPIC_WITH_COMP_P2_R2, TOPIC_WITH_COMP_P2_R2.DIT, TOPIC_WITHOUT_COMP_P2_R2.DIT]], [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:295)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:189)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 1.0 failed 1 times, most recent failure: Lost task 2.0 in stage 1.0 (TID 8, localhost, executor driver): java.lang.ClassCastException: java.lang.String cannot be cast to [B
at example_new.MongoDBForeachWriter.process(MongoDBForeachWriter.scala:42)
at example_new.MongoDBForeachWriter.process(MongoDBForeachWriter.scala:15)
at org.apache.spark.sql.execution.streaming.ForeachSink$$anonfun$addBatch$1.apply(ForeachSink.scala:53)
at org.apache.spark.sql.execution.streaming.ForeachSink$$anonfun$addBatch$1.apply(ForeachSink.scala:49)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:929)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:929)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2092)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:929)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:927)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:927)
at org.apache.spark.sql.execution.streaming.ForeachSink.addBatch(ForeachSink.scala:49)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$3$$anonfun$apply$16.apply(MicroBatchExecution.scala:477)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$3.apply(MicroBatchExecution.scala:475)
at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:271)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch(MicroBatchExecution.scala:474)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply$mcV$sp(MicroBatchExecution.scala:133)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:121)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:121)
at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:271)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1.apply$mcZ$sp(MicroBatchExecution.scala:121)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:56)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:117)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:279)
... 1 more
Caused by: java.lang.ClassCastException: java.lang.String cannot be cast to [B
at example_new.MongoDBForeachWriter.process(MongoDBForeachWriter.scala:42)
at example_new.MongoDBForeachWriter.process(MongoDBForeachWriter.scala:15)
at org.apache.spark.sql.execution.streaming.ForeachSink$$anonfun$addBatch$1.apply(ForeachSink.scala:53)
at org.apache.spark.sql.execution.streaming.ForeachSink$$anonfun$addBatch$1.apply(ForeachSink.scala:49)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:929)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:929)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
2019-04-22 19:40:26 INFO SparkContext:54 - Invoking stop() from shutdown hook
2019-04-22 19:40:26 INFO AbstractConnector:318 - Stopped Spark#907f2b7{HTTP/1.1,[http/1.1]}{0.0.0.0:4040}
2019-04-22 19:40:26 INFO SparkUI:54 - Stopped Spark web UI at http://W10BZVGSQ2.aus.amer.dell.com:4040
2019-04-22 19:40:26 INFO MapOutputTrackerMasterEndpoint:54 - MapOutputTrackerMasterEndpoint stopped!
2019-04-22 19:40:26 INFO MemoryStore:54 - MemoryStore cleared
2019-04-22 19:40:26 INFO BlockManager:54 - BlockManager stopped
2019-04-22 19:40:26 INFO BlockManagerMaster:54 - BlockManagerMaster stopped
2019-04-22 19:40:26 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint:54 - OutputCommitCoordinator stopped!
2019-04-22 19:40:26 WARN SparkEnv:87 - Exception while deleting Spark temp dir: C:\Users\raheem_mohammed\AppData\Local\Temp\spark-f9296938-c32b-42ff-af71-f90efcd49b10\userFiles-ee595b18-8c75-41be-b20e-f8c30628c765
java.io.IOException: Failed to delete: C:\Users\raheem_mohammed\AppData\Local\Temp\spark-f9296938-c32b-42ff-af71-f90efcd49b10\userFiles-ee595b18-8c75-41be-b20e-f8c30628c765
at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1070)
at org.apache.spark.SparkEnv.stop(SparkEnv.scala:103)
at org.apache.spark.SparkContext$$anonfun$stop$11.apply$mcV$sp(SparkContext.scala:1940)
at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1357)
at org.apache.spark.SparkContext.stop(SparkContext.scala:1939)
at org.apache.spark.SparkContext$$anonfun$2.apply$mcV$sp(SparkContext.scala:572)
at org.apache.spark.util.SparkShutdownHook.run(ShutdownHookManager.scala:216)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1988)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply$mcV$sp(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.util.SparkShutdownHookManager.runAll(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anon$2.run(ShutdownHookManager.scala:178)
at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:54)
2019-04-22 19:40:26 INFO SparkContext:54 - Successfully stopped SparkContext
2019-04-22 19:40:26 INFO ShutdownHookManager:54 - Shutdown hook called
2019-04-22 19:40:26 INFO ShutdownHookManager:54 - Deleting directory C:\Users\raheem_mohammed\AppData\Local\Temp\spark-f9296938-c32b-42ff-af71-f90efcd49b10
2019-04-22 19:40:26 ERROR ShutdownHookManager:91 - Exception while deleting Spark temp dir: C:\Users\raheem_mohammed\AppData\Local\Temp\spark-f9296938-c32b-42ff-af71-f90efcd49b10
java.io.IOException: Failed to delete: C:\Users\raheem_mohammed\AppData\Local\Temp\spark-f9296938-c32b-42ff-af71-f90efcd49b10
at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1070)
at org.apache.spark.util.ShutdownHookManager$$anonfun$1$$anonfun$apply$mcV$sp$3.apply(ShutdownHookManager.scala:65)
at org.apache.spark.util.ShutdownHookManager$$anonfun$1$$anonfun$apply$mcV$sp$3.apply(ShutdownHookManager.scala:62)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
at org.apache.spark.util.ShutdownHookManager$$anonfun$1.apply$mcV$sp(ShutdownHookManager.scala:62)
at org.apache.spark.util.SparkShutdownHook.run(ShutdownHookManager.scala:216)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1988)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply$mcV$sp(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.util.SparkShutdownHookManager.runAll(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anon$2.run(ShutdownHookManager.scala:178)
at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:54)
2019-04-22 19:40:26 INFO ShutdownHookManager:54 - Deleting directory C:\Users\raheem_mohammed\AppData\Local\Temp\spark-992d4d7e-ea11-4295-9368-c4038b26f895
2019-04-22 19:40:26 INFO ShutdownHookManager:54 - Deleting directory C:\Users\raheem_mohammed\AppData\Local\Temp\temporaryReader-4d362eeb-6ee5-4a48-9da9-3792a22ec1ca
2019-04-22 19:40:26 INFO ShutdownHookManager:54 - Deleting directory C:\Users\raheem_mohammed\AppData\Local\Temp\temporary-add2fc32-1623-4784-8df1-f5cb0a1dd9fc
2019-04-22 19:40:26 INFO ShutdownHookManager:54 - Deleting directory C:\Users\raheem_mohammed\AppData\Local\Temp\spark-f9296938-c32b-42ff-af71-f90efcd49b10\userFiles-ee595b18-8c75-41be-b20e-f8c30628c765
2019-04-22 19:40:26 ERROR ShutdownHookManager:91 - Exception while deleting Spark temp dir: C:\Users\raheem_mohammed\AppData\Local\Temp\spark-f9296938-c32b-42ff-af71-f90efcd49b10\userFiles-ee595b18-8c75-41be-b20e-f8c30628c765
java.io.IOException: Failed to delete: C:\Users\raheem_mohammed\AppData\Local\Temp\spark-f9296938-c32b-42ff-af71-f90efcd49b10\userFiles-ee595b18-8c75-41be-b20e-f8c30628c765
at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1070)
at org.apache.spark.util.ShutdownHookManager$$anonfun$1$$anonfun$apply$mcV$sp$3.apply(ShutdownHookManager.scala:65)
at org.apache.spark.util.ShutdownHookManager$$anonfun$1$$anonfun$apply$mcV$sp$3.apply(ShutdownHookManager.scala:62)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
at org.apache.spark.util.ShutdownHookManager$$anonfun$1.apply$mcV$sp(ShutdownHookManager.scala:62)
at org.apache.spark.util.SparkShutdownHook.run(ShutdownHookManager.scala:216)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1988)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply$mcV$sp(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.util.SparkShutdownHookManager.runAll(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anon$2.run(ShutdownHookManager.scala:178)
at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:54)
Created MongoForeachWriter.scala, Helper.scala and the StructuredStreamingProgram.scala
package example
import java.util.concurrent.TimeUnit
import scala.concurrent.Await
import scala.concurrent.duration.Duration
import org.mongodb.scala._
object Helpers {
implicit class DocumentObservable[C](val observable: Observable[Document]) extends ImplicitObservable[Document] {
override val converter: (Document) => String = (doc) => doc.toJson
}
implicit class GenericObservable[C](val observable: Observable[C]) extends ImplicitObservable[C] {
override val converter: (C) => String = (doc) => doc.toString
}
trait ImplicitObservable[C] {
val observable: Observable[C]
val converter: (C) => String
def results(): Seq[C] = Await.result(observable.toFuture(), Duration(10, TimeUnit.SECONDS))
def headResult() = Await.result(observable.head(), Duration(10, TimeUnit.SECONDS))
def printResults(initial: String = ""): Unit = {
if (initial.length > 0) print(initial)
results().foreach(res => println(converter(res)))
}
def printHeadResult(initial: String = ""): Unit = println(s"${initial}${converter(headResult())}")
}
}
package example
import java.util.Calendar
import org.apache.spark.util.LongAccumulator
import org.apache.spark.sql.Row
import org.apache.spark.sql.ForeachWriter
import org.mongodb.scala._
import org.mongodb.scala.bson.collection.mutable.Document
import org.mongodb.scala.bson._
import example.Helpers._
import scala.util.Try
class MongoDBForeachWriter(p_uri: String,
p_dbName: String,
p_collectionName: String,
p_messageCountAccum: LongAccumulator) extends ForeachWriter[Row] {
val mongodbURI = p_uri
val dbName = p_dbName
val collectionName = p_collectionName
val messageCountAccum = p_messageCountAccum
var mongoClient: MongoClient = null
var db: MongoDatabase = null
var collection: MongoCollection[Document] = null
def ensureMongoDBConnection(): Unit = {
if (mongoClient == null) {
mongoClient = MongoClient(mongodbURI)
db = mongoClient.getDatabase(dbName)
collection = db.getCollection(collectionName)
}
}
override def open(partitionId: Long, version: Long): Boolean = {
true
}
override def process(record: Row): Unit = {
val valueStr = new String(record.getAs[Array[Byte]]("value"))
val doc: Document = Document(valueStr)
doc += ("log_time" -> Calendar.getInstance().getTime())
// lazy opening of MongoDB connection
ensureMongoDBConnection()
val result = collection.insertOne(doc)
// tracks how many records I have processed
if (messageCountAccum != null)
messageCountAccum.add(1)
}
override def close(errorOrNull: Throwable): Unit = {
if(mongoClient != null) {
Try {
mongoClient.close()
}
}
}
}
package example
import org.apache.spark.sql.functions.{col, _}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.util.LongAccumulator
import example.Helpers._
import java.util.Calendar
object StructuredStreamingProgram {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.master("local[*]")
.appName("OSB_Streaming_Model")
.getOrCreate()
import spark.implicits._
val df = spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "10.160.172.45:9092, 10.160.172.46:9092, 10.160.172.100:9092")
.option("subscribe", "TOPIC_WITH_COMP_P2_R2, TOPIC_WITH_COMP_P2_R2.DIT, TOPIC_WITHOUT_COMP_P2_R2.DIT")
.load()
val dfs = df.selectExpr("CAST(value AS STRING)")
// sends to MongoDB once every 20 seconds
val mongodb_uri = "mongodb://dstk8sdev06.us.dell.com:27018"
val mdb_name = "HANZO_MDB"
val mdb_collection = "Spark"
val CountAccum: LongAccumulator = spark.sparkContext.longAccumulator("mongostreamcount")
val structuredStreamForeachWriter: MongoDBForeachWriter = new MongoDBForeachWriter(mongodb_uri,mdb_name,mdb_collection,CountAccum)
val query = dfs.writeStream
.foreach(structuredStreamForeachWriter)
.trigger(Trigger.ProcessingTime("20 seconds"))
.start()
while (!spark.streams.awaitAnyTermination(60000)) {
println(Calendar.getInstance().getTime()+" :: mongoEventsCount = "+CountAccum.value)
}
}
}
I need to save the structured streaming data onto Mongo collection
According to the error, you already have a string, (you already did df.selectExpr("CAST(value AS STRING)")), so you should try getting the Row event as a String, and not an Array[Byte]
Start by changing
val valueStr = new String(record.getAs[Array[Byte]]("value"))
to
val valueStr = record.getAs[String]("value")
I understand you may already have a cluster to run Spark code, but I would suggest still looking into the Kafka Connect Mongo Sink Connector so that you don't have to write and maintain your own Mongo writer in Spark code.
Or, you can write Spark datasets to mongo directly as well
I am trying to get data from Twitter through streaming.
I am getting data in twt varibale.
val ssc = new StreamingContext(sc, Seconds(60))
val tweets = TwitterUtils.createStream(ssc, None, Array("#hadoop", "#bigdata", "#spark", "#hortonworks", "#HDP"))
//tweets.saveAsObjectFiles("/models/Twitter_files_", ".txt")
case class Tweet(createdAt:Long, text:String, screenName:String)
val twt = tweets.window(Seconds(60))
//twt.foreach(status => println(status.text())
import sqlContext.implicits._
val temp = twt.map(status=>
Tweet(status.getCreatedAt().getTime()/1000,status.getText(), status.getUser().getScreenName())
).foreachRDD(rdd=>
rdd.toDF().registerTempTable("tweets")
)
twt.print
ssc.start()
here is the error :
org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2032)
at org.apache.spark.streaming.dstream.DStream$$anonfun$map$1.apply(DStream.scala:528)
at org.apache.spark.streaming.dstream.DStream$$anonfun$map$1.apply(DStream.scala:528)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.SparkContext.withScope(SparkContext.scala:709)
at org.apache.spark.streaming.StreamingContext.withScope(StreamingContext.scala:266)
Caused by: java.io.NotSerializableException: org.apache.spark.streaming.StreamingContext
Your Tweet class is not Serializable, so extend that.
It's a common Spark issue, and the stack tells you exactly what is trying to serialize since Spark 1.3, I believe
I have been trying to parse data from Dstream got from spark stream(TCP) and send it to elastic search. I am getting an error org.elasticsearch.hadoop.rest.EsHadoopInvalidRequest: Found unrecoverable error [127.0.0.1:9200] returned Bad Request(400) - failed to parse; Bailing out..
The following is my code:
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.SparkContext
import org.apache.spark.serializer.KryoSerializer;
import org.apache.spark.SparkContext._
import org.elasticsearch.spark._
import org.elasticsearch.spark.rdd.EsSpark
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.spark.TaskContext
import org.elasticsearch.common.transport.InetSocketTransportAddress;
object Test {
case class createRdd(Message: String, user: String)
def main(args:Array[String]) {
val mapper=new ObjectMapper()
val SparkConf = new SparkConf().setAppName("NetworkWordCount").setMaster("local[*]")
SparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
SparkConf.set("es.nodes","localhost:9200")
SparkConf.set("es.index.auto.create", "true")
// Create a local StreamingContext with batch interval of 10 second
val ssc = new StreamingContext(SparkConf, Seconds(10))
/* Create a DStream that will connect to hostname and port, like localhost 9999. As stated earlier, DStream will get created from StreamContext, which in return is created from SparkContext. */
val lines = ssc.socketTextStream("localhost",9998)
// Using this DStream (lines) we will perform transformation or output operation.
val words = lines.map(_.split(" "))
words.foreachRDD(_.saveToEs("spark/test"))
ssc.start() // Start the computation
ssc.awaitTermination() // Wait for the computation to terminate
}
}
The following is the error:
16/10/17 11:02:30 INFO Executor: Running task 0.0 in stage 1.0 (TID 1)
16/10/17 11:02:30 INFO BlockManager: Found block input-0-1476682349200 locally
16/10/17 11:02:30 INFO Version: Elasticsearch Hadoop v5.0.0.BUILD.SNAPSHOT [4282a0194a]
16/10/17 11:02:30 INFO EsRDDWriter: Writing to [spark/test]
16/10/17 11:02:30 ERROR TaskContextImpl: Error in TaskCompletionListener
org.elasticsearch.hadoop.rest.EsHadoopInvalidRequest: Found unrecoverable error [127.0.0.1:9200] returned Bad Request(400) - failed to parse; Bailing out..
at org.elasticsearch.hadoop.rest.RestClient.processBulkResponse(RestClient.java:250)
at org.elasticsearch.hadoop.rest.RestClient.bulk(RestClient.java:202)
at org.elasticsearch.hadoop.rest.RestRepository.tryFlush(RestRepository.java:220)
at org.elasticsearch.hadoop.rest.RestRepository.flush(RestRepository.java:242)
at org.elasticsearch.hadoop.rest.RestRepository.close(RestRepository.java:267)
at org.elasticsearch.hadoop.rest.RestService$PartitionWriter.close(RestService.java:120)
at org.elasticsearch.spark.rdd.EsRDDWriter$$anonfun$write$1.apply(EsRDDWriter.scala:42)
at org.elasticsearch.spark.rdd.EsRDDWriter$$anonfun$write$1.apply(EsRDDWriter.scala:42)
at org.apache.spark.TaskContext$$anon$1.onTaskCompletion(TaskContext.scala:123)
at org.apache.spark.TaskContextImpl$$anonfun$markTaskCompleted$1.apply(TaskContextImpl.scala:97)
at org.apache.spark.TaskContextImpl$$anonfun$markTaskCompleted$1.apply(TaskContextImpl.scala:95)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.TaskContextImpl.markTaskCompleted(TaskContextImpl.scala:95)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
I am coding on scala. I am unable to find the reason for the error. Please help me out with the exception.
Thank you.