JMSConnection serialization fail - scala

Currently I'm building an application that reads messages (transactions in json) in a Kafka Topic and sends to IBM MQ at production. I'm having some trouble with serialization in the JMS classes and kinda lost on how to fix it.
My code is:
object DispatcherMqApp extends Serializable {
private val logger = LoggerFactory.getLogger(this.getClass)
val config = ConfigFactory.load()
def inicialize(transactionType: String) = {
val spark = new SparkConf()
.setAppName("Dispatcher MQ Categorization")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.set("spark.streaming.stopGracefullyOnShutDown", "true")
logger.debug(s"Loading configuration at ${printConfig(config).head} =>\n${printConfig(config)(1)}")
val kafkaConfig = KafkaConfig.buildFromConfiguration(config, "dispatcher-mq")
val streamCtx = new StreamingContext(spark, Seconds(kafkaConfig.streamingInterval))
sys.ShutdownHookThread {
logger.warn("Stopping the application ...")
streamCtx.stop(stopSparkContext = true, stopGracefully = true)
logger.warn("Application Finish with Success !!!")
}
val topic = config.getString(s"conf.dispatcher-mq.consumer-topic.$transactionType")
logger.info(s"Topic: $topic")
val zkdir = s"${kafkaConfig.zookeeperBaseDir}$transactionType-$topic"
val kafkaManager = new KafkaManager(kafkaConfig)
val stream = kafkaManager.createStreaming(streamCtx, kafkaConfig.offset, topic, zkdir)
val kafkaSink = streamCtx.sparkContext.broadcast(kafkaManager.createProducer())
val mqConfig = MQConfig(config.getString("conf.mq.name"),
config.getString("conf.mq.host"),
config.getInt("conf.mq.port"),
config.getString("conf.mq.channel"),
config.getString("conf.mq.queue-manager"),
config.getInt("conf.mq.retries"),
config.getString("conf.mq.app-name"),
Try(config.getString("conf.mq.user")).toOption,
Try(config.getString("conf.mq.password")).toOption,
config.getString("conf.dispatcher-mq.send.category_file"))
val queueConn = new MQService(mqConfig)
(stream, queueConn, streamCtx, kafkaSink, zkdir)
}
def main(args: Array[String]): Unit = {
val transactionType = args.head
if (transactionType=="account" | transactionType=="credit") {
val (messages, queueConn, sc, kafkaSink, zkdir) = inicialize(transactionType)
val fieldsType = config.getString(s"conf.dispatcher-mq.send.fields.$transactionType")
val source = config.getString("conf.dispatcher-mq.parameters.source")
val mqVersion = config.getString(s"conf.dispatcher-mq.parameters.version.$transactionType")
val topicError = config.getString("conf.kafka.topic_error")
messages.foreachRDD(rdd => {
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd.map(_._2).filter(_.toUpperCase.contains("BYCATEGORIZER"))
.foreach(message => {
val msg:Option[TextMessage] = try {
Some(queueConn.createOutputMq(message, fieldsType, source, mqVersion))
} catch {
case ex: Exception =>
logger.error(s"[ERROR] input: [[$message]]\n$ex")
val errorReport = ErrorReport("GENERAL", "DISPATCHER-MQ", transactionType.toString, ex.getMessage, None, Option(ex.toString))
ErrorReportService.sendError(errorReport, topicError, kafkaSink.value)
None
}
if(msg.nonEmpty) queueConn.submit(msg.get)
})
logger.info(s"Save Offset in $zkdir...\n${offsetRanges.toList.to}")
ZookeeperConn.saveOffsets(zkdir, offsetRanges)
})
sc.start()
sc.awaitTermination()
} else
logger.error(s"${args.head} is not a valid argument. ( account or credit ) !!! ")
}
I'm having error on serialization the JMSConnection which is called hidden in the createOutputMq method. The error is:
20/09/04 17:21:00 ERROR JobScheduler: Error running job streaming job 1599250860000 ms.0
org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2054)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:918)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:917)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:323)
at org.apache.spark.rdd.RDD.foreach(RDD.scala:917)
at br.com.HIDDEN.dispatcher.DispatcherMqApp$$anonfun$main$1.apply(DispatcherMqApp.scala:80)
at br.com.HIDDEN.dispatcher.DispatcherMqApp$$anonfun$main$1.apply(DispatcherMqApp.scala:76)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:426)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at scala.util.Try$.apply(Try.scala:161)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:227)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:227)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:227)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:226)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.io.NotSerializableException: com.ibm.msg.client.jms.JmsConnection
Serialization stack:
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:301)
... 30 more
20/09/04 17:21:00 ERROR ApplicationMaster: User class threw exception: org.apache.spark.SparkException: Task not serializable
org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2054)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:918)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:917)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:323)
at org.apache.spark.rdd.RDD.foreach(RDD.scala:917)
at br.com.HIDDEN.dispatcher.DispatcherMqApp$$anonfun$main$1.apply(DispatcherMqApp.scala:80)
at br.com.HIDDEN.dispatcher.DispatcherMqApp$$anonfun$main$1.apply(DispatcherMqApp.scala:76)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:426)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at scala.util.Try$.apply(Try.scala:161)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:227)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:227)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:227)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:226)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.io.NotSerializableException: com.ibm.msg.client.jms.JmsConnection
Serialization stack:
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:301)
... 30 more
Anybody got some idea on how to fix it? The line shown in error message (76 and 80) are my messages.foreachRDD(rdd => { and .foreach(message => { respectively.
Thanks in advance

Related

spark streaming - NoClassDefFoundError: Could not initialize class

Application Summary:-
Use spark streaming in databricks (scala) to receive messages from event hub and fetch files from s3
To get past serialization errors the below package(com.databricks.s3get) was defined and imported. The writeStream Method is as follows:-
import com.databricks.s3get._
val streamwriter = df
.writeStream
.queryName("stream")
.option("checkpointLocation", "dbfs:/mnt/test/checkpoint")
.trigger(Trigger.ProcessingTime("3 seconds"))
.outputMode("append")
.foreachBatch(
(batchDF: DataFrame, batchId: Long) => Try {
batchDF.persist()
val dataframe = batchDF.select("object", "batch")
writeS3Object.getS3Object(dataframe)
batchDF.unpersist()
} match {
case Success(_) =>
case Failure(e) => {
throw(e)
}
}
)
Package cell:-
package com.databricks.s3get
//import all the necessary libraries
class fetchS3Object extends Serializable {
//whatever is inside the "object fetchS3Object" is inside here ..ditto!
}
object fetchS3Object {
//build client to connect to s3
val AccessKey = dbutils.secrets.get(scope = "My_Scope", key = "AccessKey-ID")
val SecretKey = dbutils.secrets.get(scope = "My_Scope", key = "AccessKey-Secret")
val creds = new BasicAWSCredentials(AccessKey, SecretKey)
val clientRegion: Regions = Regions.US_EAST_1
private val s3Client: AmazonS3 = AmazonS3ClientBuilder.standard()
.withRegion(clientRegion)
.withCredentials(new AWSStaticCredentialsProvider(creds))
.build()
def getS3Object(df: DataFrame) {
// execute foreach function to get s3 objects
df.foreach(x => writeS3Object(x.getString(0), x.getString(1)))
}
def writeFunction(input: String, fileWrite: String) {
//write s3 object content to file in datalake
var fileWriter: FileWriter = null
var bw: BufferedWriter = null
val file = new File(fileWrite)
fileWriter = new FileWriter(file)
bw = new BufferedWriter(fileWriter)
bw.write(input)
bw.close()
fileWriter.close()
}
def getS3ObjectContentAsStream(bucketName: String, objectKey: String) ={
//function to get s3 object content
val contentStream = s3Client.getObject(bucketName, objectKey)
contentStream.getObjectContent
}
def writeS3Object(s3ObjectName: String, batchname: String) {
val s3FileName = s"${s3ObjectName}_.json"
val inputS3Stream = getS3ObjectContentAsStream("myBucket",s3FileName)
val inputS3String = IOUtils.toString(inputS3Stream, "UTF-8")
val filePath = s"/dbfs/mnt/test/${batchname}/${s3FileName}"
writeFunction(inputS3String, filePath)
}
}
However on execution this error is thrown:-
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 161.0 failed 4 times, most recent failure: Lost task 0.5 in stage 161.0 (TID 3327) (10.21.132.64 executor 144): java.lang.NoClassDefFoundError: Could not initialize class com.databricks.s3get.fetchS3Object$
at java.io.ObjectStreamClass.hasStaticInitializer(Native Method)
at java.io.ObjectStreamClass.computeDefaultSUID(ObjectStreamClass.java:1955)
at java.io.ObjectStreamClass.access$100(ObjectStreamClass.java:79)
at java.io.ObjectStreamClass$2.run(ObjectStreamClass.java:275)
at java.io.ObjectStreamClass$2.run(ObjectStreamClass.java:273)
at java.security.AccessController.doPrivileged(Native Method)
at java.io.ObjectStreamClass.getSerialVersionUID(ObjectStreamClass.java:272)
at java.io.ObjectStreamClass.initNonProxy(ObjectStreamClass.java:694)
at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:2003)
at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1850)
at java.io.ObjectInputStream.readClass(ObjectInputStream.java:1813)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1638)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2405)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
at java.io.ObjectInputStream.readArray(ObjectInputStream.java:2093)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1655)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2405)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
at java.io.ObjectInputStream.readArray(ObjectInputStream.java:2093)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1655)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2405)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2405)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:503)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:461)
at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:76)
at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:115)
at org.apache.spark.scheduler.ResultTask.$anonfun$runTask$2(ResultTask.scala:65)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.ResultTask.$anonfun$runTask$1(ResultTask.scala:65)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:55)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:150)
at org.apache.spark.scheduler.Task.$anonfun$run$1(Task.scala:119)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.Task.run(Task.scala:91)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$13(Executor.scala:813)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1605)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:816)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:672)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2783)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2730)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2724)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2724)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1260)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1260)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1260)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2991)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2932)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2920)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:1033)
at org.apache.spark.SparkContext.runJobInternal(SparkContext.scala:2474)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2457)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2495)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2514)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2539)
at org.apache.spark.rdd.RDD.$anonfun$foreach$1(RDD.scala:1017)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:125)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:419)
at org.apache.spark.rdd.RDD.foreach(RDD.scala:1015)
at org.apache.spark.sql.Dataset.$anonfun$foreach$1(Dataset.scala:2920)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.Dataset.$anonfun$withNewRDDExecutionId$1(Dataset.scala:3804)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:126)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:269)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:104)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:852)
at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:219)
at org.apache.spark.sql.Dataset.withNewRDDExecutionId(Dataset.scala:3802)
at org.apache.spark.sql.Dataset.foreach(Dataset.scala:2920)
at com.databricks.s3get.fetchS3Object$.getS3Object(<notebook>:83)
at $lineab81ab0d385746ac861c3adb69ddaf77157.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.$anonfun$tripsummarywriter$3(command-4241228593676676:36)
at $lineab81ab0d385746ac861c3adb69ddaf7745.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.$anonfun$retry$1(command-4241228593676670:4)
at scala.util.Try$.apply(Try.scala:213)
at $lineab81ab0d385746ac861c3adb69ddaf7745.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.retry(command-4241228593676670:4)
at $lineab81ab0d385746ac861c3adb69ddaf7745.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.retry(command-4241228593676670:37)
at $lineab81ab0d385746ac861c3adb69ddaf77157.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.$anonfun$tripsummarywriter$2(command-4241228593676676:10)
at scala.util.Try$.apply(Try.scala:213)
at $lineab81ab0d385746ac861c3adb69ddaf77157.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.$anonfun$tripsummarywriter$1(command-4241228593676676:10)
at $lineab81ab0d385746ac861c3adb69ddaf77157.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.$anonfun$tripsummarywriter$1$adapted(command-4241228593676676:8)
at org.apache.spark.sql.execution.streaming.sources.ForeachBatchSink.addBatch(ForeachBatchSink.scala:38)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$17(MicroBatchExecution.scala:608)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:126)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:269)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:104)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:852)
at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:219)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$16(MicroBatchExecution.scala:606)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:293)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:291)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:73)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runBatch(MicroBatchExecution.scala:606)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$4(MicroBatchExecution.scala:244)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.withSchemaEvolution(MicroBatchExecution.scala:649)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$2(MicroBatchExecution.scala:241)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:293)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:291)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:73)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$1(MicroBatchExecution.scala:210)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:57)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:204)
at org.apache.spark.sql.execution.streaming.StreamExecution.$anonfun$runStream$1(StreamExecution.scala:366)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:852)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:341)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:268)
Caused by: java.lang.NoClassDefFoundError: Could not initialize class com.databricks.s3get.fetchS3Object$
at java.io.ObjectStreamClass.hasStaticInitializer(Native Method)
at java.io.ObjectStreamClass.computeDefaultSUID(ObjectStreamClass.java:1955)
at java.io.ObjectStreamClass.access$100(ObjectStreamClass.java:79)
at java.io.ObjectStreamClass$2.run(ObjectStreamClass.java:275)
at java.io.ObjectStreamClass$2.run(ObjectStreamClass.java:273)
at java.security.AccessController.doPrivileged(Native Method)
at java.io.ObjectStreamClass.getSerialVersionUID(ObjectStreamClass.java:272)
at java.io.ObjectStreamClass.initNonProxy(ObjectStreamClass.java:694)
at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:2003)
at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1850)
at java.io.ObjectInputStream.readClass(ObjectInputStream.java:1813)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1638)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2405)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
at java.io.ObjectInputStream.readArray(ObjectInputStream.java:2093)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1655)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2405)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
at java.io.ObjectInputStream.readArray(ObjectInputStream.java:2093)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1655)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2405)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2405)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2329)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2187)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:503)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:461)
at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:76)
at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:115)
at org.apache.spark.scheduler.ResultTask.$anonfun$runTask$2(ResultTask.scala:65)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.ResultTask.$anonfun$runTask$1(ResultTask.scala:65)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:55)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:150)
at org.apache.spark.scheduler.Task.$anonfun$run$1(Task.scala:119)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.Task.run(Task.scala:91)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$13(Executor.scala:813)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1605)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:816)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:672)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
How can this be resolved?
Update without using classes and objects:-
The dataframe consists of the two columns (s3ObjectName, batchName) with tens of thousands of rows
What is trying to be achieved:-
Retrieve objects from an S3 bucket in parallel using details from each row in the microbatch from the writeStream process using foreachPartition() or foreach() functions
// s3 connection details
def provides3Client(): AmazonS3 = {
val AccessKey = dbutils.secrets.get(scope = "My_Scope", key = "AccessKey-ID")
val SecretKey = dbutils.secrets.get(scope = "My_Scope", key = "AccessKey-Secret")
val creds = new BasicAWSCredentials(AccessKey, SecretKey)
val clientRegion: Regions = Regions.US_EAST_1
AmazonS3ClientBuilder.standard()
.withRegion(clientRegion)
.withCredentials(new AWSStaticCredentialsProvider(creds))
.build()
}
dataframe.foreachPartition(partition => {
//Initialize s3 connection for each partition
val client: AmazonS3 = provides3Client()
partition.foreach(row => {
val s3ObjectName = row.getString(0)
val batchname = row.getString(1)
val inputS3Stream = client.getObject("s3bucketname", objectKey).getObjectContent
val inputS3String = IOUtils.toString(inputS3Stream, "UTF-8")
val filePath = s"/dbfs/mnt/pp-telematics-working-5/Sensors/s3response/test/${batchname}/${s3ObjectName}"
var fileWriter: FileWriter = null
var bw: BufferedWriter = null
val file = new File(filePath)
fileWriter = new FileWriter(file)
bw = new BufferedWriter(fileWriter)
bw.write(inputS3String)
bw.close()
fileWriter.close()
})
})
The above process gives me Error: value foreach is not a member of Object

spark streaming serialization error

I am running into serialization error in spark-streaming application. Below is my driver code:
package com.test
import org.apache.spark._
import org.apache.spark.streaming._
import org.json.JSONObject;
import java.io.Serializable
object SparkFiller extends Serializable{
def main(args: Array[String]): Unit ={
val sparkConf = new
SparkConf().setAppName("SparkFiller").setMaster("local[*]")
// println("test")
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sparkConf.registerKryoClasses(Array(classOf[firehoseToDocumentDB]))
sparkConf.registerKryoClasses(Array(classOf[PushToDocumentDB]))
var TimeStamp_Start = 1493836050
val TimeStamp_Final = 1493836056
var timeStamp_temp = TimeStamp_Start - 5;
// val send_timestamps = new firehoseToDocumentDB(TimeStamp_Start,TimeStamp_Final);
// send_timestamps.onStart();
val ssc = new StreamingContext(sparkConf, Seconds(5))
val lines = ssc.receiverStream(
new firehoseToDocumentDB(TimeStamp_Start.toString(),TimeStamp_Final.toString()))
// val timestamp_stream = ssc.receiverStream(new firehoseToDocumentDB(TimeStamp_Start.toString(),TimeStamp_Final.toString()))
lines.foreachRDD(rdd => {
rdd.foreachPartition(part => {
val dbsender = new PushToDocumentDB();
part.foreach(msg =>{
var jsonobject = new JSONObject(part)
var temp_pitr = jsonobject.getString("pitr")
println(temp_pitr)
if ( TimeStamp_Final >= temp_pitr.toLong) {
ssc.stop()
}
dbsender.PushFirehoseMessagesToDocumentDb(msg)
})
// dbsender.close()
})
})
println("line",line)))
println("ankush")
ssc.start()
ssc.awaitTermination()
}
}
When I add the below lines to the code
var jsonobject = new JSONObject(part)
var temp_pitr = jsonobject.getString("pitr")
println(temp_pitr)
if ( TimeStamp_Final >= temp_pitr.toLong) {
ssc.stop()
}
I get an error:
Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2055)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:919)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:918)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:918)
at com.boeing.SparkFiller$$anonfun$main$1.apply(SparkFiller.scala:26)
at com.boeing.SparkFiller$$anonfun$main$1.apply(SparkFiller.scala:25)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:426)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at scala.util.Try$.apply(Try.scala:161)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:224)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:223)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.io.NotSerializableException: org.apache.spark.streaming.StreamingContext
Serialization stack:
- object not serializable (class:
org.apache.spark.streaming.StreamingContext, value:
org.apache.spark.streaming.StreamingContext#780e1bb5)
- field (class: com.boeing.SparkFiller$$anonfun$main$1, name: ssc$1, type:
class org.apache.spark.streaming.StreamingContext)
- object (class com.boeing.SparkFiller$$anonfun$main$1, <function1>)
- field (class: com.boeing.SparkFiller$$anonfun$main$1$$anonfun$apply$1,
name: $outer, type: class com.boeing.SparkFiller$$anonfun$main$1)
- object (class com.boeing.SparkFiller$$anonfun$main$1$$anonfun$apply$1, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:301)
... 30 more
Process finished with exit code 1
If I remove those lines of code it is working good.
The issue is because of using the ssc.stop() in the rdd. Is there any otherway that I can call a shutdown hook from the rdd if it satisfies the condition.
Issue is because of using the ssc.stop() in the rdd.
You are right! Any of the Spark contexts are not serializable and cannot be used inside any of the tasks.
is there any otherway that I can call a shutdown hook from the rdd if it satisfies the condition.
In order to control the lifecycle of your streaming application, you should consider overriding a listener and stop the context based on your condition. I have done enough research and found out that this the only feasible solution.
Please refer to my answer to this post to understand how to stop the streaming application based on certain condition.

The usage of serializable object: Caused by: java.io.NotSerializableException

I follow this tutorial and other similar tutorials on Task serialization, but my code fails with the Task serialization error. I don't understand why does it happen. I am setting the variable topicOutputMessages outside of foreachRDD and then I am reading it within foreachPartition. Also I create KafkaProducer INSIDE foreachPartition. So, what is the problem here? Cannot really get the point.
al topicsSet = topicInputMessages.split(",").toSet
val kafkaParams = Map[String, String]("metadata.broker.list" -> metadataBrokerList_InputQueue)
val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet).map(_._2)
messages.foreachRDD(rdd => {
rdd.foreachPartition{iter =>
UtilsDM.setMetadataBrokerList(metadataBrokerList)
UtilsDM.setOutputTopic(topicOutputMessages)
val producer = UtilsDM.createProducer
iter.foreach { msg =>
val record = new ProducerRecord[String, String](UtilsDM.getOutputTopic(), msg)
producer.send(record)
}
producer.close()
}
})
EDIT:
object UtilsDM extends Serializable {
var topicOutputMessages: String = ""
var metadataBrokerList: String = ""
var producer: KafkaProducer[String, String] = null
def setOutputTopic(t: String): Unit = {
topicOutputMessages = t
}
def setMetadataBrokerList(m: String): Unit = {
metadataBrokerList = m
}
def createProducer: KafkaProducer[String, String] = {
val kafkaProps = new Properties()
kafkaProps.put("bootstrap.servers", metadataBrokerList)
// This is mandatory, even though we don't send key
kafkaProps.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
kafkaProps.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
kafkaProps.put("acks", "1")
// how many times to retry when produce request fails?
kafkaProps.put("retries", "3")
// This is an upper limit of how many messages Kafka Producer will attempt to batch before sending (bytes)
kafkaProps.put("batch.size", "5")
// How long will the producer wait before sending in order to allow more messages to get accumulated in the same batch
kafkaProps.put("linger.ms", "5")
new KafkaProducer[String, String](kafkaProps)
}
}
Full stacktrace:
16/11/21 13:47:30 ERROR JobScheduler: Error running job streaming job 1479732450000 ms.0
org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2055)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:919)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:918)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:918)
at org.test.consumer.kafka.KafkaDecisionsConsumer$$anonfun$run$1.apply(KafkaDecisionsConsumer.scala:103)
at org.test.consumer.kafka.KafkaDecisionsConsumer$$anonfun$run$1.apply(KafkaDecisionsConsumer.scala:93)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:426)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at scala.util.Try$.apply(Try.scala:161)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:224)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:223)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.io.NotSerializableException: org.test.consumer.kafka.KafkaDecisionsConsumer
Serialization stack:
- object not serializable (class: org.test.consumer.kafka.KafkaDecisionsConsumer, value: org.test.consumer.kafka.KafkaDecisionsConsumer#4a0ee025)
- field (class: org.test.consumer.kafka.KafkaDecisionsConsumer$$anonfun$run$1, name: $outer, type: class org.test.consumer.kafka.KafkaDecisionsConsumer)
- object (class org.test.consumer.kafka.KafkaDecisionsConsumer$$anonfun$run$1, <function1>)
- field (class: org.test.consumer.kafka.KafkaDecisionsConsumer$$anonfun$run$1$$anonfun$apply$1, name: $outer, type: class org.test.consumer.kafka.KafkaDecisionsConsumer$$anonfun$run$1)
- object (class org.test.consumer.kafka.KafkaDecisionsConsumer$$anonfun$run$1$$anonfun$apply$1, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:301)
... 30 more
16/11/21 13:47:30 ERROR ApplicationMaster: User class threw exception: org.apache.spark.SparkException: Task not serializable
org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2055)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:919)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:918)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:918)
at org.test.consumer.kafka.KafkaDecisionsConsumer$$anonfun$run$1.apply(KafkaDecisionsConsumer.scala:103)
at org.test.consumer.kafka.KafkaDecisionsConsumer$$anonfun$run$1.apply(KafkaDecisionsConsumer.scala:93)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:426)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at scala.util.Try$.apply(Try.scala:161)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:224)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:223)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.io.NotSerializableException: org.test.consumer.kafka.KafkaDecisionsConsumer
Serialization stack:
- object not serializable (class: org.test.consumer.kafka.KafkaDecisionsConsumer, value: org.test.consumer.kafka.KafkaDecisionsConsumer#4a0ee025)
- field (class: org.test.consumer.kafka.KafkaDecisionsConsumer$$anonfun$run$1, name: $outer, type: class org.test.consumer.kafka.KafkaDecisionsConsumer)
- object (class org.test.consumer.kafka.KafkaDecisionsConsumer$$anonfun$run$1, <function1>)
- field (class: org.test.consumer.kafka.KafkaDecisionsConsumer$$anonfun$run$1$$anonfun$apply$1, name: $outer, type: class org.test.consumer.kafka.KafkaDecisionsConsumer$$anonfun$run$1)
- object (class org.test.consumer.kafka.KafkaDecisionsConsumer$$anonfun$run$1$$anonfun$apply$1, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:301)
... 30 more
The serialization issue lies in how Spark deals with closure serialization (which you can read in detail in this answer: How spark handles object )
In the failing code, referencing metadataBrokerList and topicOutputMessages here:
rdd.foreachPartition{iter =>
UtilsDM.setMetadataBrokerList(metadataBrokerList)
UtilsDM.setOutputTopic(topicOutputMessages)
creates a reference to the outer object where these variables are created, and forces the closure cleaner in Spark to included in the "cleaned" closure. outer then includes sparkContext and streamingContext in the closure, which are not serializable and hence the serialization exception.
In the second attempt (in the workaround posted as an answer), these links are broken as the variables are now contained in the help object and the closure can be "cut clean" from the outer context.
I'd think that adding #transient to these variables is not necessary within the UtilsDM object, given that the values are serializable. Be aware that singleton objects are recreated in each executor. Therefore the value of mutable variables changed in the driver will not be available in the executors, often leading to NullPointerException if not handled properly.
There's a serialization trick that would help in the original scenario:
Copy referenced variables within the closure. e.g.
rdd.foreachPartition{iter =>
val innerMDBL = metadataBrokerList
val innerTOM = topicOutputMessages
UtilsDM.setMetadataBrokerList(innerMDBL)
UtilsDM.setOutputTopic(innerTOM)
That way, the values are copied at compile time and there's also no link with outer.
To deal with executor-bound objects (like non-serializable connections or even local caches) I prefer to use a instance factory approach, like explained in this answer: Redis on Spark:Task not serializable
I think the problem lies in your UtilsDM class. It is being captured by closure and Spark attempts to serialize the code to ship it to executors.
Try to make UtilsDM serializable or create it within the foreachRDD function.
This is not the answer to my question, but it is the option that works.Maybe someone could elaborate it in the final answer? The disadvantage of this solution is that metadataBrokerList and topicOutputMessages should be fixed from the code of UtilsTest using #transient lazy val topicOutputMessages and #transient lazy val metadataBrokerList, but ideally I would like to be able to pass these parameters as input parameters:
object TestRunner {
var zkQuorum: String = ""
var metadataBrokerList: String = ""
var group: String = ""
val topicInputMessages: String = ""
def main(args: Array[String]) {
if (args.length < 14) {
System.err.println("Usage: TestRunner <zkQuorum> <metadataBrokerList> " +
"<group> <topicInputMessages>")
System.exit(1)
}
val Array(zkQuorum,metadataBrokerList,group,topicInputMessages) = args
setParameters(zkQuorum,metadataBrokerList,group,topicInputMessages)
run(kafka_num_threads.toInt)
}
def setParameters(mi: String,
mo: String,
g: String,t: String) {
zkQuorum = mi
metadataBrokerList = mo
group = g
topicInputMessages = t
}
def run(kafkaNumThreads: Int) = {
val conf = new SparkConf()
.setAppName("TEST")
val sc = new SparkContext(conf)
sc.setCheckpointDir("~/checkpointDir")
val ssc = new StreamingContext(sc, Seconds(5))
val topicMessagesMap = topicInputMessages.split(",").map((_, 1)).toMap
val messages = KafkaUtils.createStream(ssc, zkQuorum, group, topicMessagesMap).map(_._2)
messages.foreachRDD(rdd => {
rdd.foreachPartition{iter =>
val producer = UtilsTest.createProducer
iter.foreach { msg =>
val record = new ProducerRecord[String, String](UtilsTest.getOutputTopic(), msg)
producer.send(record)
}
producer.close()
}
})
ssc.start()
ssc.awaitTermination()
}
}
object UtilsDM extends Serializable {
#transient lazy val topicOutputMessages: String = "myTestTopic"
#transient lazy val metadataBrokerList: String = "172.12.34.233:9092"
var producer: KafkaProducer[String, String] = null
def createProducer: KafkaProducer[String, String] = {
val kafkaProps = new Properties()
kafkaProps.put("bootstrap.servers", metadataBrokerList)
// This is mandatory, even though we don't send key
kafkaProps.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
kafkaProps.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
kafkaProps.put("acks", "1")
// how many times to retry when produce request fails?
kafkaProps.put("retries", "3")
// This is an upper limit of how many messages Kafka Producer will attempt to batch before sending (bytes)
kafkaProps.put("batch.size", "5")
// How long will the producer wait before sending in order to allow more messages to get accumulated in the same batch
kafkaProps.put("linger.ms", "5")
new KafkaProducer[String, String](kafkaProps)
}
}

Error Running SparkApp local or YARN

At the moment we got the problem, that submitting a spark app with --master local[*] and --master yarn leads to different behaviours. A local submitted application runs fine, starting it on yarn leads - after the first 5 to 7 stages - to the following error:
WARN TaskSetManager: Lost task 1.0 in stage 7.0 (TID 210, quickstart.cloudera): java.io.IOException: org.apache.spark.SparkException: Failed to get broadcast_8_piece0 of broadcast_8
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1177)
at org.apache.spark.broadcast.TorrentBroadcast.readBroadcastBlock(TorrentBroadcast.scala:165)
at org.apache.spark.broadcast.TorrentBroadcast._value$lzycompute(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast._value(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast.getValue(TorrentBroadcast.scala:88)
at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:62)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: Failed to get broadcast_8_piece0 of broadcast_8
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1$$anonfun$2.apply(TorrentBroadcast.scala:138)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1$$anonfun$2.apply(TorrentBroadcast.scala:138)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply$mcVI$sp(TorrentBroadcast.scala:137)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply(TorrentBroadcast.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply(TorrentBroadcast.scala:120)
at scala.collection.immutable.List.foreach(List.scala:318)
at org.apache.spark.broadcast.TorrentBroadcast.org$apache$spark$broadcast$TorrentBroadcast$$readBlocks(TorrentBroadcast.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$readBroadcastBlock$1.apply(TorrentBroadcast.scala:175)
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1174)
... 11 more
[Stage 7:> (0 + 1) / 2]16/07/22 07:34:53 ERROR TaskSetManager: Task 1 in stage 7.0 failed 4 times; aborting job
16/07/22 07:34:53 ERROR InsertIntoHadoopFsRelation: Aborting job.
org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 7.0 failed 4 times, most recent failure: Lost task 1.3 in stage 7.0 (TID 214, quickstart.cloudera): java.io.IOException: org.apache.spark.SparkException: Failed to get broadcast_8_piece0 of broadcast_8
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1177)
at org.apache.spark.broadcast.TorrentBroadcast.readBroadcastBlock(TorrentBroadcast.scala:165)
at org.apache.spark.broadcast.TorrentBroadcast._value$lzycompute(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast._value(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast.getValue(TorrentBroadcast.scala:88)
at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:62)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: Failed to get broadcast_8_piece0 of broadcast_8
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1$$anonfun$2.apply(TorrentBroadcast.scala:138)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1$$anonfun$2.apply(TorrentBroadcast.scala:138)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply$mcVI$sp(TorrentBroadcast.scala:137)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply(TorrentBroadcast.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply(TorrentBroadcast.scala:120)
at scala.collection.immutable.List.foreach(List.scala:318)
at org.apache.spark.broadcast.TorrentBroadcast.org$apache$spark$broadcast$TorrentBroadcast$$readBlocks(TorrentBroadcast.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$readBroadcastBlock$1.apply(TorrentBroadcast.scala:175)
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1174)
... 11 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1294)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1282)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1281)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1281)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1507)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1469)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1914)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1.apply$mcV$sp(InsertIntoHadoopFsRelation.scala:150)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1.apply(InsertIntoHadoopFsRelation.scala:108)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1.apply(InsertIntoHadoopFsRelation.scala:108)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:56)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation.run(InsertIntoHadoopFsRelation.scala:108)
at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult$lzycompute(commands.scala:57)
at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult(commands.scala:57)
at org.apache.spark.sql.execution.ExecutedCommand.doExecute(commands.scala:69)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:140)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:138)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:138)
at org.apache.spark.sql.SQLContext$QueryExecution.toRdd$lzycompute(SQLContext.scala:933)
at org.apache.spark.sql.SQLContext$QueryExecution.toRdd(SQLContext.scala:933)
at org.apache.spark.sql.execution.datasources.ResolvedDataSource$.apply(ResolvedDataSource.scala:197)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:146)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:137)
at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:325)
at dataCreator.DataSetGenerator$$anonfun$createVP$1.apply(DataSetGenerator.scala:160)
at dataCreator.DataSetGenerator$$anonfun$createVP$1.apply(DataSetGenerator.scala:144)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:108)
at dataCreator.DataSetGenerator$.createVP(DataSetGenerator.scala:144)
at dataCreator.DataSetGenerator$.generateDataSet(DataSetGenerator.scala:78)
at runDriver$.main(runDriver.scala:14)
at runDriver.main(runDriver.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:672)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.IOException: org.apache.spark.SparkException: Failed to get broadcast_8_piece0 of broadcast_8
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1177)
at org.apache.spark.broadcast.TorrentBroadcast.readBroadcastBlock(TorrentBroadcast.scala:165)
at org.apache.spark.broadcast.TorrentBroadcast._value$lzycompute(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast._value(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast.getValue(TorrentBroadcast.scala:88)
at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:62)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: Failed to get broadcast_8_piece0 of broadcast_8
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1$$anonfun$2.apply(TorrentBroadcast.scala:138)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1$$anonfun$2.apply(TorrentBroadcast.scala:138)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply$mcVI$sp(TorrentBroadcast.scala:137)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply(TorrentBroadcast.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply(TorrentBroadcast.scala:120)
at scala.collection.immutable.List.foreach(List.scala:318)
at org.apache.spark.broadcast.TorrentBroadcast.org$apache$spark$broadcast$TorrentBroadcast$$readBlocks(TorrentBroadcast.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$readBroadcastBlock$1.apply(TorrentBroadcast.scala:175)
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1174)
... 11 more
16/07/22 07:34:53 ERROR DefaultWriterContainer: Job job_201607220734_0000 aborted.
Exception in thread "main" org.apache.spark.SparkException: Job aborted.
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1.apply$mcV$sp(InsertIntoHadoopFsRelation.scala:156)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1.apply(InsertIntoHadoopFsRelation.scala:108)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1.apply(InsertIntoHadoopFsRelation.scala:108)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:56)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation.run(InsertIntoHadoopFsRelation.scala:108)
at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult$lzycompute(commands.scala:57)
at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult(commands.scala:57)
at org.apache.spark.sql.execution.ExecutedCommand.doExecute(commands.scala:69)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:140)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:138)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:138)
at org.apache.spark.sql.SQLContext$QueryExecution.toRdd$lzycompute(SQLContext.scala:933)
at org.apache.spark.sql.SQLContext$QueryExecution.toRdd(SQLContext.scala:933)
at org.apache.spark.sql.execution.datasources.ResolvedDataSource$.apply(ResolvedDataSource.scala:197)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:146)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:137)
at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:325)
at dataCreator.DataSetGenerator$$anonfun$createVP$1.apply(DataSetGenerator.scala:160)
at dataCreator.DataSetGenerator$$anonfun$createVP$1.apply(DataSetGenerator.scala:144)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:108)
at dataCreator.DataSetGenerator$.createVP(DataSetGenerator.scala:144)
at dataCreator.DataSetGenerator$.generateDataSet(DataSetGenerator.scala:78)
at runDriver$.main(runDriver.scala:14)
at runDriver.main(runDriver.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:672)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 7.0 failed 4 times, most recent failure: Lost task 1.3 in stage 7.0 (TID 214, quickstart.cloudera): java.io.IOException: org.apache.spark.SparkException: Failed to get broadcast_8_piece0 of broadcast_8
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1177)
at org.apache.spark.broadcast.TorrentBroadcast.readBroadcastBlock(TorrentBroadcast.scala:165)
at org.apache.spark.broadcast.TorrentBroadcast._value$lzycompute(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast._value(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast.getValue(TorrentBroadcast.scala:88)
at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:62)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: Failed to get broadcast_8_piece0 of broadcast_8
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1$$anonfun$2.apply(TorrentBroadcast.scala:138)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1$$anonfun$2.apply(TorrentBroadcast.scala:138)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply$mcVI$sp(TorrentBroadcast.scala:137)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply(TorrentBroadcast.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply(TorrentBroadcast.scala:120)
at scala.collection.immutable.List.foreach(List.scala:318)
at org.apache.spark.broadcast.TorrentBroadcast.org$apache$spark$broadcast$TorrentBroadcast$$readBlocks(TorrentBroadcast.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$readBroadcastBlock$1.apply(TorrentBroadcast.scala:175)
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1174)
... 11 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1294)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1282)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1281)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1281)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1507)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1469)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1914)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1.apply$mcV$sp(InsertIntoHadoopFsRelation.scala:150)
... 34 more
Caused by: java.io.IOException: org.apache.spark.SparkException: Failed to get broadcast_8_piece0 of broadcast_8
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1177)
at org.apache.spark.broadcast.TorrentBroadcast.readBroadcastBlock(TorrentBroadcast.scala:165)
at org.apache.spark.broadcast.TorrentBroadcast._value$lzycompute(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast._value(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast.getValue(TorrentBroadcast.scala:88)
at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:62)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: Failed to get broadcast_8_piece0 of broadcast_8
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1$$anonfun$2.apply(TorrentBroadcast.scala:138)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1$$anonfun$2.apply(TorrentBroadcast.scala:138)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply$mcVI$sp(TorrentBroadcast.scala:137)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply(TorrentBroadcast.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply(TorrentBroadcast.scala:120)
at scala.collection.immutable.List.foreach(List.scala:318)
at org.apache.spark.broadcast.TorrentBroadcast.org$apache$spark$broadcast$TorrentBroadcast$$readBlocks(TorrentBroadcast.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$readBroadcastBlock$1.apply(TorrentBroadcast.scala:175)
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1174)
... 11 more
We guess this should be the interesting part of the code. When the method handleTypes isn't called. The app runs without the error as well.
/**
* Generates VP table for each unique predicate in input RDF dataset.
* All tables have to be cached, since they are used for generation of ExtVP
* tables.
*/
private def createVP() = {
// create directory for all vp tables
Helper.removeDirInHDFS(Settings.vpDir)
Helper.createDirInHDFS(Settings.vpDir)
StatisticWriter.initNewStatisticFile("VP")
// create and cache vpTables for all predicates in input RDF dataset
for (predicate <- _uPredicates){
var vpTable = _sqlContext.sql("select sub, obj "
+ "from triples where pred='"+predicate+"'")
val cleanPredicate = Helper.getPartName(predicate)
// --> without this call no error occurs <--
vpTable = handleTypes(vpTable, predicate)
vpTable.registerTempTable(cleanPredicate)
_sqlContext.cacheTable(cleanPredicate)
_vpTableSizes(predicate) = vpTable.count()
//vpTable.saveAsParquetFile(Settings.vpDir + cleanPredicate + ".parquet")
vpTable.write.parquet(Settings.vpDir + cleanPredicate + ".parquet")
// print statistic line
StatisticWriter.incSavedTables()
StatisticWriter.addTableStatistic("<" + predicate + ">",
-1,
_vpTableSizes(predicate))
writeLoadScript(Settings.vpDir + cleanPredicate + ".parquet", cleanPredicate, "", vpTable)
}
StatisticWriter.closeStatisticFile()
}
private def handleTypes(vTable: DataFrame, predIn: String) = {
var pred = predIn
//println("'" + pred + "'")
if(pred.startsWith("<") && pred.endsWith(">")) {
pred = pred.substring(1, pred.length() - 1)
}
var newSchema = StructType( StructField("sub", StringType, false) :: StructField("obj", StringType, false) :: Nil)
var predType = ""
// type check of object
if(integerPred.contains(pred)) {
newSchema = StructType( StructField("sub", StringType, false) :: StructField("obj", IntegerType, true) :: Nil)
predType = "int"
}else if (doublePred.contains(pred)) {
newSchema = StructType( StructField("sub", StringType, false) :: StructField("obj", DoubleType, false) :: Nil)
predType = "double"
}else if (datePred.contains(pred)) {
newSchema = StructType( StructField("sub", StringType, false) :: StructField("obj", DateType, false) :: Nil)
predType = "date"
}else if (dateTimePred.contains(pred)) {
newSchema = StructType( StructField("sub", StringType, false) :: StructField("obj", TimestampType, false) :: Nil)
predType = "timestamp"
}
var newRdd = vTable.rdd
newRdd = newRdd.map( r => extractObj(r, predType))
var newDF = _sqlContext.createDataFrame(newRdd, newSchema)
newDF
}
private def extractObj(r: Row, predType: String) = {
var pattern = new Regex("(?<=\").*?(?=\")")
var obj = r.getString(1)
var result = obj
if(obj.contains("^^")) {
result = pattern.findFirstIn(obj).get
if(predType.equals("timestamp")){
result = result.replace("T", " ")
}
}
var result2 = Row(r.getString(0), result)
if(predType.equals("int")){
val ret = result.toInt
result2 = Row(r.getString(0), ret)
}else if(predType.equals("double")){
val ret = result.toDouble
result2 = Row(r.getString(0), ret)
}else if(predType.equals("date")){
val ret = getDate(result)
result2 = Row(r.getString(0), ret)
}else if(predType.equals("timestamp")){
val ret = getTimestamp(result)
result2 = Row(r.getString(0), ret)
}
result2
}
def getDate(x:Any) :java.sql.Date = {
val format = new SimpleDateFormat("yyyy-MM-dd")
if (x.toString() == "")
return null
else {
val d = format.parse(x.toString());
val t = new Date(d.getDate());
return t
}
}
def getTimestamp(x:Any) :java.sql.Timestamp = {
val format = new SimpleDateFormat("yyyy-MM-dd' 'HH:mm:ss")
if (x.toString() == "")
return null
else {
val d = format.parse(x.toString());
val t = new Timestamp(d.getTime());
return t
}
}
def writeLoadScript(path: String, tableName: String, relType: String, table: DataFrame) = {
var relationType = relType.toUpperCase
var columnList = table.schema.toSeq
var subType = columnList(0).dataType.simpleString
var objType = columnList(1).dataType.simpleString
val fw = new java.io.FileWriter(_loadScriptName, true)
try {
if(tableName == "triples") {
fw.write("DROP TABLE IF EXISTS triples;\n")
fw.write("CREATE EXTERNAL TABLE triples (sub STRING, pred STRING, obj STRING)\n")
fw.write("STORED AS PARQUET LOCATION \'${hiveconf:prepath}" + path + "\';\n\n")
}else {
fw.write("DROP TABLE IF EXISTS " + relationType + _delimiter + tableName + ";\n")
fw.write("CREATE EXTERNAL TABLE " + relationType + _delimiter + tableName + " (sub " + subType + ", obj " + objType +")\n")
fw.write("STORED AS PARQUET LOCATION \'${hiveconf:prepath}" + path + "\';\n\n")
}
}
finally fw.close()
}
def initLoadScript() = {
val fw = new java.io.FileWriter(_loadScriptName, false)
try {
fw.write("-- Hive 1.2.0 or later is needed! \n")
}
finally fw.close()
}
We run this on Spark 1.6.1 and Hadoop 2.7.1 as well as on the Cloudera Quickstart VM with version 1.5.0-cdh5.5.2 and 2.6.0-cdh5.5.2. If you have any suggestions where to look at, let us know!
Thanks
It seems like the error is related to this open spark issue:
https://issues.apache.org/jira/browse/SPARK-5594
As recommended in this post of the issue, we serialized the function extractObj, the code now looks like that:
object FunctionSerializable extends Serializable{
def extractObj(r: Row, predType: String): Row = {
var pattern = new Regex("(?<=\").*?(?=\")")
var obj = r.getString(1)
var result = obj
if(obj.contains("^^")) {
result = pattern.findFirstIn(obj).get
if(predType.equals("timestamp")){
result = result.replace("T", " ")
}
}
var result2 = Row(r.getString(0), result)
if(predType.equals("int")){
val ret = result.toInt
result2 = Row(r.getString(0), ret)
}else if(predType.equals("double")){
val ret = result.toDouble
result2 = Row(r.getString(0), ret)
}else if(predType.equals("date")){
val ret = getDate(result)
result2 = Row(r.getString(0), ret)
}else if(predType.equals("timestamp")){
val ret = getTimestamp(result)
result2 = Row(r.getString(0), ret)
}
result2
}
def getDate(x:Any) :java.sql.Date = {
val format = new SimpleDateFormat("yyyy-MM-dd")
if (x.toString() == "")
return null
else {
val d = format.parse(x.toString());
val t = new Date(d.getDate());
return t
}
}
def getTimestamp(x:Any) :java.sql.Timestamp = {
val format = new SimpleDateFormat("yyyy-MM-dd' 'HH:mm:ss")
if (x.toString() == "")
return null
else {
val d = format.parse(x.toString());
val t = new Timestamp(d.getTime());
return t
}
}
}
The call in handleTypes looks like that:
newRdd = newRdd.map( r => FunctionSerializable.extractObj(r, predType))
And everything is working fine... :-)

hBaseRDD.collect() giving an error

I am wotking with spark and hbase. I used HBaseTest.scala.
rdd.count() is giving accurate result. But when I try to do rdd.collect() following error came:
java.io.NotSerializableException: org.apache.hadoop.hbase.io.ImmutableBytesWritable
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1183)
at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1547)
at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1508)
at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1431)
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1177)
at java.io.ObjectOutputStream.writeArray(ObjectOutputStream.java:1377)
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1173)
at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:347)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:42)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:73)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:210)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Not able to figure out the issue. I want to print some rows of hbase table.
I had the same problem and found the solution here.
Here is a code snippet
type HBaseRow = java.util.NavigableMap[Array[Byte], java.util.NavigableMap[Array[Byte], java.util.NavigableMap[java.lang.Long, Array[Byte]]]]
type CFTimeseriesRow = Map[Array[Byte], Map[Array[Byte], Map[Long, Array[Byte]]]]
type CFTimeseriesRowStr = scala.collection.immutable.Map[String, scala.collection.immutable.Map[String, scala.collection.immutable.Map[Long, String]]]
import scala.collection.JavaConverters._
def rowToStrMap(navMap: CFTimeseriesRow): CFTimeseriesRowStr = navMap.map(cf =>
(Bytes.toString(cf._1), cf._2.map(col =>
(Bytes.toString(col._1), col._2.map(elem => (elem._1, Bytes.toString(elem._2)))))))
def navMapToMap(navMap: HBaseRow): CFTimeseriesRow =
navMap.asScala.toMap.map(cf =>
(cf._1, cf._2.asScala.toMap.map(col =>
(col._1, col._2.asScala.toMap.map(elem => (elem._1.toLong, elem._2))))))
#transient val conf = HBaseConfiguration.create()
conf.set(TableInputFormat.INPUT_TABLE, tableName)
val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
.map(kv => (kv._1.get(), navMapToMap(kv._2.getMap)))
.map(kv => (Bytes.toString(kv._1), rowToStrMap(kv._2))).take(10).foreach(println)