spark java.io.NotSerializableException: org.apache.spark.SparkContext - scala

I'm try to implement to check exist record by received message from kafka in spark by spark streaming, now when i run RunReadLogByKafka object, there is a NotSerializableException for SparkContext was throwed, i google it, but i still don't know how to fix it, Could anyone suggest me how to rewrite it? thanks in advance.
package com.test.spark.hbase
import java.sql.{DriverManager, PreparedStatement, Connection}
import java.text.SimpleDateFormat
import com.powercn.spark.LogRow
import com.powercn.spark.SparkReadHBaseTest.{SensorStatsRow, SensorRow}
import kafka.serializer.{DefaultDecoder, StringDecoder}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Result, Put}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{TableInputFormat, TableOutputFormat}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapred.JobConf
import org.apache.spark.sql.{SQLContext, Row}
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
case class LogRow(rowkey: String, content: String)
object LogRow {
def parseLogRow(result: Result): LogRow = {
val rowkey = Bytes.toString(result.getRow())
val p0 = rowkey
val p1 = Bytes.toString(result.getValue(Bytes.toBytes("data"), Bytes.toBytes("content")))
LogRow(p0, p1)
}
}
class ReadLogByKafka(sct:SparkContext) extends Serializable {
implicit def func(records: String) {
#transient val conf = HBaseConfiguration.create()
conf.set(TableInputFormat.INPUT_TABLE, "log")
#transient val sc = sct
#transient val sqlContext = SQLContext.getOrCreate(sc)
import sqlContext.implicits._
try {
//query info table
val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
println(hBaseRDD.count())
// transform (ImmutableBytesWritable, Result) tuples into an RDD of Results
val resultRDD = hBaseRDD.map(tuple => tuple._2)
println(resultRDD.count())
val logRDD = resultRDD.map(LogRow.parseLogRow)
val logDF = logRDD.toDF()
logDF.printSchema()
logDF.show()
// register the DataFrame as a temp table
logDF.registerTempTable("LogRow")
val logAdviseDF = sqlContext.sql("SELECT rowkey, content as content FROM LogRow ")
logAdviseDF.printSchema()
logAdviseDF.take(5).foreach(println)
} catch {
case e: Exception => e.printStackTrace()
} finally {
}
}
}
package com.test.spark.hbase
import kafka.serializer.{DefaultDecoder, StringDecoder}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.mapreduce.{TableInputFormat, TableOutputFormat}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.io.Text
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
object RunReadLogByKafka extends Serializable {
def main(args: Array[String]): Unit = {
val broker = "192.168.13.111:9092"
val topic = "log"
#transient val sparkConf = new SparkConf().setAppName("RunReadLogByKafka")
#transient val streamingContext = new StreamingContext(sparkConf, Seconds(2))
#transient val sc = streamingContext.sparkContext
val kafkaConf = Map("metadata.broker.list" -> broker,
"group.id" -> "group1",
"zookeeper.connection.timeout.ms" -> "3000",
"kafka.auto.offset.reset" -> "smallest")
// Define which topics to read from
val topics = Set(topic)
val messages = KafkaUtils.createDirectStream[Array[Byte], String, DefaultDecoder, StringDecoder](
streamingContext, kafkaConf, topics).map(_._2)
messages.foreachRDD(rdd => {
val readLogByKafka =new ReadLogByKafka(sc)
//parse every message, it will throw NotSerializableException
rdd.foreach(readLogByKafka.func)
})
streamingContext.start()
streamingContext.awaitTermination()
}
}
Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2021)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:889)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:888)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:306)
at org.apache.spark.rdd.RDD.foreach(RDD.scala:888)
at com.test.spark.hbase.RunReadLogByKafka$$anonfun$main$1.apply(RunReadLogByKafka.scala:38)
at com.test.spark.hbase.RunReadLogByKafka$$anonfun$main$1.apply(RunReadLogByKafka.scala:35)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:631)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:631)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:42)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:40)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:40)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:399)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:40)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:40)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:40)
at scala.util.Try$.apply(Try.scala:161)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:34)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:207)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:207)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:207)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:206)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.io.NotSerializableException: org.apache.spark.SparkContext
Serialization stack:
- object not serializable (class: org.apache.spark.SparkContext, value: org.apache.spark.SparkContext#5207878f)
- field (class: com.test.spark.hbase.ReadLogByKafka, name: sct, type: class org.apache.spark.SparkContext)
- object (class com.test.spark.hbase.ReadLogByKafka, com.test.spark.hbase.ReadLogByKafka#60212100)
- field (class: com.test.spark.hbase.RunReadLogByKafka$$anonfun$main$1$$anonfun$apply$1, name: readLogByKafka$1, type: class com.test.spark.hbase.ReadLogByKafka)
- object (class com.test.spark.hbase.RunReadLogByKafka$$anonfun$main$1$$anonfun$apply$1, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:84)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:301)
... 30 more

Object you get as an argument in foreachRDD is a standard Spark RDD so you have to obey exactly the same rules as usual including no nested actions or transformations and no access to the SparkContext. It is not exactly clear what you try to achieve (It doesn't look like ReadLogByKafka.func is doing anything useful) but I am guess you're looking for some kind of join.

Related

How to create a PolygonRDD from H3 boundary?

I'm using Apache Spark with Apache Sedona (previously called GeoSpark), and I'm trying to do the following:
Take a DataFrame containing latitude and longitude in each row (it comes from an arbitrary source, it neither is a PointRDD nor comes from a specific file format) and transform it into a DataFrame with the H3 index of each point.
Take that DataFrame and create a PolygonRDD containing the H3 cell boundaries of each distinct H3 index.
This is what I have so far:
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.sedona.core.spatialRDD.PolygonRDD
import org.apache.sedona.sql.utils.SedonaSQLRegistrator
import org.apache.sedona.viz.core.Serde.SedonaVizKryoRegistrator
import org.apache.sedona.viz.sql.utils.SedonaVizRegistrator
import org.locationtech.jts.geom.{Polygon, GeometryFactory, Coordinate}
import com.uber.h3core.H3Core
import com.uber.h3core.util.GeoCoord
object Main {
def main(args: Array[String]) {
val sparkSession: SparkSession = SparkSession
.builder()
.config("spark.serializer", classOf[KryoSerializer].getName)
.config("spark.kryo.registrator", classOf[SedonaVizKryoRegistrator].getName)
.master("local[*]")
.appName("Sedona-Analysis")
.getOrCreate()
import sparkSession.implicits._
SedonaSQLRegistrator.registerAll(sparkSession)
SedonaVizRegistrator.registerAll(sparkSession)
val df = Seq(
(-8.01681, -34.92618),
(-25.59306, -49.39895),
(-7.17897, -34.86518),
(-20.24521, -42.14273),
(-20.24628, -42.14785),
(-27.01641, -50.94109),
(-19.72987, -47.94319)
).toDF("latitude", "longitude")
val core: H3Core = H3Core.newInstance()
val geoFactory = new GeometryFactory()
val geoToH3 = udf((lat: Double, lng: Double, res: Int) => core.geoToH3(lat, lng, res))
val trdd = df
.select(geoToH3($"latitude", $"longitude", lit(7)).as("h3index"))
.distinct()
.rdd
.map(row => {
val h3 = row.getAs[Long](0)
val lboundary = core.h3ToGeoBoundary(h3)
val aboundary = lboundary.toArray(Array.ofDim[GeoCoord](lboundary.size))
val poly = geoFactory.createPolygon(
aboundary.map((c: GeoCoord) => new Coordinate(c.lat, c.lng))
)
poly.setUserData(h3)
poly
})
val polyRDD = new PolygonRDD(trdd)
polyRDD.rawSpatialRDD.foreach(println)
sparkSession.stop()
}
}
However, after running sbt assembly and submitting the output jar to spark-submit, I get this error:
Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:416)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:406)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:162)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2362)
at org.apache.spark.rdd.RDD.$anonfun$map$1(RDD.scala:396)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
at org.apache.spark.rdd.RDD.map(RDD.scala:395)
at Main$.main(Main.scala:44)
at Main.main(Main.scala)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:928)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1007)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1016)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.NotSerializableException: com.uber.h3core.H3Core
Serialization stack:
- object not serializable (class: com.uber.h3core.H3Core, value: com.uber.h3core.H3Core#3407ded1)
- element of array (index: 0)
- array (class [Ljava.lang.Object;, size 2)
- field (class: java.lang.invoke.SerializedLambda, name: capturedArgs, type: class [Ljava.lang.Object;)
- object (class java.lang.invoke.SerializedLambda, SerializedLambda[capturingClass=class Main$, functionalInterfaceMethod=scala/Function1.apply:(Ljava/lang/Object;)Ljava/lang/Object;, implementation=invokeStatic Main$.$anonfun$main$2:(Lcom/uber/h3core/H3Core;Lorg/locationtech/jts/geom/GeometryFactory;Lorg/apache/spark/sql/Row;)Lorg/locationtech/jts/geom/Polygon;, instantiatedMethodType=(Lorg/apache/spark/sql/Row;)Lorg/locationtech/jts/geom/Polygon;, numCaptured=2])
- writeReplace data (class: java.lang.invoke.SerializedLambda)
- object (class Main$$$Lambda$1710/0x0000000840d7f040, Main$$$Lambda$1710/0x0000000840d7f040#4853f592)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:41)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:413)
... 22 more
What is the proper way to achieve what I'm trying to do?
So, basically just adding the Serializable trait to an object containing the H3Core was enough. Also, I had to adjust the Coordinate array to begin and end with the same point.
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.sedona.core.spatialRDD.PolygonRDD
import org.apache.sedona.sql.utils.SedonaSQLRegistrator
import org.apache.sedona.viz.core.Serde.SedonaVizKryoRegistrator
import org.apache.sedona.viz.sql.utils.SedonaVizRegistrator
import org.locationtech.jts.geom.{Polygon, GeometryFactory, Coordinate}
import com.uber.h3core.H3Core
import com.uber.h3core.util.GeoCoord
object H3 extends Serializable {
val core = H3Core.newInstance()
val geoFactory = new GeometryFactory()
}
object Main {
def main(args: Array[String]) {
val sparkSession: SparkSession = SparkSession
.builder()
.config("spark.serializer", classOf[KryoSerializer].getName)
.config("spark.kryo.registrator", classOf[SedonaVizKryoRegistrator].getName)
.master("local[*]")
.appName("Sedona-Analysis")
.getOrCreate()
import sparkSession.implicits._
SedonaSQLRegistrator.registerAll(sparkSession)
SedonaVizRegistrator.registerAll(sparkSession)
val df = Seq(
(-8.01681, -34.92618),
(-25.59306, -49.39895),
(-7.17897, -34.86518),
(-20.24521, -42.14273),
(-20.24628, -42.14785),
(-27.01641, -50.94109),
(-19.72987, -47.94319)
).toDF("latitude", "longitude")
val geoToH3 = udf((lat: Double, lng: Double, res: Int) => H3.core.geoToH3(lat, lng, res))
val trdd = df
.select(geoToH3($"latitude", $"longitude", lit(7)).as("h3index"))
.distinct()
.rdd
.map(row => {
val h3 = row.getAs[Long](0)
val lboundary = H3.core.h3ToGeoBoundary(h3)
val aboundary = lboundary.toArray(Array.ofDim[GeoCoord](lboundary.size))
val poly = H3.geoFactory.createPolygon({
val ps = aboundary.map((c: GeoCoord) => new Coordinate(c.lat, c.lng))
ps :+ ps(0)
})
poly.setUserData(h3)
poly
})
val polyRDD = new PolygonRDD(trdd)
polyRDD.rawSpatialRDD.foreach(println)
sparkSession.stop()
}
}

Unable to solve the error: java.io.NotSerializableException: org.apache.avro.Schema$RecordSchema

I am trying to read data from a table through SparkSession, and publish it to a Kafka topic. Using below piece of code for the same:
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericData, GenericDatumWriter, GenericRecord}
import org.apache.avro.specific.SpecificDatumWriter
import org.apache.avro.io._
import org.apache.kafka.clients.CommonClientConfigs
import org.apache.kafka.clients.producer._
import org.apache.kafka.common.serialization.StringSerializer
import org.apache.kafka.common.serialization.ByteArraySerializer
import java.io.{ByteArrayOutputStream, StringWriter}
object Producer extends Serializable {
def main(args: Array[String]): Unit = {
val props = new Properties()
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, classOf[StringSerializer].getName)
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, classOf[ByteArraySerializer].getName)
val lines= Source.fromFile("file")
val schema = new Schema.Parser().parse(lines)
val spark = new SparkSession.Builder().enableHiveSupport() getOrCreate()
import spark.implicits._
val df = spark.sql("select * from table")
df.rdd.map{
value => {
val prod = new KafkaProducer[String, Array[Byte]](props)
val records = new GenericData.Record(schema)
records.put("col1",value.getString(1))
records.put("col2",value.getString(2))
records.put("col3",value.getString(3))
records.put("col4",value.getString(4))
val writer = new SpecificDatumWriter[GenericRecord](schema)
val out = new ByteArrayOutputStream()
val encoder: BinaryEncoder = EncoderFactory.get().binaryEncoder(out, null)
writer.write(records, encoder)
encoder.flush()
out.close()
val serializedBytes: Array[Byte] = out.toByteArray()
val record = new ProducerRecord("topic",col1.toString , serializedBytes)
val data = prod.send(record)
prod.flush()
prod.close() }
}
spark.close()
}
}
And, below error is thrown when I execute it:
Caused by: java.io.NotSerializableException:
org.apache.avro.Schema$RecordSchema Serialization stack:
- object not serializable (class: org.apache.avro.Schema$RecordSchema, value:
{"type":"record","name":"data","namespace":"com.data.record","fields":[{"name":"col1","type":"string"},{"name":"col2","type":"string"},{"name":"col3","type":"string"},{"name":"col4","type":"string"}]})
field (class: scala.runtime.ObjectRef, name: elem, type: class java.lang.Object)
object (class scala.runtime.ObjectRef,
{"type":"record","name":"data","namespace":"com.data.record","fields":[{"name":"col1","type":"string"},{"name":"col2","type":"string"},{"name":"col3","type":"string"},{"name":"col4","type":"string"}]})
- field (class: com.kafka.driver.KafkaProducer.Producer$$anonfun$main$1, name: schema$1, type: class scala.runtime.ObjectRef)
However, it runs fine when I try to pass the dataset to driver using df.rdd.collect.foreach . Instead, I need to publish the messages at cluster level, thus using rdd.map . Not sure what am I missing here exactly which is causing this error. Any help towards resolving this would be highly appreciated, thanks!
Figured out that objects, Schema and Kafka Producer, need to be exposed to executors. For that modified the above code as:
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericData, GenericDatumWriter, GenericRecord}
import org.apache.avro.specific.SpecificDatumWriter
import org.apache.avro.io._
import org.apache.kafka.clients.CommonClientConfigs
import org.apache.kafka.clients.producer._
import org.apache.kafka.common.serialization.StringSerializer
import org.apache.kafka.common.serialization.ByteArraySerializer
import java.io.{ByteArrayOutputStream, StringWriter}
object Producer extends Serializable {
def main(args: Array[String]): Unit = {
val props = new Properties()
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, classOf[StringSerializer].getName)
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, classOf[ByteArraySerializer].getName)
val spark = new SparkSession.Builder().enableHiveSupport() getOrCreate()
import spark.implicits._
val df = spark.sql("select * from table")
df.foreachPartition{
rows => {
val prod = new KafkaProducer[String, Array[Byte]](props)
val lines= Source.fromFile("file")
val schema = new Schema.Parser().parse(lines)
rows.foreach{
value =>
val records = new GenericData.Record(schema)
records.put("col1",value.getString(1))
records.put("col2",value.getString(2))
records.put("col3",value.getString(3))
records.put("col4",value.getString(4))
val writer = new SpecificDatumWriter[GenericRecord](schema)
val out = new ByteArrayOutputStream()
val encoder: BinaryEncoder = EncoderFactory.get().binaryEncoder(out, null)
writer.write(records, encoder)
encoder.flush()
out.close()
val serializedBytes: Array[Byte] = out.toByteArray()
val record = new ProducerRecord("topic",col1.toString , serializedBytes)
val data = prod.send(record)
}
prod.flush()
prod.close()
}
}
spark.close()
}
}

Unable to serialize SparkContext in foreachRDD

I am trying to save the streaming data to cassandra from Kafka. I am able to read and parse the data but when I call below lines to save the data i am getting a Task not Serializable Exception. My class is extending serializable but not sure why i am seeing this error, didn't get much help ever after googling for 3 hours, can some body give any pointers ?
val collection = sc.parallelize(Seq((obj.id, obj.data)))
collection.saveToCassandra("testKS", "testTable ", SomeColumns("id", "data"))`
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SaveMode
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.kafka.KafkaUtils
import com.datastax.spark.connector._
import kafka.serializer.StringDecoder
import org.apache.spark.rdd.RDD
import com.datastax.spark.connector.SomeColumns
import java.util.Formatter.DateTime
object StreamProcessor extends Serializable {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setMaster("local[2]").setAppName("StreamProcessor")
.set("spark.cassandra.connection.host", "127.0.0.1")
val sc = new SparkContext(sparkConf)
val ssc = new StreamingContext(sc, Seconds(2))
val sqlContext = new SQLContext(sc)
val kafkaParams = Map("metadata.broker.list" -> "localhost:9092")
val topics = args.toSet
val stream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc, kafkaParams, topics)
stream.foreachRDD { rdd =>
if (!rdd.isEmpty()) {
try {
rdd.foreachPartition { iter =>
iter.foreach {
case (key, msg) =>
val obj = msgParseMaster(msg)
val collection = sc.parallelize(Seq((obj.id, obj.data)))
collection.saveToCassandra("testKS", "testTable ", SomeColumns("id", "data"))
}
}
}
}
}
ssc.start()
ssc.awaitTermination()
}
import org.json4s._
import org.json4s.native.JsonMethods._
case class wordCount(id: Long, data: String) extends serializable
implicit val formats = DefaultFormats
def msgParseMaster(msg: String): wordCount = {
val m = parse(msg).extract[wordCount]
return m
}
}
I am getting
org.apache.spark.SparkException: Task not serializable
below is the full log
16/08/06 10:24:52 ERROR JobScheduler: Error running job streaming job 1470504292000 ms.0
org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2055)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:919)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:918)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:918)
at
SparkContext isn't serializable, you can't use it inside foreachRDD, and from the use of your graph you don't need it. Instead, you can simply map over each RDD, parse out the relevant data and save that new RDD to cassandra:
stream
.map {
case (_, msg) =>
val result = msgParseMaster(msg)
(result.id, result.data)
}
.foreachRDD(rdd => if (!rdd.isEmpty)
rdd.saveToCassandra("testKS",
"testTable",
SomeColumns("id", "data")))
You can not call sc.parallelize within a function passed to foreachPartition - that function would have to be serialized and sent to each executor, and SparkContext is (intentionally) not serializable (it should only reside within the Driver application, not the executor).

Exception in thread "main" scala.ScalaReflectionException

This is my code for joinning two dataframes
package org.test.rddjoins
import org.apache.spark.SparkConf
import org.apache.spark.SparkConf
import org.apache.spark._
import org.apache.spark.rdd.RDD
object rdd {
case class Score(name: String, score: Int)
case class Age(name: String, age: Int)
def main(args: Array[String]) {
val sparkConf = new SparkConf()
.setAppName("rdd")
.setMaster("local[2]")
val sc = new SparkContext(sparkConf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext._
val scores = sc.textFile("scores.txt").map(_.split(",")).map(s => Score(s(0), s(1).trim.toInt))
val ages = sc.textFile("ages.txt").map(_.split(",")).map(s => Age(s(0), s(1).trim.toInt))
scores.registerAsTable("scores")
ages.registerAsTable("ages")
val joined = sqlContext.sql("""
SELECT a.name, a.age, s.score
FROM ages a JOIN scores s
ON a.name = s.name""")
joined.collect().foreach(println)
}
}
I am getting the following error while running it:
Exception in thread "main" scala.ScalaReflectionException: class org.apache.spark.sql.catalyst.ScalaReflection in JavaMirror with primordial classloader with boot classpath [C:\Users\Owner\Downloads\Compressed\eclipse\plugins\org.scala-lang.scala-library_2.11.8.v20160304-115712-1706a37eb8.jar;C:\Users\Owner\Downloads\Compressed\eclipse\plugins\org.scala-lang.scala-reflect_2.11.8.v20160304-115712-1706a37eb8.jar;C:\Program Files\Java\jdk1.8.0_77\jre\lib\resources.jar;C:\Program Files\Java\jdk1.8.0_77\jre\lib\rt.jar;C:\Program Files\Java\jdk1.8.0_77\jre\lib\sunrsasign.jar;C:\Program Files\Java\jdk1.8.0_77\jre\lib\jsse.jar;C:\Program Files\Java\jdk1.8.0_77\jre\lib\jce.jar;C:\Program Files\Java\jdk1.8.0_77\jre\lib\charsets.jar;C:\Program Files\Java\jdk1.8.0_77\jre\lib\jfr.jar;C:\Program Files\Java\jdk1.8.0_77\jre\classes] not found.
at scala.reflect.internal.Mirrors$RootsBase.staticClass(Mirrors.scala:123)
at scala.reflect.internal.Mirrors$RootsBase.staticClass(Mirrors.scala:22)
at org.apache.spark.sql.catalyst.ScalaReflection$$typecreator1$1.apply(ScalaReflection.scala:115)
at scala.reflect.api.TypeTags$WeakTypeTagImpl.tpe$lzycompute(TypeTags.scala:232)
at scala.reflect.api.TypeTags$WeakTypeTagImpl.tpe(TypeTags.scala:232)
at scala.reflect.api.TypeTags$class.typeOf(TypeTags.scala:341)
at scala.reflect.api.Universe.typeOf(Universe.scala:61)
at org.apache.spark.sql.catalyst.ScalaReflection$class.schemaFor(ScalaReflection.scala:115)
at org.apache.spark.sql.catalyst.ScalaReflection$.schemaFor(ScalaReflection.scala:33)
at org.apache.spark.sql.catalyst.ScalaReflection$class.schemaFor(ScalaReflection.scala:100)
at org.apache.spark.sql.catalyst.ScalaReflection$.schemaFor(ScalaReflection.scala:33)
at org.apache.spark.sql.catalyst.ScalaReflection$class.attributesFor(ScalaReflection.scala:94)
at org.apache.spark.sql.catalyst.ScalaReflection$.attributesFor(ScalaReflection.scala:33)
at org.apache.spark.sql.SQLContext.createSchemaRDD(SQLContext.scala:111)
at org.test.rddjoins.rdd$.main(rdd.scala:27)
Help!!!
Apache Spark library missed in classpath.
Exception says that one of spark related class was not found to classpath.
You should modify your classpath to add specified jars.

Spark Streaming into HBase with filtering logic

I have been trying to understand how spark streaming and hbase connect, but have not been successful. What I am trying to do is given a spark stream, process that stream and store the results in an hbase table. So far this is what I have:
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.storage.StorageLevel
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{HBaseAdmin,HTable,Put,Get}
import org.apache.hadoop.hbase.util.Bytes
def blah(row: Array[String]) {
val hConf = new HBaseConfiguration()
val hTable = new HTable(hConf, "table")
val thePut = new Put(Bytes.toBytes(row(0)))
thePut.add(Bytes.toBytes("cf"), Bytes.toBytes(row(0)), Bytes.toBytes(row(0)))
hTable.put(thePut)
}
val ssc = new StreamingContext(sc, Seconds(1))
val lines = ssc.socketTextStream("localhost", 9999, StorageLevel.MEMORY_AND_DISK_SER)
val words = lines.map(_.split(","))
val store = words.foreachRDD(rdd => rdd.foreach(blah))
ssc.start()
I am currently running the above code in spark-shell. I am not sure what I am doing wrong.
I get the following error in the shell:
14/09/03 16:21:03 ERROR scheduler.JobScheduler: Error running job streaming job 1409786463000 ms.0
org.apache.spark.SparkException: Job aborted due to stage failure: Task not serializable: java.io.NotSerializableException: org.apache.spark.streaming.StreamingContext
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1033)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1017)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1015)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1015)
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$submitMissingTasks(DAGScheduler.scala:770)
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$submitStage(DAGScheduler.scala:713)
at org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1176)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498)
at akka.actor.ActorCell.invoke(ActorCell.scala:456)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237)
at akka.dispatch.Mailbox.run(Mailbox.scala:219)
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
I also double checked the hbase table, just in case, and nothing new is written in there.
I am running nc -lk 9999 on another terminal to feed in data into the spark-shell for testing.
With help from users on the spark user group, I was able to figure out how to get this to work. It looks like I needed to wrap my streaming, mapping and foreach call around a serializable object:
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.storage.StorageLevel
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{HBaseAdmin,HTable,Put,Get}
import org.apache.hadoop.hbase.util.Bytes
object Blaher {
def blah(row: Array[String]) {
val hConf = new HBaseConfiguration()
val hTable = new HTable(hConf, "table")
val thePut = new Put(Bytes.toBytes(row(0)))
thePut.add(Bytes.toBytes("cf"), Bytes.toBytes(row(0)), Bytes.toBytes(row(0)))
hTable.put(thePut)
}
}
object TheMain extends Serializable{
def run() {
val ssc = new StreamingContext(sc, Seconds(1))
val lines = ssc.socketTextStream("localhost", 9999, StorageLevel.MEMORY_AND_DISK_SER)
val words = lines.map(_.split(","))
val store = words.foreachRDD(rdd => rdd.foreach(Blaher.blah))
ssc.start()
}
}
TheMain.run()
Seems to be a typical antipattern.
See "Design Patterns for using foreachRDD" chapter at http://spark.apache.org/docs/latest/streaming-programming-guide.html for correct pattern.