solr add document error - scala

I am trying to load CSV file to solr doc, i am trying using scala. I am new to scala. For case class structure, if i pass one set of values it works fine. But if i want to want all read values from CSV, it gives an error. I am not sure how to do it in scala, any help greatly appreciated.
object BasicParseCsv {
case class Person(id: String, name: String,age: String, addr: String )
val schema = ArrayBuffer[Person]()
def main(args: Array[String]) {
val master = args(0)
val inputFile = args(1)
val outputFile = args(2)
val sc = new SparkContext(master, "BasicParseCsv", System.getenv("SPARK_HOME"))
val params = new ModifiableSolrParams
val Solr = new HttpSolrServer("http://localhost:8983/solr/person1")
//Preparing the Solr document
val doc = new SolrInputDocument()
val input = sc.textFile(inputFile)
val result = input.map{ line =>
val reader = new CSVReader(new StringReader(line));
reader.readNext();
}
def getSolrDocument(person: Person): SolrInputDocument = {
val document = new SolrInputDocument()
document.addField("id",person.id)
document.addField("name", person.name)
document.addField("age",person.age)
document.addField("addr", person.addr)
document
}
def send(persons:List[Person]){
persons.foreach(person=>Solr.add(getSolrDocument(person)))
Solr.commit()
}
val people = result.map(x => Person(x(0), x(1),x(2),x(3)))
val book1 = new Person("101","xxx","20","abcd")
send(List(book1))
people.map(person => send(List(Person(person.id, person.name, person.age,person.addr))))
System.out.println("Documents added")
}
}
people.map(person => send(List(Person(person.id, person.name, person.age,person.addr)))) ==> gives error
val book1 = new Person("101","xxx","20","abcd") ==> works fine
Update : I get below error
Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2067)
at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:324)
at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:323)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
at org.apache.spark.rdd.RDD.map(RDD.scala:323)
at BasicParseCsv$.main(BasicParseCsv.scala:90)
at BasicParseCsv.main(BasicParseCsv.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:144)
Caused by: java.io.NotSerializableException: org.apache.http.impl.client.SystemDefaultHttpClient
Serialization stack:
- object not serializable (class: org.apache.http.impl.client.SystemDefaultHttpClient, value: org.apache.http.impl.client.SystemDefaultHttpClient#1dbd580)
- field (class: org.apache.solr.client.solrj.impl.HttpSolrServer, name: httpClient, type: interface org.apache.http.client.HttpClient)
- object (class org.apache.solr.client.solrj.impl.HttpSolrServer, org.apache.solr.client.solrj.impl.HttpSolrServer#17e0827)
- field (class: BasicParseCsv$$anonfun$main$1, name: Solr$1, type: class org.apache.solr.client.solrj.impl.HttpSolrServer)

Related

Scala error: Exception in thread "main" org.apache.spark.SparkException: Task not serializable

I got not serializable error when running this code:
import org.apache.spark.{SparkContext, SparkConf}
import scala.collection.mutable.ArrayBuffer
object Task1 {
def findHighestRatingUsers(movieRating: String): (String) = {
val tokens = movieRating.split(",", -1)
val movieTitle = tokens(0)
val ratings = tokens.slice(1, tokens.size)
val maxRating = ratings.max
var userIds = ArrayBuffer[Int]()
for(i <- 0 until ratings.length){
if (ratings(i) == maxRating) {
userIds += (i+1)
}
}
return movieTitle + "," + userIds.mkString(",")
return movieTitle
}
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Task 1")
val sc = new SparkContext(conf)
val Lines = sc.textFile(args(0))
val TitleAndMaxUserIds = Lines.map(findHighestRatingUsers)
.saveAsTextFile(args(1))
}
}
The error occurs at line:
val TitleAndMaxUserIds =Lines.map(findHighestRatingUsers)
.saveAsTextFile(args(1))
I believe this is due to something in function 'findHighestRatingUsers'. Could somebody explain why and how to fix it?
More info in the exception is like:
Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:416)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:406)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:162)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2362)
at org.apache.spark.rdd.RDD.$anonfun$map$1(RDD.scala:396)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
at org.apache.spark.rdd.RDD.map(RDD.scala:395)
at Task1$.main(Task1.scala:63)
at Task1.main(Task1.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:928)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1007)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1016)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.NotSerializableException: Task1$
Serialization stack:
- object not serializable (class: Task1$, value: Task1$#3c770db4)
- element of array (index: 0)
- array (class [Ljava.lang.Object;, size 1)
- field (class: java.lang.invoke.SerializedLambda, name: capturedArgs, type: class [Ljava.lang.Object;)
- object (class java.lang.invoke.SerializedLambda, SerializedLambda[capturingClass=class Task1$, functionalInterfaceMethod=scala/Function1.apply:(Ljava/lang/Object;)Ljava/lang/Object;, implementation=invokeStatic Task1$.$anonfun$main$1:(LTask1$;Ljava/lang/String;)Ljava/lang/String;, instantiatedMethodType=(Ljava/lang/String;)Ljava/lang/String;, numCaptured=1])
- writeReplace data (class: java.lang.invoke.SerializedLambda)
- object (class Task1$$$Lambda$1023/20408451, Task1$$$Lambda$1023/20408451#4f59a516)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:41)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:413)
... 22 more
I checked this post
Difference between object and class in Scala and tried to use object to enclose the function:
import org.apache.spark.{SparkContext, SparkConf}
import scala.collection.mutable.ArrayBuffer
object Function{
def _findHighestRatingUsers(movieRating: String): (String) = {
val tokens = movieRating.split(",", -1)
val movieTitle = tokens(0)
val ratings = tokens.slice(1, tokens.size)
val maxRating = ratings.max
var userIds = ArrayBuffer[Int]()
for(i <- 0 until ratings.length){
if (ratings(i) == maxRating) {
userIds += (i+1)
}
}
return movieTitle + "," + userIds.mkString(",")
}
}
object Task1 {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Task 1")
val sc = new SparkContext(conf)
val textFile = sc.textFile(args(0))
val output = textFile.map(Function._findHighestRatingUsers)
.saveAsTextFile(args(1))
}
}
But still got exception With a huge amount of errors...
This time I tried to put object Function in the object task1 like this:
import org.apache.spark.{SparkContext, SparkConf}
import scala.collection.mutable.ArrayBuffer
object Task1 {
object Function{
def _findHighestRatingUsers(movieRating: String): (String) = {
val tokens = movieRating.split(",", -1)
val movieTitle = tokens(0)
val ratings = tokens.slice(1, tokens.size)
val maxRating = ratings.max
var userIds = ArrayBuffer[Int]()
for(i <- 0 until ratings.length){
if (ratings(i) == maxRating) {
userIds += (i+1)
}
}
return movieTitle + "," + userIds.mkString(",")
}
}
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Task 1")
val sc = new SparkContext(conf)
val textFile = sc.textFile(args(0))
val output = textFile.map(Function._findHighestRatingUsers)
.saveAsTextFile(args(1))
}
}
And problem solved. But I still don't know why the nested object solves this problem. Could somebody explain it?
And further more, I have several points not sure:
What is main function in scala? Is it the entrance of program?
Why we use an object to describe main function?
Could somebody give a common structure of a Scala program containing function, class or some basic components?
First thing is that I would recommend that you should get familiar by reading documentation both with Scala and Spark as your questions highlight that you are just starting working with it.
I'll give you some insights for your original question about "Task not serializable" (but not answering it precisely though) and let you open other questions for the questions you added later in your post otherwise this answer will be a mess.
As you probably know, Spark allows distributed computation. To do so, one thing Spark does is take the code you write, serialize it and send it to some executors somewhere to actually run it. The key part here is that your code must be serializable.
The error you got is telling you that Spark cannot serialize your code.
Now, how to make it serializable? This is where it can becomes challenging and even though Spark tries to help you by providing a "serialization stack", sometimes the info it provides are not that helpful.
In your case (1st example of code), findHighestRatingUsers must be serialized but to do so it has to serialize the whole object Task1 which is not serializable.
Why is Task1 not serializable? I'll admit I'm not really sure but I would bet on the main method, though I'd expected your 2nd example to be serializable then.
You can read more about this on various documentation or blog posts on the web. For instance: https://medium.com/swlh/spark-serialization-errors-e0eebcf0f6e6

SparkException: Task not serializable on class: org.apache.avro.generic.GenericDatumReader

I have input in json format with two fields, (size : BigInteger and data : String). Here data contains ZStd compressed Avro records. The task is to decode these records. I am using Spark-avro for this. But getting, Task not serializable exception.
Sample Data
{
"data": "7z776qOPevPJF5/0Dv9Rzx/1/i8gJJiQD5MTDGdbeNKKT"
"size" : 231
}
Code
import java.util.Base64
import com.github.luben.zstd.Zstd
import org.apache.avro.Schema
import com.twitter.bijection.Injection
import org.apache.avro.generic.GenericRecord
import com.twitter.bijection.avro.GenericAvroCodecs
import com.databricks.spark.avro.SchemaConverters
import org.apache.spark.sql.types.StructType
import com.databricks.spark.avro.SchemaConverters._
def decode2(input:String,size:Int,avroBijection:Injection[GenericRecord, Array[Byte]], sqlType:StructType): GenericRecord = {
val compressedGenericRecordBytes = Base64.getDecoder.decode(input)
val genericRecordBytes = Zstd.decompress(compressedGenericRecordBytes,size)
avroBijection.invert(genericRecordBytes).get
}
val myRdd = spark.read.format("json").load("/path").rdd
val rows = myRdd.mapPartitions{
lazy val schema = new Schema.Parser().parse(schemaStr)
lazy val avroBijection: Injection[GenericRecord, Array[Byte]] = GenericAvroCodecs.toBinary(schema)
lazy val sqlType = SchemaConverters.toSqlType(schema).dataType.asInstanceOf[StructType]
(iterator) => {
val myList = iterator.toList
myList.map{ x => {
val size = x(1).asInstanceOf[Long].intValue
val data = x(0).asInstanceOf [String]
decode2(data, size, avroBijection,sqlType)
}
}.iterator
}
}
Exception
files: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[987] at rdd at <console>:346
org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2287)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1.apply(RDD.scala:794)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1.apply(RDD.scala:793)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.mapPartitions(RDD.scala:793)
... 112 elided
Caused by: java.io.NotSerializableException: org.apache.avro.generic.GenericDatumReader
Serialization stack:
- object not serializable (class: org.apache.avro.generic.GenericDatumReader, value: org.apache.avro.generic.GenericDatumReader#4937cd88)
- field (class: com.twitter.bijection.avro.BinaryAvroCodec, name: reader, type: interface org.apache.avro.io.DatumReader)
- object (class com.twitter.bijection.avro.BinaryAvroCodec, com.twitter.bijection.avro.BinaryAvroCodec#6945439c)
- field (class: $$$$79b2515edf74bd80cfc9d8ac1ba563c6$$$$iw, name: avroBijection, type: interface com.twitter.bijection.Injection)
Already tried SO posts
Spark: java.io.NotSerializableException: org.apache.avro.Schema$RecordSchema
Following this post I have update the decode2 method to take schemaStr as input and convert to schema and SqlType within method. No change in exception
Use schema to convert AVRO messages with Spark to DataFrame
Used the code provided in the post to create object Injection and then use it. This one also didn't help.
have you tried
val rows = myRdd.mapPartitions{
(iterator) => {
val myList = iterator.toList
myList.map{ x => {
lazy val schema = new Schema.Parser().parse(schemaStr)
lazy val avroBijection: Injection[GenericRecord, Array[Byte]] = GenericAvroCodecs.toBinary(schema)
lazy val sqlType = SchemaConverters.toSqlType(schema).dataType.asInstanceOf[StructType]
val size = x(1).asInstanceOf[Long].intValue
val data = x(0).asInstanceOf [String]
decode2(data, size, avroBijection,sqlType)
}
}.iterator
}

Task not serializable after adding it to ForEachPartition

I am receiving a task not serializable exception in spark when attempting to implement an Apache pulsar Sink in spark structured streaming.
I have already attempted to extrapolate the PulsarConfig to a separate class and call this within the .foreachPartition lambda function which I normally do for JDBC connections and other systems I integrate into spark structured streaming like shown below:
PulsarSink Class
class PulsarSink(
sqlContext: SQLContext,
parameters: Map[String, String],
partitionColumns: Seq[String],
outputMode: OutputMode) extends Sink{
override def addBatch(batchId: Long, data: DataFrame): Unit = {
data.toJSON.foreachPartition( partition => {
val pulsarConfig = new PulsarConfig(parameters).client
val producer = pulsarConfig.newProducer(Schema.STRING)
.topic(parameters.get("topic").get)
.compressionType(CompressionType.LZ4)
.sendTimeout(0, TimeUnit.SECONDS)
.create
partition.foreach(rec => producer.send(rec))
producer.flush()
})
}
PulsarConfig Class
class PulsarConfig(parameters: Map[String, String]) {
def client(): PulsarClient = {
import scala.collection.JavaConverters._
if(!parameters.get("tlscert").isEmpty && !parameters.get("tlskey").isEmpty) {
val tlsAuthMap = Map("tlsCertFile" -> parameters.get("tlscert").get,
"tlsKeyFile" -> parameters.get("tlskey").get).asJava
val tlsAuth: Authentication = AuthenticationFactory.create(classOf[AuthenticationTls].getName, tlsAuthMap)
PulsarClient.builder
.serviceUrl(parameters.get("broker").get)
.tlsTrustCertsFilePath(parameters.get("tlscert").get)
.authentication(tlsAuth)
.enableTlsHostnameVerification(false)
.allowTlsInsecureConnection(true)
.build
}
else{
PulsarClient.builder
.serviceUrl(parameters.get("broker").get)
.enableTlsHostnameVerification(false)
.allowTlsInsecureConnection(true)
.build
}
}
}
The error message I receive is the following:
ERROR StreamExecution: Query [id = 12c715c2-2d62-4523-a37a-4555995ccb74, runId = d409c0db-7078-4654-b0ce-96e46dfb322c] terminated with error
org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:340)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:330)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:156)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2294)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:925)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:924)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:924)
at org.apache.spark.sql.Dataset$$anonfun$foreachPartition$1.apply$mcV$sp(Dataset.scala:2341)
at org.apache.spark.sql.Dataset$$anonfun$foreachPartition$1.apply(Dataset.scala:2341)
at org.apache.spark.sql.Dataset$$anonfun$foreachPartition$1.apply(Dataset.scala:2341)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2828)
at org.apache.spark.sql.Dataset.foreachPartition(Dataset.scala:2340)
at org.apache.spark.datamediation.impl.sink.PulsarSink.addBatch(PulsarSink.scala:20)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch$1.apply$mcV$sp(StreamExecution.scala:666)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch$1.apply(StreamExecution.scala:666)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch$1.apply(StreamExecution.scala:666)
at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:279)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch(StreamExecution.scala:665)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$apply$mcZ$sp$1.apply$mcV$sp(StreamExecution.scala:306)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$apply$mcZ$sp$1.apply(StreamExecution.scala:294)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$apply$mcZ$sp$1.apply(StreamExecution.scala:294)
at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:279)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1.apply$mcZ$sp(StreamExecution.scala:294)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:56)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches(StreamExecution.scala:290)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:206)
Caused by: java.io.NotSerializableException: org.apache.spark.datamediation.impl.sink.PulsarSink
Serialization stack:
- object not serializable (class: org.apache.spark.datamediation.impl.sink.PulsarSink, value: org.apache.spark.datamediation.impl.sink.PulsarSink#38813f43)
- field (class: org.apache.spark.datamediation.impl.sink.PulsarSink$$anonfun$addBatch$1, name: $outer, type: class org.apache.spark.datamediation.impl.sink.PulsarSink)
- object (class org.apache.spark.datamediation.impl.sink.PulsarSink$$anonfun$addBatch$1, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:337)
... 31 more
Values used in "foreachPartition" can be reassigned from class level to function variables:
override def addBatch(batchId: Long, data: DataFrame): Unit = {
val parametersLocal = parameters
data.toJSON.foreachPartition( partition => {
val pulsarConfig = new PulsarConfig(parametersLocal).client

Why does custom DefaultSource give java.io.NotSerializableException?

this is my first post on SO and my apology if the improper format is being used.
I'm working with Apache Spark to create a new source (via DefaultSource), BaseRelations, etc... and run into a problem with serialization that I would like to understand better. Consider below a class that extends BaseRelation and implements the scan builder.
class RootTableScan(path: String, treeName: String)(#transient val sqlContext: SQLContext) extends BaseRelation with PrunedFilteredScan{
private val att: core.SRType =
{
val reader = new RootFileReader(new java.io.File(Seq(path) head))
val tmp =
if (treeName==null)
buildATT(findTree(reader.getTopDir), arrangeStreamers(reader), null)
else
buildATT(reader.getKey(treeName).getObject.asInstanceOf[TTree],
arrangeStreamers(reader), null)
tmp
}
// define the schema from the AST
def schema: StructType = {
val s = buildSparkSchema(att)
s
}
// builds a scan
def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
// parallelize over all the files
val r = sqlContext.sparkContext.parallelize(Seq(path), 1).
flatMap({fileName =>
val reader = new RootFileReader(new java.io.File(fileName))
// get the TTree
/* PROBLEM !!! */
val rootTree =
// findTree(reader)
if (treeName == null) findTree(reader)
else reader.getKey(treeName).getObject.asInstanceOf[TTree]
new RootTreeIterator(rootTree, arrangeStreamers(reader),
requiredColumns, filters)
})
println("Done building Scan")
r
}
}
}
PROBLEM identifies where the issue happens. treeName is a val that gets injected into the class thru the constructor. The lambda that uses it is supposed to be executed on the slave and I do need to send the treeName - serialize it. I would like to understand why exactly the code snippet below causes this NotSerializableException. I know for sure that without treeName in it, it works just fine
val rootTree =
// findTree(reader)
if (treeName == null) findTree(reader)
else reader.getKey(treeName).getObject.asInstanceOf[TTree]
Below is the Stack trace
org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2056)
at org.apache.spark.rdd.RDD$$anonfun$flatMap$1.apply(RDD.scala:375)
at org.apache.spark.rdd.RDD$$anonfun$flatMap$1.apply(RDD.scala:374)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
at org.apache.spark.rdd.RDD.flatMap(RDD.scala:374)
at org.dianahep.sparkroot.package$RootTableScan.buildScan(sparkroot.scala:95)
at org.apache.spark.sql.execution.datasources.DataSourceStrategy$$anonfun$8.apply(DataSourceStrategy.scala:260)
at org.apache.spark.sql.execution.datasources.DataSourceStrategy$$anonfun$8.apply(DataSourceStrategy.scala:260)
at org.apache.spark.sql.execution.datasources.DataSourceStrategy$$anonfun$pruneFilterProject$1.apply(DataSourceStrategy.scala:303)
at org.apache.spark.sql.execution.datasources.DataSourceStrategy$$anonfun$pruneFilterProject$1.apply(DataSourceStrategy.scala:302)
at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.pruneFilterProjectRaw(DataSourceStrategy.scala:379)
at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.pruneFilterProject(DataSourceStrategy.scala:298)
at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.apply(DataSourceStrategy.scala:256)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:60)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:60)
at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:61)
at org.apache.spark.sql.execution.SparkPlanner.plan(SparkPlanner.scala:47)
at org.apache.spark.sql.execution.SparkPlanner$$anonfun$plan$1$$anonfun$apply$1.applyOrElse(SparkPlanner.scala:51)
at org.apache.spark.sql.execution.SparkPlanner$$anonfun$plan$1$$anonfun$apply$1.applyOrElse(SparkPlanner.scala:48)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:301)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:301)
at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:69)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:300)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:298)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:298)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5.apply(TreeNode.scala:321)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:179)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:319)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:298)
at org.apache.spark.sql.execution.SparkPlanner$$anonfun$plan$1.apply(SparkPlanner.scala:48)
at org.apache.spark.sql.execution.SparkPlanner$$anonfun$plan$1.apply(SparkPlanner.scala:48)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:78)
at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:76)
at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:83)
at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:83)
at org.apache.spark.sql.Dataset.withTypedCallback(Dataset.scala:2572)
at org.apache.spark.sql.Dataset.head(Dataset.scala:1934)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2149)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:239)
at org.apache.spark.sql.Dataset.show(Dataset.scala:526)
at org.apache.spark.sql.Dataset.show(Dataset.scala:486)
at org.apache.spark.sql.Dataset.show(Dataset.scala:495)
... 50 elided
Caused by: java.io.NotSerializableException: org.dianahep.sparkroot.package$RootTableScan
Serialization stack:
- object not serializable (class: org.dianahep.sparkroot.package$RootTableScan, value: org.dianahep.sparkroot.package$RootTableScan#6421e9e7)
- field (class: org.dianahep.sparkroot.package$RootTableScan$$anonfun$1, name: $outer, type: class org.dianahep.sparkroot.package$RootTableScan)
- object (class org.dianahep.sparkroot.package$RootTableScan$$anonfun$1, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:295)
From the stack I think I can deduce that it tries to serialize my lambda and can not. this lambda should be a closure as we have a val in there that is defined outside of the lambda scope. But I don't understand why this can not be serialized.
Any help would be really appreciated!!!
Thanks a lot!
Any time a scala closure references a class variable, like treeName, then the JVM serializes the parent class along with the closure. Your class RootTableScan is not serializable, though! The solution is to create a local string variable:
// builds a scan
def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
val localTreeName = treeName // this is safe to serialize
// parallelize over all the files
val r = sqlContext.sparkContext.parallelize(Seq(path), 1).
flatMap({fileName =>
val reader = new RootFileReader(new java.io.File(fileName))
// get the TTree
/* PROBLEM !!! */
val rootTree =
// findTree(reader)
if (localTreeName == null) findTree(reader)
else reader.getKey(localTreeName).getObject.asInstanceOf[TTree]
new RootTreeIterator(rootTree, arrangeStreamers(reader),
requiredColumns, filters)
})

Spark: Solrj exception on serialization

I am trying to import some csv file to Solr using Spark as "pipe", in this way:
trait SparkConnector {
def connectionWrapper[A](f: SparkContext => A): A = {
val sc: SparkContext = getSparkContext()
val res: A = f(sc)
sc.stop
res
}
}
object SolrIndexBuilder extends SparkConnector {
val solr = new ConcurrentUpdateSolrClient ("http://some-solr-url", 10000, 5)
def run(implicit fileSystem: FileSystem) = connectionWrapper[Unit] {
sparkContext =>
def build(): Unit = {
def toSolrDocument(person: Map[String, String], fieldNames: Array[String]): SolrInputDocument = {
val doc = new SolrInputDocument()
//add the values to the solr doc
doc
}
def rddToMapWithHeaderUnchecked(rdd: RDD[String], header: Array[String]): RDD[Map[String, String]] =
rdd.map { line =>
val splits = new CSVParser().parseLine(line)
header.zip(splits).toMap
}
val indexCsv = sparkContext.textFile("/somewhere/filename.csv")
val fieldNames = Vector("field1", "field2", "field3", ...)
val indexRows = rddToMapWithHeaderUnchecked(indexCsv, fieldNames)
indexRows.map(row => toSolrDocument(row, fieldNames)).foreach { doc => solr.add(doc) }
if (!indexRows.isEmpty()) {
solr.blockUntilFinished()
solr.commit()
}
}
build()
}
}
Running the code on a cluster (Spark-mode) I get the following error:
Caused by: java.io.NotSerializableException: org.apache.http.impl.client.SystemDefaultHttpClient
Serialization stack:
- object not serializable (class: org.apache.http.impl.client.SystemDefaultHttpClient, value: org.apache.http.impl.client.SystemDefaultHttpClient#3bdd79c7)
- field (class: org.apache.solr.client.solrj.impl.HttpSolrClient, name: httpClient, type: interface org.apache.http.client.HttpClient)
- object (class org.apache.solr.client.solrj.impl.HttpSolrClient, org.apache.solr.client.solrj.impl.HttpSolrClient#5d2c576a)
- field (class: org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrClient, name: client, type: class org.apache.solr.client.solrj.impl.HttpSolrClient)
- object (class org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrClient, org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrClient#5da7593d)
- field (class: com.oneninetwo.andromeda.solr.SolrIndexBuilder$$anonfun$run$1$$anonfun$build$1$2, name: solr$1, type: class org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrClient)
- object (class com.oneninetwo.andromeda.solr.SolrIndexBuilder$$anonfun$run$1$$anonfun$build$1$2, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:301)
... 26 more
which clearly states that I cannot serialize classes related to Solrj (and relative dependencies).
I tried different thing but I am cannot find a way to make it work. Any help?
UPDATE
This is one solution I have found:
indexRows.map(row => toSolrDocument(row, fieldNames)).foreachPartition{ partition =>
val solr = new ConcurrentUpdateSolrClient(...)
partition.foreach { doc =>
solr.add(doc)
}
solr.commit()