Spark Hadoop Failed to get broadcast - scala

Running a spark-submit job and receiving a "Failed to get broadcast_58_piece0..." error. I'm really not sure what I'm doing wrong. Am I overusing UDFs? Too complicated a function?
As a summary of my objective, I am parsing text from pdfs, which are stored as base64 encoded strings in JSON objects. I'm using Apache Tika to get the text, and trying to make copious use of data frames to make things easier.
I had written a piece of code that ran the text extraction through tika as a function outside of "main" on the data as a RDD, and that worked flawlessly. When I try to bring the extraction into main as a UDF on data frames, though, it borks in various different ways. Before I got here I was actually trying to write the final data frame as:
valid.toJSON.saveAsTextFile(hdfs_dir)
This was giving me all sorts of "File/Path already exists" headaches.
Current code:
object Driver {
def main(args: Array[String]):Unit = {
val hdfs_dir = args(0)
val spark_conf = new SparkConf().setAppName("Spark Tika HDFS")
val sc = new SparkContext(spark_conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
// load json data into dataframe
val df = sqlContext.read.json("hdfs://hadoophost.com:8888/user/spark/data/in/*")
val extractInfo: (Array[Byte] => String) = (fp: Array[Byte]) => {
val parser:Parser = new AutoDetectParser()
val handler:BodyContentHandler = new BodyContentHandler(Integer.MAX_VALUE)
val config:TesseractOCRConfig = new TesseractOCRConfig()
val pdfConfig:PDFParserConfig = new PDFParserConfig()
val inputstream:InputStream = new ByteArrayInputStream(fp)
val metadata:Metadata = new Metadata()
val parseContext:ParseContext = new ParseContext()
parseContext.set(classOf[TesseractOCRConfig], config)
parseContext.set(classOf[PDFParserConfig], pdfConfig)
parseContext.set(classOf[Parser], parser)
parser.parse(inputstream, handler, metadata, parseContext)
handler.toString
}
val extract_udf = udf(extractInfo)
val df2 = df.withColumn("unbased_media", unbase64($"media_file")).drop("media_file")
val dfRenamed = df2.withColumn("media_corpus", extract_udf(col("unbased_media"))).drop("unbased_media")
val depuncter: (String => String) = (corpus: String) => {
val r = corpus.replaceAll("""[\p{Punct}]""", "")
val s = r.replaceAll("""[0-9]""", "")
s
}
val depuncter_udf = udf(depuncter)
val withoutPunct = dfRenamed.withColumn("sentence", depuncter_udf(col("media_corpus")))
val model = sc.objectFile[org.apache.spark.ml.PipelineModel]("hdfs://hadoophost.com:8888/user/spark/hawkeye-nb-ml-v2.0").first()
val with_predictions = model.transform(withoutPunct)
val fullNameChecker: ((String, String, String, String, String) => String) = (fname: String, mname: String, lname: String, sfx: String, text: String) =>{
val newtext = text.replaceAll(" ", "").replaceAll("""[0-9]""", "").replaceAll("""[\p{Punct}]""", "").toLowerCase
val new_fname = fname.replaceAll(" ", "").replaceAll("""[0-9]""", "").replaceAll("""[\p{Punct}]""", "").toLowerCase
val new_mname = mname.replaceAll(" ", "").replaceAll("""[0-9]""", "").replaceAll("""[\p{Punct}]""", "").toLowerCase
val new_lname = lname.replaceAll(" ", "").replaceAll("""[0-9]""", "").replaceAll("""[\p{Punct}]""", "").toLowerCase
val new_sfx = sfx.replaceAll(" ", "").replaceAll("""[0-9]""", "").replaceAll("""[\p{Punct}]""", "").toLowerCase
val name_full = new_fname.concat(new_mname).concat(new_lname).concat(new_sfx)
val c = name_full.r.findAllIn(newtext).length
c match {
case 0 => "N"
case _ => "Y"
}
}
val fullNameChecker_udf = udf(fullNameChecker)
val stringChecker: ((String, String) => String) = (term: String, text: String) => {
val termLower = term.replaceAll("""[\p{Punct}]""", "").toLowerCase
val textLower = text.replaceAll("""[\p{Punct}]""", "").toLowerCase
val c = termLower.r.findAllIn(textLower).length
c match {
case 0 => "N"
case _ => "Y"
}
}
val stringChecker_udf = udf(stringChecker)
val stringChecker2: ((String, String) => String) = (term: String, text: String) => {
val termLower = term takeRight 4
val textLower = text
val c = termLower.r.findAllIn(textLower).length
c match {
case 0 => "N"
case _ => "Y"
}
}
val stringChecker2_udf = udf(stringChecker)
val valids = with_predictions.withColumn("fname_valid", stringChecker_udf(col("first_name"), col("media_corpus")))
.withColumn("lname_valid", stringChecker_udf(col("last_name"), col("media_corpus")))
.withColumn("fname2_valid", stringChecker_udf(col("first_name_2"), col("media_corpus")))
.withColumn("lname2_valid", stringChecker_udf(col("last_name_2"), col("media_corpus")))
.withColumn("camt_valid", stringChecker_udf(col("chargeoff_amount"), col("media_corpus")))
.withColumn("ocan_valid", stringChecker2_udf(col("original_creditor_account_nbr"), col("media_corpus")))
.withColumn("dpan_valid", stringChecker2_udf(col("debt_provider_account_nbr"), col("media_corpus")))
.withColumn("full_name_valid", fullNameChecker_udf(col("first_name"), col("middle_name"), col("last_name"), col("suffix"), col("media_corpus")))
.withColumn("full_name_2_valid", fullNameChecker_udf(col("first_name_2"), col("middle_name_2"), col("last_name_2"), col("suffix_2"), col("media_corpus")))
valids.write.mode(SaveMode.Overwrite).format("json").save(hdfs_dir)
}
}
Full stack trace starting with error:
16/06/14 15:02:01 WARN TaskSetManager: Lost task 0.0 in stage 4.0 (TID 53, hdpd11n05.squaretwofinancial.com): org.apache.spark.SparkException: Task failed while writing rows.
at org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:272)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
Caused by: java.io.IOException: org.apache.spark.SparkException: Failed to get broadcast_58_piece0 of broadcast_58
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1222)
at org.apache.spark.broadcast.TorrentBroadcast.readBroadcastBlock(TorrentBroadcast.scala:165)
at org.apache.spark.broadcast.TorrentBroadcast._value$lzycompute(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast._value(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast.getValue(TorrentBroadcast.scala:88)
at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70)
at org.apache.spark.ml.feature.CountVectorizerModel$$anonfun$9$$anonfun$apply$7.apply(CountVectorizer.scala:222)
at org.apache.spark.ml.feature.CountVectorizerModel$$anonfun$9$$anonfun$apply$7.apply(CountVectorizer.scala:221)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:34)
at org.apache.spark.ml.feature.CountVectorizerModel$$anonfun$9.apply(CountVectorizer.scala:221)
at org.apache.spark.ml.feature.CountVectorizerModel$$anonfun$9.apply(CountVectorizer.scala:218)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.evalExpr43$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at org.apache.spark.sql.execution.Project$$anonfun$1$$anonfun$apply$1.apply(basicOperators.scala:51)
at org.apache.spark.sql.execution.Project$$anonfun$1$$anonfun$apply$1.apply(basicOperators.scala:49)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:263)
... 8 more
Caused by: org.apache.spark.SparkException: Failed to get broadcast_58_piece0 of broadcast_58
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1$$anonfun$2.apply(TorrentBroadcast.scala:138)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1$$anonfun$2.apply(TorrentBroadcast.scala:138)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply$mcVI$sp(TorrentBroadcast.scala:137)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply(TorrentBroadcast.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply(TorrentBroadcast.scala:120)
at scala.collection.immutable.List.foreach(List.scala:318)
at org.apache.spark.broadcast.TorrentBroadcast.org$apache$spark$broadcast$TorrentBroadcast$$readBlocks(TorrentBroadcast.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$readBroadcastBlock$1.apply(TorrentBroadcast.scala:175)
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1219)
... 25 more

I encountered a similar error.
It turns out to be caused by the broadcast usage in CounterVectorModel. Following is the detailed cause in my case:
When model.transform() is called , the vocabulary is broadcasted and saved as an attribute broadcastDic in model implicitly. Therefore, if the CounterVectorModel is saved after calling model.transform(), the private var attribute broadcastDic is also saved. But unfortunately, in Spark, broadcasted object is context-sensitive, which means it is embedded in SparkContext. If that CounterVectorModel is loaded in a different SparkContext, it will fail to find the previous saved broadcastDic.
So either solution is to prevent calling model.transform() before saving the model, or clone the model by method model.copy().

For anyone coming across this, it turns out the model I was loading was malformed. I found out by using spark-shell in yarn-client mode and stepping through the code. When I tried to load the model it was fine, but running it against the datagram (model.transform) through errors about not finding a metadata directory.
I went back and found a good model, ran against that and it worked fine. This code is actually sound.

Related

How to fix 'No symbol could be loaded from org.apache.hbase.classification.InterfaceAudience'?

I'm trying to prepare a DataFrame to be stored in HFile format on HBase using Apache Spark. I'm using Spark 2.1.0, Scala 2.11 and HBase 1.1.2
Here is my code:
val df = createDataframeFromRow(Row("mlk", "kpo", "opi"), "a b c")
val cols = df.columns.sorted
val colsorteddf = df.select(cols.map(x => col(x)): _*)
val valcols = cols.filterNot(x => x.equals("U_ID"))
So far so good. I only sort the columns of my dataframe
val pdd = colsorteddf.map(row => {
(row(0).toString, (row(1).toString, row(2).toString))
})
val tdd = pdd.flatMap(x => {
val rowKey = PLong.INSTANCE.toBytes(x._1)
for(i <- 0 until valcols.length - 1) yield {
val colname = valcols(i).toString
val colvalue = x._2.productElement(i).toString
val colfam = "data"
(rowKey, (colfam, colname, colvalue))
}
})
After this, I transform each row into this key value format (rowKey, (colfam, colname, colvalue))
No here's when the problem happens. I try to map each row of tdd into a pair of (ImmutableBytesWritable, KeyValue)
import org.apache.hadoop.hbase.KeyValue
val output = tdd.map(x => {
val rowKey: Array[Byte] = x._1
val immutableRowKey = new ImmutableBytesWritable(rowKey)
val colfam = x._2._1
val colname = x._2._2
val colvalue = x._2._3
val kv = new KeyValue(
rowKey,
colfam.getBytes(),
colname.getBytes(),
Bytes.toBytes(colvalue.toString)
)
(immutableRowKey, kv)
})
It renders this stack trace :
java.lang.AssertionError: assertion failed: no symbol could be loaded from interface org.apache.hadoop.hbase.classification.InterfaceAudience$Public in object InterfaceAudience with name Public and classloader scala.reflect.internal.util.ScalaClassLoader$URLClassLoader#3269cbb7
at scala.reflect.runtime.JavaMirrors$JavaMirror.scala$reflect$runtime$JavaMirrors$JavaMirror$$classToScala1(JavaMirrors.scala:1021)
at scala.reflect.runtime.JavaMirrors$JavaMirror$$anonfun$classToScala$1.apply(JavaMirrors.scala:980)
at scala.reflect.runtime.JavaMirrors$JavaMirror$$anonfun$classToScala$1.apply(JavaMirrors.scala:980)
at scala.reflect.runtime.JavaMirrors$JavaMirror$$anonfun$toScala$1.apply(JavaMirrors.scala:97)
at scala.reflect.runtime.TwoWayCaches$TwoWayCache$$anonfun$toScala$1.apply(TwoWayCaches.scala:38)
at scala.reflect.runtime.Gil$class.gilSynchronized(Gil.scala:19)
at scala.reflect.runtime.JavaUniverse.gilSynchronized(JavaUniverse.scala:16)
at scala.reflect.runtime.TwoWayCaches$TwoWayCache.toScala(TwoWayCaches.scala:33)
at scala.reflect.runtime.JavaMirrors$JavaMirror.toScala(JavaMirrors.scala:95)
at scala.reflect.runtime.JavaMirrors$JavaMirror.classToScala(JavaMirrors.scala:980)
at scala.reflect.runtime.JavaMirrors$JavaMirror$JavaAnnotationProxy.<init>(JavaMirrors.scala:163)
at scala.reflect.runtime.JavaMirrors$JavaMirror$JavaAnnotationProxy$.apply(JavaMirrors.scala:162)
at scala.reflect.runtime.JavaMirrors$JavaMirror$JavaAnnotationProxy$.apply(JavaMirrors.scala:162)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
at scala.reflect.runtime.JavaMirrors$JavaMirror.scala$reflect$runtime$JavaMirrors$JavaMirror$$copyAnnotations(JavaMirrors.scala:683)
at scala.reflect.runtime.JavaMirrors$JavaMirror$FromJavaClassCompleter.load(JavaMirrors.scala:733)
at scala.reflect.runtime.SynchronizedSymbols$SynchronizedSymbol$$anonfun$typeParams$1.apply(SynchronizedSymbols.scala:140)
at scala.reflect.runtime.SynchronizedSymbols$SynchronizedSymbol$$anonfun$typeParams$1.apply(SynchronizedSymbols.scala:133)
at scala.reflect.runtime.Gil$class.gilSynchronized(Gil.scala:19)
at scala.reflect.runtime.JavaUniverse.gilSynchronized(JavaUniverse.scala:16)
at scala.reflect.runtime.SynchronizedSymbols$SynchronizedSymbol$class.gilSynchronizedIfNotThreadsafe(SynchronizedSymbols.scala:123)
at scala.reflect.runtime.SynchronizedSymbols$SynchronizedSymbol$$anon$8.gilSynchronizedIfNotThreadsafe(SynchronizedSymbols.scala:168)
at scala.reflect.runtime.SynchronizedSymbols$SynchronizedSymbol$class.typeParams(SynchronizedSymbols.scala:132)
at scala.reflect.runtime.SynchronizedSymbols$SynchronizedSymbol$$anon$8.typeParams(SynchronizedSymbols.scala:168)
at scala.reflect.internal.Types$NoArgsTypeRef.typeParams(Types.scala:1926)
at scala.reflect.internal.Types$NoArgsTypeRef.isHigherKinded(Types.scala:1925)
at scala.reflect.internal.transform.UnCurry$class.scala$reflect$internal$transform$UnCurry$$expandAlias(UnCurry.scala:22)
at scala.reflect.internal.transform.UnCurry$$anon$2.apply(UnCurry.scala:26)
at scala.reflect.internal.transform.UnCurry$$anon$2.apply(UnCurry.scala:24)
at scala.collection.immutable.List.loop$1(List.scala:173)
at scala.collection.immutable.List.mapConserve(List.scala:189)
at scala.reflect.internal.tpe.TypeMaps$TypeMap.mapOver(TypeMaps.scala:115)
at scala.reflect.internal.transform.UnCurry$$anon$2.apply(UnCurry.scala:46)
at scala.reflect.internal.transform.Transforms$class.transformedType(Transforms.scala:43)
at scala.reflect.internal.SymbolTable.transformedType(SymbolTable.scala:16)
at scala.reflect.internal.Types$TypeApiImpl.erasure(Types.scala:225)
at scala.
It seems like a scala issue. Has anyone ever run into the same problem? If so how did you overcome this?
PS: I'm using running this code through spark-shell.

solr add document error

I am trying to load CSV file to solr doc, i am trying using scala. I am new to scala. For case class structure, if i pass one set of values it works fine. But if i want to want all read values from CSV, it gives an error. I am not sure how to do it in scala, any help greatly appreciated.
object BasicParseCsv {
case class Person(id: String, name: String,age: String, addr: String )
val schema = ArrayBuffer[Person]()
def main(args: Array[String]) {
val master = args(0)
val inputFile = args(1)
val outputFile = args(2)
val sc = new SparkContext(master, "BasicParseCsv", System.getenv("SPARK_HOME"))
val params = new ModifiableSolrParams
val Solr = new HttpSolrServer("http://localhost:8983/solr/person1")
//Preparing the Solr document
val doc = new SolrInputDocument()
val input = sc.textFile(inputFile)
val result = input.map{ line =>
val reader = new CSVReader(new StringReader(line));
reader.readNext();
}
def getSolrDocument(person: Person): SolrInputDocument = {
val document = new SolrInputDocument()
document.addField("id",person.id)
document.addField("name", person.name)
document.addField("age",person.age)
document.addField("addr", person.addr)
document
}
def send(persons:List[Person]){
persons.foreach(person=>Solr.add(getSolrDocument(person)))
Solr.commit()
}
val people = result.map(x => Person(x(0), x(1),x(2),x(3)))
val book1 = new Person("101","xxx","20","abcd")
send(List(book1))
people.map(person => send(List(Person(person.id, person.name, person.age,person.addr))))
System.out.println("Documents added")
}
}
people.map(person => send(List(Person(person.id, person.name, person.age,person.addr)))) ==> gives error
val book1 = new Person("101","xxx","20","abcd") ==> works fine
Update : I get below error
Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2067)
at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:324)
at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:323)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
at org.apache.spark.rdd.RDD.map(RDD.scala:323)
at BasicParseCsv$.main(BasicParseCsv.scala:90)
at BasicParseCsv.main(BasicParseCsv.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:144)
Caused by: java.io.NotSerializableException: org.apache.http.impl.client.SystemDefaultHttpClient
Serialization stack:
- object not serializable (class: org.apache.http.impl.client.SystemDefaultHttpClient, value: org.apache.http.impl.client.SystemDefaultHttpClient#1dbd580)
- field (class: org.apache.solr.client.solrj.impl.HttpSolrServer, name: httpClient, type: interface org.apache.http.client.HttpClient)
- object (class org.apache.solr.client.solrj.impl.HttpSolrServer, org.apache.solr.client.solrj.impl.HttpSolrServer#17e0827)
- field (class: BasicParseCsv$$anonfun$main$1, name: Solr$1, type: class org.apache.solr.client.solrj.impl.HttpSolrServer)

Scala-Spark NullPointerError when submitting jar, not in shell

My spark job raises a null pointer exception that I cannot trace down. When I print potential null variables, they're all populated on every worker. My data does not contain null values as the same job works within the spark shell. The execute function of the job is below, followed by the error message.
All helper methods not defined in the function are defined within the body of the spark job object, so I believe closure is not the problem.
override def execute(sc:SparkContext) = {
def construct_query(targetTypes:List[String]) = Map("query" ->
Map("nested" ->
Map("path"->"annotations.entities.items",
"query"-> Map("terms"->
Map("annotations.entities.items.type"-> targetTypes)))))
val sourceConfig = HashMap(
"es.nodes" -> params.targetClientHost
)
// Base elastic search RDD returning articles which match the above query on entity types
val rdd = EsSpark.esJsonRDD(sc,
params.targetIndex,
toJson(construct_query(params.entityTypes)),
sourceConfig
).sample(false,params.sampleRate)
// Mapping ES json into news article object, then extracting the entities list of
// well defined annotations
val objectsRDD = rdd.map(tuple => {
val maybeArticle =
try {
Some(JavaJsonUtils.fromJson(tuple._2, classOf[SearchableNewsArticle]))
}catch {
case e: Exception => None
}
(tuple._1,maybeArticle)
}
).filter(tuple => {tuple._2.isDefined && tuple._2.get.annotations.isDefined &&
tuple._2.get.annotations.get.entities.isDefined}).map(tuple => (tuple._1, tuple._2.get.annotations.get.entities.get))
// flat map the RDD of entities lists into a list of (entity text, (entity type, 1)) tuples
(line 79) val entityDataMap: RDD[(String, (String, Int))] = objectsRDD.flatMap(tuple => tuple._2.items.collect({
case item if (item.`type`.isDefined) && (item.text.isDefined) &&
(line 81)(params.entityTypes.contains(item.`type`.get)) => (cleanUpText(item.text.get), (item.`type`.get, 1))
}))
// bucketize the tuples RDD into entity text, List(entity_type, entity_count) to make count aggregation and file writeouts
// easier to follow
val finalResults: Array[(String, (String, Int))] = entityDataMap.reduceByKey((x, y) => (x._1, x._2+y._2)).collect()
val entityTypeMapping = Map(
"HealthCondition" -> "HEALTH_CONDITION",
"Drug" -> "DRUG",
"FieldTerminology" -> "FIELD_TERMINOLOGY"
)
for (finalTuple <- finalResults) {
val entityText = finalTuple._1
val entityType = finalTuple._2._1
if(entityTypeMapping.contains(entityType))
{
if(!Files.exists(Paths.get(entityTypeMapping.get(entityType).get+".txt"))){
val myFile = new java.io.FileOutputStream(new File(entityTypeMapping.get(entityType).get+".txt"),false)
printToFile(myFile) {p => p.println(entityTypeMapping.get(entityType))}
}
}
val myFile = new java.io.FileOutputStream(new File(entityTypeMapping.get(entityType).get+".txt"),true)
printToFile(myFile) {p => p.println(entityText)}
}
}
And the error message below:
java.lang.NullPointerException at
com.quid.gazetteers.GazetteerGenerator$$anonfun$4$$anonfun$apply$1.isDefinedAt(GazetteerGenerator.scala:81)
at
com.quid.gazetteers.GazetteerGenerator$$anonfun$4$$anonfun$apply$1.isDefinedAt(GazetteerGenerator.scala:79)
at
scala.collection.TraversableLike$$anonfun$collect$1.apply(TraversableLike.scala:278)
at scala.collection.immutable.List.foreach(List.scala:318) at
scala.collection.TraversableLike$class.collect(TraversableLike.scala:278)
at
scala.collection.AbstractTraversable.collect(Traversable.scala:105)
at
com.quid.gazetteers.GazetteerGenerator$$anonfun$4.apply(GazetteerGenerator.scala:79)
at
com.quid.gazetteers.GazetteerGenerator$$anonfun$4.apply(GazetteerGenerator.scala:79)
at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371) at
org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:189)
at
org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:64)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89) at
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
This question has been resolved. The params attribute was not serialized and available to spark workers. The solution is to form a spark broadcast variable within scope of the areas where the params attribute is needed.

Scala : java.lang.IllegalArgumentException: requirement failed: nonEmpty input

I have written a code to process tiff files , do map algebra operations and store the result back as tiff file. When am running the code am getting this error. I am passing only one argument to the code which is a file path. What could be causing the error?
Error :
Exception in thread "main" java.lang.IllegalArgumentException: requirement failed: nonEmpty input
at scala.Predef$.require(Predef.scala:233)
at geotrellis.spark.stitch.TileLayoutStitcher$.stitch(StitchRDDMethods.scala:21)
at geotrellis.spark.stitch.SpatialTileLayoutRDDMethods.stitch(StitchRDDMethods.scala:42)
at RasterData.RasterOperations.MapAlgebraOperations$.readHdfsTiles(MapAlgebraOperations.scala:97)
at RasterData.RasterOperations.MapAlgebraOperations$.main(MapAlgebraOperations.scala:27)
at RasterData.RasterOperations.MapAlgebraOperations.main(MapAlgebraOperations.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:685)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
code:
object MapAlgebraOperations {
def main(args: Array[String]) : Unit = {
readHdfsTiles(args(0))
}
def readHdfsTiles(readpath: String): TileLayerRDD[SpatialKey] = {
implicit val sc = SparkUtils.createSparkContext("MapAlgebra")
//layer name for reading the RasterRDD
val nlcd1 = new String("nlcd1")
val nlcd2 = new String("nlcd2")
//Hadoop Tiles Location
val HdfsPath = new Path(readpath)
val reader = HadoopLayerReader(HdfsPath)(sc)
val rastereurope = reader.read[SpatialKey, Tile, TileLayerMetadata[SpatialKey]](LayerId(nlcd1, 1))
val rasterasia = reader.read[SpatialKey, Tile, TileLayerMetadata[SpatialKey]](LayerId(nlcd2, 1))
// Local Operations on Tiles
// Add 100 to each cell on raster Europe
val AddRasterEurope: RDD[(SpatialKey, Tile)] with Metadata[TileLayerMetadata[SpatialKey]] = rastereurope.withContext { _ localAdd 100 }
// Subtract 5 to each cell on raster Asia
val SubtractRasterAsia: RDD[(SpatialKey, Tile)] with Metadata[TileLayerMetadata[SpatialKey]] = rasterasia.withContext { _ localSubtract 1 }
// Union
val EuropeUnionAsia: RDD[(SpatialKey, Tile)] with Metadata[TileLayerMetadata[SpatialKey]] = rastereurope.withContext { _ union(SubtractRasterAsia) }
//Intersection
val EuropeIntersectAsia: RDD[(SpatialKey, Tile)] with Metadata[TileLayerMetadata[SpatialKey]] = rastereurope.withContext { _ intersection(rasterasia) }
//focal operations on Europe
val focalEurope = rastereurope.focalMean(Circle(5))
//Chain Focal Operations
val focalEuropeChain = focalEurope.focalMean(Circle(5))
//LeftOuterJoin
//val LeftOuterJoin = rastereurope.leftOuterJoin(rasterasia).updateValues(Add(_, _))
//Crop India from Asia
//Spatial Join
//val EuropeSpatialJoinAsia = rastereurope.spatialJoin(rasterasia)
//Define new layer
//val layerid1 = new LayerId("newnlcd1",1)
//val layerid2 = new LayerId("newnlcd2",1)
//val layerid3 = new LayerId("newnlcd3",1)
//Writing the resultant RDD to HDFS
//val writer = HadoopLayerWriter(HdfsPath)(sc)
//writer.write(layerid1, rddWithContext, ZCurveKeyIndexMethod)
//writer.write(layerid2, rddWithContext1, ZCurveKeyIndexMethod)
//writer.write(layerid3, rddWithContext2, ZCurveKeyIndexMethod)
//Convert the data into GeoTiff
val addrasterEurope = AddRasterEurope.stitch
GeoTiff(addrasterEurope, AddRasterEurope.metadata.crs).write("/home/brillio/tiffs/AddRasterEurope.tiff")
val subrasterAsia = SubtractRasterAsia.stitch
GeoTiff(subrasterAsia, SubtractRasterAsia.metadata.crs).write("/home/brillio/tiffs/SubtractRasterAsia.tiff")
val rasterUnion = EuropeUnionAsia.stitch
GeoTiff(rasterUnion, EuropeUnionAsia.metadata.crs).write("/home/brillio/tiffs/EuropeUnionAsia.tiff")
val rasterIntersect = EuropeIntersectAsia.stitch
GeoTiff(rasterIntersect, EuropeIntersectAsia.metadata.crs).write("/home/brillio/tiffs/EuropeIntersectAsia.tiff")
val rasterfocal = focalEurope.stitch
GeoTiff(rasterfocal, focalEurope.metadata.crs).write("/home/brillio/tiffs/rasterfocal.tiff")
val rasterChainfocal = focalEuropeChain.stitch
GeoTiff(rasterChainfocal, focalEuropeChain.metadata.crs).write("/home/brillio/tiffs/rasterChainfocal.tiff")
//val EuropeLeftOuterAsia = LeftOuterJoin.stitch
//GeoTiff(EuropeLeftOuterAsia, LeftOuterJoin.metadata.crs).write("/home/brillio/tiffs/rasterChainfocal.tiff")
//val rasterres3 = EuropeSpatialJoinAsia.stitch
//GeoTiff(rasterres3, EuropeSpatialJoinAsia.metadata.crs).write("/home/brillio/tiffs/EuropeSpatialJoinAsia.tiff")
AddRasterEurope
}
}

NoSuchElementException while trying to read header from excel

I have a class that parsing an xsl file for me and mapping it by the headers of the xsl.
I have another class that is the object of each line of the xsl and is using the headers to know which cell get which attribute by the headers that im mapping before...
from some reason im getting en error NoSuchElementException on one of the headers that is actually there, and there is no typos...it worked before, I dont know whats wrong now.
this is DataSource.scala class (its a trait) that controlling the xsl:
import java.io.File
import com.github.tototoshi.csv.CSVReader
import jxl.{Cell, Workbook}
import scala.collection.mutable
trait DataSource {
def read (fileName: String): Seq[Map[String, String]]
}
object CsvDataSource extends DataSource {
import com.github.tototoshi.csv.CSVFormat
import com.github.tototoshi.csv.Quoting
import com.github.tototoshi.csv.QUOTE_MINIMAL
implicit object VATBoxFormat extends CSVFormat {
val delimiter: Char = '\t'
val quoteChar: Char = '"'
val escapeChar: Char = '"'
val lineTerminator: String = "\r\n"
val quoting: Quoting = QUOTE_MINIMAL
val treatEmptyLineAsNil: Boolean = false
}
override def read(file: String): Seq[Map[String, String]] = {
val reader = CSVReader.open(file, "UTF-16")(VATBoxFormat)
reader.readNext()
val country = reader.readNext().get(5)
reader.readNext()
reader.iteratorWithHeaders.toSeq.map(c => c + ("country" -> country))
}
}
object ExecDataSource extends DataSource {
override def read(file: String): Seq[Map[String, String]] = {
val workbook = Workbook.getWorkbook(new File(file))
val sheet = workbook.getSheet(0)
val rowsUsed: Int = sheet.getRows
val headers = sheet.getRow(3).toList
// println(headers.map(_.getContents))
val country = sheet.getCell(5, 1).getContents
(4 until rowsUsed).map { i =>
val c = headers.zip(sheet.getRow(i)).map{case (k,v) => (k.getContents, v.getContents)}.toMap
c + ("country" -> country)
}
}
}
this is the PurchaseInfo class which is creating an object of each line of the excel:
case class PurchaseInfo(
something1: String,
something2: String,
something3: String,
something4: String) {
}
object PurchaseInfo {
private def changeDateFormat(dateInString: String): String = {
//System.out.println(dateInString)
val formatter: SimpleDateFormat = new SimpleDateFormat("MMM dd, yyyy")
val formatter2: SimpleDateFormat = new SimpleDateFormat("dd/MM/yyyy")
val date: Date = formatter.parse(dateInString)
return formatter2.format(date).toString
}
def fromDataSource (ds: DataSource)(fileName: String): Seq[PurchaseInfo] = {
ds.read(fileName).map { c =>
PurchaseInfo(
something1 = c("Supplier Address Street Number"),
something2 = c("Supplier Address Route"),
something3 = c("Supplier Address Locality"),
something4 = c("Supplier Address Postal Code")
)
}
}
}
(iv cut some of the var's in purchaseInfo to make it shorter for the question)
Now, this is the error im getting while running my code (from a diff class that runs my actions, this is an automation project that I use selenium)
Exception in thread "main" java.util.NoSuchElementException: key not found: Supplier Address Street Number
at scala.collection.MapLike$class.default(MapLike.scala:228)
at scala.collection.AbstractMap.default(Map.scala:59)
at scala.collection.MapLike$class.apply(MapLike.scala:141)
at scala.collection.AbstractMap.apply(Map.scala:59)
at PurchaseInfo$$anonfun$fromDataSource$1.apply(PurchaseInfo.scala:50)
at PurchaseInfo$$anonfun$fromDataSource$1.apply(PurchaseInfo.scala:48)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
at scala.collection.Iterator$class.foreach(Iterator.scala:742)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1194)
at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:245)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at PurchaseInfo$.fromDataSource(PurchaseInfo.scala:48)
at HolandPortal$.main(HolandPortal.scala:22)
at HolandPortal.main(HolandPortal.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:140)
Process finished with exit code 1
Does someone can see the issue...? I dont know why he cant find "Supplier Address Street Number", in the xsl I have this header exactly the same :/
thanks