Scala : java.lang.IllegalArgumentException: requirement failed: nonEmpty input - scala

I have written a code to process tiff files , do map algebra operations and store the result back as tiff file. When am running the code am getting this error. I am passing only one argument to the code which is a file path. What could be causing the error?
Error :
Exception in thread "main" java.lang.IllegalArgumentException: requirement failed: nonEmpty input
at scala.Predef$.require(Predef.scala:233)
at geotrellis.spark.stitch.TileLayoutStitcher$.stitch(StitchRDDMethods.scala:21)
at geotrellis.spark.stitch.SpatialTileLayoutRDDMethods.stitch(StitchRDDMethods.scala:42)
at RasterData.RasterOperations.MapAlgebraOperations$.readHdfsTiles(MapAlgebraOperations.scala:97)
at RasterData.RasterOperations.MapAlgebraOperations$.main(MapAlgebraOperations.scala:27)
at RasterData.RasterOperations.MapAlgebraOperations.main(MapAlgebraOperations.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:685)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
code:
object MapAlgebraOperations {
def main(args: Array[String]) : Unit = {
readHdfsTiles(args(0))
}
def readHdfsTiles(readpath: String): TileLayerRDD[SpatialKey] = {
implicit val sc = SparkUtils.createSparkContext("MapAlgebra")
//layer name for reading the RasterRDD
val nlcd1 = new String("nlcd1")
val nlcd2 = new String("nlcd2")
//Hadoop Tiles Location
val HdfsPath = new Path(readpath)
val reader = HadoopLayerReader(HdfsPath)(sc)
val rastereurope = reader.read[SpatialKey, Tile, TileLayerMetadata[SpatialKey]](LayerId(nlcd1, 1))
val rasterasia = reader.read[SpatialKey, Tile, TileLayerMetadata[SpatialKey]](LayerId(nlcd2, 1))
// Local Operations on Tiles
// Add 100 to each cell on raster Europe
val AddRasterEurope: RDD[(SpatialKey, Tile)] with Metadata[TileLayerMetadata[SpatialKey]] = rastereurope.withContext { _ localAdd 100 }
// Subtract 5 to each cell on raster Asia
val SubtractRasterAsia: RDD[(SpatialKey, Tile)] with Metadata[TileLayerMetadata[SpatialKey]] = rasterasia.withContext { _ localSubtract 1 }
// Union
val EuropeUnionAsia: RDD[(SpatialKey, Tile)] with Metadata[TileLayerMetadata[SpatialKey]] = rastereurope.withContext { _ union(SubtractRasterAsia) }
//Intersection
val EuropeIntersectAsia: RDD[(SpatialKey, Tile)] with Metadata[TileLayerMetadata[SpatialKey]] = rastereurope.withContext { _ intersection(rasterasia) }
//focal operations on Europe
val focalEurope = rastereurope.focalMean(Circle(5))
//Chain Focal Operations
val focalEuropeChain = focalEurope.focalMean(Circle(5))
//LeftOuterJoin
//val LeftOuterJoin = rastereurope.leftOuterJoin(rasterasia).updateValues(Add(_, _))
//Crop India from Asia
//Spatial Join
//val EuropeSpatialJoinAsia = rastereurope.spatialJoin(rasterasia)
//Define new layer
//val layerid1 = new LayerId("newnlcd1",1)
//val layerid2 = new LayerId("newnlcd2",1)
//val layerid3 = new LayerId("newnlcd3",1)
//Writing the resultant RDD to HDFS
//val writer = HadoopLayerWriter(HdfsPath)(sc)
//writer.write(layerid1, rddWithContext, ZCurveKeyIndexMethod)
//writer.write(layerid2, rddWithContext1, ZCurveKeyIndexMethod)
//writer.write(layerid3, rddWithContext2, ZCurveKeyIndexMethod)
//Convert the data into GeoTiff
val addrasterEurope = AddRasterEurope.stitch
GeoTiff(addrasterEurope, AddRasterEurope.metadata.crs).write("/home/brillio/tiffs/AddRasterEurope.tiff")
val subrasterAsia = SubtractRasterAsia.stitch
GeoTiff(subrasterAsia, SubtractRasterAsia.metadata.crs).write("/home/brillio/tiffs/SubtractRasterAsia.tiff")
val rasterUnion = EuropeUnionAsia.stitch
GeoTiff(rasterUnion, EuropeUnionAsia.metadata.crs).write("/home/brillio/tiffs/EuropeUnionAsia.tiff")
val rasterIntersect = EuropeIntersectAsia.stitch
GeoTiff(rasterIntersect, EuropeIntersectAsia.metadata.crs).write("/home/brillio/tiffs/EuropeIntersectAsia.tiff")
val rasterfocal = focalEurope.stitch
GeoTiff(rasterfocal, focalEurope.metadata.crs).write("/home/brillio/tiffs/rasterfocal.tiff")
val rasterChainfocal = focalEuropeChain.stitch
GeoTiff(rasterChainfocal, focalEuropeChain.metadata.crs).write("/home/brillio/tiffs/rasterChainfocal.tiff")
//val EuropeLeftOuterAsia = LeftOuterJoin.stitch
//GeoTiff(EuropeLeftOuterAsia, LeftOuterJoin.metadata.crs).write("/home/brillio/tiffs/rasterChainfocal.tiff")
//val rasterres3 = EuropeSpatialJoinAsia.stitch
//GeoTiff(rasterres3, EuropeSpatialJoinAsia.metadata.crs).write("/home/brillio/tiffs/EuropeSpatialJoinAsia.tiff")
AddRasterEurope
}
}

Related

Spark: FlatMap and CountVectorizer pipeline

I working on the pipeline and try to split the column value before passing it to CountVectorizer.
For this purpose I made a custom Transformer.
class FlatMapTransformer(override val uid: String)
extends Transformer {
/**
* Param for input column name.
* #group param
*/
final val inputCol = new Param[String](this, "inputCol", "The input column")
final def getInputCol: String = $(inputCol)
/**
* Param for output column name.
* #group param
*/
final val outputCol = new Param[String](this, "outputCol", "The output column")
final def getOutputCol: String = $(outputCol)
def setInputCol(value: String): this.type = set(inputCol, value)
def setOutputCol(value: String): this.type = set(outputCol, value)
def this() = this(Identifiable.randomUID("FlatMapTransformer"))
private val flatMap: String => Seq[String] = { input: String =>
input.split(",")
}
override def copy(extra: ParamMap): SplitString = defaultCopy(extra)
override def transform(dataset: Dataset[_]): DataFrame = {
val flatMapUdf = udf(flatMap)
dataset.withColumn($(outputCol), explode(flatMapUdf(col($(inputCol)))))
}
override def transformSchema(schema: StructType): StructType = {
val dataType = schema($(inputCol)).dataType
require(
dataType.isInstanceOf[StringType],
s"Input column must be of type StringType but got ${dataType}")
val inputFields = schema.fields
require(
!inputFields.exists(_.name == $(outputCol)),
s"Output column ${$(outputCol)} already exists.")
DataTypes.createStructType(
Array(
DataTypes.createStructField($(outputCol), DataTypes.StringType, false)))
}
}
The code seems legit, but when I try to chain it with other operation the problem occurs. Here is my pipeline:
val train = reader.readTrainingData()
val cat_features = getFeaturesByType(taskConfig, "categorical")
val num_features = getFeaturesByType(taskConfig, "numeric")
val cat_ohe_features = getFeaturesByType(taskConfig, "categorical", Some("ohe"))
val cat_features_string_index = cat_features.
filter { feature: String => !cat_ohe_features.contains(feature) }
val catIndexer = cat_features_string_index.map {
feature =>
new StringIndexer()
.setInputCol(feature)
.setOutputCol(feature + "_index")
.setHandleInvalid("keep")
}
val flatMapper = cat_ohe_features.map {
feature =>
new FlatMapTransformer()
.setInputCol(feature)
.setOutputCol(feature + "_transformed")
}
val countVectorizer = cat_ohe_features.map {
feature =>
new CountVectorizer()
.setInputCol(feature + "_transformed")
.setOutputCol(feature + "_vectorized")
.setVocabSize(10)
}
// val countVectorizer = cat_ohe_features.map {
// feature =>
//
// val flatMapper = new FlatMapTransformer()
// .setInputCol(feature)
// .setOutputCol(feature + "_transformed")
//
// new CountVectorizer()
// .setInputCol(flatMapper.getOutputCol)
// .setOutputCol(feature + "_vectorized")
// .setVocabSize(10)
// }
val cat_features_index = cat_features_string_index.map {
(feature: String) => feature + "_index"
}
val count_vectorized_index = cat_ohe_features.map {
(feature: String) => feature + "_vectorized"
}
val catFeatureAssembler = new VectorAssembler()
.setInputCols(cat_features_index)
.setOutputCol("cat_features")
val oheFeatureAssembler = new VectorAssembler()
.setInputCols(count_vectorized_index)
.setOutputCol("cat_ohe_features")
val numFeatureAssembler = new VectorAssembler()
.setInputCols(num_features)
.setOutputCol("num_features")
val featureAssembler = new VectorAssembler()
.setInputCols(Array("cat_features", "num_features", "cat_ohe_features_vectorized"))
.setOutputCol("features")
val pipelineStages = catIndexer ++ flatMapper ++ countVectorizer ++
Array(
catFeatureAssembler,
oheFeatureAssembler,
numFeatureAssembler,
featureAssembler)
val pipeline = new Pipeline().setStages(pipelineStages)
pipeline.fit(dataset = train)
Running this code, I receive an error:
java.lang.IllegalArgumentException: Field "my_ohe_field_trasformed" does not exist.
[info] java.lang.IllegalArgumentException: Field "from_expdelv_areas_transformed" does not exist.
[info] at org.apache.spark.sql.types.StructType$$anonfun$apply$1.apply(StructType.scala:266)
[info] at org.apache.spark.sql.types.StructType$$anonfun$apply$1.apply(StructType.scala:266)
[info] at scala.collection.MapLike$class.getOrElse(MapLike.scala:128)
[info] at scala.collection.AbstractMap.getOrElse(Map.scala:59)
[info] at org.apache.spark.sql.types.StructType.apply(StructType.scala:265)
[info] at org.apache.spark.ml.util.SchemaUtils$.checkColumnTypes(SchemaUtils.scala:56)
[info] at org.apache.spark.ml.feature.CountVectorizerParams$class.validateAndTransformSchema(CountVectorizer.scala:75)
[info] at org.apache.spark.ml.feature.CountVectorizer.validateAndTransformSchema(CountVectorizer.scala:123)
[info] at org.apache.spark.ml.feature.CountVectorizer.transformSchema(CountVectorizer.scala:188)
When I uncomment the stringSplitter and countVectorizer the error is raised in my Transformer
java.lang.IllegalArgumentException: Field "my_ohe_field" does not exist. at
val dataType = schema($(inputCol)).dataType
Result of calling pipeline.getStages:
strIdx_3c2630a738f0
strIdx_0d76d55d4200
FlatMapTransformer_fd8595c2969c
FlatMapTransformer_2e9a7af0b0fa
cntVec_c2ef31f00181
cntVec_68a78eca06c9
vecAssembler_a81dd9f43d56
vecAssembler_b647d348f0a0
vecAssembler_b5065a22d5c8
vecAssembler_d9176b8bb593
I might follow the wrong way. Any comments are appreciated.
Your FlatMapTransformer #transform is incorrect, your kind of dropping/ignoring all other columns when you select only on outputCol
please modify your method to -
override def transform(dataset: Dataset[_]): DataFrame = {
val flatMapUdf = udf(flatMap)
dataset.withColumn($(outputCol), explode(flatMapUdf(col($(inputCol)))))
}
Also, Modify your transformSchema to check input column first before checking its datatype-
override def transformSchema(schema: StructType): StructType = {
require(schema.names.contains($(inputCol)), "inputCOl is not there in the input dataframe")
//... rest as it is
}
Update-1 based on comments
PLease modify the copy method (Though it's not the cause for exception you facing)-
override def copy(extra: ParamMap): FlatMapTransformer = defaultCopy(extra)
please note that the CountVectorizer takes the column having columns of type ArrayType(StringType, true/false) and since the FlatMapTransformer output columns becomes the input of CountVectorizer, you need to make sure output column of FlatMapTransformer must be of ArrayType(StringType, true/false). I think, this is not the case, your code today is as following-
override def transform(dataset: Dataset[_]): DataFrame = {
val flatMapUdf = udf(flatMap)
dataset.withColumn($(outputCol), explode(flatMapUdf(col($(inputCol)))))
}
The explode functions converts the array<string> to string, so the output of the transformer becomes StringType. you may wanted to change this code to-
override def transform(dataset: Dataset[_]): DataFrame = {
val flatMapUdf = udf(flatMap)
dataset.withColumn($(outputCol), flatMapUdf(col($(inputCol))))
}
modify transformSchema method to output ArrayType(StringType)
override def transformSchema(schema: StructType): StructType = {
val dataType = schema($(inputCol)).dataType
require(
dataType.isInstanceOf[StringType],
s"Input column must be of type StringType but got ${dataType}")
val inputFields = schema.fields
require(
!inputFields.exists(_.name == $(outputCol)),
s"Output column ${$(outputCol)} already exists.")
schema.add($(outputCol), ArrayType(StringType))
}
change vector assembler to this-
val featureAssembler = new VectorAssembler()
.setInputCols(Array("cat_features", "num_features", "cat_ohe_features"))
.setOutputCol("features")
I tried to execute your pipeline on dummy dataframe, it worked well. Please refer this gist for full code.

Spark Streaming - Broadcast variable - Case Class

My requirement is to enrich data stream data with profile information from a HBase table. I was looking to use a broadcast variable. Enclosed the whole code here.
The output of HBase data is as follows
In the Driver node HBaseReaderBuilder
(org.apache.spark.SparkContext#3c58b102,hbase_customer_profile,Some(data),WrappedArray(gender, age),None,None,List()))
In the Worker node
HBaseReaderBuilder(null,hbase_customer_profile,Some(data),WrappedArray(gender, age),None,None,List()))
As you can see it has lost the spark context. When i issue the statement val
myRdd = bcdocRdd.map(r => Profile(r._1, r._2, r._3)) i get a NullPointerException
java.lang.NullPointerException
at it.nerdammer.spark.hbase.HBaseReaderBuilderConversions$class.toSimpleHBaseRDD(HBaseReaderBuilder.scala:83)
at it.nerdammer.spark.hbase.package$.toSimpleHBaseRDD(package.scala:5)
at it.nerdammer.spark.hbase.HBaseReaderBuilderConversions$class.toHBaseRDD(HBaseReaderBuilder.scala:67)
at it.nerdammer.spark.hbase.package$.toHBaseRDD(package.scala:5)
at testPartition$$anonfun$main$1$$anonfun$apply$1$$anonfun$apply$2.apply(testPartition.scala:34)
at testPartition$$anonfun$main$1$$anonfun$apply$1$$anonfun$apply$2.apply(testPartition.scala:33)
object testPartition {
def main(args: Array[String]) : Unit = {
val sparkMaster = "spark://x.x.x.x:7077"
val ipaddress = "x.x.x.x:2181" // Zookeeper
val hadoopHome = "/home/hadoop/software/hadoop-2.6.0"
val topicname = "new_events_test_topic"
val mainConf = new SparkConf().setMaster(sparkMaster).setAppName("testingPartition")
val mainSparkContext = new SparkContext(mainConf)
val ssc = new StreamingContext(mainSparkContext, Seconds(30))
val eventsStream = KafkaUtils.createStream(ssc,"x.x.x.x:2181","receive_rest_events",Map(topicname.toString -> 2))
val docRdd = mainSparkContext.hbaseTable[(String, Option[String], Option[String])]("hbase_customer_profile").select("gender","age").inColumnFamily("data")
println ("docRDD from Driver ",docRdd)
val broadcastedprof = mainSparkContext.broadcast(docRdd)
eventsStream.foreachRDD(dstream => {
dstream.foreachPartition(records => {
println("Broadcasted docRDD - in Worker ", broadcastedprof.value)
val bcdocRdd = broadcastedprof.value
records.foreach(record => {
//val myRdd = bcdocRdd.map(r => Profile(r._1, r._2, r._3))
//myRdd.foreach(println)
val Rows = record._2.split("\r\n")
})
})
})
ssc.start()
ssc.awaitTermination()
}
}

Exception in thread "main" java.lang.AbstractMethodError

I am trying to access spark GRAPHX .
i have tried following code :
/**
* Created by xyz on 5/7/16.
*/
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.graphx._
object simple {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("untitled1")
val sc = new SparkContext(conf)
val graph = GraphLoader.edgeListFile(sc, "graphx/data/followers.txt", true).partitionBy(PartitionStrategy.RandomVertexCut)
val triCounts = graph.triangleCount().vertices
val users = sc.textFile("graphx/data/users.txt").map { line =>
val fields = line.split(",")
(fields(0).toLong, fields(1))
}
val triCountByUsername = users.join(triCounts).map { case (id, (username, tc)) =>
(username, tc)
}
println(triCountByUsername.collect().mkString("\n"))
}
}
val conf = new SparkConf().setAppName("untitled1")
val sc = new SparkContext(conf)
val graph = GraphLoader.edgeListFile(sc, "graphx/data/followers.txt", true).partitionBy(PartitionStrategy.RandomVertexCut)
val triCounts = graph.triangleCount().vertices
val users = sc.textFile("graphx/data/users.txt").map { line =>
val fields = line.split(",")
(fields(0).toLong, fields(1))
}
val triCountByUsername = users.join(triCounts).map { case (id, (username, tc)) =>
(username, tc)
}
println(triCountByUsername.collect().mkString("\n"))
========================================
but its giving me following error :
Exception in thread "main" java.lang.AbstractMethodError
at org.apache.spark.Logging$class.log(Logging.scala:52)
at org.apache.spark.graphx.GraphLoader$.log(GraphLoader.scala:26)
at org.apache.spark.Logging$class.logInfo(Logging.scala:59)
at org.apache.spark.graphx.GraphLoader$.logInfo(GraphLoader.scala:26)
at org.apache.spark.graphx.GraphLoader$.edgeListFile(GraphLoader.scala:84)
at simple$.main(simple.scala:18)
at simple.main(simple.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:144)
I have a data file having following content :
2 1
4 1
1 2
6 3
7 3
7 6
6 7
3 7

Spark Hadoop Failed to get broadcast

Running a spark-submit job and receiving a "Failed to get broadcast_58_piece0..." error. I'm really not sure what I'm doing wrong. Am I overusing UDFs? Too complicated a function?
As a summary of my objective, I am parsing text from pdfs, which are stored as base64 encoded strings in JSON objects. I'm using Apache Tika to get the text, and trying to make copious use of data frames to make things easier.
I had written a piece of code that ran the text extraction through tika as a function outside of "main" on the data as a RDD, and that worked flawlessly. When I try to bring the extraction into main as a UDF on data frames, though, it borks in various different ways. Before I got here I was actually trying to write the final data frame as:
valid.toJSON.saveAsTextFile(hdfs_dir)
This was giving me all sorts of "File/Path already exists" headaches.
Current code:
object Driver {
def main(args: Array[String]):Unit = {
val hdfs_dir = args(0)
val spark_conf = new SparkConf().setAppName("Spark Tika HDFS")
val sc = new SparkContext(spark_conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
// load json data into dataframe
val df = sqlContext.read.json("hdfs://hadoophost.com:8888/user/spark/data/in/*")
val extractInfo: (Array[Byte] => String) = (fp: Array[Byte]) => {
val parser:Parser = new AutoDetectParser()
val handler:BodyContentHandler = new BodyContentHandler(Integer.MAX_VALUE)
val config:TesseractOCRConfig = new TesseractOCRConfig()
val pdfConfig:PDFParserConfig = new PDFParserConfig()
val inputstream:InputStream = new ByteArrayInputStream(fp)
val metadata:Metadata = new Metadata()
val parseContext:ParseContext = new ParseContext()
parseContext.set(classOf[TesseractOCRConfig], config)
parseContext.set(classOf[PDFParserConfig], pdfConfig)
parseContext.set(classOf[Parser], parser)
parser.parse(inputstream, handler, metadata, parseContext)
handler.toString
}
val extract_udf = udf(extractInfo)
val df2 = df.withColumn("unbased_media", unbase64($"media_file")).drop("media_file")
val dfRenamed = df2.withColumn("media_corpus", extract_udf(col("unbased_media"))).drop("unbased_media")
val depuncter: (String => String) = (corpus: String) => {
val r = corpus.replaceAll("""[\p{Punct}]""", "")
val s = r.replaceAll("""[0-9]""", "")
s
}
val depuncter_udf = udf(depuncter)
val withoutPunct = dfRenamed.withColumn("sentence", depuncter_udf(col("media_corpus")))
val model = sc.objectFile[org.apache.spark.ml.PipelineModel]("hdfs://hadoophost.com:8888/user/spark/hawkeye-nb-ml-v2.0").first()
val with_predictions = model.transform(withoutPunct)
val fullNameChecker: ((String, String, String, String, String) => String) = (fname: String, mname: String, lname: String, sfx: String, text: String) =>{
val newtext = text.replaceAll(" ", "").replaceAll("""[0-9]""", "").replaceAll("""[\p{Punct}]""", "").toLowerCase
val new_fname = fname.replaceAll(" ", "").replaceAll("""[0-9]""", "").replaceAll("""[\p{Punct}]""", "").toLowerCase
val new_mname = mname.replaceAll(" ", "").replaceAll("""[0-9]""", "").replaceAll("""[\p{Punct}]""", "").toLowerCase
val new_lname = lname.replaceAll(" ", "").replaceAll("""[0-9]""", "").replaceAll("""[\p{Punct}]""", "").toLowerCase
val new_sfx = sfx.replaceAll(" ", "").replaceAll("""[0-9]""", "").replaceAll("""[\p{Punct}]""", "").toLowerCase
val name_full = new_fname.concat(new_mname).concat(new_lname).concat(new_sfx)
val c = name_full.r.findAllIn(newtext).length
c match {
case 0 => "N"
case _ => "Y"
}
}
val fullNameChecker_udf = udf(fullNameChecker)
val stringChecker: ((String, String) => String) = (term: String, text: String) => {
val termLower = term.replaceAll("""[\p{Punct}]""", "").toLowerCase
val textLower = text.replaceAll("""[\p{Punct}]""", "").toLowerCase
val c = termLower.r.findAllIn(textLower).length
c match {
case 0 => "N"
case _ => "Y"
}
}
val stringChecker_udf = udf(stringChecker)
val stringChecker2: ((String, String) => String) = (term: String, text: String) => {
val termLower = term takeRight 4
val textLower = text
val c = termLower.r.findAllIn(textLower).length
c match {
case 0 => "N"
case _ => "Y"
}
}
val stringChecker2_udf = udf(stringChecker)
val valids = with_predictions.withColumn("fname_valid", stringChecker_udf(col("first_name"), col("media_corpus")))
.withColumn("lname_valid", stringChecker_udf(col("last_name"), col("media_corpus")))
.withColumn("fname2_valid", stringChecker_udf(col("first_name_2"), col("media_corpus")))
.withColumn("lname2_valid", stringChecker_udf(col("last_name_2"), col("media_corpus")))
.withColumn("camt_valid", stringChecker_udf(col("chargeoff_amount"), col("media_corpus")))
.withColumn("ocan_valid", stringChecker2_udf(col("original_creditor_account_nbr"), col("media_corpus")))
.withColumn("dpan_valid", stringChecker2_udf(col("debt_provider_account_nbr"), col("media_corpus")))
.withColumn("full_name_valid", fullNameChecker_udf(col("first_name"), col("middle_name"), col("last_name"), col("suffix"), col("media_corpus")))
.withColumn("full_name_2_valid", fullNameChecker_udf(col("first_name_2"), col("middle_name_2"), col("last_name_2"), col("suffix_2"), col("media_corpus")))
valids.write.mode(SaveMode.Overwrite).format("json").save(hdfs_dir)
}
}
Full stack trace starting with error:
16/06/14 15:02:01 WARN TaskSetManager: Lost task 0.0 in stage 4.0 (TID 53, hdpd11n05.squaretwofinancial.com): org.apache.spark.SparkException: Task failed while writing rows.
at org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:272)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
Caused by: java.io.IOException: org.apache.spark.SparkException: Failed to get broadcast_58_piece0 of broadcast_58
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1222)
at org.apache.spark.broadcast.TorrentBroadcast.readBroadcastBlock(TorrentBroadcast.scala:165)
at org.apache.spark.broadcast.TorrentBroadcast._value$lzycompute(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast._value(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast.getValue(TorrentBroadcast.scala:88)
at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70)
at org.apache.spark.ml.feature.CountVectorizerModel$$anonfun$9$$anonfun$apply$7.apply(CountVectorizer.scala:222)
at org.apache.spark.ml.feature.CountVectorizerModel$$anonfun$9$$anonfun$apply$7.apply(CountVectorizer.scala:221)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:34)
at org.apache.spark.ml.feature.CountVectorizerModel$$anonfun$9.apply(CountVectorizer.scala:221)
at org.apache.spark.ml.feature.CountVectorizerModel$$anonfun$9.apply(CountVectorizer.scala:218)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.evalExpr43$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at org.apache.spark.sql.execution.Project$$anonfun$1$$anonfun$apply$1.apply(basicOperators.scala:51)
at org.apache.spark.sql.execution.Project$$anonfun$1$$anonfun$apply$1.apply(basicOperators.scala:49)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:263)
... 8 more
Caused by: org.apache.spark.SparkException: Failed to get broadcast_58_piece0 of broadcast_58
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1$$anonfun$2.apply(TorrentBroadcast.scala:138)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1$$anonfun$2.apply(TorrentBroadcast.scala:138)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply$mcVI$sp(TorrentBroadcast.scala:137)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply(TorrentBroadcast.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$org$apache$spark$broadcast$TorrentBroadcast$$readBlocks$1.apply(TorrentBroadcast.scala:120)
at scala.collection.immutable.List.foreach(List.scala:318)
at org.apache.spark.broadcast.TorrentBroadcast.org$apache$spark$broadcast$TorrentBroadcast$$readBlocks(TorrentBroadcast.scala:120)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$readBroadcastBlock$1.apply(TorrentBroadcast.scala:175)
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1219)
... 25 more
I encountered a similar error.
It turns out to be caused by the broadcast usage in CounterVectorModel. Following is the detailed cause in my case:
When model.transform() is called , the vocabulary is broadcasted and saved as an attribute broadcastDic in model implicitly. Therefore, if the CounterVectorModel is saved after calling model.transform(), the private var attribute broadcastDic is also saved. But unfortunately, in Spark, broadcasted object is context-sensitive, which means it is embedded in SparkContext. If that CounterVectorModel is loaded in a different SparkContext, it will fail to find the previous saved broadcastDic.
So either solution is to prevent calling model.transform() before saving the model, or clone the model by method model.copy().
For anyone coming across this, it turns out the model I was loading was malformed. I found out by using spark-shell in yarn-client mode and stepping through the code. When I tried to load the model it was fine, but running it against the datagram (model.transform) through errors about not finding a metadata directory.
I went back and found a good model, ran against that and it worked fine. This code is actually sound.

Exception in thread "main" java.lang.NumberFormatException

I am new to scala. When i try to run the example program PageRank its showing the following error..
Exception in thread "main" java.lang.NumberFormatException: For input
string: "5" at
scala.collection.immutable.StringLike$class.parseBoolean(StringLike.scala:240)
at
scala.collection.immutable.StringLike$class.toBoolean(StringLike.scala:228)
at scala.collection.immutable.StringOps.toBoolean(StringOps.scala:31)
at
spark.bagel.examples.WikipediaPageRank$.main(WikipediaPageRank.scala:30)
at
spark.bagel.examples.WikipediaPageRank.main(WikipediaPageRank.scala)
import spark._
import spark.SparkContext._
import spark.bagel._
import spark.bagel.Bagel._
import scala.xml.{XML,NodeSeq}
object WikipediaPageRank {
def main(args: Array[String]) {
if (args.length < 5) {
System.err.println("Usage: WikipediaPageRank <inputFile> <threshold> <numPartitions> <host> <usePartitioner>")
System.exit(-1)
}
System.setProperty("spark.serializer", "spark.KryoSerializer")
System.setProperty("spark.kryo.registrator", classOf[PRKryoRegistrator].getName)
val inputFile = args(0)
val threshold = args(1).toDouble
val numPartitions = args(2).toInt
val host = args(3)
val usePartitioner = args(4).toBoolean
val sc = new SparkContext(host, "WikipediaPageRank")
// Parse the Wikipedia page data into a graph
val input = sc.textFile(inputFile)
println("Counting vertices...")
val numVertices = input.count()
println("Done counting vertices.")
println("Parsing input file...")
var vertices = input.map(line => {
val fields = line.split("\t")
val (title, body) = (fields(1), fields(3).replace("\\n", "\n"))
val links =
if (body == "\\N")
NodeSeq.Empty
else
try {
XML.loadString(body) \\ "link" \ "target"
} catch {
case e: org.xml.sax.SAXParseException =>
System.err.println("Article \""+title+"\" has malformed XML in body:\n"+body)
NodeSeq.Empty
}
val outEdges = links.map(link => new String(link.text)).toArray
val id = new String(title)
(id, new PRVertex(1.0 / numVertices, outEdges))
})
if (usePartitioner)
vertices = vertices.partitionBy(new HashPartitioner(sc.defaultParallelism)).cache
else
vertices = vertices.cache
println("Done parsing input file.")
// Do the computation
val epsilon = 0.01 / numVertices
val messages = sc.parallelize(Array[(String, PRMessage)]())
val utils = new PageRankUtils
val result =
Bagel.run(
sc, vertices, messages, combiner = new PRCombiner(),
numPartitions = numPartitions)(
utils.computeWithCombiner(numVertices, epsilon))
// Print the result
System.err.println("Articles with PageRank >= "+threshold+":")
val top =
(result
.filter { case (id, vertex) => vertex.value >= threshold }
.map { case (id, vertex) => "%s\t%s\n".format(id, vertex.value) }
.collect.mkString)
println(top)
}
}
Please help me in solving the error.