Logistic Regression on a Dataset from generateLinearRDD fails with java.lang.IllegalArgumentException - scala

So as a proof of concept for something I was trying to generate a DataFrame with sample data from LinearDataGenerator.generateLinearRDD, then perform a logistic regression on it.
Assuming that generateLinearRDD would generate data suitable for performing a linear regression, I stuck it in a pipeline with a Binarizer to create a threshold column suitable for a logistic regression.
My code is as follows:
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.util.{LinearDataGenerator, MLUtils}
import org.apache.spark.ml.feature.Binarizer
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
// databricks users can comment out lines between here...
val spark = SparkSession
.builder()
.appName("Java Spark SQL basic example")
.config("spark.master", "local")
.getOrCreate()
import spark.implicits._
// ...and here
val data = {
val tmp = LinearDataGenerator.generateLinearRDD(spark.sparkContext, 10000, 4, 0.05).toDF()
MLUtils.convertVectorColumnsToML(tmp, "features").withColumnRenamed("label", "continuousLabel")
}
val binarizer = new Binarizer()
.setInputCol("continuousLabel")
.setOutputCol("label")
.setThreshold(0)
val logisticRegression = new LogisticRegression()
val pipeline = new Pipeline()
.setStages(Array(binarizer, logisticRegression))
val pipelineModel = pipeline.fit(data)
println(pipelineModel.stages.last.asInstanceOf[LogisticRegressionModel].binarySummary.accuracy)
The stacktrace from the exception looks like this:
Exception in thread "main" java.lang.IllegalArgumentException
at org.apache.xbean.asm5.ClassReader.<init>(Unknown Source)
at org.apache.xbean.asm5.ClassReader.<init>(Unknown Source)
at org.apache.xbean.asm5.ClassReader.<init>(Unknown Source)
at org.apache.spark.util.ClosureCleaner$.getClassReader(ClosureCleaner.scala:46)
at org.apache.spark.util.FieldAccessFinder$$anon$3$$anonfun$visitMethodInsn$2.apply(ClosureCleaner.scala:449)
at org.apache.spark.util.FieldAccessFinder$$anon$3$$anonfun$visitMethodInsn$2.apply(ClosureCleaner.scala:432)
at scala.collection.TraversableLike$WithFilter$$anonfun$foreach$1.apply(TraversableLike.scala:733)
at scala.collection.mutable.HashMap$$anon$1$$anonfun$foreach$2.apply(HashMap.scala:134)
at scala.collection.mutable.HashMap$$anon$1$$anonfun$foreach$2.apply(HashMap.scala:134)
at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:236)
at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:40)
at scala.collection.mutable.HashMap$$anon$1.foreach(HashMap.scala:134)
at scala.collection.TraversableLike$WithFilter.foreach(TraversableLike.scala:732)
at org.apache.spark.util.FieldAccessFinder$$anon$3.visitMethodInsn(ClosureCleaner.scala:432)
at org.apache.xbean.asm5.ClassReader.a(Unknown Source)
at org.apache.xbean.asm5.ClassReader.b(Unknown Source)
at org.apache.xbean.asm5.ClassReader.accept(Unknown Source)
at org.apache.xbean.asm5.ClassReader.accept(Unknown Source)
at org.apache.spark.util.ClosureCleaner$$anonfun$org$apache$spark$util$ClosureCleaner$$clean$14.apply(ClosureCleaner.scala:262)
at org.apache.spark.util.ClosureCleaner$$anonfun$org$apache$spark$util$ClosureCleaner$$clean$14.apply(ClosureCleaner.scala:261)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:261)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:159)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2299)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2073)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:743)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:742)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:742)
at org.apache.spark.mllib.evaluation.MulticlassMetrics.tpByClass$lzycompute(MulticlassMetrics.scala:48)
at org.apache.spark.mllib.evaluation.MulticlassMetrics.tpByClass(MulticlassMetrics.scala:44)
at org.apache.spark.mllib.evaluation.MulticlassMetrics.accuracy$lzycompute(MulticlassMetrics.scala:168)
at org.apache.spark.mllib.evaluation.MulticlassMetrics.accuracy(MulticlassMetrics.scala:168)
at org.apache.spark.ml.classification.LogisticRegressionSummary$class.accuracy(LogisticRegression.scala:1445)
at org.apache.spark.ml.classification.LogisticRegressionSummaryImpl.accuracy(LogisticRegression.scala:1641)
at crossvalidation_graphs$.delayedEndpoint$crossvalidation_graphs$1(crossvalidation_graphs.scala:35)
at crossvalidation_graphs$delayedInit$body.apply(crossvalidation_graphs.scala:9)
at scala.Function0$class.apply$mcV$sp(Function0.scala:34)
at scala.runtime.AbstractFunction0.apply$mcV$sp(AbstractFunction0.scala:12)
at scala.App$$anonfun$main$1.apply(App.scala:76)
at scala.App$$anonfun$main$1.apply(App.scala:76)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.generic.TraversableForwarder$class.foreach(TraversableForwarder.scala:35)
at scala.App$class.main(App.scala:76)
at crossvalidation_graphs$.main(crossvalidation_graphs.scala:9)
at crossvalidation_graphs.main(crossvalidation_graphs.scala)
My schema currently looks like this:
root
|-- continuousLabel: double (nullable = false)
|-- features: vector (nullable = true)
I'm running Spark 2.3.1 with Scala 2.11.12

Similar to this guy, my actual problem was that I was using Java 10 instead of Java 8. When I switched back to Java 8 my code worked without problems.

Related

Implement XGBoost in Scala Spark, dataproc zeppelin notebook

I am trying to implement an xgboost model in scala, using zeppelin in dataproc (google cloud). This is the code I'm implementing:
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions._
import org.apache.spark.sql.{DataFrame, Dataset, Row, SaveMode, SparkSession}
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.udf
import scala.collection.mutable
import org.apache.spark.sql.{DataFrame, _}
import spark.implicits._
import org.apache.spark.ml.{Pipeline, PipelineStage}
Adding deppendency (also added jar in zeppelin notebook dependencies)
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-spark</artifactId>
<version>0.72</version>
</dependency>
Dummy data:
val someData = Seq(
Row(8, 15 1),
Row(64, 25 1),
Row(27, 22 0)
)
val someSchema = List(
StructField("var1", IntegerType, true),
StructField("var2", IntegerType, true),
StructField("Classification", IntegerType, true)
)
val data= spark.createDataFrame(
spark.sparkContext.parallelize(someData),
StructType(someSchema)
)
Model implementation:
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.VectorAssembler
val stringIndexer = new StringIndexer().
setInputCol("Classification").
setOutputCol("label").
fit(data)
val labelTransformed = stringIndexer.transform(data).drop("Classification")
val vectorAssembler = new VectorAssembler().
setInputCols(Array("var1", "var2")).
setOutputCol("features")
val xgbInput = vectorAssembler.transform(labelTransformed).select("features", "label")
import ml.dmlc.xgboost4j.scala.spark.XGBoostEstimator
val paramMap = Map[String, Any]("objective" -> "binary:logistic", "nworkers" -> 2)
val est = new XGBoostEstimator(paramMap)
val model = est.fit(xgbInput)
Everything works except for the very last line, where I get the following error:
Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.156.0.33, DMLC_TRACKER_PORT=9091, DMLC_NUM_WORKER=2}
ml.dmlc.xgboost4j.java.XGBoostError: XGBoostModel training failed
at ml.dmlc.xgboost4j.scala.spark.XGBoost$.ml$dmlc$xgboost4j$scala$spark$XGBoost$$postTrackerReturnProcessing(XGBoost.scala:406)
at ml.dmlc.xgboost4j.scala.spark.XGBoost$$anonfun$trainDistributed$4.apply(XGBoost.scala:356)
at ml.dmlc.xgboost4j.scala.spark.XGBoost$$anonfun$trainDistributed$4.apply(XGBoost.scala:337)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.immutable.List.foreach(List.scala:381)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.immutable.List.map(List.scala:285)
at ml.dmlc.xgboost4j.scala.spark.XGBoost$.trainDistributed(XGBoost.scala:336)
at ml.dmlc.xgboost4j.scala.spark.XGBoostEstimator.train(XGBoostEstimator.scala:139)
at ml.dmlc.xgboost4j.scala.spark.XGBoostEstimator.train(XGBoostEstimator.scala:36)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
... 69 elided
Once again, using scala on zeppelin through dataproc, spark version is 2.4.5.
Can anyone help me?
EDIT: Full error logs:
Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.156.0.9, DMLC_TRACKER_PORT=9091, DMLC_NUM_WORKER=2}
ml.dmlc.xgboost4j.java.XGBoostError: XGBoostModel training failed
at ml.dmlc.xgboost4j.scala.spark.XGBoost$.ml$dmlc$xgboost4j$scala$spark$XGBoost$$postTrackerReturnProcessing(XGBoost.scala:406)
at ml.dmlc.xgboost4j.scala.spark.XGBoost$$anonfun$trainDistributed$4.apply(XGBoost.scala:356)
at ml.dmlc.xgboost4j.scala.spark.XGBoost$$anonfun$trainDistributed$4.apply(XGBoost.scala:337)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.immutable.List.foreach(List.scala:381)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.immutable.List.map(List.scala:285)
at ml.dmlc.xgboost4j.scala.spark.XGBoost$.trainDistributed(XGBoost.scala:336)
at ml.dmlc.xgboost4j.scala.spark.XGBoostEstimator.train(XGBoostEstimator.scala:139)
at ml.dmlc.xgboost4j.scala.spark.XGBoostEstimator.train(XGBoostEstimator.scala:36)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
at $line15134072657.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.liftedTree1$1(<console>:63)
at $line15134072657.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:61)
at $line15134072657.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:74)
at $line15134072657.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:76)
at $line15134072657.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:78)
at $line15134072657.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:80)
at $line15134072657.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:82)
at $line15134072657.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:84)
at $line15134072657.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:86)
at $line15134072657.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:88)
at $line15134072657.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:90)
at $line15134072657.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:92)
at $line15134072657.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:94)
at $line15134072657.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:96)
at $line15134072657.$read$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:98)
at $line15134072657.$read$$iw$$iw$$iw$$iw$$iw.<init>(<console>:100)
at $line15134072657.$read$$iw$$iw$$iw$$iw.<init>(<console>:102)
at $line15134072657.$read$$iw$$iw$$iw.<init>(<console>:104)
at $line15134072657.$read$$iw$$iw.<init>(<console>:106)
at $line15134072657.$read$$iw.<init>(<console>:108)
at $line15134072657.$read.<init>(<console>:110)
at $line15134072657.$read$.<init>(<console>:114)
at $line15134072657.$read$.<clinit>(<console>)
at $line15134072657.$eval$.$print$lzycompute(<console>:7)
at $line15134072657.$eval$.$print(<console>:6)
at $line15134072657.$eval.$print(<console>)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at scala.tools.nsc.interpreter.IMain$ReadEvalPrint.call(IMain.scala:786)
at scala.tools.nsc.interpreter.IMain$Request.loadAndRun(IMain.scala:1047)
at scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$loadAndRunReq$1.apply(IMain.scala:638)
at scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$loadAndRunReq$1.apply(IMain.scala:637)
at scala.reflect.internal.util.ScalaClassLoader$class.asContext(ScalaClassLoader.scala:31)
at scala.reflect.internal.util.AbstractFileClassLoader.asContext(AbstractFileClassLoader.scala:19)
at scala.tools.nsc.interpreter.IMain$WrappedRequest.loadAndRunReq(IMain.scala:637)
at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:569)
at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:565)
at org.apache.zeppelin.spark.SparkScala211Interpreter.scalaInterpret(SparkScala211Interpreter.scala:108)
at org.apache.zeppelin.spark.BaseSparkScalaInterpreter$$anonfun$_interpret$1$1.apply(BaseSparkScalaInterpreter.scala:100)
at org.apache.zeppelin.spark.BaseSparkScalaInterpreter$$anonfun$_interpret$1$1.apply(BaseSparkScalaInterpreter.scala:94)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at scala.Console$.withOut(Console.scala:65)
at org.apache.zeppelin.spark.BaseSparkScalaInterpreter._interpret$1(BaseSparkScalaInterpreter.scala:94)
at org.apache.zeppelin.spark.BaseSparkScalaInterpreter.interpret(BaseSparkScalaInterpreter.scala:125)
at org.apache.zeppelin.spark.NewSparkInterpreter.interpret(NewSparkInterpreter.java:147)
at org.apache.zeppelin.spark.SparkInterpreter.interpret(SparkInterpreter.java:73)
at org.apache.zeppelin.interpreter.LazyOpenInterpreter.interpret(LazyOpenInterpreter.java:103)
at org.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:632)
at org.apache.zeppelin.scheduler.Job.run(Job.java:188)
at org.apache.zeppelin.scheduler.FIFOScheduler$1.run(FIFOScheduler.java:140)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
ml.dmlc.xgboost4j.java.XGBoostError: XGBoostModel training failed

Spark Scala - rdd distinct nullpointerexception

I'm doing baby steps with spark and my exercise loads a JSON file into RDD, select a column, and then use the distinct to get unique values.
The column I'm filtering contains multiple values (CSV line) and has to be split.
val sqlContext = spark.sqlContext
import org.apache.spark.sql.hive.HiveContext
val hiveCtx = new HiveContext(sc)
import hiveCtx.implicits._
val bizDF = hiveCtx.jsonFile("/home/xpto/Documents/PersonalProjects/Yelp_P1/yelp_academic_dataset_business.json")
val catRdd = bizDF.select("categories").rdd.flatMap(row => (row.getString(0).split(",").map(_.trim))).distinct
When I run the statement "catRdd.take (10).foreach (println)" returns an exception:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 234.0 failed 1 times, most recent failure: Lost task 0.0 in stage 234.0 (TID 682, 192.168.0.122, executor driver):
java.lang.NullPointerException
at $anonfun$catRdd$1(<console>:39)
at $anonfun$catRdd$1$adapted(<console>:39)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:192)
at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:62)
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2120)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2139)
at org.apache.spark.rdd.RDD.$anonfun$take$1(RDD.scala:1423)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
at org.apache.spark.rdd.RDD.take(RDD.scala:1396)
... 48 elided
Caused by: java.lang.NullPointerException
at $anonfun$catRdd$1(<console>:39)
at $anonfun$catRdd$1$adapted(<console>:39)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:192)
at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:62)
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
... 3 more
The spark version I'm running is 2.12-3.0.1
I found a solution that fits for my requirement:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window
val v1 = bizDF.withColumn("categories", split(col("categories"), ","))
.select(col("categories")(0).as("description"))
.distinct
.coalesce(1)
.orderBy(asc("description"))
val windowSpec = Window.orderBy("description")
val v2 = v1.withColumn("id",row_number.over(windowSpec))
val v3 = v2.select("id","description")
Your json file has multiple lines and is not supported by HiveCtx. Try this instead using spark session:
val bizDF = spark.read.format("json").option("multiline", "true").load("/home/xpto/Documents/PersonalProjects/Yelp_P1/yelp_academic_dataset_business.json")
val catRdd = bizDF.select("categories").rdd.flatMap(row => (row.getString(0).split(",").map(_.trim))).distinct
catRdd.take(10).foreach(println)

Apache Spark Throwing Deserialization Error when using take method on RDD

I am new to Spark, and I'm using Scala 2.12.8 with Spark 2.4.0. I'm trying to use the Random Forest classifier in Spark MLLib. I can build and train the classifier, and the classifier can predict if I use the first() function on the resulting RDD. However, if I try to use the take(n) function, I get a pretty big, ugly stack trace. Does anyone know what I'm doing wrong? The error is occurring in the line: ".take(3)". I am aware that this is the first effectful operation that I'm performing on the RDD, so if anyone can explain to me why it's failing and how to fix it, I would be really grateful.
object ItsABreeze {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession
.builder()
.appName("test")
.getOrCreate()
//Do stuff to file
val data: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(spark.sparkContext, "file.svm")
// Split the data into training and test sets (30% held out for testing)
val splits: Array[RDD[LabeledPoint]] = data.randomSplit(Array(0.7, 0.3))
val (trainingData, testData) = (splits(0), splits(1))
// Train a RandomForest model.
// Empty categoricalFeaturesInfo indicates all features are continuous
val numClasses = 4
val categoricaFeaturesInfo = Map[Int, Int]()
val numTrees = 3
val featureSubsetStrategy = "auto"
val impurity = "gini"
val maxDepth = 5
val maxBins = 32
val model: RandomForestModel = RandomForest.trainClassifier(
trainingData,
numClasses,
categoricaFeaturesInfo,
numTrees,
featureSubsetStrategy,
impurity,
maxDepth,
maxBins
)
testData
.map((point: LabeledPoint) => model.predict(point.features))
.take(3)
.foreach(println)
spark.stop()
}
}
The top portion of the stack trace follows:
java.io.IOException: unexpected exception type
at java.io.ObjectStreamClass.throwMiscException(ObjectStreamClass.java:1736)
at java.io.ObjectStreamClass.invokeReadResolve(ObjectStreamClass.java:1266)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2078)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:431)
at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:83)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at java.lang.invoke.SerializedLambda.readResolve(SerializedLambda.java:230)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at java.io.ObjectStreamClass.invokeReadResolve(ObjectStreamClass.java:1260)
... 25 more
Caused by: java.lang.BootstrapMethodError: java.lang.NoClassDefFoundError: scala/runtime/LambdaDeserialize
at ItsABreeze$.$deserializeLambda$(ItsABreeze.scala)
... 35 more
Caused by: java.lang.NoClassDefFoundError: scala/runtime/LambdaDeserialize
... 36 more
Caused by: java.lang.ClassNotFoundException: scala.runtime.LambdaDeserialize
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
The code that I was trying to run was a slightly modified version of the classification example on this page (from the Spark Machine Learning Library documentation).
Both commenters on my original question were correct: I changed the version of Scala that I was using from 2.12.8 to 2.11.12, and I reverted Spark to 2.2.1, and the code ran just as it was.
For anyone watching this issue that is qualified to answer, here is a followup question: Spark 2.4.0 claims to have new, experimental support for Scala 2.12.x. Are there a lot of known issues with the 2.12.x support?

Spark & Scala for Twitter streaming

I am trying to stream live tweets using Spark/Scala. I am having some difficulties.
I am using Spark 2.0, scala 2.11.8, spark-streaming_2.11-2.0.0.jar & spark-streaming-twitter_2.11-2.0.0.jar.
It runs for the first time and immediately throws an error.
ssc.awaitTermination() is the culprit.
Attaching code snippet as well as error, any idea what am I doing wrong?
import org.apache.log4j._
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.twitter._
import twitter4j.TwitterFactory
import twitter4j.conf.ConfigurationBuilder
import java.util.Properties
import org.apache.spark.storage.StorageLevel
import twitter4j.auth.OAuthAuthorization
object TStreaming {
Logger.getLogger("org").setLevel(Level.ERROR)
def main (args: Array[String]) {
val ssc = new StreamingContext("local[2]", "TweeterStreaming", Seconds(10))
val hashTags = "Hurricane Florence"
val cb = new ConfigurationBuilder()
val prop = new Properties()
//prop.load(Thread.currentThread().getContextClassLoader.getResourceAsStream("twitter.properties"))
cb.setDebugEnabled(true)
.setOAuthConsumerKey("***************")
.setOAuthConsumerSecret("***************")
.setOAuthAccessToken("***************")
.setOAuthAccessTokenSecret("***************")
val bld = cb.build()
val tf = new TwitterFactory(bld)
val twitter = tf.getInstance()
val filters = Array(hashTags).toSeq
val auth = new OAuthAuthorization(bld)
val twitterStream = TwitterUtils.createStream(ssc, Some(auth), filters, StorageLevel.MEMORY_ONLY)
twitterStream.cache()
val lines = twitterStream.map(status => status.getText)
lines.print()
val words = lines.flatMap(_.split(" "))
val pairs = words.map(word => (word, 1))
val wordCounts = pairs.reduceByKey(_ + _)
wordCounts.print()
ssc.start() // Start the computation
ssc.awaitTermination()
}
}
Here is the error...
18/09/29 10:27:10 ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
java.lang.NoSuchMethodError: twitter4j.TwitterStream.addListener(Ltwitter4j/StreamListener;)V
at org.apache.spark.streaming.twitter.TwitterReceiver.onStart(TwitterInputDStream.scala:72)
at org.apache.spark.streaming.receiver.ReceiverSupervisor.startReceiver(ReceiverSupervisor.scala:149)
at org.apache.spark.streaming.receiver.ReceiverSupervisor.start(ReceiverSupervisor.scala:131)
at org.apache.spark.streaming.scheduler.ReceiverTracker$ReceiverTrackerEndpoint$$anonfun$9.apply(ReceiverTracker.scala:597)
at org.apache.spark.streaming.scheduler.ReceiverTracker$ReceiverTrackerEndpoint$$anonfun$9.apply(ReceiverTracker.scala:587)
at org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:1974)
at org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:1974)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:85)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
18/09/29 10:27:10 ERROR SparkUncaughtExceptionHandler: Uncaught exception in thread Thread[Executor task launch worker-0,5,main]
java.lang.NoSuchMethodError: twitter4j.TwitterStream.addListener(Ltwitter4j/StreamListener;)V
at org.apache.spark.streaming.twitter.TwitterReceiver.onStart(TwitterInputDStream.scala:72)
at org.apache.spark.streaming.receiver.ReceiverSupervisor.startReceiver(ReceiverSupervisor.scala:149)
at org.apache.spark.streaming.receiver.ReceiverSupervisor.start(ReceiverSupervisor.scala:131)
at org.apache.spark.streaming.scheduler.ReceiverTracker$ReceiverTrackerEndpoint$$anonfun$9.apply(ReceiverTracker.scala:597)
at org.apache.spark.streaming.scheduler.ReceiverTracker$ReceiverTrackerEndpoint$$anonfun$9.apply(ReceiverTracker.scala:587)
at org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:1974)
at org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:1974)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:85)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
18/09/29 10:27:10 ERROR TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
18/09/29 10:27:10 ERROR ReceiverTracker: Deregistered receiver for stream 0: Stopped by driver
-------------------------------------------
Time: 1538242030000 ms
-------------------------------------------
-------------------------------------------
Time: 1538242030000 ms
-------------------------------------------
Here is the code snippet
Here is the error
Thank you in advance.
It's probably your build tool configuration. You might not be creating your uberjar correctly and the class is not found.

Issue in saving Xgboost model in spark scala

I trained a xgboost model in spark scala as follows:
val xgbParamGrid = new ParamGridBuilder()
.addGrid(xgb.maxDepth, Array(5, 10))
.addGrid(xgb.maxBins, Array(2))
.addGrid(xgb.minChildWeight, Array(0.2))
.addGrid(xgb.eta, Array(0.015))
.addGrid(xgb.alpha, Array(0.8, 0.9))
.addGrid(xgb.lambda, Array(0.9, 1.0))
.build()
// Create the XGBoost pipeline
val pipeline = new Pipeline().setStages(Array(xgb))
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val cv = new CrossValidator()
.setEstimator(pipeline)
.setEvaluator(evaluator)
.setEstimatorParamMaps(xgbParamGrid)
.setNumFolds(3)
val xgbModel = cv.fit(trainingData)
val xgbBest = xgbModel.bestModel.asInstanceOf[PipelineModel].stages(0).asInstanceOf[XGBoostClassificationModel]
Then I tried to save it to local as:
xgbBest.write.overwrite.save(modelSavePath)
but got error message as:
Exception in thread "main" java.lang.ClassCastException: java.lang.Integer cannot be cast to java.lang.Long
at scala.runtime.BoxesRunTime.unboxToLong(BoxesRunTime.java:105)
at org.apache.spark.ml.param.LongParam.jsonEncode(params.scala:480)
at ml.dmlc.xgboost4j.scala.spark.params.DefaultXGBoostParamsWriter$$anonfun$1$$anonfun$3.apply(DefaultXGBoostParamsWriter.scala:73)
at ml.dmlc.xgboost4j.scala.spark.params.DefaultXGBoostParamsWriter$$anonfun$1$$anonfun$3.apply(DefaultXGBoostParamsWriter.scala:71)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at ml.dmlc.xgboost4j.scala.spark.params.DefaultXGBoostParamsWriter$$anonfun$1.apply(DefaultXGBoostParamsWriter.scala:71)
at ml.dmlc.xgboost4j.scala.spark.params.DefaultXGBoostParamsWriter$$anonfun$1.apply(DefaultXGBoostParamsWriter.scala:69)
at scala.Option.getOrElse(Option.scala:121)
at ml.dmlc.xgboost4j.scala.spark.params.DefaultXGBoostParamsWriter$.getMetadataToSave(DefaultXGBoostParamsWriter.scala:69)
at ml.dmlc.xgboost4j.scala.spark.params.DefaultXGBoostParamsWriter$.saveMetadata(DefaultXGBoostParamsWriter.scala:51)
at ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel$XGBoostClassificationModelWriter.saveImpl(XGBoostClassifier.scala:480)
at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:103)
at classificationWithXgboost$.main(classificationWithXgboost.scala:125)
at classificationWithXgboost.main(classificationWithXgboost.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:894)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:198)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:228)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:137)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Can anyone help with the problem? Thanks