Apache spark MultilayerPerceptronClassifier fails with ArrayIndexOutOfBoundsException - scala

I'm using this code to try to predict :
import org.apache.spark.sql.functions.col
import org.apache.spark.Logging
import org.apache.spark.graphx._
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.SparkContext._
import org.apache.spark.sql.SQLContext._
import org.apache.log4j.Logger
import org.apache.log4j.Level
import org.apache.spark.sql.functions.col
import org.apache.spark.ml.feature.VectorAssembler
object NN extends App {
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("akka").setLevel(Level.OFF)
val sc = new SparkContext(new SparkConf().setMaster("local[2]")
.setAppName("cs"))
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
val df = sc.parallelize(Seq(
("3", "1", "1"),
("2", "1", "1"),
("2", "3", "3"),
("3", "3", "3"),
("0", "1", "0")))
.toDF("label", "feature1", "feature2")
val numeric = df
.select(df.columns.map(c => col(c).cast("double").alias(c)): _*)
val assembler = new VectorAssembler()
.setInputCols(Array("feature1", "feature2"))
.setOutputCol("features")
val data = assembler.transform(numeric)
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier
val layers = Array[Int](2, 3, 5, 4) // Note 2 neurons in the input layer
val trainer = new MultilayerPerceptronClassifier()
.setLayers(layers)
.setBlockSize(128)
.setSeed(1234L)
.setMaxIter(100)
val model = trainer.fit(data)
model.transform(data).show
}
For the dataframe (df) if I use
("4", "1", "1") instead of ("3", "1", "1") I receive error :
Java HotSpot(TM) 64-Bit Server VM warning: ignoring option MaxPermSize=256m; support was removed in 8.0
[info] Set current project to spark-applications1458853926-master (in build file:/C:/Users/Desktop/spark-applications1458853926-master/)
[info] Compiling 1 Scala source to C:\Users\Desktop\spark-applications1458853926-master\target\scala-2.11\classes...
[info] Running NN
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
16/04/06 12:42:11 INFO Remoting: Starting remoting
16/04/06 12:42:11 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkDriverActorSystem#10.95.132.202:64056]
[error] (run-main-0) org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): java.lang.ArrayIndexOutOfBoundsException: 4
[error] at org.apache.spark.ml.classification.LabelConverter$.encodeLabeledPoint(MultilayerPerceptronClassifier.scala:85)
[error] at org.apache.spark.ml.classification.MultilayerPerceptronClassifier$$anonfun$2.apply(MultilayerPerceptronClassifier.scala:165)
[error] at org.apache.spark.ml.classification.MultilayerPerceptronClassifier$$anonfun$2.apply(MultilayerPerceptronClassifier.scala:165)
[error] at scala.collection.Iterator$$anon$11.next(Iterator.scala:370)
[error] at scala.collection.Iterator$GroupedIterator.takeDestructively(Iterator.scala:934)
[error] at scala.collection.Iterator$GroupedIterator.go(Iterator.scala:949)
[error] at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:986)
[error] at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:990)
[error] at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:369)
[error] at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1595)
[error] at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1143)
[error] at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1143)
[error] at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1858)
[error] at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1858)
[error] at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
[error] at org.apache.spark.scheduler.Task.run(Task.scala:89)
[error] at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
[error] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
[error] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
[error] at java.lang.Thread.run(Thread.java:745)
[error]
[error] Driver stacktrace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): java.lang.ArrayIndexOutOfBoundsException: 4
at org.apache.spark.ml.classification.LabelConverter$.encodeLabeledPoint(MultilayerPerceptronClassifier.scala:85)
at org.apache.spark.ml.classification.MultilayerPerceptronClassifier$$anonfun$2.apply(MultilayerPerceptronClassifier.scala:165)
at org.apache.spark.ml.classification.MultilayerPerceptronClassifier$$anonfun$2.apply(MultilayerPerceptronClassifier.scala:165)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:370)
at scala.collection.Iterator$GroupedIterator.takeDestructively(Iterator.scala:934)
at scala.collection.Iterator$GroupedIterator.go(Iterator.scala:949)
at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:986)
at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:990)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:369)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1595)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1143)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1143)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1858)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1858)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)
at org.apache.spark.rdd.RDD.count(RDD.scala:1143)
at org.apache.spark.mllib.optimization.LBFGS$.runLBFGS(LBFGS.scala:170)
at org.apache.spark.mllib.optimization.LBFGS.optimize(LBFGS.scala:117)
at org.apache.spark.ml.ann.FeedForwardTrainer.train(Layer.scala:878)
at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:170)
at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:110)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:90)
at NN$.delayedEndpoint$NN$1(NN.scala:56)
at NN$delayedInit$body.apply(NN.scala:15)
at scala.Function0$class.apply$mcV$sp(Function0.scala:34)
at scala.runtime.AbstractFunction0.apply$mcV$sp(AbstractFunction0.scala:12)
at scala.App$$anonfun$main$1.apply(App.scala:76)
at scala.App$$anonfun$main$1.apply(App.scala:76)
at scala.collection.immutable.List.foreach(List.scala:381)
at scala.collection.generic.TraversableForwarder$class.foreach(TraversableForwarder.scala:35)
at scala.App$class.main(App.scala:76)
at NN$.main(NN.scala:15)
at NN.main(NN.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
Caused by: java.lang.ArrayIndexOutOfBoundsException: 4
at org.apache.spark.ml.classification.LabelConverter$.encodeLabeledPoint(MultilayerPerceptronClassifier.scala:85)
at org.apache.spark.ml.classification.MultilayerPerceptronClassifier$$anonfun$2.apply(MultilayerPerceptronClassifier.scala:165)
at org.apache.spark.ml.classification.MultilayerPerceptronClassifier$$anonfun$2.apply(MultilayerPerceptronClassifier.scala:165)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:370)
at scala.collection.Iterator$GroupedIterator.takeDestructively(Iterator.scala:934)
at scala.collection.Iterator$GroupedIterator.go(Iterator.scala:949)
at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:986)
at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:990)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:369)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1595)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1143)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1143)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1858)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1858)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
[trace] Stack trace suppressed: run last compile:run for the full output.
java.lang.RuntimeException: Nonzero exit code: 1
at scala.sys.package$.error(package.scala:27)
[trace] Stack trace suppressed: run last compile:run for the full output.
[error] (compile:run) Nonzero exit code: 1
[error] Total time: 19 s, completed 06-Apr-2016 12:42:20
Why do I receive ArrayIndexOutOfBoundsException , I'm not setting up my labels correctly ? Can labels not take any value as they are just labels? In this example it appears they have to be in the range 0-3 ?

The output layer uses one-hot encoding; that is, a label of "3" is converted to (0,0,0,1) where the 'third' element is 1 and the rest are 0. When you have 4 output nodes and a label of 4, the LabelConverter function (whose source is visible here) will fail. (labelCount is 4, labeledPoint.label.toInt is 4, hence your error.)
val output = Array.fill(labelCount)(0.0)
output(labeledPoint.label.toInt) = 1.0
(labeledPoint.features, Vectors.dense(output))
So change this line:
val layers = Array[Int](2, 3, 5, 4) // Note 2 neurons in the input layer
to this:
val layers = Array[Int](2, 3, 5, 5) // Note 2 neurons in the input layer and 5 neurons in the output layer
and I expect it to work.

The last layer appears (I am writing my first serious sample myself) to represent labels rounded to int values so by declaring 4 as this value you expect labels of 0,1,2,3 -
Apparently the code is designed to create a neural net to classify the output into a series of states based on the input -
I am trying to determine how to write t tic tac toe player using this capability

Related

Spark Scala - rdd distinct nullpointerexception

I'm doing baby steps with spark and my exercise loads a JSON file into RDD, select a column, and then use the distinct to get unique values.
The column I'm filtering contains multiple values (CSV line) and has to be split.
val sqlContext = spark.sqlContext
import org.apache.spark.sql.hive.HiveContext
val hiveCtx = new HiveContext(sc)
import hiveCtx.implicits._
val bizDF = hiveCtx.jsonFile("/home/xpto/Documents/PersonalProjects/Yelp_P1/yelp_academic_dataset_business.json")
val catRdd = bizDF.select("categories").rdd.flatMap(row => (row.getString(0).split(",").map(_.trim))).distinct
When I run the statement "catRdd.take (10).foreach (println)" returns an exception:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 234.0 failed 1 times, most recent failure: Lost task 0.0 in stage 234.0 (TID 682, 192.168.0.122, executor driver):
java.lang.NullPointerException
at $anonfun$catRdd$1(<console>:39)
at $anonfun$catRdd$1$adapted(<console>:39)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:192)
at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:62)
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2120)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2139)
at org.apache.spark.rdd.RDD.$anonfun$take$1(RDD.scala:1423)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
at org.apache.spark.rdd.RDD.take(RDD.scala:1396)
... 48 elided
Caused by: java.lang.NullPointerException
at $anonfun$catRdd$1(<console>:39)
at $anonfun$catRdd$1$adapted(<console>:39)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:192)
at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:62)
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
... 3 more
The spark version I'm running is 2.12-3.0.1
I found a solution that fits for my requirement:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window
val v1 = bizDF.withColumn("categories", split(col("categories"), ","))
.select(col("categories")(0).as("description"))
.distinct
.coalesce(1)
.orderBy(asc("description"))
val windowSpec = Window.orderBy("description")
val v2 = v1.withColumn("id",row_number.over(windowSpec))
val v3 = v2.select("id","description")
Your json file has multiple lines and is not supported by HiveCtx. Try this instead using spark session:
val bizDF = spark.read.format("json").option("multiline", "true").load("/home/xpto/Documents/PersonalProjects/Yelp_P1/yelp_academic_dataset_business.json")
val catRdd = bizDF.select("categories").rdd.flatMap(row => (row.getString(0).split(",").map(_.trim))).distinct
catRdd.take(10).foreach(println)

For loop over list while filtering

I have 2 DataFrames called df1 and df2, where they both have the same column names. I wish to run a for loop over unique dates, from df1 and apply the same date filter to df2. I created a list of unique dates and then tried to iterate through that. However what I have is throwing errors.
Here is what I have:
val unique_weeks = df1.select(df1("date")).distinct
for( week <- unique_weeks) {
val df1_filtered = df1.filter($"date" === week)
val df2_filtered = df2.filter($"date" === week)
/// will run a join here and more code
}
I think <- this part may be incorrect - but not sure how I can filter the DataFrames using another method.
Here is the error:
[error] (run-main-0) org.apache.spark.SparkException: Job aborted due to stage failure: Task 35 in stage 3.0 failed 1 times, most recent failure: Lost task 35.0 in stage 3.0 (TID 399, localhost, executor driver): java.lang.RuntimeException: Unsupported literal type class org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema [1591772400000]
[error] at org.apache.spark.sql.catalyst.expressions.Literal$.apply(literals.scala:75)
[error] at org.apache.spark.sql.functions$.lit(functions.scala:101)
[error] at org.apache.spark.sql.Column.$eq$eq$eq(Column.scala:267)
[error] at spark_pkg.SparkMain$$anonfun$main$1.apply(SparkMain.scala:880)
[error] at spark_pkg.SparkMain$$anonfun$main$1.apply(SparkMain.scala:878)
[error] at scala.collection.Iterator$class.foreach(Iterator.scala:893)
[error] at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
[error] at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:917)
[error] at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:917)
[error] at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1944)
[error] at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1944)
[error] at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
[error] at org.apache.spark.scheduler.Task.run(Task.scala:99)
[error] at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
[error] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
[error] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
[error] at java.lang.Thread.run(Thread.java:748)
[error]
[error] Driver stacktrace:
[error] org.apache.spark.SparkException: Job aborted due to stage failure: Task 35 in stage 3.0 failed 1 times, most recent failure: Lost task 35.0 in stage 3.0 (TID 399, localhost, executor driver): java.lang.RuntimeException: Unsupported literal type class org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema [1591772400000]
[error] at org.apache.spark.sql.catalyst.expressions.Literal$.apply(literals.scala:75)
[error] at org.apache.spark.sql.functions$.lit(functions.scala:101)
[error] at org.apache.spark.sql.Column.$eq$eq$eq(Column.scala:267)
[error] at spark_pkg.SparkMain$$anonfun$main$1.apply(SparkMain.scala:880)
[error] at spark_pkg.SparkMain$$anonfun$main$1.apply(SparkMain.scala:878)
[error] at scala.collection.Iterator$class.foreach(Iterator.scala:893)
[error] at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
[error] at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:917)
[error] at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:917)
[error] at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1944)
[error] at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1944)
[error] at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
[error] at org.apache.spark.scheduler.Task.run(Task.scala:99)
[error] at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
[error] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
[error] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
[error] at java.lang.Thread.run(Thread.java:748)
[error]
[error] Driver stacktrace:
[error] at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
[error] at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
[error] at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
[error] at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
[error] at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
[error] at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
[error] at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
[error] at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
[error] at scala.Option.foreach(Option.scala:257)
[error] at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
[error] at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
[error] at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
[error] at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
[error] at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
[error] at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
[error] at org.apache.spark.SparkContext.runJob(SparkContext.scala:1918)
[error] at org.apache.spark.SparkContext.runJob(SparkContext.scala:1931)
[error] at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944)
[error] at org.apache.spark.SparkContext.runJob(SparkContext.scala:1958)
[error] at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:917)
[error] at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:915)
[error] at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
[error] at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
[error] at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
[error] at org.apache.spark.rdd.RDD.foreach(RDD.scala:915)
[error] at org.apache.spark.sql.Dataset$$anonfun$foreach$1.apply$mcV$sp(Dataset.scala:2286)
[error] at org.apache.spark.sql.Dataset$$anonfun$foreach$1.apply(Dataset.scala:2286)
[error] at org.apache.spark.sql.Dataset$$anonfun$foreach$1.apply(Dataset.scala:2286)
[error] at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
[error] at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2765)
[error] at org.apache.spark.sql.Dataset.foreach(Dataset.scala:2285)
[error] at spark_pkg.SparkMain$.main(SparkMain.scala:878)
[error] at spark_pkg.SparkMain.main(SparkMain.scala)
[error] at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
[error] at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
[error] at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
[error] at java.lang.reflect.Method.invoke(Method.java:498)
[error] Caused by: java.lang.RuntimeException: Unsupported literal type class org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema [1591772400000]
[error] at org.apache.spark.sql.catalyst.expressions.Literal$.apply(literals.scala:75)
[error] at org.apache.spark.sql.functions$.lit(functions.scala:101)
[error] at org.apache.spark.sql.Column.$eq$eq$eq(Column.scala:267)
[error] at spark_pkg.SparkMain$$anonfun$main$1.apply(SparkMain.scala:880)
[error] at spark_pkg.SparkMain$$anonfun$main$1.apply(SparkMain.scala:878)
[error] at scala.collection.Iterator$class.foreach(Iterator.scala:893)
[error] at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
[error] at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:917)
[error] at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:917)
[error] at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1944)
[error] at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1944)
[error] at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
[error] at org.apache.spark.scheduler.Task.run(Task.scala:99)
[error] at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
[error] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
[error] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
[error] at java.lang.Thread.run(Thread.java:748)
[error] stack trace is suppressed; run 'last Compile / bgRun' for the full output
[error] Nonzero exit code: 1
[error] (Compile / run) Nonzero exit code: 1
[error] Total time: 137 s (02:17), completed Aug 20, 2020 1:16:02 PM
A dataframe is not an iterator, and therefore, you cannot run a for loop over it. You can run something like this - but I don't think it will do what you're hoping to achieve based on your other code.
unique_weeks.foreachPartition{ weeks : Iterator[YourData] =>
for( week <- weeks) {
}
}
Your question suggests your mental model of what a dataframe is and how Spark works is not quite complete. Think of a Dataframe more as a List[List[YourData]], except each inner List[YourData] is located on an independent piece of a machine, and may not necessarily know or interact with any of the other Lists until you collect them back to the driver.

org.apache.spark.SparkException: job aborted due to stage failure: Task 0 in stage 2.0

I was trying to solve the below problem statement using spark-shell in cloudera VM but getting error. Tried to follow few suggestions raised in other tickets on this site but not following. As I am new to this big data world so any help will be appreciated.
org.apache.spark.SparkException: **Job aborted due to stage failure: Task 0 in stage 1.0 failed 1 times, most recent failure: Lost task 0.0 in stage 1.0 (TID 1, localhost, executor driver): java.lang.NumberFormatException: For input string: "patientID" at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)**
Problem Statement:
You have been given below patient data in csv format,
patientID,name,dateOfBirth,lastVisitDate
1001,Ah Teck,1991-12-31,2012-01-20
1002,Kumar,2011-10-29,2012-09-20
1003,Ali,2011-01-30,2012-10-21
Accomplish following activities.
1 . Find all the patients whose lastVisitDate between current time and '2012-09-15'
Error message with scala code written in Spark-shell
[cloudera#quickstart ~]$ spark-shell
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel).
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/zookeeper/lib/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/flume-ng/lib/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/parquet/lib/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/avro/avro-tools-1.7.6-cdh5.12.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 1.6.0
/_/
Using Scala version 2.10.5 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_67)
Type in expressions to have them evaluated.
Type :help for more information.
17/11/06 20:15:44 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Spark context available as sc (master = local[*], app id = local-1510028148251).
17/11/06 20:15:53 WARN shortcircuit.DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.
SQL context available as sqlContext.
scala> val sqlContext = new org.apache.spark.sql.SQLContext(sc)
sqlContext: org.apache.spark.sql.SQLContext = org.apache.spark.sql.SQLContext#3de10177
scala> import sqlContext.implicits._
import sqlContext.implicits._
scala> import org.apache.spark.sql._
import org.apache.spark.sql._
scala> val patients = sc.textFile("sparksql3/patients.csv")
patients: org.apache.spark.rdd.RDD[String] = sparksql3/patients.csv MapPartitionsRDD[1] at textFile at <console>:35
scala> patients.first()
res0: String = 1001,Ah Teck,1999-12-31,2012-01-20
scala> case class Patient(patientId:Integer,name:String,dateOfBirth:String,lastVisitDate:String)
defined class Patient
scala> val patRDD = patients.map(_.split(",")).map(p=>Patient(p(0).toInt,p(1),p(2),p(3)))
patRDD: org.apache.spark.rdd.RDD[Patient] = MapPartitionsRDD[4] at map at <console>:39
scala> patRDD.count()
17/11/06 20:27:01 ERROR executor.Executor: Exception in task 0.0 in stage 1.0 (TID 1)
java.lang.NumberFormatException: For input string: "patientID"
at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
at java.lang.Integer.parseInt(Integer.java:492)
at java.lang.Integer.parseInt(Integer.java:527)
at scala.collection.immutable.StringLike$class.toInt(StringLike.scala:229)
at scala.collection.immutable.StringOps.toInt(StringOps.scala:31)
at $line35.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$2.apply(<console>:39)
at $line35.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$2.apply(<console>:39)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1635)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1888)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1888)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:242)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
17/11/06 20:27:02 WARN scheduler.TaskSetManager: Lost task 0.0 in stage 1.0 (TID 1, localhost, executor driver): java.lang.NumberFormatException: For input string: "patientID"
at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
at java.lang.Integer.parseInt(Integer.java:492)
at java.lang.Integer.parseInt(Integer.java:527)
at scala.collection.immutable.StringLike$class.toInt(StringLike.scala:229)
at scala.collection.immutable.StringOps.toInt(StringOps.scala:31)
at $line35.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$2.apply(<console>:39)
at $line35.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$2.apply(<console>:39)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1635)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1888)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1888)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:242)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
17/11/06 20:27:02 ERROR scheduler.TaskSetManager: Task 0 in stage 1.0 failed 1 times; aborting job
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 1 times, most recent failure: Lost task 0.0 in stage 1.0 (TID 1, localhost, executor driver): java.lang.NumberFormatException: For input string: "patientID"
at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
at java.lang.Integer.parseInt(Integer.java:492)
at java.lang.Integer.parseInt(Integer.java:527)
at scala.collection.immutable.StringLike$class.toInt(StringLike.scala:229)
at scala.collection.immutable.StringOps.toInt(StringOps.scala:31)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$2.apply(<console>:39)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$2.apply(<console>:39)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1635)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1888)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1888)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:242)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1457)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1445)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1444)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1444)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1668)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1627)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1616)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1862)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1875)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1888)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1959)
at org.apache.spark.rdd.RDD.count(RDD.scala:1157)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:42)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:47)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:49)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:51)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:53)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:55)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:57)
at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:59)
at $iwC$$iwC$$iwC$$iwC.<init>(<console>:61)
at $iwC$$iwC$$iwC.<init>(<console>:63)
at $iwC$$iwC.<init>(<console>:65)
at $iwC.<init>(<console>:67)
at <init>(<console>:69)
at .<init>(<console>:73)
at .<clinit>(<console>)
at .<init>(<console>:7)
at .<clinit>(<console>)
at $print(<console>)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1045)
at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1326)
at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:821)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:852)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:800)
at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:857)
at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902)
at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:814)
at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657)
at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665)
at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$loop(SparkILoop.scala:670)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply$mcZ$sp(SparkILoop.scala:997)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$process(SparkILoop.scala:945)
at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1064)
at org.apache.spark.repl.Main$.main(Main.scala:35)
at org.apache.spark.repl.Main.main(Main.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:730)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.NumberFormatException: For input string: "patientID"
at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
at java.lang.Integer.parseInt(Integer.java:492)
at java.lang.Integer.parseInt(Integer.java:527)
at scala.collection.immutable.StringLike$class.toInt(StringLike.scala:229)
at scala.collection.immutable.StringOps.toInt(StringOps.scala:31)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$2.apply(<console>:39)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$2.apply(<console>:39)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1635)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1888)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1888)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:242)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
You are reading .csv file along with the header. Before creating the RDD you have to remove the header.
val patients = sc.textFile("sparksql3/patients.csv")
val header = patients.first()
val new_record = patients.filter(row => row!= header)
case class Patient(patientId:Integer,name:String,dateOfBirth:String,lastVisitDate:String)
val patRDD = new_record.map(_.split(",")).map(p=>Patient(p(0).toInt,p(1),p(2),p(3)))
patRDD.count() // 3

Multiple apps are getting submitted to spark Cluster and keeps in waiting and then exits withError

I am submitting a spark application to the Cluster by using the following command
/root/spark/bin/spark-submit --conf spark.driver.momory=10g --class com.knoldus.SampleApp /pathToJar/Application.jar
But what is happening is : Multiple apps are getting submitted and one is running and all others are waiting and then after sometime the code exits with an exception.
The Spark UI looks something like this :
After this the code exits with this error :
8.149.243): java.io.IOException: Failed to write statements to keyspace.tableName.
at com.datastax.spark.connector.writer.TableWriter$$anonfun$write$1.apply(TableWriter.scala:167)
at com.datastax.spark.connector.writer.TableWriter$$anonfun$write$1.apply(TableWriter.scala:135)
at com.datastax.spark.connector.cql.CassandraConnector$$anonfun$withSessionDo$1.apply(CassandraConnector.scala:111)
at com.datastax.spark.connector.cql.CassandraConnector$$anonfun$withSessionDo$1.apply(CassandraConnector.scala:110)
at com.datastax.spark.connector.cql.CassandraConnector.closeResourceAfterUse(CassandraConnector.scala:140)
at com.datastax.spark.connector.cql.CassandraConnector.withSessionDo(CassandraConnector.scala:110)
at com.datastax.spark.connector.writer.TableWriter.write(TableWriter.scala:135)
at com.datastax.spark.connector.RDDFunctions$$anonfun$saveToCassandra$1.apply(RDDFunctions.scala:37)
at com.datastax.spark.connector.RDDFunctions$$anonfun$saveToCassandra$1.apply(RDDFunctions.scala:37)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:85)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
16/12/14 06:26:28 WARN TaskSetManager: Lost task 0.1 in stage 2.0 (TID 561, 10.178.149.243): java.util.NoSuchElementException: None.get
at scala.None$.get(Option.scala:347)
at scala.None$.get(Option.scala:345)
at org.apache.spark.storage.BlockInfoManager.releaseAllLocksForTask(BlockInfoManager.scala:343)
at org.apache.spark.storage.BlockManager.releaseAllLocksForTask(BlockManager.scala:644)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:281)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
16/12/14 06:26:28 ERROR TaskSetManager: Task 0 in stage 2.0 failed 4 times; aborting job
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 4 times, most recent failure: Lost task 0.3 in stage 2.0 (TID 563, 10.178.149.225): java.util.NoSuchElementException: None.get
at scala.None$.get(Option.scala:347)
at scala.None$.get(Option.scala:345)
at org.apache.spark.storage.BlockInfoManager.releaseAllLocksForTask(BlockInfoManager.scala:343)
at org.apache.spark.storage.BlockManager.releaseAllLocksForTask(BlockManager.scala:644)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:281)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1450)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1438)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1437)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1437)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1659)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1618)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1607)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1871)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1884)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1904)
at com.datastax.spark.connector.RDDFunctions.saveToCassandra(RDDFunctions.scala:37)
at com.knoldus.xml.RNF2Driver$.main(RNFIngestPipeline.scala:56)
at com.knoldus.xml.RNF2Driver.main(RNFIngestPipeline.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:729)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:185)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:210)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:124)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.util.NoSuchElementException: None.get
at scala.None$.get(Option.scala:347)
at scala.None$.get(Option.scala:345)
at org.apache.spark.storage.BlockInfoManager.releaseAllLocksForTask(BlockInfoManager.scala:343)
at org.apache.spark.storage.BlockManager.releaseAllLocksForTask(BlockManager.scala:644)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:281)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
My Spark-conf is :
private val conf = new SparkConf()
.setAppName("SampleApp")
.setMaster(sparkClusterIP)
.set("spark.sql.shuffle.partitions", "8")
.set("spark.cassandra.connection.host", cassandraIP)
.set("spark.sql.crossJoin.enabled", "true")
.set("spark.kryoserializer.buffer.max", "640m")
.set("spark.executor.memory", "10g")
.set("spark.executor.cores", "3")
.set("spark.cassandra.output.batch.size.rows", "10")
.set("spark.cassandra.output.batch.size.bytes", "20480")
This is my sample code. Can anyone please let me know what the problem is :
val cassandraIDs = sc.cassandraTable[A](keySpace,tableName).map(_.filename.split("/").last.split("\\.")(0).toLong).collect()
val broadCastList = sc.broadcast(cassandraIDs)
val files = sc.wholeTextFiles(hdFSIP).map(_._1).filter { path =>
val listOfCassandraID = broadCastList.value
!listOfCassandraID.contains(path.split("/").last.split("\\.")(0).toLong)
}.take(100)
import sqlContext.implicits._
val fileNameRDD = sc.parallelize(files)
val cassandraRdd = fileNameRDD.map { path =>
...
//do some task
}.toDF(columnNames)
cassandraRdd.saveToCassandra(keySpace,tablename)
println(s"Completed Processing of $numOfDocs in ${System.currentTimeMillis() - start} milliseconds")
sc.stop()
Why multiple apps are getting submitted?
Because you are submitting multiple spark jobs in your driver code. Each one of the below statements will trigger a new job in the current spark context,
val cassandraIDs = sc.cassandraTable[A]....toLong).collect()
sc.wholeTextFiles
sc.parallelize(files)
cassandraRdd.saveToCassandra(keySpace,tablename)
.toDF(columnNames)
sc.broadcast
(Not sure about the last two)
I haven't worked with cassandra via spark much. But, above code doesn't utilize the power of spark.
You should pipeline the tasks as much as possible, so that spark can plan and run the tasks in distributed way.
Example:
In your code, you are creating files by calling take(100) and then creating an RDD called fileNameRDD using sc.parallelize(files).
This triggers two spark jobs, one to take 100 items, and one to create a new RDD using those 100 items.
Instead you should combine these two tasks so that they can be pipelined by spark.
sc.wholeTextFiles(hdFSIP).map(_._1).filter { path =>
val listOfCassandraID = broadCastList.value
!listOfCassandraID.contains(path.split("/").last.split("\\.")(0).toLong)
}.map { path => /*<---Combine the two tasks like this*/
...
//do some task
}.toDF(columnNames)
Note: I have skipped take(100) part, but you should be able to do that using filters
Why are you getting NoSuchElementException: None.get error
This is more likely because of scala version mismatch between your code and the scala version that is used to build the spark.

Inserting Dataframe Data into RDD

I currently have a dataframe containing the results of the SQL query. The dataframe has columns patientID, date, and code.
val res1 = sqlContext.sql("select encounter.Member_ID AS patientID, encounter.Encounter_DateTime AS date, diag.code from encounter join diag on encounter.Encounter_ID = diag.Encounter_ID")
I am attempting to take this dataframe and place it into an RDD of the format RDD[Diagnostic] where Diagnostic is a case class of the form:
case class Diagnostic(patientID:String, date: Date, code: String)
Is this possible? My current attempt is throwing back a scala.MatchError coming from the below line.
val diagnostic: RDD[Diagnostic] = res1.map {
case Row(patientID:String, date:java.util.Date, code:String) => Diagnostic(patientID=patientID, date=date, code=code)
}
Schema:
root
|-- patientID: string (nullable = true)
|-- date: string (nullable = true)
|-- code: string (nullable = true)
Error message from res1.as[Diagnostic]:
Main.scala:170: overloaded method value as with alternatives:
[error] (alias: Symbol)org.apache.spark.sql.DataFrame <and>
[error] (alias: String)org.apache.spark.sql.DataFrame
[error] does not take type parameters
[error] val testlol: RDD[Diagnostic] = res1.as[Diagnostic]
[error] ^
[error] one error found
[error] (compile:compileIncremental) Compilation failed
[error] Total time: 3 s, completed Oct 9, 2016 3:16:38 PM
Entire error message:
[Stage 4:=======================================> (2 +
1) / 3]16/10/09 14:23:32 ERROR Executor: Exception in task 0.0 in stage 6.0 (TID 8)
scala.MatchError: [000961291-01,2005-06-21T19:45:00Z,584.9] (of class org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema)
at edu.gatech.cse8803.main.Main$$anonfun$11.apply(Main.scala:168)
at edu.gatech.cse8803.main.Main$$anonfun$11.apply(Main.scala:168)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at org.apache.spark.rdd.RDD$$anonfun$33.apply(RDD.scala:1177)
at org.apache.spark.rdd.RDD$$anonfun$33.apply(RDD.scala:1177)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1498)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1498)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
at org.apache.spark.scheduler.Task.run(Task.scala:64)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
16/10/09 14:23:32 WARN TaskSetManager: Lost task 0.0 in stage 6.0 (TID 8, localhost): scala.MatchError: [000961291-01,2005-06-21T19:45:00Z,584.9] (of class org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema)
at edu.gatech.cse8803.main.Main$$anonfun$11.apply(Main.scala:168)
at edu.gatech.cse8803.main.Main$$anonfun$11.apply(Main.scala:168)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at org.apache.spark.rdd.RDD$$anonfun$33.apply(RDD.scala:1177)
at org.apache.spark.rdd.RDD$$anonfun$33.apply(RDD.scala:1177)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1498)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1498)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
at org.apache.spark.scheduler.Task.run(Task.scala:64)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
16/10/09 14:23:32 ERROR TaskSetManager: Task 0 in stage 6.0 failed 1 times; aborting job
[error] (run-main-0) org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 6.0 failed 1 times, most recent failure: Lost task 0.0 in stage 6.0 (TID 8, localhost): scala.MatchError: [000961291-01,2005-06-21T19:45:00Z,584.9] (of class org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema)
[error] at edu.gatech.cse8803.main.Main$$anonfun$11.apply(Main.scala:168)
[error] at edu.gatech.cse8803.main.Main$$anonfun$11.apply(Main.scala:168)
[error] at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
[error] at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
[error] at scala.collection.Iterator$class.foreach(Iterator.scala:727)
[error] at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
[error] at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
[error] at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
[error] at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
[error] at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
[error] at scala.collection.AbstractIterator.to(Iterator.scala:1157)
[error] at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
[error] at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
[error] at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
[error] at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
[error] at org.apache.spark.rdd.RDD$$anonfun$33.apply(RDD.scala:1177)
[error] at org.apache.spark.rdd.RDD$$anonfun$33.apply(RDD.scala:1177)
[error] at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1498)
[error] at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1498)
[error] at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
[error] at org.apache.spark.scheduler.Task.run(Task.scala:64)
[error] at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
[error] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
[error] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
[error] at java.lang.Thread.run(Thread.java:745)
[error]
[error] Driver stacktrace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 6.0 failed 1 times, most recent failure: Lost task 0.0 in stage 6.0 (TID 8, localhost): scala.MatchError: [000961291-01,2005-06-21T19:45:00Z,584.9] (of class org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema)
at edu.gatech.cse8803.main.Main$$anonfun$11.apply(Main.scala:168)
at edu.gatech.cse8803.main.Main$$anonfun$11.apply(Main.scala:168)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at org.apache.spark.rdd.RDD$$anonfun$33.apply(RDD.scala:1177)
at org.apache.spark.rdd.RDD$$anonfun$33.apply(RDD.scala:1177)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1498)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1498)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
at org.apache.spark.scheduler.Task.run(Task.scala:64)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1204)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1193)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1192)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1393)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
[trace] Stack trace suppressed: run last compile:run for the full output.
16/10/09 14:23:32 ERROR ContextCleaner: Error in cleaning thread
java.lang.InterruptedException
at java.lang.Object.wait(Native Method)
at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:135)
at org.apache.spark.ContextCleaner$$anonfun$org$apache$spark$ContextCleaner$$keepCleaning$1.apply$mcV$sp(ContextCleaner.scala:146)
at org.apache.spark.ContextCleaner$$anonfun$org$apache$spark$ContextCleaner$$keepCleaning$1.apply(ContextCleaner.scala:144)
at org.apache.spark.ContextCleaner$$anonfun$org$apache$spark$ContextCleaner$$keepCleaning$1.apply(ContextCleaner.scala:144)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1618)
at org.apache.spark.ContextCleaner.org$apache$spark$ContextCleaner$$keepCleaning(ContextCleaner.scala:143)
at org.apache.spark.ContextCleaner$$anon$3.run(ContextCleaner.scala:65)
16/10/09 14:23:32 ERROR Utils: Uncaught exception in thread SparkListenerBus
java.lang.InterruptedException
at java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireSharedInterruptibly(AbstractQueuedSynchronizer.java:996)
at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireSharedInterruptibly(AbstractQueuedSynchronizer.java:1303)
at java.util.concurrent.Semaphore.acquire(Semaphore.java:317)
at org.apache.spark.util.AsynchronousListenerBus$$anon$1$$anonfun$run$1.apply$mcV$sp(AsynchronousListenerBus.scala:62)
at org.apache.spark.util.AsynchronousListenerBus$$anon$1$$anonfun$run$1.apply(AsynchronousListenerBus.scala:61)
at org.apache.spark.util.AsynchronousListenerBus$$anon$1$$anonfun$run$1.apply(AsynchronousListenerBus.scala:61)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1618)
at org.apache.spark.util.AsynchronousListenerBus$$anon$1.run(AsynchronousListenerBus.scala:60)
java.lang.RuntimeException: Nonzero exit code: 1
at scala.sys.package$.error(package.scala:27)
[trace] Stack trace suppressed: run last compile:run for the full output.
[error] (compile:run) Nonzero exit code: 1
[error] Total time: 13 s, completed Oct 9, 2016 2:23:32 PM
java.util.Date is not data type that can be stored in a DataFrame. From the looks of it date is a Timestamp String. If I am right case class should be defined as:
case class Diagnostic(patientID: String, date: java.sql.Timestamp, code: String)
you should replace pattern:
case Row(patientID: String, date: java.util.Date, code: String)
with:
case Row(patientID: String, date: java.sql.Timestamp, code: String)
and cast date to timestamp:
res1.select($"patientID", $"date".cast("timestamp"), $"code")
Finally you should use rdd method before mapping for the forward compatibility:
res1.select($"patientID", $"date".cast("timestamp"), $"code").rdd.map {
...
}
In general I would recommend using as method:
res1.as[Diagnostic].rdd