I'm using Spark-Cassandra-Connector-assembly-2.0.0-M1-13-gd0df0.jar in Spark 2.0.0. I'm connecting to spark-shell:
bin/spark-shell --jars jars/spark-cassandra-connector-assembly-2.0.0-M1-13-gd0df0.jar
When I try to retrieve data from cassandra, I'm getting the following error:
scala> sc.stop
scala> import com.datastax.spark.connector._, org.apache.spark.SparkContext, org.apache.spark.SparkContext._, org.apache.spark.SparkConf
import com.datastax.spark.connector._
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
scala> val conf = new SparkConf(true).set("spark.cassandra.connection.host", "localhost")
conf: org.apache.spark.SparkConf = org.apache.spark.SparkConf#a9a8ec3
scala> val sc = new SparkContext(conf)
sc: org.apache.spark.SparkContext = org.apache.spark.SparkContext#6451a288
scala> val test_spark_rdd = sc.cassandraTable("test_spark", "test")
test_spark_rdd: com.datastax.spark.connector.rdd.CassandraTableScanRDD[com.datastax.spark.connector.CassandraRow] = CassandraTableScanRDD[0] at RDD at CassandraRDD.scala:15
scala> test_spark_rdd.first
16/08/16 15:02:19 ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
java.lang.NoClassDefFoundError: scala/collection/GenTraversableOnce$class
at com.datastax.spark.connector.util.CountingIterator.<init>(CountingIterator.scala:4)
at com.datastax.spark.connector.rdd.CassandraTableScanRDD.compute(CassandraTableScanRDD.scala:336)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:85)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ClassNotFoundException: scala.collection.GenTraversableOnce$class
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 10 more
16/08/16 15:02:19 ERROR SparkUncaughtExceptionHandler: Uncaught exception in thread Thread[Executor task launch worker-0,5,main]
java.lang.NoClassDefFoundError: scala/collection/GenTraversableOnce$class
at com.datastax.spark.connector.util.CountingIterator.<init>(CountingIterator.scala:4)
at com.datastax.spark.connector.rdd.CassandraTableScanRDD.compute(CassandraTableScanRDD.scala:336)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:85)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ClassNotFoundException: scala.collection.GenTraversableOnce$class
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 10 more
16/08/16 15:02:19 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, localhost): java.lang.NoClassDefFoundError: scala/collection/GenTraversableOnce$class
at com.datastax.spark.connector.util.CountingIterator.<init>(CountingIterator.scala:4)
at com.datastax.spark.connector.rdd.CassandraTableScanRDD.compute(CassandraTableScanRDD.scala:336)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:85)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ClassNotFoundException: scala.collection.GenTraversableOnce$class
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 10 more
16/08/16 15:02:19 ERROR TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
16/08/16 15:02:19 ERROR LiveListenerBus: SparkListenerBus has already stopped! Dropping event SparkListenerStageCompleted(org.apache.spark.scheduler.StageInfo#12164ec5)
16/08/16 15:02:19 ERROR LiveListenerBus: SparkListenerBus has already stopped! Dropping event SparkListenerJobEnd(0,1471384939706,JobFailed(org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): java.lang.NoClassDefFoundError: scala/collection/GenTraversableOnce$class
at com.datastax.spark.connector.util.CountingIterator.<init>(CountingIterator.scala:4)
at com.datastax.spark.connector.rdd.CassandraTableScanRDD.compute(CassandraTableScanRDD.scala:336)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:85)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ClassNotFoundException: scala.collection.GenTraversableOnce$class
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 10 more
Driver stacktrace:))
After this error message the scala prompt closes automatically.
Cassandra version: 3.0.8
Scala version: 2.11.8
In cassandra.yaml:
native_transport_port: 9042
rpc_address: localhost
rpc_port: 9160
Any help would be appreciated.
Related
I get org.apache.spark.SparkException: A master URL must be set in your configuration ERROR.
It occurs on an executor not a driver.
I get ========conf======== print correctly which I logged after starting sparksession
But when I do partitioning on Dataset and try to do some action, I get error as executor fails to create SparkSession.
(If I call collect() instead of coalesce(3), the job succeeds.)
I can't find any solution.
Below is my spark APP.
val builder: SparkSession.Builder = SparkSession.builder().enableHiveSupport()
_spark = builder.getOrCreate()
println("================START================")
println(s"=========profile:${ProfileUtils.getEnvironment()}============")
println("========conf========")
val odsBundles = findBundles()
odsBundles.coalesce(3).foreach((row: Row) => {
val bundleLifecycleIdKey = s"${row.getAs[Long]("bundleId")}:${mapBundleType(row.getAs[String]("shippingDeliveryType"))}"
val newBundleLifecycleId: UUID = UUID.nameUUIDFromBytes(bundleLifecycleIdKey.getBytes())
println(s"###### find: ${newBundleLifecycleId.toString}")
})
println("================END================")
Driver error
22/02/18 16:40:33 INFO cluster.YarnClusterScheduler: Cancelling stage 4
22/02/18 16:40:33 INFO scheduler.DAGScheduler: ResultStage 4 (foreach at BundleMappingCheckApp.scala:73) failed in 9.881 s due to Job aborted due to stage failure: Task 0 in stage 4.0 failed 4 times, most recent failure: Lost task 0.3 in stage 4.0 (TID 2595, ip-10-211-213-216.ap-northeast-2.compute.internal, executor 2): java.lang.ExceptionInInitializerError
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$$anonfun$runApp$1.apply(BundleMappingCheckApp.scala:74)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$$anonfun$runApp$1.apply(BundleMappingCheckApp.scala:73)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:927)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:927)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: A master URL must be set in your configuration
at org.apache.spark.SparkContext.<init>(SparkContext.scala:367)
at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2493)
at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:934)
at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:925)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:925)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.initializeContext(BundleMappingCheckApp.scala:29)
at com.coupang.flow.validation.base.spark.app.BaseSparkApp$class.$init$(BaseSparkApp.scala:12)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.<init>(BundleMappingCheckApp.scala:12)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.<clinit>(BundleMappingCheckApp.scala:48)
... 14 more
Driver stacktrace:
22/02/18 16:40:33 INFO scheduler.DAGScheduler: Job 0 failed: foreach at BundleMappingCheckApp.scala:73, took 94.454855 s
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 4.0 failed 4 times, most recent failure: Lost task 0.3 in stage 4.0 (TID 2595, ip-10-211-213-216.ap-northeast-2.compute.internal, executor 2): java.lang.ExceptionInInitializerError
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$$anonfun$runApp$1.apply(BundleMappingCheckApp.scala:74)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$$anonfun$runApp$1.apply(BundleMappingCheckApp.scala:73)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:927)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:927)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: A master URL must be set in your configuration
at org.apache.spark.SparkContext.<init>(SparkContext.scala:367)
at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2493)
at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:934)
at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:925)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:925)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.initializeContext(BundleMappingCheckApp.scala:29)
at com.coupang.flow.validation.base.spark.app.BaseSparkApp$class.$init$(BaseSparkApp.scala:12)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.<init>(BundleMappingCheckApp.scala:12)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.<clinit>(BundleMappingCheckApp.scala:48)
... 14 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1803)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1791)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1790)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1790)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:871)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:871)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:871)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2024)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1973)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1962)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:682)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:927)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:925)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.foreach(RDD.scala:925)
at org.apache.spark.sql.Dataset$$anonfun$foreach$1.apply$mcV$sp(Dataset.scala:2661)
at org.apache.spark.sql.Dataset$$anonfun$foreach$1.apply(Dataset.scala:2661)
at org.apache.spark.sql.Dataset$$anonfun$foreach$1.apply(Dataset.scala:2661)
at org.apache.spark.sql.Dataset$$anonfun$withNewRDDExecutionId$1.apply(Dataset.scala:3244)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
at org.apache.spark.sql.Dataset.withNewRDDExecutionId(Dataset.scala:3240)
at org.apache.spark.sql.Dataset.foreach(Dataset.scala:2660)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.runApp(BundleMappingCheckApp.scala:73)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.main(BundleMappingCheckApp.scala:51)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp.main(BundleMappingCheckApp.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.yarn.ApplicationMaster$$anon$4.run(ApplicationMaster.scala:721)
Caused by: java.lang.ExceptionInInitializerError
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$$anonfun$runApp$1.apply(BundleMappingCheckApp.scala:74)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$$anonfun$runApp$1.apply(BundleMappingCheckApp.scala:73)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:927)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:927)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: A master URL must be set in your configuration
at org.apache.spark.SparkContext.<init>(SparkContext.scala:367)
at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2493)
at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:934)
at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:925)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:925)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.initializeContext(BundleMappingCheckApp.scala:29)
at com.coupang.flow.validation.base.spark.app.BaseSparkApp$class.$init$(BaseSparkApp.scala:12)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.<init>(BundleMappingCheckApp.scala:12)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.<clinit>(BundleMappingCheckApp.scala:48)
... 14 more
22/02/18 16:40:33 ERROR yarn.ApplicationMaster: User class threw exception: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 4.0 failed 4 times, most recent failure: Lost task 0.3 in stage 4.0 (TID 2595, ip-10-211-213-216.ap-northeast-2.compute.internal, executor 2): java.lang.ExceptionInInitializerError
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$$anonfun$runApp$1.apply(BundleMappingCheckApp.scala:74)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$$anonfun$runApp$1.apply(BundleMappingCheckApp.scala:73)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:927)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:927)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: A master URL must be set in your configuration
at org.apache.spark.SparkContext.<init>(SparkContext.scala:367)
at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2493)
at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:934)
at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:925)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:925)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.initializeContext(BundleMappingCheckApp.scala:29)
at com.coupang.flow.validation.base.spark.app.BaseSparkApp$class.$init$(BaseSparkApp.scala:12)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.<init>(BundleMappingCheckApp.scala:12)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.<clinit>(BundleMappingCheckApp.scala:48)
... 14 more
Executor error
22/02/18 16:40:33 ERROR spark.SparkContext: Error initializing SparkContext.
org.apache.spark.SparkException: A master URL must be set in your configuration
at org.apache.spark.SparkContext.<init>(SparkContext.scala:367)
at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2493)
at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:934)
at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:925)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:925)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.initializeContext(BundleMappingCheckApp.scala:29)
at com.coupang.flow.validation.base.spark.app.BaseSparkApp$class.$init$(BaseSparkApp.scala:12)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.<init>(BundleMappingCheckApp.scala:12)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.<clinit>(BundleMappingCheckApp.scala:48)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$$anonfun$runApp$1.apply(BundleMappingCheckApp.scala:74)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$$anonfun$runApp$1.apply(BundleMappingCheckApp.scala:73)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:927)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:927)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
22/02/18 16:40:33 ERROR util.Utils: Uncaught exception in thread Executor task launch worker for task 2595
java.lang.NullPointerException
at org.apache.spark.SparkContext.org$apache$spark$SparkContext$$postApplicationEnd(SparkContext.scala:2389)
at org.apache.spark.SparkContext$$anonfun$stop$1.apply$mcV$sp(SparkContext.scala:1904)
at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1361)
at org.apache.spark.SparkContext.stop(SparkContext.scala:1903)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:579)
at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2493)
at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:934)
at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:925)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:925)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.initializeContext(BundleMappingCheckApp.scala:29)
at com.coupang.flow.validation.base.spark.app.BaseSparkApp$class.$init$(BaseSparkApp.scala:12)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.<init>(BundleMappingCheckApp.scala:12)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.<clinit>(BundleMappingCheckApp.scala:48)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$$anonfun$runApp$1.apply(BundleMappingCheckApp.scala:74)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$$anonfun$runApp$1.apply(BundleMappingCheckApp.scala:73)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:927)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:927)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
22/02/18 16:40:33 INFO spark.SparkContext: Successfully stopped SparkContext
22/02/18 16:40:33 ERROR executor.Executor: Exception in task 0.3 in stage 4.0 (TID 2595)
java.lang.ExceptionInInitializerError
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$$anonfun$runApp$1.apply(BundleMappingCheckApp.scala:74)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$$anonfun$runApp$1.apply(BundleMappingCheckApp.scala:73)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:927)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:927)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: A master URL must be set in your configuration
at org.apache.spark.SparkContext.<init>(SparkContext.scala:367)
at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2493)
at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:934)
at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:925)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:925)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.initializeContext(BundleMappingCheckApp.scala:29)
at com.coupang.flow.validation.base.spark.app.BaseSparkApp$class.$init$(BaseSparkApp.scala:12)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.<init>(BundleMappingCheckApp.scala:12)
at com.coupang.flow.validation.spark.app.bundle.BundleMappingCheckApp$.<clinit>(BundleMappingCheckApp.scala:48)
... 14 more
22/02/18 16:40:33 INFO executor.CoarseGrainedExecutorBackend: Driver commanded a shutdown
Sample rddDate: [2016-08-01,"pm",5,"ri"]
There are some rows with incorrect format of a date in this RDD, so I can't count rows in RDD. That throws IndexOutOfBound exception.
Used Date format is java.sql.Date
The expected Date Format is for every row in RDD: "yyyy-mm-dd"
2016-08-01
To verify date format in RDD, below code is implemented,
val rddVerified: RDD[(Date, String, Long, String)] = rddDate.map{
a => {
val fmt = DateTimeFormat forPattern "yyyy-mm-dd"
val input = a._1.toString
try {
val output = fmt parseDateTime input
} catch {
case e: Exception => {
val v1 = new java.util.Date("2016-08-01")
val v2 = new Date(a1.getTime)
val ed:(Date,String, Int, String) = (v2, "p1",2,"r1")
Some(ed) // This gives compile time error
}
} finally {
Some(a._1, a._2,a._3,a._4)
}
}
}
I am not able to handle the exception in catch section. I want to either remove that row from the RDD or correct the format of date in that row.
I want to get returned RDD in this format:
RDD[(Date, String, Long, String)]
Thanks.
UPDATE
Exception when counting Dataframe:
COUNT : :
[error] o.a.s.e.Executor - Exception in task 0.0 in stage 7.0 (TID 7)
java.lang.IndexOutOfBoundsException: 1
at scala.collection.LinearSeqOptimized$class.apply(LinearSeqOptimized.scala:65)
at scala.collection.immutable.List.apply(List.scala:84)
at controllers.Spark$$anonfun$5.apply(Spark.scala:78)
at controllers.Spark$$anonfun$5.apply(Spark.scala:78)
at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:464)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.agg_doAggregateWithoutKey$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
[warn] o.a.s.s.TaskSetManager - Lost task 0.0 in stage 7.0 (TID 7, localhost, executor driver): java.lang.IndexOutOfBoundsException: 1
at scala.collection.LinearSeqOptimized$class.apply(LinearSeqOptimized.scala:65)
at scala.collection.immutable.List.apply(List.scala:84)
at controllers.Spark$$anonfun$5.apply(Spark.scala:78)
at controllers.Spark$$anonfun$5.apply(Spark.scala:78)
at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:464)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.agg_doAggregateWithoutKey$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:108)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)
[error] o.a.s.s.TaskSetManager - Task 0 in stage 7.0 failed 1 times; aborting job
[warn] o.a.s.s.BlockManager - Putting block rdd_1_1 failed due to an exception
[warn] o.a.s.s.BlockManager - Block rdd_1_1 could not be removed as it was not found on disk or in memory
[warn] o.a.s.s.BlockManager - Putting block rdd_1_2 failed due to an exception
[warn] o.a.s.s.BlockManager - Block rdd_1_2 could not be removed as it was not found on disk or in memory
[warn] o.a.s.s.TaskSetManager - Lost task 1.0 in stage 7.0 (TID 8, localhost, executor driver): TaskKilled (unknown reason)
[warn] o.a.s.s.TaskSetManager - Lost task 2.0 in stage 7.0 (TID 9, localhost, executor driver): TaskKilled (unknown reason)
[error] application -
stage failure: Task 0 in stage 7.0 failed 1 times, most recent failure: Lost task 0.0 in stage 7.0 (TID 7, localhost, executor driver): java.lang.IndexOutOfBoundsException: 1
at scala.collection.LinearSeqOptimized$class.apply(LinearSeqOptimized.scala:65)
at scala.collection.immutable.List.apply(List.scala:84)
at controllers.Spark$$anonfun$5.apply(Spark.scala:78)
at controllers.Spark$$anonfun$5.apply(Spark.scala:78)
at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:464)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.agg_doAggregateWithoutKey$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:108)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:]]
at play.api.http.HttpErrorHandlerExceptions$.throwableToUsefulException(HttpErrorHandler.scala:293)
at play.api.http.DefaultHttpErrorHandler.onServerError(HttpErrorHandler.scala:220)
at play.api.GlobalSettings$class.onError(GlobalSettings.scala:160)
at play.api.DefaultGlobal$.onError(GlobalSettings.scala:188)
at play.api.http.GlobalSettingsHttpErrorHandler.onServerError(HttpErrorHandler.scala:100)
at play.core.server.netty.PlayRequestHandler$$anonfun$2$$anonfun$apply$1.applyOrElse(PlayRequestHandler.scala:100)
at play.core.server.netty.PlayRequestHandler$$anonfun$2$$anonfun$apply$1.applyOrElse(PlayRequestHandler.scala:99)
at scala.concurrent.Future$$anonfun$recoverWith$1.apply(Future.scala:346)
at scala.concurrent.Future$$anonfun$recoverWith$1.apply(Future.scala:345)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:36)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 7.0 failed 1 times, most recent failure: Lost task 0.0 in stage 7.0 (TID 7, localhost, executor driver): java.lang.IndexOutOfBoundsException: 1
at scala.collection.LinearSeqOptimized$class.apply(LinearSeqOptimized.scala:65)
at scala.collection.immutable.List.apply(List.scala:84)
at controllers.Spark$$anonfun$5.apply(Spark.scala:78)
at controllers.Spark$$anonfun$5.apply(Spark.scala:78)
at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:464)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.agg_doAggregateWithoutKey$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:108)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
Caused by: java.lang.IndexOutOfBoundsException: 1
at scala.collection.LinearSeqOptimized$class.apply(LinearSeqOptimized.scala:65)
at scala.collection.immutable.List.apply(List.scala:84)
at controllers.Spark$$anonfun$5.apply(Spark.scala:78)
at controllers.Spark$$anonfun$5.apply(Spark.scala:78)
at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:464)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.agg_doAggregateWithoutKey$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
I would recommend using Dataset. It is not only faster and simpler, but also future friendly
import org.apache.spark.sql.functions.to_date
val spark: SparkSession = ???
import spark.implicits._
rddDate.toDF.withColumn("_1", to_date($"_1"))
.na.drop(Seq("_1))
.as[(java.sql.Date, String, Long, String)]
Edit
But the problem is some else in your code.
Caused by: java.lang.IndexOutOfBoundsException: 1
Suggest that you make some mistake, probably in the parsing logic. You have to step back to the place where you call apply add exception handling there.
I need to convert SQL row filled in var value named rows to vector. I use the steps below
val df = sqlContext.sql("SELECT age,gender FROM test.test2")
val rows: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = df.rdd
val doubVals = rows.map{ row => row.getDouble(0) }
val vector = Vectors.dense{ doubVals.collect}
but it gives a lot of exceptions like ClassNotFoundException
scala> val vector = Vectors.dense{ doubVals.collect}
WARN 2017-07-14 02:12:09,477 org.apache.spark.scheduler.TaskSetManager:
Lost task 0.0 in stage 2.0 (TID 7, 192.168.110.200):
java.lang.ClassNotFoundException:
$line31.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw
$$iw$$iw$$iw$$iw$$anonfun$1
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.spark.serializer.JavaDeserializationStream$$anon$1.resolveClass(JavaSerializer.scala:67)
at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1826)
at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1713)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2000)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2245)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2169)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2245)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2169)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2245)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2169)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:422)
at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)
[Stage 2:> (0 +
3) / 7]ERROR 2017-07-14 02:12:09,787
org.apache.spark.scheduler.TaskSetManager: Task 2 in stage 2.0 failed 4
times; aborting job
org.apache.spark.SparkException: Job aborted due to stage failure: Task 2
in stage 2.0 failed 4 times, most recent failure: Lost task 2.3 in stage
2.0 (TID 21, 192.168.110.200): java.lang.ClassNotFoundException: $anonfun$1
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.spark.serializer.JavaDeserializationStream$$anon$1.resolveClass(JavaSerializer.scala:67)
at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1826)
at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1713)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2000)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2245)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2169)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2245)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2169)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2245)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2169)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:422)
at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)
but it gives me exception: ClassNotFoundException
Could you please help me to solve this error?
look at the following steps ( it does allow me )
scala> val df = Seq(2.0,3.0,3.2,2.3,1.2).toDF("col")
df: org.apache.spark.sql.DataFrame = [col: double]
scala> import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.Vectors
scala> val rows = df.rdd
rows: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[3] at rdd at <console>:31
scala> val doubVals = rows.map{ row => row.getDouble(0) }
doubVals: org.apache.spark.rdd.RDD[Double] = MapPartitionsRDD[4] at map at <console>:33
scala> val vector = Vectors.dense{ doubVals.collect}
vector: org.apache.spark.mllib.linalg.Vector = [2.0,3.0,3.2,2.3,1.2]
This should give hints to debug yours
I am submitting a spark application to the Cluster by using the following command
/root/spark/bin/spark-submit --conf spark.driver.momory=10g --class com.knoldus.SampleApp /pathToJar/Application.jar
But what is happening is : Multiple apps are getting submitted and one is running and all others are waiting and then after sometime the code exits with an exception.
The Spark UI looks something like this :
After this the code exits with this error :
8.149.243): java.io.IOException: Failed to write statements to keyspace.tableName.
at com.datastax.spark.connector.writer.TableWriter$$anonfun$write$1.apply(TableWriter.scala:167)
at com.datastax.spark.connector.writer.TableWriter$$anonfun$write$1.apply(TableWriter.scala:135)
at com.datastax.spark.connector.cql.CassandraConnector$$anonfun$withSessionDo$1.apply(CassandraConnector.scala:111)
at com.datastax.spark.connector.cql.CassandraConnector$$anonfun$withSessionDo$1.apply(CassandraConnector.scala:110)
at com.datastax.spark.connector.cql.CassandraConnector.closeResourceAfterUse(CassandraConnector.scala:140)
at com.datastax.spark.connector.cql.CassandraConnector.withSessionDo(CassandraConnector.scala:110)
at com.datastax.spark.connector.writer.TableWriter.write(TableWriter.scala:135)
at com.datastax.spark.connector.RDDFunctions$$anonfun$saveToCassandra$1.apply(RDDFunctions.scala:37)
at com.datastax.spark.connector.RDDFunctions$$anonfun$saveToCassandra$1.apply(RDDFunctions.scala:37)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:85)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
16/12/14 06:26:28 WARN TaskSetManager: Lost task 0.1 in stage 2.0 (TID 561, 10.178.149.243): java.util.NoSuchElementException: None.get
at scala.None$.get(Option.scala:347)
at scala.None$.get(Option.scala:345)
at org.apache.spark.storage.BlockInfoManager.releaseAllLocksForTask(BlockInfoManager.scala:343)
at org.apache.spark.storage.BlockManager.releaseAllLocksForTask(BlockManager.scala:644)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:281)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
16/12/14 06:26:28 ERROR TaskSetManager: Task 0 in stage 2.0 failed 4 times; aborting job
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 4 times, most recent failure: Lost task 0.3 in stage 2.0 (TID 563, 10.178.149.225): java.util.NoSuchElementException: None.get
at scala.None$.get(Option.scala:347)
at scala.None$.get(Option.scala:345)
at org.apache.spark.storage.BlockInfoManager.releaseAllLocksForTask(BlockInfoManager.scala:343)
at org.apache.spark.storage.BlockManager.releaseAllLocksForTask(BlockManager.scala:644)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:281)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1450)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1438)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1437)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1437)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1659)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1618)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1607)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1871)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1884)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1904)
at com.datastax.spark.connector.RDDFunctions.saveToCassandra(RDDFunctions.scala:37)
at com.knoldus.xml.RNF2Driver$.main(RNFIngestPipeline.scala:56)
at com.knoldus.xml.RNF2Driver.main(RNFIngestPipeline.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:729)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:185)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:210)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:124)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.util.NoSuchElementException: None.get
at scala.None$.get(Option.scala:347)
at scala.None$.get(Option.scala:345)
at org.apache.spark.storage.BlockInfoManager.releaseAllLocksForTask(BlockInfoManager.scala:343)
at org.apache.spark.storage.BlockManager.releaseAllLocksForTask(BlockManager.scala:644)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:281)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
My Spark-conf is :
private val conf = new SparkConf()
.setAppName("SampleApp")
.setMaster(sparkClusterIP)
.set("spark.sql.shuffle.partitions", "8")
.set("spark.cassandra.connection.host", cassandraIP)
.set("spark.sql.crossJoin.enabled", "true")
.set("spark.kryoserializer.buffer.max", "640m")
.set("spark.executor.memory", "10g")
.set("spark.executor.cores", "3")
.set("spark.cassandra.output.batch.size.rows", "10")
.set("spark.cassandra.output.batch.size.bytes", "20480")
This is my sample code. Can anyone please let me know what the problem is :
val cassandraIDs = sc.cassandraTable[A](keySpace,tableName).map(_.filename.split("/").last.split("\\.")(0).toLong).collect()
val broadCastList = sc.broadcast(cassandraIDs)
val files = sc.wholeTextFiles(hdFSIP).map(_._1).filter { path =>
val listOfCassandraID = broadCastList.value
!listOfCassandraID.contains(path.split("/").last.split("\\.")(0).toLong)
}.take(100)
import sqlContext.implicits._
val fileNameRDD = sc.parallelize(files)
val cassandraRdd = fileNameRDD.map { path =>
...
//do some task
}.toDF(columnNames)
cassandraRdd.saveToCassandra(keySpace,tablename)
println(s"Completed Processing of $numOfDocs in ${System.currentTimeMillis() - start} milliseconds")
sc.stop()
Why multiple apps are getting submitted?
Because you are submitting multiple spark jobs in your driver code. Each one of the below statements will trigger a new job in the current spark context,
val cassandraIDs = sc.cassandraTable[A]....toLong).collect()
sc.wholeTextFiles
sc.parallelize(files)
cassandraRdd.saveToCassandra(keySpace,tablename)
.toDF(columnNames)
sc.broadcast
(Not sure about the last two)
I haven't worked with cassandra via spark much. But, above code doesn't utilize the power of spark.
You should pipeline the tasks as much as possible, so that spark can plan and run the tasks in distributed way.
Example:
In your code, you are creating files by calling take(100) and then creating an RDD called fileNameRDD using sc.parallelize(files).
This triggers two spark jobs, one to take 100 items, and one to create a new RDD using those 100 items.
Instead you should combine these two tasks so that they can be pipelined by spark.
sc.wholeTextFiles(hdFSIP).map(_._1).filter { path =>
val listOfCassandraID = broadCastList.value
!listOfCassandraID.contains(path.split("/").last.split("\\.")(0).toLong)
}.map { path => /*<---Combine the two tasks like this*/
...
//do some task
}.toDF(columnNames)
Note: I have skipped take(100) part, but you should be able to do that using filters
Why are you getting NoSuchElementException: None.get error
This is more likely because of scala version mismatch between your code and the scala version that is used to build the spark.
I am following an example from the book Learning Spark: Lightning-Fast Big Data Analysis:
// custom partitioner
class DomainNamePartitioner(numParts: Int) extends Partitioner {
override def numPartitions: Int = numParts
override def getPartition(key: Any): Int = {
val domain = new URI(key.toString).getHost
val code = (domain.hashCode % numPartitions)
if(code < 0) {
code + numPartitions
} else {
code
}
}
override def equals(other: Any): Boolean = other match {
case dnp: DomainNamePartitioner => dnp.numPartitions == numPartitions
case _ => false
}
}
The project is here: https://github.com/kindlychung/learnSpark
I got the following error when running the main function:
16/02/02 21:25:04 ERROR Executor: Exception in task 1.0 in stage 49.0 (TID 193)
java.lang.NullPointerException
at DomainNamePartitioner.getPartition(HasRun.scala:20)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:151)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
16/02/02 21:25:04 ERROR Executor: Exception in task 3.0 in stage 49.0 (TID 195)
java.lang.NullPointerException
at DomainNamePartitioner.getPartition(HasRun.scala:20)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:151)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
16/02/02 21:25:04 ERROR Executor: Exception in task 0.0 in stage 49.0 (TID 192)
java.lang.NullPointerException
at DomainNamePartitioner.getPartition(HasRun.scala:20)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:151)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
16/02/02 21:25:04 ERROR Executor: Exception in task 2.0 in stage 49.0 (TID 194)
java.lang.NullPointerException
at DomainNamePartitioner.getPartition(HasRun.scala:20)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:151)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
16/02/02 21:25:04 WARN TaskSetManager: Lost task 0.0 in stage 49.0 (TID 192, localhost): java.lang.NullPointerException
at DomainNamePartitioner.getPartition(HasRun.scala:20)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:151)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
16/02/02 21:25:04 ERROR TaskSetManager: Task 0 in stage 49.0 failed 1 times; aborting job
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 49.0 failed 1 times, most recent failure: Lost task 0.0 in stage 49.0 (TID 192, localhost): java.lang.NullPointerException
at DomainNamePartitioner.getPartition(HasRun.scala:20)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:151)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:927)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
at org.apache.spark.rdd.RDD.collect(RDD.scala:926)
at Intro$$anon$14.run(Intro.scala:248)
at Intro$.main(Intro.scala:270)
at Intro.main(Intro.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:144)
Caused by: java.lang.NullPointerException
at DomainNamePartitioner.getPartition(HasRun.scala:20)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:151)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Process finished with exit code 1
Quite cryptic. Anyone can help? Thanks!
It looks like things are failing at this line
val code = (domain.hashCode % numPartitions)
Likely because domain is null. This is probably happening because the keys you're passing into the constructor for URI in the previous line are ill-formed.