We have a big project with multiple tests suites and every test suite has in average 3 tests.
For our unit tests, we use the Spark Standalone, and so no Yarn as a resource manager.
Every test suite :
initiliazes spark session :
implicit val spark = SparkSession
.builder()
.config(sparkConf)
.getOrCreate()
extends BeforeAndAfterAll:
class MyTestsSpec extends WordSpec
with Matchers
with BeforeAndAfterAll {
...
}
and redefine afterAll :
override def afterAll: Unit = {
try {
spark.stop()
} finally {
super.afterAll
}
}
Our solution has an CI job in Jenkins, and the Jenkins job started to be so often Unstable due to tests that fail because of the following Error :
Message d'erreur
Job 9 cancelled because SparkContext was shut down
Pile d'exécution
org.apache.spark.SparkException: Job 9 cancelled because SparkContext was shut down
at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:820)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:818)
at scala.collection.mutable.HashSet.foreach(HashSet.scala:78)
at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:818)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1732)
at org.apache.spark.util.EventLoop.stop(EventLoop.scala:83)
at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1651)
at org.apache.spark.SparkContext$$anonfun$stop$8.apply$mcV$sp(SparkContext.scala:1921)
at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1317)
at org.apache.spark.SparkContext.stop(SparkContext.scala:1920)
at org.apache.spark.SparkContext$$anonfun$2.apply$mcV$sp(SparkContext.scala:581)
at org.apache.spark.util.SparkShutdownHook.run(ShutdownHookManager.scala:216)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1954)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply$mcV$sp(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.util.SparkShutdownHookManager.runAll(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anon$2.run(ShutdownHookManager.scala:178)
at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:54)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:278)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:2853)
at org.apache.spark.sql.Dataset$$anonfun$collect$1.apply(Dataset.scala:2390)
at org.apache.spark.sql.Dataset$$anonfun$collect$1.apply(Dataset.scala:2390)
at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2837)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2836)
at org.apache.spark.sql.Dataset.collect(Dataset.scala:2390)
// some business classes
at org.scalatest.OutcomeOf$class.outcomeOf(OutcomeOf.scala:85)
at org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104)
at org.scalatest.Transformer.apply(Transformer.scala:22)
at org.scalatest.Transformer.apply(Transformer.scala:20)
at org.scalatest.WordSpecLike$$anon$1.apply(WordSpecLike.scala:1078)
at org.scalatest.TestSuite$class.withFixture(TestSuite.scala:196)
at org.scalatest.WordSpec.withFixture(WordSpec.scala:1881)
at org.scalatest.WordSpecLike$class.invokeWithFixture$1(WordSpecLike.scala:1075)
at org.scalatest.WordSpecLike$$anonfun$runTest$1.apply(WordSpecLike.scala:1088)
at org.scalatest.WordSpecLike$$anonfun$runTest$1.apply(WordSpecLike.scala:1088)
at org.scalatest.SuperEngine.runTestImpl(Engine.scala:289)
at org.scalatest.WordSpecLike$class.runTest(WordSpecLike.scala:1088)
at org.scalatest.WordSpec.runTest(WordSpec.scala:1881)
at org.scalatest.WordSpecLike$$anonfun$runTests$1.apply(WordSpecLike.scala:1147)
at org.scalatest.WordSpecLike$$anonfun$runTests$1.apply(WordSpecLike.scala:1147)
at org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:396)
at org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:384)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:384)
at org.scalatest.SuperEngine.org$scalatest$SuperEngine$$runTestsInBranch(Engine.scala:373)
at org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:410)
at org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:384)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:384)
at org.scalatest.SuperEngine.org$scalatest$SuperEngine$$runTestsInBranch(Engine.scala:379)
at org.scalatest.SuperEngine.runTestsImpl(Engine.scala:461)
at org.scalatest.WordSpecLike$class.runTests(WordSpecLike.scala:1147)
at org.scalatest.WordSpec.runTests(WordSpec.scala:1881)
at org.scalatest.Suite$class.run(Suite.scala:1147)
at org.scalatest.WordSpec.org$scalatest$WordSpecLike$$super$run(WordSpec.scala:1881)
at org.scalatest.WordSpecLike$$anonfun$run$1.apply(WordSpecLike.scala:1192)
at org.scalatest.WordSpecLike$$anonfun$run$1.apply(WordSpecLike.scala:1192)
at org.scalatest.SuperEngine.runImpl(Engine.scala:521)
at org.scalatest.WordSpecLike$class.run(WordSpecLike.scala:1192)
// some business classes
at org.scalatest.BeforeAndAfterAll$class.liftedTree1$1(BeforeAndAfterAll.scala:213)
at org.scalatest.BeforeAndAfterAll$class.run(BeforeAndAfterAll.scala:210)
// some business classes
at org.scalatest.tools.Framework.org$scalatest$tools$Framework$$runSuite(Framework.scala:314)
at org.scalatest.tools.Framework$ScalaTestTask.execute(Framework.scala:480)
at sbt.TestRunner.runTest$1(TestFramework.scala:106)
at sbt.TestRunner.run(TestFramework.scala:117)
at sbt.TestFramework$$anon$2$$anonfun$$lessinit$greater$1.$anonfun$apply$1(TestFramework.scala:262)
at sbt.TestFramework$.sbt$TestFramework$$withContextLoader(TestFramework.scala:233)
at sbt.TestFramework$$anon$2$$anonfun$$lessinit$greater$1.apply(TestFramework.scala:262)
at sbt.TestFramework$$anon$2$$anonfun$$lessinit$greater$1.apply(TestFramework.scala:262)
at sbt.TestFunction.apply(TestFramework.scala:271)
at sbt.Tests$.processRunnable$1(Tests.scala:307)
at sbt.Tests$.$anonfun$makeSerial$1(Tests.scala:313)
at sbt.std.Transform$$anon$3.$anonfun$apply$2(System.scala:46)
at sbt.std.Transform$$anon$4.work(System.scala:66)
at sbt.Execute.$anonfun$submit$2(Execute.scala:262)
at sbt.internal.util.ErrorHandling$.wideConvert(ErrorHandling.scala:16)
at sbt.Execute.work(Execute.scala:271)
at sbt.Execute.$anonfun$submit$1(Execute.scala:262)
at sbt.ConcurrentRestrictions$$anon$4.$anonfun$submitValid$1(ConcurrentRestrictions.scala:174)
at sbt.CompletionService$$anon$2.call(CompletionService.scala:36)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
And when we run the test alone it succeeds, with no problem.
I had a similar issue.
I had multiple tests using Spark and only the first suite worked. I was calling spark.close() in all of them. After removing this call from all suites it worked.
After investigating SparkSession code, my conclusion was that since it is only possible to have one SparkContext per JVM and the tests are run on the same JVM, when you stop it the first time, it becomes unusable for that "JVM session".
Related
I'm doing baby steps with spark and my exercise loads a JSON file into RDD, select a column, and then use the distinct to get unique values.
The column I'm filtering contains multiple values (CSV line) and has to be split.
val sqlContext = spark.sqlContext
import org.apache.spark.sql.hive.HiveContext
val hiveCtx = new HiveContext(sc)
import hiveCtx.implicits._
val bizDF = hiveCtx.jsonFile("/home/xpto/Documents/PersonalProjects/Yelp_P1/yelp_academic_dataset_business.json")
val catRdd = bizDF.select("categories").rdd.flatMap(row => (row.getString(0).split(",").map(_.trim))).distinct
When I run the statement "catRdd.take (10).foreach (println)" returns an exception:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 234.0 failed 1 times, most recent failure: Lost task 0.0 in stage 234.0 (TID 682, 192.168.0.122, executor driver):
java.lang.NullPointerException
at $anonfun$catRdd$1(<console>:39)
at $anonfun$catRdd$1$adapted(<console>:39)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:192)
at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:62)
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2120)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2139)
at org.apache.spark.rdd.RDD.$anonfun$take$1(RDD.scala:1423)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
at org.apache.spark.rdd.RDD.take(RDD.scala:1396)
... 48 elided
Caused by: java.lang.NullPointerException
at $anonfun$catRdd$1(<console>:39)
at $anonfun$catRdd$1$adapted(<console>:39)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:192)
at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:62)
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
... 3 more
The spark version I'm running is 2.12-3.0.1
I found a solution that fits for my requirement:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window
val v1 = bizDF.withColumn("categories", split(col("categories"), ","))
.select(col("categories")(0).as("description"))
.distinct
.coalesce(1)
.orderBy(asc("description"))
val windowSpec = Window.orderBy("description")
val v2 = v1.withColumn("id",row_number.over(windowSpec))
val v3 = v2.select("id","description")
Your json file has multiple lines and is not supported by HiveCtx. Try this instead using spark session:
val bizDF = spark.read.format("json").option("multiline", "true").load("/home/xpto/Documents/PersonalProjects/Yelp_P1/yelp_academic_dataset_business.json")
val catRdd = bizDF.select("categories").rdd.flatMap(row => (row.getString(0).split(",").map(_.trim))).distinct
catRdd.take(10).foreach(println)
I get below error when I run the unit tests using sbt console twice or more. I do not get any error when I run for the first time. If I get out from Sbt (Ctrl+D) and go back to Sbt, there is no issue with running the tests for the first time.Using IntelliJ.
My test suites contain many test cases like below
val timeOutSpan = 5
val intervalSpan = 500
implicit val defaultPatience =
PatienceConfig(timeout = Span(timeOutSpan, Seconds), interval = Span(intervalSpan, Millis))
it should "get houses" in {
val index: Future[Result] = destinations.index
.apply(FakeRequest(GET, "/houses"))
whenReady(index){ index =>
index.header.status shouldBe 200
}
}
Getting below error
java.lang.ExceptionInInitializerError
at play.api.http.HttpConfiguration$.current(HttpConfiguration.scala:149)
at play.api.mvc.GlobalStateHttpConfiguration$.httpConfiguration(Http.scala:22)
at play.api.mvc.Session$.path(Http.scala:713)
at play.api.i18n.DefaultMessagesApi.setLang(Messages.scala:502)
at play.api.i18n.I18nSupport$ResultWithLang.withLang(I18nSupport.scala:49)
at controllers.BaseController.decorate(BaseController.scala:38)
at modules.home.controllers.Home$$anonfun$index$1$$anonfun$apply$1$$anonfun$apply$2$$anonfun$apply$3$$anonfun$apply$4$$anonfun$apply$5$$anonfun$apply$9$$anonfun$apply$10.apply(Home.scala:51)
at modules.home.controllers.Home$$anonfun$index$1$$anonfun$apply$1$$anonfun$apply$2$$anonfun$apply$3$$anonfun$apply$4$$anonfun$apply$5$$anonfun$apply$9$$anonfun$apply$10.apply(Home.scala:40)
at scala.util.Success$$anonfun$map$1.apply(Try.scala:237)
at scala.util.Try$.apply(Try.scala:192)
at scala.util.Success.map(Try.scala:237)
at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:237)
at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:237)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
at scala.concurrent.impl.ExecutionContextImpl$AdaptedForkJoinTask.exec(ExecutionContextImpl.scala:121)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
Caused by: java.lang.ClassCastException: org.apache.xerces.parsers.XIncludeAwareParserConfiguration cannot be cast to org.apache.xerces.xni.parser.XMLParserConfiguration
at org.apache.xerces.parsers.SAXParser.<init>(Unknown Source)
at org.apache.xerces.parsers.SAXParser.<init>(Unknown Source)
at org.apache.xerces.jaxp.SAXParserImpl$JAXPSAXParser.<init>(Unknown Source)
at org.apache.xerces.jaxp.SAXParserImpl.<init>(Unknown Source)
at org.apache.xerces.jaxp.SAXParserImpl.<init>(Unknown Source)
at org.apache.xerces.jaxp.SAXParserFactoryImpl.newSAXParserImpl(Unknown Source)
at org.apache.xerces.jaxp.SAXParserFactoryImpl.setFeature(Unknown Source)
at play.api.Play$.<init>(Play.scala:46)
at play.api.Play$.<clinit>(Play.scala)
... 19 more
Including OneAppPerSuite in my test suite solved the issue.
class ExampleSpec extends FlatSpec with OneAppPerSuite {
test cases go here
}
details about OneAppPerSuite is here
I am submitting a spark application to the Cluster by using the following command
/root/spark/bin/spark-submit --conf spark.driver.momory=10g --class com.knoldus.SampleApp /pathToJar/Application.jar
But what is happening is : Multiple apps are getting submitted and one is running and all others are waiting and then after sometime the code exits with an exception.
The Spark UI looks something like this :
After this the code exits with this error :
8.149.243): java.io.IOException: Failed to write statements to keyspace.tableName.
at com.datastax.spark.connector.writer.TableWriter$$anonfun$write$1.apply(TableWriter.scala:167)
at com.datastax.spark.connector.writer.TableWriter$$anonfun$write$1.apply(TableWriter.scala:135)
at com.datastax.spark.connector.cql.CassandraConnector$$anonfun$withSessionDo$1.apply(CassandraConnector.scala:111)
at com.datastax.spark.connector.cql.CassandraConnector$$anonfun$withSessionDo$1.apply(CassandraConnector.scala:110)
at com.datastax.spark.connector.cql.CassandraConnector.closeResourceAfterUse(CassandraConnector.scala:140)
at com.datastax.spark.connector.cql.CassandraConnector.withSessionDo(CassandraConnector.scala:110)
at com.datastax.spark.connector.writer.TableWriter.write(TableWriter.scala:135)
at com.datastax.spark.connector.RDDFunctions$$anonfun$saveToCassandra$1.apply(RDDFunctions.scala:37)
at com.datastax.spark.connector.RDDFunctions$$anonfun$saveToCassandra$1.apply(RDDFunctions.scala:37)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:85)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
16/12/14 06:26:28 WARN TaskSetManager: Lost task 0.1 in stage 2.0 (TID 561, 10.178.149.243): java.util.NoSuchElementException: None.get
at scala.None$.get(Option.scala:347)
at scala.None$.get(Option.scala:345)
at org.apache.spark.storage.BlockInfoManager.releaseAllLocksForTask(BlockInfoManager.scala:343)
at org.apache.spark.storage.BlockManager.releaseAllLocksForTask(BlockManager.scala:644)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:281)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
16/12/14 06:26:28 ERROR TaskSetManager: Task 0 in stage 2.0 failed 4 times; aborting job
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 4 times, most recent failure: Lost task 0.3 in stage 2.0 (TID 563, 10.178.149.225): java.util.NoSuchElementException: None.get
at scala.None$.get(Option.scala:347)
at scala.None$.get(Option.scala:345)
at org.apache.spark.storage.BlockInfoManager.releaseAllLocksForTask(BlockInfoManager.scala:343)
at org.apache.spark.storage.BlockManager.releaseAllLocksForTask(BlockManager.scala:644)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:281)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1450)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1438)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1437)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1437)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1659)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1618)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1607)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1871)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1884)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1904)
at com.datastax.spark.connector.RDDFunctions.saveToCassandra(RDDFunctions.scala:37)
at com.knoldus.xml.RNF2Driver$.main(RNFIngestPipeline.scala:56)
at com.knoldus.xml.RNF2Driver.main(RNFIngestPipeline.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:729)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:185)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:210)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:124)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.util.NoSuchElementException: None.get
at scala.None$.get(Option.scala:347)
at scala.None$.get(Option.scala:345)
at org.apache.spark.storage.BlockInfoManager.releaseAllLocksForTask(BlockInfoManager.scala:343)
at org.apache.spark.storage.BlockManager.releaseAllLocksForTask(BlockManager.scala:644)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:281)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
My Spark-conf is :
private val conf = new SparkConf()
.setAppName("SampleApp")
.setMaster(sparkClusterIP)
.set("spark.sql.shuffle.partitions", "8")
.set("spark.cassandra.connection.host", cassandraIP)
.set("spark.sql.crossJoin.enabled", "true")
.set("spark.kryoserializer.buffer.max", "640m")
.set("spark.executor.memory", "10g")
.set("spark.executor.cores", "3")
.set("spark.cassandra.output.batch.size.rows", "10")
.set("spark.cassandra.output.batch.size.bytes", "20480")
This is my sample code. Can anyone please let me know what the problem is :
val cassandraIDs = sc.cassandraTable[A](keySpace,tableName).map(_.filename.split("/").last.split("\\.")(0).toLong).collect()
val broadCastList = sc.broadcast(cassandraIDs)
val files = sc.wholeTextFiles(hdFSIP).map(_._1).filter { path =>
val listOfCassandraID = broadCastList.value
!listOfCassandraID.contains(path.split("/").last.split("\\.")(0).toLong)
}.take(100)
import sqlContext.implicits._
val fileNameRDD = sc.parallelize(files)
val cassandraRdd = fileNameRDD.map { path =>
...
//do some task
}.toDF(columnNames)
cassandraRdd.saveToCassandra(keySpace,tablename)
println(s"Completed Processing of $numOfDocs in ${System.currentTimeMillis() - start} milliseconds")
sc.stop()
Why multiple apps are getting submitted?
Because you are submitting multiple spark jobs in your driver code. Each one of the below statements will trigger a new job in the current spark context,
val cassandraIDs = sc.cassandraTable[A]....toLong).collect()
sc.wholeTextFiles
sc.parallelize(files)
cassandraRdd.saveToCassandra(keySpace,tablename)
.toDF(columnNames)
sc.broadcast
(Not sure about the last two)
I haven't worked with cassandra via spark much. But, above code doesn't utilize the power of spark.
You should pipeline the tasks as much as possible, so that spark can plan and run the tasks in distributed way.
Example:
In your code, you are creating files by calling take(100) and then creating an RDD called fileNameRDD using sc.parallelize(files).
This triggers two spark jobs, one to take 100 items, and one to create a new RDD using those 100 items.
Instead you should combine these two tasks so that they can be pipelined by spark.
sc.wholeTextFiles(hdFSIP).map(_._1).filter { path =>
val listOfCassandraID = broadCastList.value
!listOfCassandraID.contains(path.split("/").last.split("\\.")(0).toLong)
}.map { path => /*<---Combine the two tasks like this*/
...
//do some task
}.toDF(columnNames)
Note: I have skipped take(100) part, but you should be able to do that using filters
Why are you getting NoSuchElementException: None.get error
This is more likely because of scala version mismatch between your code and the scala version that is used to build the spark.
Since migrating to Spark 2.0.0, some SparkContext and SparkSql functions(e.g. textFile, parallelize, read.load) no longer work
in the scala unit test environment although they do work outside of the test environment in my Main class.
When the code attempts to execute each of these functions in the test environment, a java.lang.ExceptionInInitializerError occurs.
Note that the same problem exists when using Spark 2.0.0 whether I use SparkSession or the old SparkContext and SparkSql.
Also, it is important noting that the same functions do work in the test environment when I use Spark 1.6.
Any ideas for a solution to this problem? Both the test code and the stack trace (associated with the execution of
spark.sparkContext.textFile) with the error are below:
import org.apache.spark.sql.SparkSession
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfterAll, FunSuite}
#RunWith(classOf[JUnitRunner])
class TestSparkFunctions extends FunSuite with BeforeAndAfterAll {
val master = "local[2]"
val appName = "test"
val spark = SparkSession
.builder()
.appName(appName)
.master(master)
.config("spark.sql.warehouse.dir", System.getProperty("user.dir"))
.getOrCreate()
val textFile = spark.sparkContext.textFile("people.txt")
test("test spark functions") {
// val peopleDf = spark.read.format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat")
// .option("header", "true")
// .option("delimiter", "\\t")
// .option("mode", "PERMISSIVE")
// .option("inferSchema", "true")
// .load(getClass.getResource("/people.txt").getPath)
// val lines = Source.fromFile(getClass.getResource("/people.txt").getPath).getLines.toSeq
// val linesRdd: RDD[String] = spark.sparkContext.parallelize(lines)
}
override def afterAll() = {
if (spark != null) spark.stop()
}
}
Stack Trace Error:
java.lang.ExceptionInInitializerError
at org.apache.spark.SparkContext.withScope(SparkContext.scala:682)
at org.apache.spark.SparkContext.textFile(SparkContext.scala:800)
at com.corelogic.TestSparkFunctions.<init>(TestSparkFunctions.scala:19)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:422)
at java.lang.Class.newInstance(Class.java:442)
at org.scalatest.tools.Framework$ScalaTestTask.execute(Framework.scala:468)
at sbt.TestRunner.runTest$1(TestFramework.scala:76)
at sbt.TestRunner.run(TestFramework.scala:85)
at sbt.TestFramework$$anon$2$$anonfun$$init$$1$$anonfun$apply$8.apply(TestFramework.scala:202)
at sbt.TestFramework$$anon$2$$anonfun$$init$$1$$anonfun$apply$8.apply(TestFramework.scala:202)
at sbt.TestFramework$.sbt$TestFramework$$withContextLoader(TestFramework.scala:185)
at sbt.TestFramework$$anon$2$$anonfun$$init$$1.apply(TestFramework.scala:202)
at sbt.TestFramework$$anon$2$$anonfun$$init$$1.apply(TestFramework.scala:202)
at sbt.TestFunction.apply(TestFramework.scala:207)
at sbt.Tests$$anonfun$9.apply(Tests.scala:216)
at sbt.Tests$$anonfun$9.apply(Tests.scala:216)
at sbt.std.Transform$$anon$3$$anonfun$apply$2.apply(System.scala:44)
at sbt.std.Transform$$anon$3$$anonfun$apply$2.apply(System.scala:44)
at sbt.std.Transform$$anon$4.work(System.scala:63)
at sbt.Execute$$anonfun$submit$1$$anonfun$apply$1.apply(Execute.scala:226)
at sbt.Execute$$anonfun$submit$1$$anonfun$apply$1.apply(Execute.scala:226)
at sbt.ErrorHandling$.wideConvert(ErrorHandling.scala:17)
at sbt.Execute.work(Execute.scala:235)
at sbt.Execute$$anonfun$submit$1.apply(Execute.scala:226)
at sbt.Execute$$anonfun$submit$1.apply(Execute.scala:226)
at sbt.ConcurrentRestrictions$$anon$4$$anonfun$1.apply(ConcurrentRestrictions.scala:159)
at sbt.CompletionService$$anon$2.call(CompletionService.scala:28)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: com.fasterxml.jackson.databind.JsonMappingException: Jackson version is too old 2.4.4
at com.fasterxml.jackson.module.scala.JacksonModule$class.setupModule(JacksonModule.scala:56)
at com.fasterxml.jackson.module.scala.DefaultScalaModule.setupModule(DefaultScalaModule.scala:19)
at com.fasterxml.jackson.databind.ObjectMapper.registerModule(ObjectMapper.java:549)
at org.apache.spark.rdd.RDDOperationScope$.<init>(RDDOperationScope.scala:82)
at org.apache.spark.rdd.RDDOperationScope$.<clinit>(RDDOperationScope.scala)
at org.apache.spark.SparkContext.withScope(SparkContext.scala:682)
at org.apache.spark.SparkContext.textFile(SparkContext.scala:800)
at com.corelogic.TestSparkFunctions.<init>(TestSparkFunctions.scala:19)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:422)
at java.lang.Class.newInstance(Class.java:442)
at org.scalatest.tools.Framework$ScalaTestTask.execute(Framework.scala:468)
at sbt.TestRunner.runTest$1(TestFramework.scala:76)
at sbt.TestRunner.run(TestFramework.scala:85)
at sbt.TestFramework$$anon$2$$anonfun$$init$$1$$anonfun$apply$8.apply(TestFramework.scala:202)
at sbt.TestFramework$$anon$2$$anonfun$$init$$1$$anonfun$apply$8.apply(TestFramework.scala:202)
at sbt.TestFramework$.sbt$TestFramework$$withContextLoader(TestFramework.scala:185)
at sbt.TestFramework$$anon$2$$anonfun$$init$$1.apply(TestFramework.scala:202)
at sbt.TestFramework$$anon$2$$anonfun$$init$$1.apply(TestFramework.scala:202)
at sbt.TestFunction.apply(TestFramework.scala:207)
at sbt.Tests$$anonfun$9.apply(Tests.scala:216)
at sbt.Tests$$anonfun$9.apply(Tests.scala:216)
at sbt.std.Transform$$anon$3$$anonfun$apply$2.apply(System.scala:44)
at sbt.std.Transform$$anon$3$$anonfun$apply$2.apply(System.scala:44)
at sbt.std.Transform$$anon$4.work(System.scala:63)
at sbt.Execute$$anonfun$submit$1$$anonfun$apply$1.apply(Execute.scala:226)
at sbt.Execute$$anonfun$submit$1$$anonfun$apply$1.apply(Execute.scala:226)
at sbt.ErrorHandling$.wideConvert(ErrorHandling.scala:17)
at sbt.Execute.work(Execute.scala:235)
at sbt.Execute$$anonfun$submit$1.apply(Execute.scala:226)
at sbt.Execute$$anonfun$submit$1.apply(Execute.scala:226)
at sbt.ConcurrentRestrictions$$anon$4$$anonfun$1.apply(ConcurrentRestrictions.scala:159)
at sbt.CompletionService$$anon$2.call(CompletionService.scala:28)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
I ran into same issue. Underlying issue here is:
Jackson version is too old 2.4.4
Resolved this by using newer version "2.6.5" that Spark 2.x supports. "com.fasterxml.jackson.core" % "jackson-databind" % "2.6.5"
ref:spark pom.xml
I tried to run a word count spark app every time I get this error please help, following is the wordcount.scala file and after sbt package I ran the spark-submit command
package main
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
object WordCount {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Word Count")
val sc = new SparkContext(conf)
val textfile = sc.textFile("file:///usr/local/spark/README.md")
val tokenizeddata = textfile.flatMap(line => line.split(" "))
val countprep = tokenizeddata.map(word => (word,1))
val counts = countprep.reduceByKey((accumvalue,newvalue)=>(accumvalue+newvalue))
val sortedcount = counts.sortBy(kvpair=>kvpair._2,false)
sortedcount.saveAsTextFile("file:///usr/local/wordcount")
}
}
I ran the next command.
bin/spark-submit --class "main.WordCount" --master "local[*]" "/home/hadoop/SparkApps/target/scala-2.10/word-count_2.10-1.0.jar"
Spark assembly has been built with Hive, including Datanucleus jars on
classpath Java HotSpot(TM) 64-Bit Server VM warning:
ignoring option MaxPermSize=128m; support was removed in 8.0 15/11/28 07:38:51 ERROR Executor: Exception in task 1.0 in stage 1.0
(TID 1) java.net.NoRouteToHostException: No route to host
at java.net.PlainSocketImpl.socketConnect(Native Method)
at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350)
at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:206)
at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188)
at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
at java.net.Socket.connect(Socket.java:589)
at sun.net.NetworkClient.doConnect(NetworkClient.java:175)
at sun.net.www.http.HttpClient.openServer(HttpClient.java:432)
at sun.net.www.http.HttpClient.openServer(HttpClient.java:527)
at sun.net.www.http.HttpClient.(HttpClient.java:211)
at sun.net.www.http.HttpClient.New(HttpClient.java:308)
at sun.net.www.http.HttpClient.New(HttpClient.java:326)
at sun.net.www.protocol.http.HttpURLConnection.getNewHttpClient(HttpURLConnection.java:1169)
at sun.net.www.protocol.http.HttpURLConnection.plainConnect0(HttpURLConnection.java:1105)
at sun.net.www.protocol.http.HttpURLConnection.plainConnect(HttpURLConnection.java:999)
at sun.net.www.protocol.http.HttpURLConnection.connect(HttpURLConnection.java:933)
at org.apache.spark.util.Utils$.fetchFile(Utils.scala:375)
at org.apache.spark.executor.Executor$$anonfun$org$apache$spark$executor$Executor$$updateDependencies$6.apply(Executor.scala:325)
at org.apache.spark.executor.Executor$$anonfun$org$apache$spark$executor$Executor$$updateDependencies$6.apply(Executor.scala:323)
at scala.collection.TraversableLike$WithFilter$$anonfun$foreach$1.apply(TraversableLike.scala:772)
at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:98)
at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:98)
at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:226)
at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:39)
at scala.collection.mutable.HashMap.foreach(HashMap.scala:98)
at scala.collection.TraversableLike$WithFilter.foreach(TraversableLike.scala:771)
at org.apache.spark.executor.Executor.org$apache$spark$executor$Executor$$updateDependencies(Executor.scala:323)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:158)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Maybe you should add .setMaster("local")