SparkSql.scala
import org.apache.spark.sql.SparkSession
object SparkSql
{
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local[*]").appName("SparkSql1").getOrCreate()
//val data = spark.sparkContext.textFile(("C:\\Users\\blaskar\\Desktop\\Biswajit\\data\\sample.txt"))
val data = spark.read.format("csv").option("inferSchema","true").option("header","true").load("C:\\Users\\blaskar\\Desktop\\Biswajit\\data\\SampleCSVFile.csv")
val columnsRenamed = Seq("S.no", "Eldonbase", "platnium","Macintyre","count1","count2","count3","Nunavut","Storage","count4")
val original_data = data.toDF(columnsRenamed:_*).persist()
original_data.show(10)
}
}
build.sbt
name := "Spark-Sample1"
version := "1.0"
scalaVersion := "2.11.8"
libraryDependencies ++= Seq(
"org.apache.spark" % "spark-core_2.11" % "2.1.0" exclude("org.apache.hadoop", "hadoop-yarn-server-web-proxy"),
"org.apache.spark" % "spark-sql_2.11" % "2.1.0" exclude("org.apache.hadoop", "hadoop-yarn-server-web-proxy"))
VERSIONS OF SOFTWARE THE SYSTEM IS USING
spark: 2.1.0
scala: 2.11.8
I am using IntelliJ. The code is running fine but when I am submitting the job to my standalone cluster it is showing the below error:
SPARK-SUBMIT ERROR:
spark-submit --class "SparkSql" C:\Users\blaskar\Desktop\sbt_projects\Spark-Sample1
\out\artifacts\Spark_Sample1_jar\Spark-Sample1.jar
ERROR
java.lang.ClassNotFoundException: SparkSql
at java.net.URLClassLoader.findClass(Unknown Source)
at java.lang.ClassLoader.loadClass(Unknown Source)
at java.lang.ClassLoader.loadClass(Unknown Source)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Unknown Source)
at org.apache.spark.util.Utils$.classForName(Utils.scala:229)
at
org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:695)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:187)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:212)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:126)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Related
I'm working on an integration project of Kafka and Spark and I'm trying to read a Kafka topic using Spark 2.4.5, Scala 2.12.11 and Kafka 2.5.0.
My sbt file is:
name := "Test"
version := "1.0"
scalaVersion := "2.12.11"
libraryDependencies ++= Seq(
"org.apache.spark" % "spark-sql_2.12" % "2.4.5",
"org.apache.spark" % "spark-sql-kafka-0-10_2.12" % "2.4.5",
"org.apache.spark" % "spark-streaming-kafka-0-10-assembly_2.12" % "2.4.5",
"org.apache.kafka" % "kafka-clients" % "2.5.0"
)
my code is:
object Test{
def main(args: Array[String]) = {
import org.apache.spark.sql.SparkSession
val spark = SparkSession
.builder()
.appName("SparkTest")
.master("local[*]")
.getOrCreate()
import spark.implicits._
val df = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "test")
.option("startingOffsets", "earliest")
.load()
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
.as[(String, String)]
df.printSchema()
}}
After having created the topic on Kafka, started zookeeper and Kafka itself, when I launch the code with:
./spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:2.4.5 --class Test /home/luca/Projects/Test/target/scala-2.12/test_2.12-1.0.jar
I run into the following error:
20/05/06 15:40:29 INFO StateStoreCoordinatorRef: Registered StateStoreCoordinator endpoint
Exception in thread "main" java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: Provider org.apache.spark.sql.kafka010.KafkaSourceProvider could not be instantiated
at java.util.ServiceLoader.fail(ServiceLoader.java:232)
at java.util.ServiceLoader.access$100(ServiceLoader.java:185)
at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:384)
at java.util.ServiceLoader$LazyIterator.next(ServiceLoader.java:404)
at java.util.ServiceLoader$1.next(ServiceLoader.java:480)
at scala.collection.convert.Wrappers$JIteratorWrapper.next(Wrappers.scala:43)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
at scala.collection.TraversableLike$class.filterImpl(TraversableLike.scala:247)
at scala.collection.TraversableLike$class.filter(TraversableLike.scala:259)
at scala.collection.AbstractTraversable.filter(Traversable.scala:104)
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:630)
at org.apache.spark.sql.streaming.DataStreamReader.load(DataStreamReader.scala:161)
at Test$.main(projectfile.scala:24)
at Test.main(projectfile.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:845)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:161)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:184)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:86)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:920)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:929)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.NoSuchMethodError: org.apache.spark.internal.Logging.$init$(Lorg/apache/spark/internal/Logging;)V
at org.apache.spark.sql.kafka010.KafkaSourceProvider.<init>(KafkaSourceProvider.scala:44)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at java.lang.Class.newInstance(Class.java:442)
at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:380)
... 26 more
Can someone help me out with this?
Kafka- client version can be one of the reasons. Otherwise, try with spark 2.4.0 and scala 2.12 earlier versions. Looks like a compatibility issue
Deploying a spark application written in scala to an EMR cluster with the following command and i cannot figure out why I am receiving a missing dependency error message when deployed to the EMR cluster instance.
error message:
User class threw exception: java.lang.NoClassDefFoundError: com/amazonaws/services/kinesis/clientlibrary/lib/worker/InitialPositionInStream
aws emr add-steps --cluster-id j-xxxxxxx --steps Type=spark,Name=ScalaStream,Args=[\
--class,"ScalaStream",\
--deploy-mode,cluster,\
--master,yarn,\
--jars,s3://xxx.xxx.xxx/aws-java-sdk-1.11.715.jar,\
--conf,spark.yarn.submit.waitAppCompletion=false,\
s3://xxx.xxxx.xxxx/simple-project_2.12-1.0.jar\
],ActionOnFailure=CONTINUE
and sbt file
name := "Simple Project"
version := "1.0"
scalaVersion := "2.12.8"
libraryDependencies += "org.apache.spark" % "spark-sql_2.12" % "2.4.4"
libraryDependencies += "org.apache.spark" % "spark-streaming_2.12" % "2.4.4"
libraryDependencies += "com.amazonaws" % "aws-java-sdk" % "1.11.715"
libraryDependencies += "org.apache.spark" % "spark-streaming-kinesis-asl_2.12" % "2.4.4"
partial code below
...
import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
...
val streamingContext = new StreamingContext(sparkContext, batchInterval)
// Populate the appropriate variables from the given args
val streamAppName = "xxxxxx"
val streamName = "xxxxxx"
val endpointUrl = "https://kinesis.xxxxx.amazonaws.com"
val regionName = "xx-xx-x"
val initialPosition = InitialPositionInStream.LATEST
val checkpointInterval = batchInterval
val storageLevel = StorageLevel.MEMORY_AND_DISK_2
val kinesisStream = KinesisUtils.createStream(streamingContext, streamAppName, streamAppName, endpointUrl, regionName, initialPosition, checkpointInterval, storageLevel)
val initialPosition = InitialPositionInStream.LATEST
val checkpointInterval = batchInterval
val storageLevel = StorageLevel.MEMORY_AND_DISK_2
val kinesisStream = KinesisUtils.createStream(streamingContext, streamAppName, streamAppName, endpointUrl, regionName, initialPosition, checkpointInterval, storageLevel)
20/02/05 21:43:10 ERROR ApplicationMaster: User class threw exception: java.lang.NoClassDefFoundError: com/amazonaws/services/kinesis/clientlibrary/lib/worker/InitialPositionInStream
java.lang.NoClassDefFoundError: com/amazonaws/services/kinesis/clientlibrary/lib/worker/InitialPositionInStream
at ScalaStream$.main(stream.scala:32)
at ScalaStream.main(stream.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.yarn.ApplicationMaster$$anon$2.run(ApplicationMaster.scala:684)
Caused by: java.lang.ClassNotFoundException: com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 7 more
20/02/05 21:43:10 INFO ApplicationMaster: Final app status: FAILED, exitCode: 15, (reason: User class threw exception: java.lang.NoClassDefFoundError: com/amazonaws/services/kinesis/clientlibrary/lib/worker/InitialPositionInStream
at ScalaStream$.main(stream.scala:32)
at ScalaStream.main(stream.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.yarn.ApplicationMaster$$anon$2.run(ApplicationMaster.scala:684)
Caused by: java.lang.ClassNotFoundException: com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 7 more
)
I've tried including the aws dependencies both in the sbt file and also --jars paramater of spark-submit but cannot see why the dependency is missing?
fixed by updating the following
sbt
name := "Simple Project"
version := "1.0"
scalaVersion := "2.12.8"
libraryDependencies += "org.apache.spark" % "spark-sql_2.12" % "2.4.4"
libraryDependencies += "org.apache.spark" % "spark-streaming_2.12" % "2.4.4"
libraryDependencies += "org.apache.spark" % "spark-streaming-kinesis-asl_2.12" % "2.4.4"
deploy script
aws emr add-steps --cluster-id j-xxxxxxx --steps Type=spark,Name=ScalaStream,Args=[\
--class,"ScalaStream",\
--deploy-mode,cluster,\
--master,yarn,\
--packages,\'org.apache.spark:spark-streaming-kinesis-asl_2.11:2.4.0,org.postgresql:postgresql:42.2.9,com.facebook.presto:presto-jdbc:0.60\',\
--conf,spark.yarn.submit.waitAppCompletion=false,\
--conf,yarn.log-aggregation-enable=true,\
--conf,spark.dynamicAllocation.enabled=true,\
--conf,spark.cores.max=4,\
--conf,spark.network.timeout=300,\
s3://xxx.xxx/simple-project_2.12-1.0.jar\
],ActionOnFailure=CONTINUE
the key being the --packages flag added to the aws emr add-steps. Mistakenly thought sbt package bundled the required dependencies.
This seems like a very odd and specific issue which has me stumped.
When using a DataFrame built by a spark.sql("select * from table") query on a Hive table, I get a timeout exception whenever I try to use an HTTP client in a transform or action step on that DataFrame.
Example:
import scalaj.http._
import org.apache.spark.sql.SparkSession
object Example {
def postDoc(doc: String): Unit = {
val resp = Http("https://example.com/endpoint")
.postData(doc)
.header("content-type", "application/json")
.asString
}
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().enableHiveSupport().getOrCreate()
import spark.implicits._
val df = spark.sql("select id, json_doc from some_table")
df.map(r => r.getAs[String]("json_doc")).foreach(postDoc _)
}
}
I can however hit the service through a DataFrame I create manually; i.e. Seq((1, "{\"a\": 1}")).toDF("id", "json_doc").foreach(postDoc _).
I've also tried creating temp tables and using spark.sql to select from them; which works on a DataFrame that I create manually but not on ones sourced from a Hive table.
My partial build.sbt
scalaVersion := "2.11.12"
libraryDependencies ++= {
val sparkVersion = "2.1.3"
Seq(
"org.apache.spark" %% "spark-core" % sparkVersion % "provided",
"org.apache.spark" %% "spark-sql" % sparkVersion % "provided",
"org.scalaj" %% "scalaj-http" % "2.4.2"
)
}
Stacktrace
java.net.SocketTimeoutException: Read timed out
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at sun.net.www.protocol.http.HttpURLConnection$10.run(HttpURLConnection.java:1950)
at sun.net.www.protocol.http.HttpURLConnection$10.run(HttpURLConnection.java:1945)
at java.security.AccessController.doPrivileged(Native Method)
at sun.net.www.protocol.http.HttpURLConnection.getChainedException(HttpURLConnection.java:1944)
at sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1514)
at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1498)
at java.net.HttpURLConnection.getResponseCode(HttpURLConnection.java:480)
at sun.net.www.protocol.https.HttpsURLConnectionImpl.getResponseCode(HttpsURLConnectionImpl.java:352)
at scalaj.http.HttpRequest.scalaj$http$HttpRequest$$doConnection(Http.scala:367)
at scalaj.http.HttpRequest.exec(Http.scala:343)
at scalaj.http.HttpRequest.asString(Http.scala:492)
at com.gm.avalanche.collect.Collector$.postDoc(Collector.scala:34)
at com.gm.avalanche.collect.Collector$$anonfun$sqlTest$2.apply(Collector.scala:71)
at com.gm.avalanche.collect.Collector$$anonfun$sqlTest$2.apply(Collector.scala:71)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:918)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:918)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1954)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1954)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:100)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:325)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketTimeoutException: Read timed out
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.socketRead(SocketInputStream.java:116)
at java.net.SocketInputStream.read(SocketInputStream.java:171)
at java.net.SocketInputStream.read(SocketInputStream.java:141)
at sun.security.ssl.InputRecord.readFully(InputRecord.java:465)
at sun.security.ssl.InputRecord.read(InputRecord.java:503)
at sun.security.ssl.SSLSocketImpl.readRecord(SSLSocketImpl.java:975)
at sun.security.ssl.SSLSocketImpl.readDataRecord(SSLSocketImpl.java:933)
at sun.security.ssl.AppInputStream.read(AppInputStream.java:105)
at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
at java.io.BufferedInputStream.read1(BufferedInputStream.java:286)
at java.io.BufferedInputStream.read(BufferedInputStream.java:345)
at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:735)
at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:678)
at sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1593)
at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1498)
at sun.net.www.protocol.https.HttpsURLConnectionImpl.getInputStream(HttpsURLConnectionImpl.java:268)
at scalaj.http.HttpRequest.scalaj$http$HttpRequest$$doConnection(Http.scala:365)
... 17 more
Turns out the socket was being closed remotely by the Ingress controller running in front of the Kubernates environment where the Elasticsearch instance is running. It was set to the default of a one minute timeout.
I have a simple scala object file with the following content:
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
object X {
def main(args: Array[String]) {
val params = Map[String, String](
"abc" -> "22",)
println("Creating Spark Configuration");
val conf = new SparkConf().setAppName("X")
val sc = new SparkContext(conf)
val txtFileLines = sc.textFile("/tmp/x.txt", 2).cache()
val count = txtFileLines.count()
println("Count" + count)
}
}
My build.sbt looks like:
name := "x"
version := "1.0"
scalaVersion := "2.11.7"
libraryDependencies += "org.apache.spark" %% "spark-core" % "1.5.2" % "provided"
I then do sbt package to create x.jar under target/scala-2.11/
When I execute the above code as:
spark-submit --class X --master local[2] x.jar
I get the following error:
Creating Spark Configuration
Exception in thread "main" java.lang.NoSuchMethodError: scala.Predef$.ArrowAssoc(Ljava/lang/Object;)Ljava/lang/Object;
at Sweeper$.main(Sweeper.scala:14)
at Sweeper.main(Sweeper.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
As you are using Scala 2.11 in your project. You should use spark core library build for Scala 2.11.
Can download spark-core_2.11 from here http://mvnrepository.com/search?q=Spark
Refer spark-core_2.11 jar in project.
I’m having an issue connecting Spark SQL to a PostgreSQL data source. I’ve downloaded the Postgres JDBC jar and included it in an uber jar using sbt-assembly.
My (failing) source code:
https://gist.github.com/geowa4/a9bc238ca7c372b95267.
I’ve also tried using sqlContext.jdbc() preceded with classOf[org.postgresql.Driver] as well. It appears the driver can access the Driver just fine.
Any help would be much appreciated. Thanks.
SimpleApp.scala:
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.sql.SQLContext
object SimpleApp {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Simple Application")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
val commits = sqlContext.load("jdbc", Map(
"url" -> "jdbc:postgresql://192.168.59.103:5432/postgres",
"dbtable" -> "commits",
"driver" -> "org.postgresql.Driver"))
commits.select("message").show(1)
}
}
simple.sbt:
name := "simple-project"
version := "1.0"
scalaVersion := "2.11.6"
libraryDependencies += "org.apache.spark" %% "spark-core" % "1.3.1" % "provided"
libraryDependencies += "org.apache.spark" %% "spark-sql" % "1.3.1" % "provided"
libraryDependencies += "org.postgresql" % "postgresql" % "9.4-1201-jdbc41"
output (Edited):
Exception in thread "main" java.lang.ClassNotFoundException: org.postgresql.Driver
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:264)
at org.apache.spark.sql.jdbc.DefaultSource.createRelation(JDBCRelation.scala:102)
at org.apache.spark.sql.sources.ResolvedDataSource$.apply(ddl.scala:219)
at org.apache.spark.sql.SQLContext.load(SQLContext.scala:697)
at SimpleApp$.main(SimpleApp.scala:17)
at SimpleApp.main(SimpleApp.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:569)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:166)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:189)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:110)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
EDIT: I changed the Scala version to 2.10.5 and the output changed to this. I feel like I'm making progress.
There is a problem with general problem with JDBC, where the primordial classloader must know about the jar. In Spark 1.3 this can be addressed using the SPARK_CLASSPATH option as described here:
https://spark.apache.org/docs/1.3.0/sql-programming-guide.html#jdbc-to-other-databases
In Spark 1.4, this should be fixed by #5782.
1) Copy file into your jar location
2) Add jar in path as follows
spark-submit --jars /usr/share/java/postgresql-jdbc.jar --class com.examples.WordCount .. .. ..