I am trying to fetch the HBase data using spark and scala. However, I am getting an error which I am not able to contemplate.
Code
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.HConstants
import org.apache.hadoop.hbase.client.{HBaseAdmin, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.SparkConf
object HBase {
def main(args: Array[String]): Unit = {
val tableName = "posts"
val sc = new SparkContext(new SparkConf().setAppName("HBaseReadWrite").setMaster("local[4]"))
val conf = HBaseConfiguration.create()
conf.set(HConstants.ZOOKEEPER_QUORUM, "localhost")
conf.set(TableInputFormat.INPUT_TABLE, tableName)
val admin = new HBaseAdmin(conf)
if(!admin.isTableAvailable(conf.get(tableName))) {
println("Table doesn't exist")
return
}
val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[ImmutableBytesWritable], classOf[Result])
println(hBaseRDD.map(x => x._2).map(result => Bytes.toString(result.getRow)).collect().take(5).mkString("\n"))
}
}
build.sbt
name := "NLPAnnotationController"
version := "1.0"
scalaVersion := "2.10.5"
resolvers += "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/"
resolvers += "sonatype snapshots" at "https://oss.sonatype.org/content/repositories/snapshots/"
organization := "com.scryAnalytics"
val hadoop_version = "0.98.19-hadoop2"
libraryDependencies ++= Seq(
"org.apache.spark" % "spark-core_2.10" % "1.2.0",
"org.apache.hbase" % "hbase-spark" % "1.2.0-cdh5.7.2",
"org.apache.hbase" % "hbase-client" % hadoop_version excludeAll(ExclusionRule(organization = "javax.servlet", name="javax.servlet-api"), ExclusionRule(organization = "org.mortbay.jetty", name="jetty"), ExclusionRule(organization = "org.mortbay.jetty", name="servlet-api-2.5")),
"org.apache.hbase" % "hbase-common" % hadoop_version excludeAll(ExclusionRule(organization = "javax.servlet", name="javax.servlet-api"), ExclusionRule(organization = "org.mortbay.jetty", name="jetty"), ExclusionRule(organization = "org.mortbay.jetty", name="servlet-api-2.5")),
"org.apache.hbase" % "hbase-server" % hadoop_version excludeAll(ExclusionRule(organization = "javax.servlet", name="javax.servlet-api"), ExclusionRule(organization = "org.mortbay.jetty", name="jetty"), ExclusionRule(organization = "org.mortbay.jetty", name="servlet-api-2.5")),
"org.scala-lang" % "scala-library" % "2.10.5",
"it.nerdammer.bigdata" % "spark-hbase-connector_2.10" % "1.0.3"
)
Error
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/hadoop/hbase/exceptions/TimeoutIOException
at HBase$.main(HBase.scala:20)
at HBase.main(HBase.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:147)
Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.hbase.exceptions.TimeoutIOException
at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)
at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
I have tried changing the versions of the dependencies, still no progress.
Any help would be great. Thanks in advance.
Most probably, the jar file you provide to workers doesn't contain classes from your dependencies. Build "fat" jar using sbt-assembly and upload it to spark.
Related
I've checked a bunch of other forums and posts, but I can't seem to narrow down the issue. All I keep seeing is people saying not to use logging and how it's deprecated, but I don't even know where I'm using it in my code.
When I run the following code:
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.streaming.twitter._
import twitter4j.Status
object TrendingHashTags {
def main(args: Array[String]): Unit = {
if (args.length < 8) {
System.err.println("Usage: TrendingHashTags <consumer key> <consumer secret> " +
"<access token> <access token secret> " +
"<language> <batch interval> <min-threshold> <show-count> " +
"[<filters>]")
System.exit(1)
}
val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret,
lang, batchInterval, minThreshold, showCount ) = args.take(8)
val filters = args.takeRight(args.length - 8)
System.setProperty("twitter4j.oauth.consumerKey", consumerKey)
System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret)
System.setProperty("twitter4j.oauth.accessToken", accessToken)
System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret)
val conf = new SparkConf().setMaster(("local[4]")).setAppName("TrendingHashTags")
val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
ssc.checkpoint("checkpoint")
val tweets = TwitterUtils.createStream(ssc, None, filters)
val tweetsFilteredByLang = tweets.filter{tweet => tweet.getLang() == lang}
val statuses = tweetsFilteredByLang.map{tweet => tweet.getText()}
val words = statuses.flatMap{status => status.split("""\s+""")}
val hashTags = words.filter{word => word.startsWith("#")}
val hashTagPairs = hashTags.map{hashtag => (hashtag, 1)}
val tagsWithCounts = hashTagPairs.updateStateByKey(
(counts: Seq[Int], prevCount: Option[Int]) =>
prevCount.map{c => c + counts.sum}.orElse{Some(counts.sum)}
)
val topHashTags = tagsWithCounts.filter { case (t, c) =>
c > minThreshold.toInt
}
val sortedTopHashTags = topHashTags.transform{rdd =>
rdd.sortBy({case(w, c) => c}, false)
}
sortedTopHashTags.print(showCount.toInt)
ssc.start()
ssc.awaitTermination()
}
}
I get the following error stack trace:
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/spark/Logging
at java.lang.ClassLoader.defineClass1(Native Method)
at java.lang.ClassLoader.defineClass(ClassLoader.java:763)
at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142)
at java.net.URLClassLoader.defineClass(URLClassLoader.java:467)
at java.net.URLClassLoader.access$100(URLClassLoader.java:73)
at java.net.URLClassLoader$1.run(URLClassLoader.java:368)
at java.net.URLClassLoader$1.run(URLClassLoader.java:362)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:361)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:335)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at org.apache.spark.streaming.twitter.TwitterUtils$.createStream(TwitterUtils.scala:44)
at TrendingHashTags$.main(TrendingHashTags.scala:28)
at TrendingHashTags.main(TrendingHashTags.scala)
Caused by: java.lang.ClassNotFoundException: org.apache.spark.Logging
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:335)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
Here are my build.sbt contents:
name := "sparkStreaming"
version := "0.1"
scalaVersion := "2.11.12"
libraryDependencies ++= Seq("org.apache.spark" %% "spark-core" % "2.4.5",
"org.apache.spark" %% "spark-sql" % "2.4.5",
"org.apache.spark" %% "spark-streaming" % "2.4.5" % "provided",
"org.apache.spark" %% "spark-streaming-twitter" % "1.6.3")
Clear indication is, some where internally you are using lower version of spark... (spark 1.5 may be)
sbt inspect tree clean
you can check with this.
for maven users mvn depdency:tree will give all the dependencies used list
One more thing is you are using
"org.apache.spark" %% "spark-streaming" % "2.4.5" % "provided",
change it to default maven scope compile and see.
similar quetion and answers here
Deploying a spark application written in scala to an EMR cluster with the following command and i cannot figure out why I am receiving a missing dependency error message when deployed to the EMR cluster instance.
error message:
User class threw exception: java.lang.NoClassDefFoundError: com/amazonaws/services/kinesis/clientlibrary/lib/worker/InitialPositionInStream
aws emr add-steps --cluster-id j-xxxxxxx --steps Type=spark,Name=ScalaStream,Args=[\
--class,"ScalaStream",\
--deploy-mode,cluster,\
--master,yarn,\
--jars,s3://xxx.xxx.xxx/aws-java-sdk-1.11.715.jar,\
--conf,spark.yarn.submit.waitAppCompletion=false,\
s3://xxx.xxxx.xxxx/simple-project_2.12-1.0.jar\
],ActionOnFailure=CONTINUE
and sbt file
name := "Simple Project"
version := "1.0"
scalaVersion := "2.12.8"
libraryDependencies += "org.apache.spark" % "spark-sql_2.12" % "2.4.4"
libraryDependencies += "org.apache.spark" % "spark-streaming_2.12" % "2.4.4"
libraryDependencies += "com.amazonaws" % "aws-java-sdk" % "1.11.715"
libraryDependencies += "org.apache.spark" % "spark-streaming-kinesis-asl_2.12" % "2.4.4"
partial code below
...
import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
...
val streamingContext = new StreamingContext(sparkContext, batchInterval)
// Populate the appropriate variables from the given args
val streamAppName = "xxxxxx"
val streamName = "xxxxxx"
val endpointUrl = "https://kinesis.xxxxx.amazonaws.com"
val regionName = "xx-xx-x"
val initialPosition = InitialPositionInStream.LATEST
val checkpointInterval = batchInterval
val storageLevel = StorageLevel.MEMORY_AND_DISK_2
val kinesisStream = KinesisUtils.createStream(streamingContext, streamAppName, streamAppName, endpointUrl, regionName, initialPosition, checkpointInterval, storageLevel)
val initialPosition = InitialPositionInStream.LATEST
val checkpointInterval = batchInterval
val storageLevel = StorageLevel.MEMORY_AND_DISK_2
val kinesisStream = KinesisUtils.createStream(streamingContext, streamAppName, streamAppName, endpointUrl, regionName, initialPosition, checkpointInterval, storageLevel)
20/02/05 21:43:10 ERROR ApplicationMaster: User class threw exception: java.lang.NoClassDefFoundError: com/amazonaws/services/kinesis/clientlibrary/lib/worker/InitialPositionInStream
java.lang.NoClassDefFoundError: com/amazonaws/services/kinesis/clientlibrary/lib/worker/InitialPositionInStream
at ScalaStream$.main(stream.scala:32)
at ScalaStream.main(stream.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.yarn.ApplicationMaster$$anon$2.run(ApplicationMaster.scala:684)
Caused by: java.lang.ClassNotFoundException: com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 7 more
20/02/05 21:43:10 INFO ApplicationMaster: Final app status: FAILED, exitCode: 15, (reason: User class threw exception: java.lang.NoClassDefFoundError: com/amazonaws/services/kinesis/clientlibrary/lib/worker/InitialPositionInStream
at ScalaStream$.main(stream.scala:32)
at ScalaStream.main(stream.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.yarn.ApplicationMaster$$anon$2.run(ApplicationMaster.scala:684)
Caused by: java.lang.ClassNotFoundException: com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 7 more
)
I've tried including the aws dependencies both in the sbt file and also --jars paramater of spark-submit but cannot see why the dependency is missing?
fixed by updating the following
sbt
name := "Simple Project"
version := "1.0"
scalaVersion := "2.12.8"
libraryDependencies += "org.apache.spark" % "spark-sql_2.12" % "2.4.4"
libraryDependencies += "org.apache.spark" % "spark-streaming_2.12" % "2.4.4"
libraryDependencies += "org.apache.spark" % "spark-streaming-kinesis-asl_2.12" % "2.4.4"
deploy script
aws emr add-steps --cluster-id j-xxxxxxx --steps Type=spark,Name=ScalaStream,Args=[\
--class,"ScalaStream",\
--deploy-mode,cluster,\
--master,yarn,\
--packages,\'org.apache.spark:spark-streaming-kinesis-asl_2.11:2.4.0,org.postgresql:postgresql:42.2.9,com.facebook.presto:presto-jdbc:0.60\',\
--conf,spark.yarn.submit.waitAppCompletion=false,\
--conf,yarn.log-aggregation-enable=true,\
--conf,spark.dynamicAllocation.enabled=true,\
--conf,spark.cores.max=4,\
--conf,spark.network.timeout=300,\
s3://xxx.xxx/simple-project_2.12-1.0.jar\
],ActionOnFailure=CONTINUE
the key being the --packages flag added to the aws emr add-steps. Mistakenly thought sbt package bundled the required dependencies.
Good morning,
I have a probleme to connect Apache Spark with Hbase on AWS EMR cluster , i think the probleme is comming from the incompatibility of "shc-core" and "json4s" , i have always the same error below despite i chnge the versions of "spark,scala,shc-core".
Exception in thread "main" java.lang.NoSuchMethodError: org.json4s.jackson.JsonMethods$.parse(Lorg/json4s/JsonInput;Z)Lorg/json4s/JsonAST$JValue;
at org.apache.spark.sql.execution.datasources.hbase.HBaseTableCatalog$.apply(HBaseTableCatalog.scala:257)
at org.apache.spark.sql.execution.datasources.hbase.HBaseRelation.<init>(HBaseRelation.scala:80)
at org.apache.spark.sql.execution.datasources.hbase.DefaultSource.createRelation(HBaseRelation.scala:59)
at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:156)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:676)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:285)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:271)
at Dataflow$.main(Dataflow.scala:56)
at Dataflow.main(Dataflow.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:853)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:161)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:184)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:86)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:928)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:937)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
The configuration of the build.sbt file below:
name := "hive_spark_hbase"
version := "1.0"
scalaVersion := "2.11.12"
val sparkVersion = "2.3.4"
resolvers += "jar_hortonworks" at "http://repo.hortonworks.com/content/groups/public"
libraryDependencies ++= Seq(
"org.scala-lang" % "scala-library" % scalaVersion.value,
"org.apache.spark" %% "spark-sql" % sparkVersion,
"org.apache.spark" %% "spark-sql-kafka-0-10" % sparkVersion,
"org.apache.spark" %% "spark-mllib" % sparkVersion,
"com.hortonworks" % "shc-core" % "1.1.1-2.1-s_2.11"
)
My Main Class to load Data to Hbase:
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.execution.datasources.hbase._
import org.apache.spark.sql.execution.datasources.hbase.HBaseTableCatalog
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkConf
import org.apache.spark.sql.SQLContext
import org.apache.spark.SparkConf
case class Employee(id:String , name: String , lastname: String , mname:String , adress:String , city:String , state : String , zipcode : String)
object Dataflow {
def main(args: Array[String]): Unit = {
val catalog=
s"""{
|"table":{"namespace":"default","name":"employee"},
|"rowkey":"key",
|"columns":{
|"key":{"cf":"rowkey","col":"key","type":"string"},
|"fName":{"cf":"person","col":"firstName","type":"string"},
|"lName":{"cf":"person","col":"lastName","type":"string"},
|"mName":{"cf":"person","col":"middleName","type":"string"},
|"addressLine":{"cf":"address","col":"addressLine","type":"string"},
|"city":{"cf":"address","col":"city","type":"string"},
|"state":{"cf":"address","col":"state","type":"string"},
|"zipCode":{"cf":"address","col":"zipCode","type":"string"}
|}
|}""".stripMargin
val data=Seq(
Employee("1","Abby","Smith","K","3456main","Orlando","FL","45235"),
Employee("2","Amaya","Williams","L","123Orange","Newark","NJ","27656"),
Employee("3","Alchemy","Davis","P","Warners","Sanjose","CA","34789")
)
val master = "yarn"
val appName = "MyApp"
val conf: SparkConf = new SparkConf()
.setMaster(master)
.setAppName(appName)
.set("spark.driver.allowMultipleContexts", "false")
.set("spark.ui.enabled", "false")
val spark = SparkSession.builder().config(conf).getOrCreate()
import spark.implicits._
val df=spark.sparkContext.parallelize(data).toDF
df.write.options(
Map(HBaseTableCatalog.tableCatalog->catalog,HBaseTableCatalog.newTable->"4"))
.format("org.apache.spark.sql.execution.datasources.hbase")
.save()
}
}
And finally submitting Spark Job Like this:
spark-submit --class Dataflow --master yarn --packages com.hortonworks:shc-core:1.1.1-2.1-s_2.11 --repositories http://repo.hortonworks.com/content/groups/public/ --files /etc/hbase/conf/hbase-site.xml /home/hadoop/spark_hbase/target/scala-2.11/hive_spark_hbase_2.11-1.0.jar
Thank's a lot .
Im using Spark 1.3.1 (on ubuntu 14.04) stand alone, sbt 0.13.10, and trying to execute the following script:
package co.some.sheker
import java.sql.Date
import org.apache.spark.{SparkContext, SparkConf}
import SparkContext._
import org.apache.spark.sql.{Row, SQLContext}
import com.datastax.spark.connector._
import java.sql._
import org.apache.spark.sql._
import org.apache.spark.sql.cassandra.CassandraSQLContext
import java.io.PushbackReader
import java.lang.{ StringBuilder => JavaStringBuilder }
import java.io.StringReader
import com.datastax.spark.connector.cql.CassandraConnector
import org.joda.time.{DateTimeConstants}
case class TableKey(key1: String, key2: String)
object myclass{
def main(args: scala.Array[String]) {
val conf = ...
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val csc = new CassandraSQLContext(sc)
val data_x = csc.sql("select distinct key1, key2 from keyspace.table where key1 = 'sheker'").map(row => (row(0).toString, row(1).toString))
println("Done cross mapping")
val snapshotsFiltered = data_x.map(x => TableKey(x._1,x._2)).joinWithCassandraTable("keyspace", "table")
println("Done join")
val jsons = snapshotsFiltered.map(_._2.getString("json"))
...
sc.stop()
println("Done.")
}
}
By using:
/home/user/spark-1.3.1/bin/spark-submit --master spark://1.1.1.1:7077 --driver-class-path /home/user/spark-cassandra-connector-java-assembly-1.3.1-FAT.jar --properties-file prop.conf --class "myclass" "myjar.jar"
The prop.conf file is:
spark.cassandra.connection.host myhost
spark.serializer org.apache.spark.serializer.KryoSerializer
spark.eventLog.enabled true
spark.eventLog.dir /var/tmp/eventLog
spark.executor.extraClassPath /home/ubuntu/spark-cassandra-connector-java-assembly-1.3.1-FAT.jar
And I get this exception:
Done cross mapping
Exception in thread "main" java.lang.NoSuchMethodError: com.datastax.spark.connector.mapper.ColumnMapper$.defaultColumnMapper(Lscala/reflect/ClassTag;Lscala/reflect/api/TypeTags$TypeTag;)Lcom/datastax/spark/connector/mapper/ColumnMapper;
at co.crowdx.aggregation.CassandraToElasticTransformater$.main(CassandraToElasticTransformater.scala:79)
at co.crowdx.aggregation.CassandraToElasticTransformater.main(CassandraToElasticTransformater.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:569)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:166)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:189)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:110)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Done Sending Signal aggregation job to Spark
And The strange part is when I trying to run the commands from the script- in the shell its working fine. Im using:
/home/user/spark-1.3.1/bin/spark-shell --master spark://1.1.1.1:7077 --driver-class-path /home/ubuntu/spark-cassandra-connector-java-assembly-1.3.1-FAT.jar --properties-file prop.conf
The Build.scala file is:
import sbt._
import Keys._
import sbtassembly.Plugin._
import AssemblyKeys._
object AggregationsBuild extends Build {
lazy val buildSettings = Defaults.defaultSettings ++ Seq(
version := "1.0.0",
organization := "co.sheker",
scalaVersion := "2.10.4"
)
lazy val app = Project(
"geo-aggregations",
file("."),
settings = buildSettings ++ assemblySettings ++ Seq(
parallelExecution in Test := false,
libraryDependencies ++= Seq(
"com.datastax.spark" %% "spark-cassandra-connector" % "1.2.1",
// spark will already be on classpath when using spark-submit.
// marked as provided, so that it isn't included in assembly.
"org.apache.spark" %% "spark-core" % "1.2.1" % "provided",
"org.apache.spark" %% "spark-catalyst" % "1.2.1" % "provided",
"org.apache.spark" %% "spark-sql" % "1.2.1" % "provided",
"org.scalatest" %% "scalatest" % "2.1.5" % "test",
"org.postgresql" % "postgresql" % "9.4-1201-jdbc41",
"com.github.nscala-time" %% "nscala-time" % "2.4.0",
"org.elasticsearch" % "elasticsearch-hadoop" % "2.2.0" % "provided"
),
resolvers += "conjars.org" at "http://conjars.org/repo",
resolvers += "clojars" at "https://clojars.org/repo"
)
)
}
What is wrong? Why it fails on the submit but not in the shell?
You said that you are using spark 1.3 but your build contains spark 1.2.1 dependencies.
Like I said in the comment, I believe that your spark driver's version is different from the one in your application which leads to the error that you are getting.
I'm trying to setup zeromq data stream to spark. Basically I took the ZeroMQWordCount.scala app an tried to recompile it and run it.
I have zeromq 2.1 installed, and spark 1.2.1
here is my scala code:
package org.apache.spark.examples.streaming
import akka.actor.ActorSystem
import akka.actor.actorRef2Scala
import akka.zeromq._
import akka.zeromq.Subscribe
import akka.util.ByteString
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.streaming.zeromq._
import scala.language.implicitConversions
import org.apache.spark.SparkConf
object ZmqBenchmark {
def main(args: Array[String]) {
if (args.length < 2) {
System.err.println("Usage: ZmqBenchmark <zeroMQurl> <topic>")
System.exit(1)
}
//StreamingExamples.setStreamingLogLevels()
val Seq(url, topic) = args.toSeq
val sparkConf = new SparkConf().setAppName("ZmqBenchmark")
// Create the context and set the batch size
val ssc = new StreamingContext(sparkConf, Seconds(2))
def bytesToStringIterator(x: Seq[ByteString]) = (x.map(_.utf8String)).iterator
// For this stream, a zeroMQ publisher should be running.
val lines = ZeroMQUtils.createStream(ssc, url, Subscribe(topic), bytesToStringIterator _)
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
wordCounts.print()
ssc.start()
ssc.awaitTermination()
}
}
and this is my .sbt file for dependencies:
name := "ZmqBenchmark"
version := "1.0"
scalaVersion := "2.10.4"
resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"
resolvers += "Sonatype (releases)" at "https://oss.sonatype.org/content/repositories/releases/"
libraryDependencies += "org.apache.spark" % "spark-core_2.10" % "1.2.1"
libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.2.1"
libraryDependencies += "org.apache.spark" % "spark-streaming-zeromq_2.10" % "1.2.1"
libraryDependencies += "com.typesafe.akka" %% "akka-actor" % "2.2.0"
libraryDependencies += "org.zeromq" %% "zeromq-scala-binding" % "0.0.6"
libraryDependencies += "com.typesafe.akka" % "akka-zeromq_2.10.0-RC5" % "2.1.0-RC6"
libraryDependencies += "org.apache.spark" % "spark-examples_2.10" % "1.1.1"
libraryDependencies += "org.spark-project.zeromq" % "zeromq-scala-binding_2.11" % "0.0.7-spark"
The application compiles without any errors using sbt package, however when i run the application with spark submit, i get an error:
zaid#zaid-VirtualBox:~/spark-1.2.1$ ./bin/spark-submit --master local[*] ./zeromqsub/example/target/scala-2.10/zmqbenchmark_2.10-1.0.jar tcp://127.0.0.1:5553 hello
15/03/06 10:21:11 WARN Utils: Your hostname, zaid-VirtualBox resolves to a loopback address: 127.0.1.1; using 192.168.220.175 instead (on interface eth0)
15/03/06 10:21:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/spark/streaming/zeromq/ZeroMQUtils$
at ZmqBenchmark$.main(ZmqBenchmark.scala:78)
at ZmqBenchmark.main(ZmqBenchmark.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.deploy.SparkSubmit$.launch(SparkSubmit.scala:358)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:75)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.ClassNotFoundException: org.apache.spark.streaming.zeromq.ZeroMQUtils$
at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
... 9 more
Any ideas why this happens? i know the app should work because when i run the same example using the $/run-example $ script and point to the ZeroMQWordCount app from spark, it runs without the exception. My guess is the sbt file is incorrect, what else do I need to have in the sbt file?
Thanks
You are using ZeroMQUtils.createStream but the line
Caused by: java.lang.ClassNotFoundException: org.apache.spark.streaming.zeromq.ZeroMQUtils
shows that the bytecode for ZeroMQUtils was not located. When the spark examples are run, they are run against a jar file (like spark-1.2.1/examples/target/scala-2.10/spark-examples-1.2.1-hadoop1.0.4.jar) including the ZeroMQUtils class. A solution would be to use the --jars flag so spark-submit command can find the bytecode. In your case, this could be something like
spark-submit --jars /opt/spark/spark-1.2.1/examples/target/scala-2.10/spark-examples-1.2.1-hadoop1.0.4.jar--master local[*] ./zeromqsub/example/target/scala-2.10/zmqbenchmark_2.10-1.0.jar tcp://127.0.0.1:5553 hello
assuming that you have installed spark-1.2.1 in /opt/spark.