I am very new to scala spark and Mongo. while trying to load some data to MongoDB by spark with the following code.
import com.mongodb.spark.config.WriteConfig
import com.mongodb.spark.toDocumentRDDFunctions
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.bson.Document
object MongoTest {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.master("local[*]")
.appName(this.getClass.getSimpleName)
.getOrCreate()
val conf = new SparkConf().setAppName(this.getClass.getSimpleName).set("spark.driver.allowMultipleContexts", "true")
val sc = new SparkContext(conf)
val documents = sc.parallelize((1 to 10).map(i => Document.parse(s"{test: $i}")))
documents.saveToMongoDB(WriteConfig(Map("spark.mongodb.output.uri" -> "mongodb://127.0.0.1:27017/sampledb.testMongo")))
}
}
The error occurs and my spark submit fails with following error:
java.lang.NoSuchMethodError: com.mongodb.Mongo.<init>(Lcom/mongodb/MongoClientURI;)V
at com.mongodb.MongoClient.<init>(MongoClient.java:328)
at com.mongodb.spark.connection.DefaultMongoClientFactory.create(DefaultMongoClientFactory.scala:43)
at com.mongodb.spark.connection.MongoClientCache.acquire(MongoClientCache.scala:55)
at com.mongodb.spark.MongoConnector.acquireClient(MongoConnector.scala:239)
at com.mongodb.spark.MongoConnector.withMongoClientDo(MongoConnector.scala:152)
at com.mongodb.spark.MongoConnector.withDatabaseDo(MongoConnector.scala:171)
at com.mongodb.spark.MongoConnector.withCollectionDo(MongoConnector.scala:184)
at com.mongodb.spark.MongoSpark$$anonfun$save$1.apply(MongoSpark.scala:116)
at com.mongodb.spark.MongoSpark$$anonfun$save$1.apply(MongoSpark.scala:115)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:935)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:935)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:403)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1405)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:409)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
I use Spark version 2.4.0 and Scala version 2.11.12. Any idea where I am wrong.?
Related
Below is the code where I am trying to load table dynamically in to Hbase , But I am getting Null pointer Exception which I am not able to resolve. Please let me know if there is any way to load any table dynamically to Hbase.
import scala.util.Failure
import org.apache.spark.sql.DataFrame
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.hbase.client.Connection
import org.apache.hadoop.hbase.client.ConnectionFactory
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import java.lang.String
import org.apache.spark.sql.functions._
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{HBaseAdmin,HTable,Put,Get}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor}
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HColumnDescriptor
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.client.HTable
def hbaseDataLoad(tableName:String,hbaseTname:String,columnFname:String):Unit ={
val hiveContext = new org.apache.spark.sql.hive.HiveContext(sc)
hiveContext.setConf("hive.exec.dynamic.partition", "true")
hiveContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
val conf = HBaseConfiguration.create()
val admin = new HBaseAdmin(conf)
val df = hiveContext.sql(s"select t.* from $tableName t order by t.firstname")
if (!admin.isTableAvailable(hbaseTname)) {
val tableDesc = new HTableDescriptor(hbaseTname)
admin.createTable(tableDesc)
}
val columnNameIndex = df.columns.zipWithIndex.map(x => (x._2, x._1)).toMap
df.foreach( elmt => {
val conf = HBaseConfiguration.create()
val admin = new HBaseAdmin(conf)
conf.set("hbase.rootdir","hdfs://")
conf.set("hbase.zookeeper.quorum","")
conf.setInt("hbase.zookeeper.property.clientPort", 2181)
conf.set(TableInputFormat.INPUT_TABLE, hbaseTname)
val myTable = new HTable(conf, hbaseTname)
var p = new Put(elmt.getString(0).getBytes())
for(i <- 1 until df.columns.length ) {
p.addColumn(columnFname.getBytes(),columnNameIndex.getOrElse(i, s"c$i").getBytes,elmt.getString(i).getBytes)
}
myTable.put(p)
})
}
Exception which I am getting,even through the table has all the columns and data with column family present in hbase.
scala> hbaseDataLoad("test.tablename","hbase_test","cf1")
[Stage 13:> (0 + 1) / 7]18/10/30 17:56:15 ERROR scheduler.TaskSetManager: Task 0 in stage 13.0 failed 4 times; aborting job
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 13.0 failed 4 times, most recent failure: Lost task 0.3 in stage 13.0 (TID 49, ebdp-avdc-d177p.sys.comcast.net, executor 9): java.lang.NullPointerException
at org.apache.spark.sql.Dataset.schema(Dataset.scala:452)
at org.apache.spark.sql.Dataset.columns(Dataset.scala:503)
at $anonfun$hbaseDataLoad$1.apply(<console>:111)
at $anonfun$hbaseDataLoad$1.apply(<console>:102)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
Thanks in advance
I want to separately use TF-IDF features on the title and description fields, respectively and then combine those features in the VectorAssembler so that the final classifier can operate on those features.
It works fine if I use a single serial flow that is simply
titleTokenizer -> titleHashingTF -> VectorAssembler
But I need both like so:
titleTokenizer -> titleHashingTF
-> VectorAssembler
descriptionTokenizer -> descriptionHashingTF
Code here:
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer, StringIndexer, VectorAssembler}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.Row
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.log4j.{Level, Logger}
object SimplePipeline {
def main(args: Array[String]) {
// setup boilerplate
val conf = new SparkConf()
.setAppName("Pipeline example")
val sc = new SparkContext(conf)
val spark = SparkSession
.builder()
.appName("Session for SimplePipeline")
.getOrCreate()
val all_df = spark.read.json("file:///Users/me/data.json")
val numLabels = all_df.count()
// split into training and testing
val Array(training, testing) = all_df.randomSplit(Array(0.75, 0.25))
val nTraining = training.count();
val nTesting = testing.count();
println(s"Loaded $nTraining training labels...");
println(s"Loaded $nTesting testing labels...");
// convert string labels to integers
val indexer = new StringIndexer()
.setInputCol("rating")
.setOutputCol("label")
// tokenize our string inputs
val titleTokenizer = new Tokenizer()
.setInputCol("title")
.setOutputCol("title_words")
val descriptionTokenizer = new Tokenizer()
.setInputCol("description")
.setOutputCol("description_words")
// count term frequencies
val titleHashingTF = new HashingTF()
.setNumFeatures(1000)
.setInputCol(titleTokenizer.getOutputCol)
.setOutputCol("title_tfs")
val descriptionHashingTF = new HashingTF()
.setNumFeatures(1000)
.setInputCol(descriptionTokenizer.getOutputCol)
.setOutputCol("description_tfs")
// combine features together
val assembler = new VectorAssembler()
.setInputCols(Array(titleHashingTF.getOutputCol, descriptionHashingTF.getOutputCol))
.setOutputCol("features")
// set params for our model
val lr = new LogisticRegression()
.setMaxIter(10)
.setRegParam(0.01)
// pipeline that combines all stages
val stages = Array(indexer, titleTokenizer, titleHashingTF, descriptionTokenizer, descriptionHashingTF, assembler, lr);
val pipeline = new Pipeline().setStages(stages);
// Fit the pipeline to training documents.
val model = pipeline.fit(training)
// Make predictions.
val predictions = model.transform(testing)
// Select example rows to display.
predictions.select("label", "rawPrediction", "prediction").show()
sc.stop()
}
}
and my data file is simply a line-break separated file of JSON objects:
{"title" : "xxxxxx", "description" : "yyyyy" .... }
{"title" : "zzzzzz", "description" : "zxzxzx" .... }
The error I get is very long a difficult to understand, but the important part (I think) is a java.lang.NullPointerException:
ERROR Executor: Exception in task 0.0 in stage 9.0 (TID 12)
org.apache.spark.SparkException: Failed to execute user defined function($anonfun$createTransformFunc$1: (string) => array<string>)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:215)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:957)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:948)
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:888)
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:948)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:694)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.NullPointerException
at org.apache.spark.ml.feature.Tokenizer$$anonfun$createTransformFunc$1.apply(Tokenizer.scala:39)
at org.apache.spark.ml.feature.Tokenizer$$anonfun$createTransformFunc$1.apply(Tokenizer.scala:39)
... 23 more
How should I be properly crafting my Pipeline to do this?
(Also I'm completely new to Scala)
The problem here is that you don't validate the data and some of the values are NULL. It is pretty easy to reproduce this:
val df = Seq((1, Some("abcd bcde cdef")), (2, None)).toDF("id", "description")
val tokenizer = new Tokenizer().setInputCol("description")
tokenizer.transform(df).foreach(_ => ())
org.apache.spark.SparkException: Failed to execute user defined function($anonfun$createTransformFunc$1: (string) => array<string>)
at org.apache.spark.sql.catalyst.expressions.ScalaUDF.eval(ScalaUDF.scala:1072)
...
Caused by: java.lang.NullPointerException
at org.apache.spark.ml.feature.Tokenizer$$anonfun$createTransformFunc$1.apply(Tokenizer.scala:39)
...
You can for example drop:
tokenizer.transform(df.na.drop(Array("description")))
or replace these with empty strings:
tokenizer.transform(df.na.fill(Map("description" -> "")))
whichever makes more sense in your application.
I am reading Hbase table using spark scala.
code is follows:
package HBase
import org.apache.hadoop.hbase.client.{HBaseAdmin, Result}
import org.apache.hadoop.hbase.{ HBaseConfiguration, HTableDescriptor }
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import it.nerdammer.spark.hbase._
import org.apache.spark._
object Connector {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("HBaseApp").setMaster("local[2]")
val sc = new SparkContext(sparkConf)
val conf = HBaseConfiguration.create()
val tableName = "cars"
conf.set("hbase.master", "10.163.12.87")
conf.setInt("timeout", 40000)
conf.set("hbase.zookeeper.quorum", "10.163.12.87")
conf.set("zookeeper.znode.parent", "/hbase-unsecure")
conf.set(TableInputFormat.INPUT_TABLE, tableName)
val admin = new HBaseAdmin(conf)
if (!admin.isTableAvailable(tableName)) {
val tableDesc = new HTableDescriptor(tableName)
admin.createTable(tableDesc)
}
val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])
println("Number of Records found : " + hBaseRDD.count())
sc.stop()
}
}
I am getting below error:
Exception in thread "main" java.lang.NumberFormatException: For input string: "16000��I���PBUF
HDP1.Node1�}ڞ���*
at java.lang.NumberFormatException.forInputString(Unknown Source)
at java.lang.Integer.parseInt(Unknown Source)
at java.lang.Integer.parseInt(Unknown Source)
at org.apache.hadoop.hbase.HServerAddress.<init>(HServerAddress.java:63)
at org.apache.hadoop.hbase.MasterAddressTracker.getMasterAddress(MasterAddressTracker.java:63)
at org.apache.hadoop.hbase.client.HConnectionManager$HConnectionImplementation.getMaster(HConnectionManager.java:353)
at org.apache.hadoop.hbase.client.HBaseAdmin.<init>(HBaseAdmin.java:89)
at HBase.Connector$.main(Connector.scala:32)
at HBase.Connector.main(Connector.scala)
Try setting the port number in the hbase.master config.
conf.set("hbase.master", "10.163.12.87:60000")
I am trying to query data stored in Hive table from Spark2. Environment: 1.cloudera-quickstart-vm-5.7.0-0-vmware 2. Eclipse with Scala2.11.8 plugin 3. Spark2 and Maven under
I did not change spark default configuration. Do I need configure anything in Spark or Hive?
Code
import org.apache.spark._
import org.apache.spark.sql.SparkSession
object hiveTest {
def main (args: Array[String]){
val sparkSession = SparkSession.builder.
master("local")
.appName("HiveSQL")
.enableHiveSupport()
.getOrCreate()
val data= sparkSession2.sql("select * from test.mark")
}
}
Getting error
16/08/29 00:18:10 INFO SparkSqlParser: Parsing command: select * from test.mark
Exception in thread "main" java.lang.ExceptionInInitializerError
at org.apache.spark.sql.hive.HiveSharedState.metadataHive$lzycompute(HiveSharedState.scala:48)
at org.apache.spark.sql.hive.HiveSharedState.metadataHive(HiveSharedState.scala:47)
at org.apache.spark.sql.hive.HiveSharedState.externalCatalog$lzycompute(HiveSharedState.scala:54)
at org.apache.spark.sql.hive.HiveSharedState.externalCatalog(HiveSharedState.scala:54)
at org.apache.spark.sql.hive.HiveSessionState.catalog$lzycompute(HiveSessionState.scala:50)
at org.apache.spark.sql.hive.HiveSessionState.catalog(HiveSessionState.scala:48)
at org.apache.spark.sql.hive.HiveSessionState$$anon$1.<init>(HiveSessionState.scala:63)
at org.apache.spark.sql.hive.HiveSessionState.analyzer$lzycompute(HiveSessionState.scala:63)
at org.apache.spark.sql.hive.HiveSessionState.analyzer(HiveSessionState.scala:62)
at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:49)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:64)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:582)
at hiveTest$.main(hiveTest.scala:34)
at hiveTest.main(hiveTest.scala)
Caused by: java.lang.IllegalArgumentException: requirement failed: Duplicate SQLConfigEntry. spark.sql.hive.convertCTAS has been registered
at scala.Predef$.require(Predef.scala:224)
at org.apache.spark.sql.internal.SQLConf$.org$apache$spark$sql$internal$SQLConf$$register(SQLConf.scala:44)
at org.apache.spark.sql.internal.SQLConf$SQLConfigBuilder$$anonfun$apply$1.apply(SQLConf.scala:51)
at org.apache.spark.sql.internal.SQLConf$SQLConfigBuilder$$anonfun$apply$1.apply(SQLConf.scala:51)
at org.apache.spark.internal.config.TypedConfigBuilder$$anonfun$createWithDefault$1.apply(ConfigBuilder.scala:122)
at org.apache.spark.internal.config.TypedConfigBuilder$$anonfun$createWithDefault$1.apply(ConfigBuilder.scala:122)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.internal.config.TypedConfigBuilder.createWithDefault(ConfigBuilder.scala:122)
at org.apache.spark.sql.hive.HiveUtils$.<init>(HiveUtils.scala:103)
at org.apache.spark.sql.hive.HiveUtils$.<clinit>(HiveUtils.scala)
... 14 more
Any suggestion is appreciated
Thanks
Robin
This is what I am using:
import org.apache.spark.sql.SparkSession
object LoadCortexDataLake extends App {
val spark = SparkSession.builder().appName("Cortex-Batch").enableHiveSupport().getOrCreate()
spark.read.parquet(file).createOrReplaceTempView("temp")
spark.sql(s"insert overwrite table $table_nm partition(year='$yr',month='$mth',day='$dt') select * from temp")
I think you should use 'sparkSession.sql' instead of 'sparkSession2.sql'
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
val spark = SparkSession.
builder().
appName("Connect to Hive").
config("hive.metastore.warehouse.uris","thrift://cdh-hadoop-master:Port").
enableHiveSupport().
getOrCreate()
val df = spark.sql("SELECT * FROM table_name")
I have been trying to understand how spark streaming and hbase connect, but have not been successful. What I am trying to do is given a spark stream, process that stream and store the results in an hbase table. So far this is what I have:
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.storage.StorageLevel
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{HBaseAdmin,HTable,Put,Get}
import org.apache.hadoop.hbase.util.Bytes
def blah(row: Array[String]) {
val hConf = new HBaseConfiguration()
val hTable = new HTable(hConf, "table")
val thePut = new Put(Bytes.toBytes(row(0)))
thePut.add(Bytes.toBytes("cf"), Bytes.toBytes(row(0)), Bytes.toBytes(row(0)))
hTable.put(thePut)
}
val ssc = new StreamingContext(sc, Seconds(1))
val lines = ssc.socketTextStream("localhost", 9999, StorageLevel.MEMORY_AND_DISK_SER)
val words = lines.map(_.split(","))
val store = words.foreachRDD(rdd => rdd.foreach(blah))
ssc.start()
I am currently running the above code in spark-shell. I am not sure what I am doing wrong.
I get the following error in the shell:
14/09/03 16:21:03 ERROR scheduler.JobScheduler: Error running job streaming job 1409786463000 ms.0
org.apache.spark.SparkException: Job aborted due to stage failure: Task not serializable: java.io.NotSerializableException: org.apache.spark.streaming.StreamingContext
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1033)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1017)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1015)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1015)
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$submitMissingTasks(DAGScheduler.scala:770)
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$submitStage(DAGScheduler.scala:713)
at org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1176)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498)
at akka.actor.ActorCell.invoke(ActorCell.scala:456)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237)
at akka.dispatch.Mailbox.run(Mailbox.scala:219)
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
I also double checked the hbase table, just in case, and nothing new is written in there.
I am running nc -lk 9999 on another terminal to feed in data into the spark-shell for testing.
With help from users on the spark user group, I was able to figure out how to get this to work. It looks like I needed to wrap my streaming, mapping and foreach call around a serializable object:
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.storage.StorageLevel
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{HBaseAdmin,HTable,Put,Get}
import org.apache.hadoop.hbase.util.Bytes
object Blaher {
def blah(row: Array[String]) {
val hConf = new HBaseConfiguration()
val hTable = new HTable(hConf, "table")
val thePut = new Put(Bytes.toBytes(row(0)))
thePut.add(Bytes.toBytes("cf"), Bytes.toBytes(row(0)), Bytes.toBytes(row(0)))
hTable.put(thePut)
}
}
object TheMain extends Serializable{
def run() {
val ssc = new StreamingContext(sc, Seconds(1))
val lines = ssc.socketTextStream("localhost", 9999, StorageLevel.MEMORY_AND_DISK_SER)
val words = lines.map(_.split(","))
val store = words.foreachRDD(rdd => rdd.foreach(Blaher.blah))
ssc.start()
}
}
TheMain.run()
Seems to be a typical antipattern.
See "Design Patterns for using foreachRDD" chapter at http://spark.apache.org/docs/latest/streaming-programming-guide.html for correct pattern.