Anaylze twitter datas with Spark - scala

Anyone else help me about how can i analyze twitter data based on 'keys' whatever i write.I found this code but this is give me an error.
import java.io.File
import com.google.gson.Gson
import org.apache.spark.streaming.twitter.TwitterUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* Collect at least the specified number of tweets into json text files.
*/
object Collect {
private var numTweetsCollected = 0L
private var partNum = 0
private var gson = new Gson()
def main(args: Array[String]) {
// Process program arguments and set properties
if (args.length < 3) {
System.err.println("Usage: " + this.getClass.getSimpleName +
"<outputDirectory> <numTweetsToCollect> <intervalInSeconds> <partitionsEachInterval>")
System.exit(1)
}
val Array(outputDirectory, Utils.IntParam(numTweetsToCollect), Utils.IntParam(intervalSecs), Utils.IntParam(partitionsEachInterval)) =
Utils.parseCommandLineWithTwitterCredentials(args)
val outputDir = new File(outputDirectory.toString)
if (outputDir.exists()) {
System.err.println("ERROR - %s already exists: delete or specify another directory".format(
outputDirectory))
System.exit(1)
}
outputDir.mkdirs()
println("Initializing Streaming Spark Context...")
val conf = new SparkConf().setAppName(this.getClass.getSimpleName)
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(intervalSecs))
val tweetStream = TwitterUtils.createStream(ssc, Utils.getAuth)
.map(gson.toJson(_))
tweetStream.foreachRDD((rdd, time) => {
val count = rdd.count()
if (count > 0) {
val outputRDD = rdd.repartition(partitionsEachInterval)
outputRDD.saveAsTextFile(outputDirectory + "/tweets_" + time.milliseconds.toString)
numTweetsCollected += count
if (numTweetsCollected > numTweetsToCollect) {
System.exit(0)
}
}
})
ssc.start()
ssc.awaitTermination()
}
}
Error is
object gson is not a member of package com.google
If you know any link about it or fix this problem can you share with me,because i want to analyze twitter datas with spark.
Thanks.:)

Like Peter pointed out, you are missing the gson dependency. So you'll need to add the following dependency to your build.sbt :
libraryDependencies += "com.google.code.gson" % "gson" % "2.4"
You can also do the following to define all the dependencies in one sequence :
libraryDependencies ++= Seq(
"com.google.code.gson" % "gson" % "2.4",
"org.apache.spark" %% "spark-core" % "1.2.0",
"org.apache.spark" %% "spark-streaming" % "1.2.0",
"org.apache.spark" %% "spark-streaming-twitter" % "1.2.0"
)
Bonus: In case of other missing dependencies, you can try to search your dependency on the http://mvnrepository.com/ and if you need to find the associated jar/dependency for a given class, you can also use the findjar website

Related

Trying to read file from s3 with FLINK using the IDE getting Class org.apache.hadoop.fs.s3a.S3AFileSystem not found

I am trying to read a file from s3 using Flink from IntelliJ and getting the following exception:
Caused by: java.lang.ClassNotFoundException: Class
org.apache.hadoop.fs.s3a.S3AFileSystem not found
This how my code looks like :
import org.apache.flink.api.scala.createTypeInformation
import org.apache.flink.streaming.api.functions.source.SourceFunction
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.parquet.column.page.PageReadStore
import org.apache.parquet.example.data.simple.convert.GroupRecordConverter
import org.apache.parquet.hadoop.ParquetFileReader
import org.apache.parquet.hadoop.util.HadoopInputFile
import org.apache.parquet.io.ColumnIOFactory
class ParquetSourceFunction extends SourceFunction[String]{
override def run(ctx: SourceFunction.SourceContext[String]): Unit = {
val inputPath = "s3a://path-to-bucket/"
val outputPath = "s3a://path-to-output-bucket/"
val conf = new Configuration()
conf.set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
val readFooter = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(inputPath), conf))
val metadata = readFooter.getFileMetaData
val schema = metadata.getSchema
val parquetFileReader = new ParquetFileReader(conf, metadata, new Path(inputPath), readFooter.getRowGroups, schema.getColumns)
// val parquetFileReader2 = new ParquetFileReader(new Path(inputPath), ParquetReadOptions)
var pages: PageReadStore = null
try {
while ({ pages = parquetFileReader.readNextRowGroup; pages != null }) {
val rows = pages.getRowCount
val columnIO = new ColumnIOFactory().getColumnIO(schema)
val recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema))
(0L until rows).foreach { _ =>
val group = recordReader.read()
val myString = group.getString("field_name", 0)
ctx.collect(myString)
}
}
}
}
override def cancel(): Unit = ???
}
object Job {
def main(args: Array[String]): Unit = {
// set up the execution environment
lazy val env = StreamExecutionEnvironment.getExecutionEnvironment
lazy val stream = env.addSource(new ParquetSourceFunction)
stream.print()
env.execute()
}
}
Sbt dependencies :
val flinkVersion = "1.12.1"
val flinkDependencies = Seq(
"org.apache.flink" %% "flink-clients" % flinkVersion,// % "provided",
"org.apache.flink" %% "flink-scala" % flinkVersion,// % "provided",
"org.apache.flink" %% "flink-streaming-scala" % flinkVersion, // % "provided")
"org.apache.flink" %% "flink-parquet" % flinkVersion)
lazy val root = (project in file(".")).
settings(
libraryDependencies ++= flinkDependencies,
libraryDependencies += "org.apache.hadoop" % "hadoop-common" % "3.3.0" ,
libraryDependencies += "org.apache.parquet" % "parquet-hadoop" % "1.11.1",
libraryDependencies += "org.apache.flink" %% "flink-table-planner-blink" % "1.12.1" //% "provided"
)
S3 is only supported by adding the respective flink-s3-fs-hadoop to your plugin folder as described on the plugin docs. For an IDE local setup, the root that should contain the plugins dir is the working directory by default. You can override it by using the env var FLINK_PLUGINS_DIR.
To get the flink-s3-fs-hadoop into plugin, I'm guessing some sbt glue is necessary (or you do it once manually). In gradle, I'd define a plugin scope and copy the jars in a custom task to the plugin dir.

Spark Job is not posting message to Kafka topic

I have written spark job to read one file, convert the data to json and post the data to Kafka:
I tried all options like
1.Putting thread.sleep
2.changing linger.ms lesser than thread.sleep.But nothing is working out..it Just not post any thing to kafKa .I have tried producer.flush()/producer.close().No error is coming in log.But still it is just not posting any thing.
If i write a plain standalone producer to post the message to same kafka topic ,it is going without any issue.
Hence there is no issue with Kafka as such.
4.I can see my send method is getting called from log .Also at end close is getting called .No error.
Please help!!!!!!!!!!!!
Here is my Important files of the project:
build.sbt:
name := "SparkStreamingExample"
//version := "0.1"
scalaVersion := "2.11.8"
val spark="2.3.1"
val kafka="0.10.1"
// https://mvnrepository.com/artifact/org.apache.kafka/kafka
dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-core" % "2.9.6"
dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-databind" % "2.9.6"
dependencyOverrides += "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.9.6"
// https://mvnrepository.com/artifact/com.fasterxml.jackson.dataformat/jackson-dataformat-cbor
dependencyOverrides += "com.fasterxml.jackson.dataformat" % "jackson-dataformat-cbor" % "2.9.6"
libraryDependencies += "org.apache.kafka" % "kafka_2.11" % "2.0.0"
// https://mvnrepository.com/artifact/org.apache.kafka/kafka
libraryDependencies += "org.apache.spark" % "spark-streaming_2.11" % spark
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.3.1"
libraryDependencies +="com.typesafe.play" %"play-json_2.11" % "2.6.6" exclude("com.fasterxml.jackson.core","jackson-databind")
libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.3.1"
libraryDependencies +="com.typesafe" % "config" %"1.3.2"
MySparkKafkaProducer.scala
import java.util.Properties
import java.util.concurrent.Future
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata}
class MySparkKafkaProducer(createProducer: () => KafkaProducer[String, String]) extends Serializable {
/* This is the key idea that allows us to work around running into
NotSerializableExceptions. */
#transient lazy val producer = createProducer()
def send(topic: String, key: String, value: String): Future[RecordMetadata] = {
println("inside send method")
producer.send(new ProducerRecord(topic, key, value))
}
def send(topic: String, value: String)= {
// println("inside send method")
producer.send(new ProducerRecord(topic, value))
}
// producer.send(new ProducerRecord[K, V](topic, value))
}
object MySparkKafkaProducer extends Serializable {
import scala.collection.JavaConversions._
def apply(config:Properties):MySparkKafkaProducer={
val f = () =>{
val producer =new KafkaProducer[String,String](config)
sys.addShutdownHook({
println("calling Closeeeeeeeeeee")
producer.flush()
producer.close
})
producer
}
new MySparkKafkaProducer(f)
}
}
AlibababaMainJob.scala
import java.util.Properties
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.SparkSession
import com.typesafe.config.ConfigFactory
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.codehaus.jackson.map.ser.std.StringSerializer
object AlibababaMainJob {
def main(args:Array[String]) {
val ss = SparkSession.builder().master("local[*]").appName("AlibabaCoreJob").getOrCreate()
val conf = new SparkConf().setMaster("local[2]").setAppName("AlibabaCoreJob")
//val ssc = new StreamingContext(conf, Seconds(1))
// val ssc= new StreamingContext(getSparkConf(),6)
val coreJob = new AlibabaCoreJob()
val configuration = Configuration.apply(ConfigFactory.load.resolve)
implicit val rollUpProducer: Broadcast[MySparkKafkaProducer] = ss.sparkContext.broadcast(MySparkKafkaProducer(producerProps(configuration)))
println(s"==========Kafka Config======${configuration.kafka}")
coreJob.processRecordFromFile(ss, rollUpProducer)
Thread.sleep(1000)
//ssc.start()
// println(s"==========Spark context Sarted ]======${ssc.sparkContext.appName}")
/// ssc.awaitTermination()
//
//val ss = SparkSession.builder().master("local[*]").appName("AlibabaCoreJob").getOrCreate()
//Set Up kakfa Configuration:https://stackoverflow.com/questions/31590592/spark-streaming-read-and-write-on-kafka-topic
}
def producerProps(jobConfig:Configuration,extras:(String,String)*):Properties={
val p =new Properties()
p.put("bootstrap.servers",jobConfig.kafka.brokerList)
p.put("key.serializer","org.apache.kafka.common.serialization.StringSerializer")
p.put("value.serializer","org.apache.kafka.common.serialization.StringSerializer")
p.put("acks","all")
p.put("retries","3")
p.put("linger.ms", "1")
p
}
// coreJob.processRecordFromFile(ss,rollUpProducer)
//}
}
AlibabaCoreJob.scala
import java.util.Properties
import org.apache.kafka.clients.producer.ProducerRecord
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SparkSession}
import play.api.libs.json._
import org.apache.kafka.clients.producer.ProducerConfig
class AlibabaCoreJob extends Serializable {
// implicit val transctionWrites = Json.writes[Transction]
//case class Transction(productCode:String,description:String,brand:String,category:String,unitPrice:String,ordItems:String,mode:String) extends Serializable
def processRecordFromFile(ss:SparkSession,kafkaProducer:Broadcast[MySparkKafkaProducer]):Unit={
println("Entering processRecordFromFile")
val rddFromFile = ss.sparkContext.textFile("src/main/resources/12_transactions_case_study.csv")
println("Entering loaded file")
val fileAfterHeader=rddFromFile.mapPartitionsWithIndex(
(idx,iterator)=>if(idx==0)
iterator.drop(0) else iterator)
println("Removed header")
processRdd(fileAfterHeader,kafkaProducer:Broadcast[MySparkKafkaProducer])
}
//Set Up kakfa Configuration:https://stackoverflow.com/questions/31590592/spark-streaming-read-and-write-on-kafka-topic
def processRdd(fileAfterHeader: RDD[String],kafkaProducer:Broadcast[MySparkKafkaProducer]) = {
println("Entering processRdd")
val rddList = fileAfterHeader.mapPartitions(
line => {
// println("lineeeeee>>>"+line)
line.map(x => x.split(",")).map(y => Transction(y(0), y(1), y(2), y(3), y(4), y(5), y(6))).toList.toIterator
})
rddList.foreach(lineitem=>{
// println("Entering foreach>>>>")
val jsonString:String=Json.stringify(Json.toJson(lineitem))
//val jsonString:String=lineitem.A
// println("calling kafka producer")
kafkaProducer.value.send("topic",jsonString)
// println("done calling kafka producer")
})
}
//Suppose you want to drop 1s 3 lines from file
// val da = fi.mapPartitionsWithIndex{ (id_x, iter) => if (id_x == 0) iter.drop(3) else iter }
//Create RowRDD by mapping each line to the required fields
// val rowRdd = da.map(x=>Row(x(0), x(1)))
//Map Partitions:
}
In your project, add the following dependencies: Spark-Sql,Spark-Core,Spark-Streaming,Spa-Streaming-Kafka-0-10.
You can the read the given file in a dataframe, perform any sort of processing that you would want, and then when your processing is finished, you can write the dataframe to kafka as follows
resultDF.writeStream.format("kafka")
.option("kafka.bootstrap.servers", "host1:port1,host2:port2")
.option("topic", "topic1")
You refer the doc here for further clarity.
Note that I have assumed your results of processing would be stored in a Dataframe called resultDF

Spark Scala: "cannot resolve symbol saveAsTextFile (reduceByKey)" - IntelliJ Idea

I suppose some dependencies are not defined in build.sbt file.
I've added library dependencies in build.sbt file, but still I'm getting this error mentioned from title of this question. Try to search for solution on the google but couldn't find it
My spark scala source code (filterEventId100.scala) :
package com.projects.setTopBoxDataAnalysis
import java.lang.System._
import java.text.SimpleDateFormat
import java.util.Date
import org.apache.spark.sql.SparkSession
object filterEventId100 extends App {
if (args.length < 2) {
println("Usage: JavaWordCount <Input-File> <Output-file>")
exit(1)
}
val spark = SparkSession
.builder
.appName("FilterEvent100")
.getOrCreate()
val data = spark.read.textFile(args(0)).rdd
val result = data.flatMap{line: String => line.split("\n")}
.map{serverData =>
val serverDataArray = serverData.replace("^", "::")split("::")
val evenId = serverDataArray(2)
if (evenId.equals("100")) {
val serverId = serverDataArray(0)
val timestempTo = serverDataArray(3)
val timestempFrom = serverDataArray(6)
val server = new Servers(serverId, timestempFrom, timestempTo)
val res = (serverId, server.dateDiff(server.timestampFrom, server.timestampTo))
res
}
}.reduceByKey{
case(x: Long, y: Long) => if ((x, y) != null) {
if (x > y) x else y
}
}
result.saveAsTextFile(args(1))
spark.stop
}
class Servers(val serverId: String, val timestampFrom: String, val timestampTo: String) {
val DATE_FORMAT = "yyyy-MM-dd hh:mm:ss.SSS"
private def convertStringToDate(s: String): Date = {
val dateFormat = new SimpleDateFormat(DATE_FORMAT)
dateFormat.parse(s)
}
private def convertDateStringToLong(dateAsString: String): Long = {
convertStringToDate(dateAsString).getTime
}
def dateDiff(tFrom: String, tTo: String): Long = {
val dDiff = convertDateStringToLong(tTo) - tFrom.toLong
dDiff
}
}
My build.sbt file:
name := "SetTopProject"
version := "0.1"
scalaVersion := "2.12.8"
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.4.3" exclude ("org.apache.hadoop","hadoop-yarn-server-web-proxy"),
"org.apache.spark" %% "spark-sql_2.12" % "2.4.3" exclude ("org.apache.hadoop","hadoop-yarn-server-web-proxy"),
"org.apache.hadoop" %% "hadoop-common" % "3.2.0" exclude ("org.apache.hadoop","hadoop-yarn-server-web-proxy"),
"org.apache.spark" %% "spark-sql_2.12" % "2.4.3" exclude ("org.apache.hadoop","hadoop-yarn-server-web-proxy"),
"org.apache.spark" %% "spark-hive_2.12" % "2.4.3" exclude ("org.apache.hadoop","hadoop-yarn-server-web-proxy"),
"org.apache.spark" %% "spark-yarn_2.12" % "2.4.3" exclude ("org.apache.hadoop","hadoop-yarn-server-web-proxy")
)
I was expecting everything will be fine because
val spark = SparkSession
.builder
.appName("FilterEvent100")
.getOrCreate()
is defined well (without any compiler's errors) and I use spark value to define data value:
val data = spark.read.textFile(args(0)).rdd
which calls saveAsTextFile and reducedByKey functions:
val result = data.flatMap{line: String => line.split("\n")}...
}.reducedByKey {case(x: Long, y: Long) => if ((x, y) != null) {
if (x > y) x else y
}
result.saveAsTextFile(args(1))
What I should to to remove compiler errors for saveAsTextFile and reduceByKey functions calls?
Replace
val spark = SparkSession
.builder
.appName("FilterEvent100")
.getOrCreate()
val data = spark.read.textFile(args(0)).rdd
to
val conf = new SparkConf().setAppName("FilterEvent100")
val sc = new SparkContext(conf)
val spark = SparkSession.builder.config(sc.getConf).getOrCreate()
val data = sc.textfile(args(0))

Java Class not Found Exception while doing Spark-submit Scala using sbt

Here is my code that i wrote in scala
package normalisation
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.sql.SQLContext
import org.apache.hadoop.fs.{FileSystem,Path}
object Seasonality {
val amplitude_list_c1: Array[Nothing] = Array()
val amplitude_list_c2: Array[Nothing] = Array()
def main(args: Array[String]){
val conf = new SparkConf().setAppName("Normalization")
val sc = new SparkContext(conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val line = "MP"
val ps = "Test"
val location = "hdfs://ipaddress/user/hdfs/{0}/ps/{1}/FS/2018-10-17".format(line,ps)
val files = FileSystem.get(sc.hadoopConfiguration ).listStatus(new Path(location))
for (each <- files) {
var ps_data = sqlContext.read.json(each)
}
println(ps_data.show())
}
The error I received when compiled using sbt package is hereimage
Here is my build.sbt file
name := "OV"
scalaVersion := "2.11.8"
// https://mvnrepository.com/artifact/org.apache.spark/spark-core
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.3.1"
// https://mvnrepository.com/artifact/org.apache.spark/spark-sql
libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.3.1"
in Spark Versions > 2 you should generally use SparkSession.
See https://spark.apache.org/docs/2.3.1/api/scala/#org.apache.spark.sql.SparkSession
also then you should be able to do
val spark:SparkSession = ???
val location = "hdfs://ipaddress/user/hdfs/{0}/ps/{1}/FS/2018-10-17".format(line,ps)
spark.read.json(location)
to read all your json files in the directory.
Also I think you'd also get another compile error at
for (each <- files) {
var ps_data = sqlContext.read.json(each)
}
println(ps_data.show())
for ps_data being out of scope.
If you need to use SparkContext for some reason it should indeed be in spark-core. Have you tried restarting your IDE, cleaned caches, etc?
EDIT: I just notices that build.sbt is probably not in the directory where you call sbt package from so sbt won't pick it up

java.lang.ClassNotFoundException: org.apache.spark.sql.DataFrame error when running Scala MongoDB connector

I am trying to run a Scala example with SBT to read data from MongoDB. I am getting this error whenever I try to access the data read from Mongo into the RDD.
Exception in thread "dag-scheduler-event-loop" java.lang.NoClassDefFoundError: org/apache/spark/sql/DataFrame
at java.lang.Class.getDeclaredMethods0(Native Method)
at java.lang.Class.privateGetDeclaredMethods(Class.java:2701)
at java.lang.Class.getDeclaredMethod(Class.java:2128)
at java.io.ObjectStreamClass.getPrivateMethod(ObjectStreamClass.java:1431)
at java.io.ObjectStreamClass.access$1700(ObjectStreamClass.java:72)
at java.io.ObjectStreamClass$2.run(ObjectStreamClass.java:494)
at java.io.ObjectStreamClass$2.run(ObjectStreamClass.java:468)
at java.security.AccessController.doPrivileged(Native Method)
at java.io.ObjectStreamClass.<init>(ObjectStreamClass.java:468)
at java.io.ObjectStreamClass.lookup(ObjectStreamClass.java:365)
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1134)
at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548)
at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1509)
I have imported the Dataframe explicitly, even though it is not used in my code. Can anyone help with this issue?
My code:
package stream
import org.apache.spark._
import org.apache.spark.SparkContext._
import com.mongodb.spark._
import com.mongodb.spark.config._
import com.mongodb.spark.rdd.MongoRDD
import org.bson.Document
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.DataFrame
object SpaceWalk {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("SpaceWalk")
.setMaster("local[*]")
.set("spark.mongodb.input.uri", "mongodb://127.0.0.1/nasa.eva")
.set("spark.mongodb.output.uri", "mongodb://127.0.0.1/nasa.astronautTotals")
val sc = new SparkContext(sparkConf)
val rdd = sc.loadFromMongoDB()
def breakoutCrew ( document: Document ): List[(String,Int)] = {
println("INPUT"+document.get( "Duration").asInstanceOf[String])
var minutes = 0;
val timeString = document.get( "Duration").asInstanceOf[String]
if( timeString != null && !timeString.isEmpty ) {
val time = document.get( "Duration").asInstanceOf[String].split( ":" )
minutes = time(0).toInt * 60 + time(1).toInt
}
import scala.util.matching.Regex
val pattern = new Regex("(\\w+\\s\\w+)")
val names = pattern findAllIn document.get( "Crew" ).asInstanceOf[String]
var tuples : List[(String,Int)] = List()
for ( name <- names ) { tuples = tuples :+ (( name, minutes ) ) }
return tuples
}
val logs = rdd.flatMap( breakoutCrew ).reduceByKey( (m1: Int, m2: Int) => ( m1 + m2 ) )
//logs.foreach(println)
def mapToDocument( tuple: (String, Int ) ): Document = {
val doc = new Document();
doc.put( "name", tuple._1 )
doc.put( "minutes", tuple._2 )
return doc
}
val writeConf = WriteConfig(sc)
val writeConfig = WriteConfig(Map("collection" -> "astronautTotals", "writeConcern.w" -> "majority", "db" -> "nasa"), Some(writeConf))
logs.map( mapToDocument ).saveToMongoDB( writeConfig )
import org.apache.spark.sql.SQLContext
import com.mongodb.spark.sql._
import org.apache.spark.sql.DataFrame
// load the first dataframe "EVAs"
val sqlContext = new SQLContext(sc);
import sqlContext.implicits._
val evadf = sqlContext.read.mongo()
evadf.printSchema()
evadf.registerTempTable("evas")
// load the 2nd dataframe "astronautTotals"
val astronautDF = sqlContext.read.option("collection", "astronautTotals").mongo[astronautTotal]()
astronautDF.printSchema()
astronautDF.registerTempTable("astronautTotals")
sqlContext.sql("SELECT astronautTotals.name, astronautTotals.minutes FROM astronautTotals" ).show()
sqlContext.sql("SELECT astronautTotals.name, astronautTotals.minutes, evas.Vehicle, evas.Duration FROM " +
"astronautTotals JOIN evas ON astronautTotals.name LIKE evas.Crew" ).show()
}
}
case class astronautTotal ( name: String, minutes: Integer )
This is my sbt file -
name := "Project"
version := "1.0"
scalaVersion := "2.11.7"
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.0.0"
libraryDependencies += "org.apache.spark" %% "spark-streaming" % "2.0.0"
libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.0.0"
//libraryDependencies += "org.apache.spark" %% "spark-streaming-twitter" % "1.2.1"
libraryDependencies += "org.apache.bahir" %% "spark-streaming-twitter" % "2.0.0"
libraryDependencies += "org.mongodb.spark" %% "mongo-spark-connector" % "0.1"
addCommandAlias("c1", "run-main stream.SaveTweets")
addCommandAlias("c2", "run-main stream.SpaceWalk")
outputStrategy := Some(StdoutOutput)
//outputStrategy := Some(LoggedOutput(log: Logger))
fork in run := true
This error message is because you are using an incompatible library that only supports Spark 1.x. You should use mongo-spark-connector 2.0.0+ instead. See: https://docs.mongodb.com/spark-connector/v2.0/