Spark Job is not posting message to Kafka topic - scala

I have written spark job to read one file, convert the data to json and post the data to Kafka:
I tried all options like
1.Putting thread.sleep
2.changing linger.ms lesser than thread.sleep.But nothing is working out..it Just not post any thing to kafKa .I have tried producer.flush()/producer.close().No error is coming in log.But still it is just not posting any thing.
If i write a plain standalone producer to post the message to same kafka topic ,it is going without any issue.
Hence there is no issue with Kafka as such.
4.I can see my send method is getting called from log .Also at end close is getting called .No error.
Please help!!!!!!!!!!!!
Here is my Important files of the project:
build.sbt:
name := "SparkStreamingExample"
//version := "0.1"
scalaVersion := "2.11.8"
val spark="2.3.1"
val kafka="0.10.1"
// https://mvnrepository.com/artifact/org.apache.kafka/kafka
dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-core" % "2.9.6"
dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-databind" % "2.9.6"
dependencyOverrides += "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.9.6"
// https://mvnrepository.com/artifact/com.fasterxml.jackson.dataformat/jackson-dataformat-cbor
dependencyOverrides += "com.fasterxml.jackson.dataformat" % "jackson-dataformat-cbor" % "2.9.6"
libraryDependencies += "org.apache.kafka" % "kafka_2.11" % "2.0.0"
// https://mvnrepository.com/artifact/org.apache.kafka/kafka
libraryDependencies += "org.apache.spark" % "spark-streaming_2.11" % spark
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.3.1"
libraryDependencies +="com.typesafe.play" %"play-json_2.11" % "2.6.6" exclude("com.fasterxml.jackson.core","jackson-databind")
libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.3.1"
libraryDependencies +="com.typesafe" % "config" %"1.3.2"
MySparkKafkaProducer.scala
import java.util.Properties
import java.util.concurrent.Future
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata}
class MySparkKafkaProducer(createProducer: () => KafkaProducer[String, String]) extends Serializable {
/* This is the key idea that allows us to work around running into
NotSerializableExceptions. */
#transient lazy val producer = createProducer()
def send(topic: String, key: String, value: String): Future[RecordMetadata] = {
println("inside send method")
producer.send(new ProducerRecord(topic, key, value))
}
def send(topic: String, value: String)= {
// println("inside send method")
producer.send(new ProducerRecord(topic, value))
}
// producer.send(new ProducerRecord[K, V](topic, value))
}
object MySparkKafkaProducer extends Serializable {
import scala.collection.JavaConversions._
def apply(config:Properties):MySparkKafkaProducer={
val f = () =>{
val producer =new KafkaProducer[String,String](config)
sys.addShutdownHook({
println("calling Closeeeeeeeeeee")
producer.flush()
producer.close
})
producer
}
new MySparkKafkaProducer(f)
}
}
AlibababaMainJob.scala
import java.util.Properties
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.SparkSession
import com.typesafe.config.ConfigFactory
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.codehaus.jackson.map.ser.std.StringSerializer
object AlibababaMainJob {
def main(args:Array[String]) {
val ss = SparkSession.builder().master("local[*]").appName("AlibabaCoreJob").getOrCreate()
val conf = new SparkConf().setMaster("local[2]").setAppName("AlibabaCoreJob")
//val ssc = new StreamingContext(conf, Seconds(1))
// val ssc= new StreamingContext(getSparkConf(),6)
val coreJob = new AlibabaCoreJob()
val configuration = Configuration.apply(ConfigFactory.load.resolve)
implicit val rollUpProducer: Broadcast[MySparkKafkaProducer] = ss.sparkContext.broadcast(MySparkKafkaProducer(producerProps(configuration)))
println(s"==========Kafka Config======${configuration.kafka}")
coreJob.processRecordFromFile(ss, rollUpProducer)
Thread.sleep(1000)
//ssc.start()
// println(s"==========Spark context Sarted ]======${ssc.sparkContext.appName}")
/// ssc.awaitTermination()
//
//val ss = SparkSession.builder().master("local[*]").appName("AlibabaCoreJob").getOrCreate()
//Set Up kakfa Configuration:https://stackoverflow.com/questions/31590592/spark-streaming-read-and-write-on-kafka-topic
}
def producerProps(jobConfig:Configuration,extras:(String,String)*):Properties={
val p =new Properties()
p.put("bootstrap.servers",jobConfig.kafka.brokerList)
p.put("key.serializer","org.apache.kafka.common.serialization.StringSerializer")
p.put("value.serializer","org.apache.kafka.common.serialization.StringSerializer")
p.put("acks","all")
p.put("retries","3")
p.put("linger.ms", "1")
p
}
// coreJob.processRecordFromFile(ss,rollUpProducer)
//}
}
AlibabaCoreJob.scala
import java.util.Properties
import org.apache.kafka.clients.producer.ProducerRecord
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SparkSession}
import play.api.libs.json._
import org.apache.kafka.clients.producer.ProducerConfig
class AlibabaCoreJob extends Serializable {
// implicit val transctionWrites = Json.writes[Transction]
//case class Transction(productCode:String,description:String,brand:String,category:String,unitPrice:String,ordItems:String,mode:String) extends Serializable
def processRecordFromFile(ss:SparkSession,kafkaProducer:Broadcast[MySparkKafkaProducer]):Unit={
println("Entering processRecordFromFile")
val rddFromFile = ss.sparkContext.textFile("src/main/resources/12_transactions_case_study.csv")
println("Entering loaded file")
val fileAfterHeader=rddFromFile.mapPartitionsWithIndex(
(idx,iterator)=>if(idx==0)
iterator.drop(0) else iterator)
println("Removed header")
processRdd(fileAfterHeader,kafkaProducer:Broadcast[MySparkKafkaProducer])
}
//Set Up kakfa Configuration:https://stackoverflow.com/questions/31590592/spark-streaming-read-and-write-on-kafka-topic
def processRdd(fileAfterHeader: RDD[String],kafkaProducer:Broadcast[MySparkKafkaProducer]) = {
println("Entering processRdd")
val rddList = fileAfterHeader.mapPartitions(
line => {
// println("lineeeeee>>>"+line)
line.map(x => x.split(",")).map(y => Transction(y(0), y(1), y(2), y(3), y(4), y(5), y(6))).toList.toIterator
})
rddList.foreach(lineitem=>{
// println("Entering foreach>>>>")
val jsonString:String=Json.stringify(Json.toJson(lineitem))
//val jsonString:String=lineitem.A
// println("calling kafka producer")
kafkaProducer.value.send("topic",jsonString)
// println("done calling kafka producer")
})
}
//Suppose you want to drop 1s 3 lines from file
// val da = fi.mapPartitionsWithIndex{ (id_x, iter) => if (id_x == 0) iter.drop(3) else iter }
//Create RowRDD by mapping each line to the required fields
// val rowRdd = da.map(x=>Row(x(0), x(1)))
//Map Partitions:
}

In your project, add the following dependencies: Spark-Sql,Spark-Core,Spark-Streaming,Spa-Streaming-Kafka-0-10.
You can the read the given file in a dataframe, perform any sort of processing that you would want, and then when your processing is finished, you can write the dataframe to kafka as follows
resultDF.writeStream.format("kafka")
.option("kafka.bootstrap.servers", "host1:port1,host2:port2")
.option("topic", "topic1")
You refer the doc here for further clarity.
Note that I have assumed your results of processing would be stored in a Dataframe called resultDF

Related

SharedSparkSession is not working in Spark MemoryStream scala testing

I have tried to write Spark MemoryStream Unit test case and SharedSparkSession is not importing in my Test case program.
**import org.apache.spark.sql.test.SharedSparkSession
class MemoryStreamTest extends AnyFunSuite with SharedSparkSession {
....
}**
My build.sbt file configuration below
**scalaVersion := "2.12.0"
val sparkVersion = "3.0.0"
libraryDependencies += "org.apache.spark" %% "spark-core" % sparkVersion
libraryDependencies += "org.apache.spark" %% "spark-sql" % sparkVersion
libraryDependencies += "org.apache.spark" %% "spark-streaming" % sparkVersion
libraryDependencies += "org.apache.spark" %% "spark-sql-kafka-0-10" % sparkVersion
libraryDependencies += "org.apache.spark" %% "spark-streaming-kafka-0-10" % sparkVersion
libraryDependencies += "org.scalatest" %% "scalatest" % "3.2.5" % "test"
libraryDependencies += "com.novocode" % "junit-interface" % "0.11" % "test"**
Do I need to add any other dependencies artifact or any sclatest version changes required.
The below program getting import issue for SharedSparkSession file.
**import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.execution.streaming.{LongOffset, MemoryStream}
import org.scalatest.funsuite.AnyFunSuite
import org.apache.spark.sql.test.SharedSparkSession
class MemoryStreamTest extends AnyFunSuite with SharedSparkSession {
test("spark structured streaming can read from memory socket") {
// We can import sql implicits
implicit val sqlCtx = sparkSession.sqlContext
import sqlImplicits._
val events = MemoryStream[String]
val queryName: String = "calleventaggs"
// Add events to MemoryStream as if they came from Kafka
val batch = Seq(
"this is a value to read",
"and this is another value"
)
val currentOffset = events.addData(batch)
val streamingQuery = StreamingDataFrames.writeData(events.toDF(), "memory", queryName)
streamingQuery.processAllAvailable()
events.commit(currentOffset.asInstanceOf[LongOffset])
val result: DataFrame = sparkSession.table(queryName)
result.show
streamingQuery.awaitTermination(1000L)
assertResult(batch.size)(result.count)
val values = result.take(2)
assertResult(batch(0))(values(0).getString(0))
assertResult(batch(1))(values(1).getString(0))
}
}**
The SharedSparkSession is an internal test utility for the Apache-Spark project and not accessible through the packages you have provided in your sbt file.
The ScalaDocs do not mention the SharedSparkSession.
You will see that the trait SharedSparkSession extends SQLTestUtils which is another testing utility.
For your unit tests it is usually sufficient to just create a local SparkSession.
See the below working code..
import module.JsValueToString
import org.apache.log4j.{Level, Logger}
import org.scalatest.funsuite.AnyFunSuite
import org.apache.spark.sql.functions.{col, concat, current_timestamp, date_format, from_json, from_unixtime, from_utc_timestamp, lit, regexp_replace, sha2, struct, to_json, to_utc_timestamp, udf}
import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession}
import org.apache.spark.sql.execution.streaming.{LongOffset, MemoryStream}
import org.scalatest.BeforeAndAfterAll
import scala.io.Source
import org.apache.log4j.Logger
import org.apache.log4j.Level
class KafkaProducerFlattenerTestCase extends AnyFunSuite with BeforeAndAfterAll {
Logger.getLogger("org").setLevel(Level.ERROR)
#transient var spark : SparkSession =_
override def beforeAll(): Unit = {
spark = SparkSession
.builder()
.appName("KafkaProducerFlattenerTestCase")
.master("local[*]")
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
}
override def afterAll(): Unit = {
spark.stop()
}
test("MemoryStream testcase for Flattener JSON") {
implicit val sparkSesion: SparkSession = spark
implicit val ctx = spark.sqlContext
import sparkSesion.implicits._
val input = MemoryStream[String]
val valueDf = input.toDS().selectExpr("CAST(value As STRING)")
val df2 = (valueDf.select(to_json(struct(col("*"))).alias("content")))
df2.printSchema()
print(" Before Write Stream")
val formatName = ("memory")
val query = df2.writeStream
.queryName("testCustomSinkBasic")
.format(formatName)
.start()
val jasonContent = readJson()a
input.addData(jasonContent)
assert(query.isActive === true)
query.processAllAvailable()
assert(query.exception === None)
print("query....... "+query.runId)
val eventName = spark.sql("select * from testCustomSinkBasic")
val actualValString = JsValueToString(eventName)
println("actualValString..... "+actualValString)
assert(actualValString === expectValue())
}
def readJson(): String ={
val fileContents = Source.fromFile("src/resources/Json.txt").getLines().mkString
fileContents
}
def expectValue(): String = {
val expectVal = """{"publishTime":"123","name[0].lastname":"def","name[0].fname":"abc","name[1].lastname":"jkl","name[1].fname":"ghi","lpid":"1234"}"""
expectVal
}
}
Expected class to cover
import com.usb.transformation.JsFlattener
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.{col, struct, to_json}
import play.api.libs.json.{JsObject, JsValue, Json}
object JsValueToString extends Serializable{
var df3 : String = null
def apply(eventName : DataFrame): (String) ={
eventName.foreach(x => {
val content = x.getAs[String]("content").replace("\\", "")
val subStr = content.substring(10, content.length()-2)
println("content ...."+content)
println("subString "+subStr)
val str2Json: JsValue = Json.parse(subStr)
df3 = JsFlattener(str2Json).as[JsObject].toString
println("df3 Value......"+df3)
})
df3
}
}

Using Apache Flink to consume from a Kafka topic then processing the stream with Flink CEP

In this project, I'm trying to consume data from a Kafka topic using Flink and then process the stream to detect a pattern using Flink CEP.
The part of using Kafka connect works and data is being fetched, but the CEP part doesn't work for some reason.
I'm using scala in this project.
build.sbt:
version := "0.1"
scalaVersion := "2.11.12"
libraryDependencies += "org.apache.flink" %% "flink-streaming-scala" % "1.12.2"
libraryDependencies += "org.apache.kafka" %% "kafka" % "2.3.0"
libraryDependencies += "org.apache.flink" %% "flink-connector-kafka" % "1.12.2"
libraryDependencies += "org.apache.flink" %% "flink-cep-scala" % "1.12.2"
the main code:
import org.apache.flink.api.common.serialization.SimpleStringSchema
import java.util
import java.util.Properties
import org.apache.flink.cep.PatternSelectFunction
import org.apache.flink.cep.scala.CEP
import org.apache.flink.streaming.api.scala._
import org.apache.flink.cep.scala.pattern.Pattern
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.flink.cep.pattern.conditions.IterativeCondition
object flinkExample {
def main(args: Array[String]): Unit = {
val CLOSE_THRESHOLD: Double = 140.00
val properties = new Properties()
properties.setProperty("bootstrap.servers", "localhost:9092")
properties.setProperty("zookeeper.connect", "localhost:2181")
properties.setProperty("group.id", "test")
val consumer = new FlinkKafkaConsumer[String]("test", new SimpleStringSchema(), properties)
consumer.setStartFromEarliest
val see: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val src: DataStream[String] = see.addSource(consumer)
val keyedStream: DataStream[Stock] = src.map(v => v)
.map {
v =>
val data = v.split(":")
val date = data(0)
val close = data(1).toDouble
Stock(date,close)
}
val pat = Pattern
.begin[Stock]("start")
.where(_.Adj_Close > CLOSE_THRESHOLD)
val patternStream = CEP.pattern(keyedStream, pat)
val result = patternStream.select(
patternSelectFunction = new PatternSelectFunction[Stock, String]() {
override def select(pattern: util.Map[String, util.List[Stock]]): String = {
val data = pattern.get("first").get(0)
data.toString
}
}
)
result.print()
see.execute("ASK Flink Kafka")
}
case class Stock(date: String,
Adj_Close: Double)
{
override def toString: String = s"Stock date: $date, Adj Close: $Adj_Close"
}
}
Data coming from Kafka are in string format: "date:value"
Scala version: 2.11.12
Flink version: 1.12.2
Kafka version: 2.3.0
I'm building the project using: sbt assembly, and then deploy the jar in the flink dashboard.
With pattern.get("first") you are selecting a pattern named "first" from the pattern sequence, but the pattern sequence only has one pattern, which is named "start". Trying changing "first" to "start".
Also, CEP has to be able to sort the stream into temporal order in order to do pattern matching. You should define a watermark strategy. For processing time semantics you can use WatermarkStrategy.noWatermarks().

Spark Scala: "cannot resolve symbol saveAsTextFile (reduceByKey)" - IntelliJ Idea

I suppose some dependencies are not defined in build.sbt file.
I've added library dependencies in build.sbt file, but still I'm getting this error mentioned from title of this question. Try to search for solution on the google but couldn't find it
My spark scala source code (filterEventId100.scala) :
package com.projects.setTopBoxDataAnalysis
import java.lang.System._
import java.text.SimpleDateFormat
import java.util.Date
import org.apache.spark.sql.SparkSession
object filterEventId100 extends App {
if (args.length < 2) {
println("Usage: JavaWordCount <Input-File> <Output-file>")
exit(1)
}
val spark = SparkSession
.builder
.appName("FilterEvent100")
.getOrCreate()
val data = spark.read.textFile(args(0)).rdd
val result = data.flatMap{line: String => line.split("\n")}
.map{serverData =>
val serverDataArray = serverData.replace("^", "::")split("::")
val evenId = serverDataArray(2)
if (evenId.equals("100")) {
val serverId = serverDataArray(0)
val timestempTo = serverDataArray(3)
val timestempFrom = serverDataArray(6)
val server = new Servers(serverId, timestempFrom, timestempTo)
val res = (serverId, server.dateDiff(server.timestampFrom, server.timestampTo))
res
}
}.reduceByKey{
case(x: Long, y: Long) => if ((x, y) != null) {
if (x > y) x else y
}
}
result.saveAsTextFile(args(1))
spark.stop
}
class Servers(val serverId: String, val timestampFrom: String, val timestampTo: String) {
val DATE_FORMAT = "yyyy-MM-dd hh:mm:ss.SSS"
private def convertStringToDate(s: String): Date = {
val dateFormat = new SimpleDateFormat(DATE_FORMAT)
dateFormat.parse(s)
}
private def convertDateStringToLong(dateAsString: String): Long = {
convertStringToDate(dateAsString).getTime
}
def dateDiff(tFrom: String, tTo: String): Long = {
val dDiff = convertDateStringToLong(tTo) - tFrom.toLong
dDiff
}
}
My build.sbt file:
name := "SetTopProject"
version := "0.1"
scalaVersion := "2.12.8"
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.4.3" exclude ("org.apache.hadoop","hadoop-yarn-server-web-proxy"),
"org.apache.spark" %% "spark-sql_2.12" % "2.4.3" exclude ("org.apache.hadoop","hadoop-yarn-server-web-proxy"),
"org.apache.hadoop" %% "hadoop-common" % "3.2.0" exclude ("org.apache.hadoop","hadoop-yarn-server-web-proxy"),
"org.apache.spark" %% "spark-sql_2.12" % "2.4.3" exclude ("org.apache.hadoop","hadoop-yarn-server-web-proxy"),
"org.apache.spark" %% "spark-hive_2.12" % "2.4.3" exclude ("org.apache.hadoop","hadoop-yarn-server-web-proxy"),
"org.apache.spark" %% "spark-yarn_2.12" % "2.4.3" exclude ("org.apache.hadoop","hadoop-yarn-server-web-proxy")
)
I was expecting everything will be fine because
val spark = SparkSession
.builder
.appName("FilterEvent100")
.getOrCreate()
is defined well (without any compiler's errors) and I use spark value to define data value:
val data = spark.read.textFile(args(0)).rdd
which calls saveAsTextFile and reducedByKey functions:
val result = data.flatMap{line: String => line.split("\n")}...
}.reducedByKey {case(x: Long, y: Long) => if ((x, y) != null) {
if (x > y) x else y
}
result.saveAsTextFile(args(1))
What I should to to remove compiler errors for saveAsTextFile and reduceByKey functions calls?
Replace
val spark = SparkSession
.builder
.appName("FilterEvent100")
.getOrCreate()
val data = spark.read.textFile(args(0)).rdd
to
val conf = new SparkConf().setAppName("FilterEvent100")
val sc = new SparkContext(conf)
val spark = SparkSession.builder.config(sc.getConf).getOrCreate()
val data = sc.textfile(args(0))

java.lang.ClassNotFoundException: org.apache.spark.sql.DataFrame error when running Scala MongoDB connector

I am trying to run a Scala example with SBT to read data from MongoDB. I am getting this error whenever I try to access the data read from Mongo into the RDD.
Exception in thread "dag-scheduler-event-loop" java.lang.NoClassDefFoundError: org/apache/spark/sql/DataFrame
at java.lang.Class.getDeclaredMethods0(Native Method)
at java.lang.Class.privateGetDeclaredMethods(Class.java:2701)
at java.lang.Class.getDeclaredMethod(Class.java:2128)
at java.io.ObjectStreamClass.getPrivateMethod(ObjectStreamClass.java:1431)
at java.io.ObjectStreamClass.access$1700(ObjectStreamClass.java:72)
at java.io.ObjectStreamClass$2.run(ObjectStreamClass.java:494)
at java.io.ObjectStreamClass$2.run(ObjectStreamClass.java:468)
at java.security.AccessController.doPrivileged(Native Method)
at java.io.ObjectStreamClass.<init>(ObjectStreamClass.java:468)
at java.io.ObjectStreamClass.lookup(ObjectStreamClass.java:365)
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1134)
at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548)
at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1509)
I have imported the Dataframe explicitly, even though it is not used in my code. Can anyone help with this issue?
My code:
package stream
import org.apache.spark._
import org.apache.spark.SparkContext._
import com.mongodb.spark._
import com.mongodb.spark.config._
import com.mongodb.spark.rdd.MongoRDD
import org.bson.Document
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.DataFrame
object SpaceWalk {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("SpaceWalk")
.setMaster("local[*]")
.set("spark.mongodb.input.uri", "mongodb://127.0.0.1/nasa.eva")
.set("spark.mongodb.output.uri", "mongodb://127.0.0.1/nasa.astronautTotals")
val sc = new SparkContext(sparkConf)
val rdd = sc.loadFromMongoDB()
def breakoutCrew ( document: Document ): List[(String,Int)] = {
println("INPUT"+document.get( "Duration").asInstanceOf[String])
var minutes = 0;
val timeString = document.get( "Duration").asInstanceOf[String]
if( timeString != null && !timeString.isEmpty ) {
val time = document.get( "Duration").asInstanceOf[String].split( ":" )
minutes = time(0).toInt * 60 + time(1).toInt
}
import scala.util.matching.Regex
val pattern = new Regex("(\\w+\\s\\w+)")
val names = pattern findAllIn document.get( "Crew" ).asInstanceOf[String]
var tuples : List[(String,Int)] = List()
for ( name <- names ) { tuples = tuples :+ (( name, minutes ) ) }
return tuples
}
val logs = rdd.flatMap( breakoutCrew ).reduceByKey( (m1: Int, m2: Int) => ( m1 + m2 ) )
//logs.foreach(println)
def mapToDocument( tuple: (String, Int ) ): Document = {
val doc = new Document();
doc.put( "name", tuple._1 )
doc.put( "minutes", tuple._2 )
return doc
}
val writeConf = WriteConfig(sc)
val writeConfig = WriteConfig(Map("collection" -> "astronautTotals", "writeConcern.w" -> "majority", "db" -> "nasa"), Some(writeConf))
logs.map( mapToDocument ).saveToMongoDB( writeConfig )
import org.apache.spark.sql.SQLContext
import com.mongodb.spark.sql._
import org.apache.spark.sql.DataFrame
// load the first dataframe "EVAs"
val sqlContext = new SQLContext(sc);
import sqlContext.implicits._
val evadf = sqlContext.read.mongo()
evadf.printSchema()
evadf.registerTempTable("evas")
// load the 2nd dataframe "astronautTotals"
val astronautDF = sqlContext.read.option("collection", "astronautTotals").mongo[astronautTotal]()
astronautDF.printSchema()
astronautDF.registerTempTable("astronautTotals")
sqlContext.sql("SELECT astronautTotals.name, astronautTotals.minutes FROM astronautTotals" ).show()
sqlContext.sql("SELECT astronautTotals.name, astronautTotals.minutes, evas.Vehicle, evas.Duration FROM " +
"astronautTotals JOIN evas ON astronautTotals.name LIKE evas.Crew" ).show()
}
}
case class astronautTotal ( name: String, minutes: Integer )
This is my sbt file -
name := "Project"
version := "1.0"
scalaVersion := "2.11.7"
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.0.0"
libraryDependencies += "org.apache.spark" %% "spark-streaming" % "2.0.0"
libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.0.0"
//libraryDependencies += "org.apache.spark" %% "spark-streaming-twitter" % "1.2.1"
libraryDependencies += "org.apache.bahir" %% "spark-streaming-twitter" % "2.0.0"
libraryDependencies += "org.mongodb.spark" %% "mongo-spark-connector" % "0.1"
addCommandAlias("c1", "run-main stream.SaveTweets")
addCommandAlias("c2", "run-main stream.SpaceWalk")
outputStrategy := Some(StdoutOutput)
//outputStrategy := Some(LoggedOutput(log: Logger))
fork in run := true
This error message is because you are using an incompatible library that only supports Spark 1.x. You should use mongo-spark-connector 2.0.0+ instead. See: https://docs.mongodb.com/spark-connector/v2.0/

Anaylze twitter datas with Spark

Anyone else help me about how can i analyze twitter data based on 'keys' whatever i write.I found this code but this is give me an error.
import java.io.File
import com.google.gson.Gson
import org.apache.spark.streaming.twitter.TwitterUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* Collect at least the specified number of tweets into json text files.
*/
object Collect {
private var numTweetsCollected = 0L
private var partNum = 0
private var gson = new Gson()
def main(args: Array[String]) {
// Process program arguments and set properties
if (args.length < 3) {
System.err.println("Usage: " + this.getClass.getSimpleName +
"<outputDirectory> <numTweetsToCollect> <intervalInSeconds> <partitionsEachInterval>")
System.exit(1)
}
val Array(outputDirectory, Utils.IntParam(numTweetsToCollect), Utils.IntParam(intervalSecs), Utils.IntParam(partitionsEachInterval)) =
Utils.parseCommandLineWithTwitterCredentials(args)
val outputDir = new File(outputDirectory.toString)
if (outputDir.exists()) {
System.err.println("ERROR - %s already exists: delete or specify another directory".format(
outputDirectory))
System.exit(1)
}
outputDir.mkdirs()
println("Initializing Streaming Spark Context...")
val conf = new SparkConf().setAppName(this.getClass.getSimpleName)
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(intervalSecs))
val tweetStream = TwitterUtils.createStream(ssc, Utils.getAuth)
.map(gson.toJson(_))
tweetStream.foreachRDD((rdd, time) => {
val count = rdd.count()
if (count > 0) {
val outputRDD = rdd.repartition(partitionsEachInterval)
outputRDD.saveAsTextFile(outputDirectory + "/tweets_" + time.milliseconds.toString)
numTweetsCollected += count
if (numTweetsCollected > numTweetsToCollect) {
System.exit(0)
}
}
})
ssc.start()
ssc.awaitTermination()
}
}
Error is
object gson is not a member of package com.google
If you know any link about it or fix this problem can you share with me,because i want to analyze twitter datas with spark.
Thanks.:)
Like Peter pointed out, you are missing the gson dependency. So you'll need to add the following dependency to your build.sbt :
libraryDependencies += "com.google.code.gson" % "gson" % "2.4"
You can also do the following to define all the dependencies in one sequence :
libraryDependencies ++= Seq(
"com.google.code.gson" % "gson" % "2.4",
"org.apache.spark" %% "spark-core" % "1.2.0",
"org.apache.spark" %% "spark-streaming" % "1.2.0",
"org.apache.spark" %% "spark-streaming-twitter" % "1.2.0"
)
Bonus: In case of other missing dependencies, you can try to search your dependency on the http://mvnrepository.com/ and if you need to find the associated jar/dependency for a given class, you can also use the findjar website