I am learning Twitter integretion with Spark streaming.
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.SparkContext._
import org.apache.spark.streaming.twitter._
import org.apache.spark.SparkConf
/**
* Calculates popular hashtags (topics) over sliding 10 and 60 second windows from a Twitter
* stream. The stream is instantiated with credentials and optionally filters supplied by the
* command line arguments.
*
* Run this on your local machine as
*
*/
object TwitterPopularTags {
def main(args: Array[String]) {
if (args.length < 4) {
System.err.println("Usage: TwitterPopularTags <consumer key> <consumer secret> " +
"<access token> <access token secret> [<filters>]")
System.exit(1)
}
StreamingExamples.setStreamingLogLevels()
val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4)
val filters = args.takeRight(args.length - 4)
// Set the system properties so that Twitter4j library used by twitter stream
// can use them to generat OAuth credentials
System.setProperty("twitter4j.oauth.consumerKey", consumerKey)
System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret)
System.setProperty("twitter4j.oauth.accessToken", accessToken)
System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret)
val sparkConf = new SparkConf().setAppName("TwitterPopularTags").setMaster("local[2]")
val ssc = new StreamingContext(sparkConf, Seconds(2))
val stream = TwitterUtils.createStream(ssc, None, filters)//Dstream
val hashTags = stream.flatMap(status => status.getText.split(" ").filter(_.startsWith("#")))
val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60))
.map{case (topic, count) => (count, topic)}
.transform(_.sortByKey(false))
val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10))
.map{case (topic, count) => (count, topic)}
.transform(_.sortByKey(false))
// Print popular hashtags
topCounts60.foreachRDD(rdd => {
val topList = rdd.take(10)
println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))}
})
topCounts10.foreachRDD(rdd => {
val topList = rdd.take(10)
println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count()))
topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))}
})
ssc.start()
ssc.awaitTermination()
}
}
I am not able to understand fully the 2 code lines below:
val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4)
val filters = args.takeRight(args.length - 4)
Can someone please explain me these 2 lines?
Thanks and Regards,
val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4)
args is an Array; take(4) returns a sub-Array with the first (left-most) four elements. Assigning these four elements into Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) means that the val consumerKey will hold the value of the first element; consumerSecret will hold the value of the second, and so on. This is a neat Scala trick of "unpacking" an Array (can be done with other collections too) into named values.
val filters = args.takeRight(args.length - 4)
takeRight(n) returns a sub-Array from the right, meaning the last n elements in the array. Here, an Array with everything but the first four elements is assigned into a new value named filters.
Related
There are some examples of how to create MQTT sources [1] [2] for Spark Streaming. However, I want to create an MQTT sink where I can publish the results instead of using the print() method. I tried to create one MqttSink but I am getting object not serializable error. Then I am basing the code on this blog but I cannot find the method send that I created on the MqttSink object.
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{HashPartitioner, SparkConf}
import org.fusesource.mqtt.client.QoS
import org.sense.spark.util.{MqttSink, TaxiRideSource}
object TaxiRideCountCombineByKey {
val mqttTopic: String = "spark-mqtt-sink"
val qos: QoS = QoS.AT_LEAST_ONCE
def main(args: Array[String]): Unit = {
val outputMqtt: Boolean = if (args.length > 0 && args(0).equals("mqtt")) true else false
// Create a local StreamingContext with two working thread and batch interval of 1 second.
// The master requires 4 cores to prevent from a starvation scenario.
val sparkConf = new SparkConf()
.setAppName("TaxiRideCountCombineByKey")
.setMaster("local[4]")
val ssc = new StreamingContext(sparkConf, Seconds(1))
val stream = ssc.receiverStream(new TaxiRideSource())
val driverStream = stream.map(taxiRide => (taxiRide.driverId, 1))
val countStream = driverStream.combineByKey(
(v) => (v, 1), //createCombiner
(acc: (Int, Int), v) => (acc._1 + v, acc._2 + 1), //mergeValue
(acc1: (Int, Int), acc2: (Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2), // mergeCombiners
new HashPartitioner(3)
)
if (outputMqtt) {
println("Use the command below to consume data:")
println("mosquitto_sub -h 127.0.0.1 -p 1883 -t " + mqttTopic)
val mqttSink = ssc.sparkContext.broadcast(MqttSink)
countStream.foreachRDD { rdd =>
rdd.foreach { message =>
mqttSink.value.send(mqttTopic, message.toString()) // "send" method does not exist
}
}
} else {
countStream.print()
}
ssc.start() // Start the computation
ssc.awaitTermination() // Wait for the computation to terminate
}
}
import org.fusesource.mqtt.client.{FutureConnection, MQTT, QoS}
class MqttSink(createProducer: () => FutureConnection) extends Serializable {
lazy val producer = createProducer()
def send(topic: String, message: String): Unit = {
producer.publish(topic, message.toString().getBytes, QoS.AT_LEAST_ONCE, false)
}
}
object MqttSink {
def apply(): MqttSink = {
val f = () => {
val mqtt = new MQTT()
mqtt.setHost("localhost", 1883)
val producer = mqtt.futureConnection()
producer.connect().await()
sys.addShutdownHook {
producer.disconnect().await()
}
producer
}
new MqttSink(f)
}
}
As an alternative you could also use Structure Streaming with the Apache Bahir Spark Extention for MQTT.
Complete Example
build.sbt:
name := "MQTT_StructuredStreaming"
version := "0.1"
libraryDependencies += "org.apache.spark" % "spark-core_2.12" % "2.4.4"
libraryDependencies += "org.apache.spark" % "spark-sql_2.12" % "2.4.4"
libraryDependencies += "org.apache.spark" % "spark-streaming_2.12" % "2.4.4" % "provided"
libraryDependencies += "org.apache.bahir" % "spark-sql-streaming-mqtt_2.12" % "2.4.0"
Main.scala
import org.apache.spark.sql.streaming.StreamingQuery
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
object Main extends App {
val brokerURL = "tcp://localhost:1883"
val subTopicName = "/my/subscribe/topic"
val pubTopicName = "/my/publish/topic"
val spark: SparkSession = SparkSession
.builder
.appName("MQTT_StructStreaming")
.master("local[*]")
.config("spark.sql.streaming.checkpointLocation", "/my/sparkCheckpoint/dir")
.getOrCreate
spark.sparkContext.setLogLevel("ERROR")
import spark.implicits._
val lines: Dataset[String] = spark.readStream
.format("org.apache.bahir.sql.streaming.mqtt.MQTTStreamSourceProvider")
.option("topic", subTopicName)
.option("clientId", "some-client-id")
.option("persistence", "memory")
.load(brokerURL)
.selectExpr("CAST(payload AS STRING)").as[String]
// Split the lines into words
val words: Dataset[String] = lines.as[String].flatMap(_.split(";"))
// Generate running word count
val wordCounts: DataFrame = words.groupBy("value").count()
// Start running the query that prints the running counts to the console
val query: StreamingQuery = wordCounts.writeStream
.format("org.apache.bahir.sql.streaming.mqtt.MQTTStreamSinkProvider")
.outputMode("complete")
.option("topic", pubTopicName)
.option("brokerURL", brokerURL)
.start
query.awaitTermination()
}
this is a working example based on the blog entry Spark and Kafka integration patterns.
package org.sense.spark.app
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{HashPartitioner, SparkConf}
import org.fusesource.mqtt.client.QoS
import org.sense.spark.util.{MqttSink, TaxiRideSource}
object TaxiRideCountCombineByKey {
val mqttTopic: String = "spark-mqtt-sink"
val qos: QoS = QoS.AT_LEAST_ONCE
def main(args: Array[String]): Unit = {
val outputMqtt: Boolean = if (args.length > 0 && args(0).equals("mqtt")) true else false
// Create a local StreamingContext with two working thread and batch interval of 1 second.
// The master requires 4 cores to prevent from a starvation scenario.
val sparkConf = new SparkConf()
.setAppName("TaxiRideCountCombineByKey")
.setMaster("local[4]")
val ssc = new StreamingContext(sparkConf, Seconds(1))
val stream = ssc.receiverStream(new TaxiRideSource())
val driverStream = stream.map(taxiRide => (taxiRide.driverId, 1))
val countStream = driverStream.combineByKey(
(v) => (v, 1), //createCombiner
(acc: (Int, Int), v) => (acc._1 + v, acc._2 + 1), //mergeValue
(acc1: (Int, Int), acc2: (Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2), // mergeCombiners
new HashPartitioner(3)
)
if (outputMqtt) {
println("Use the command below to consume data:")
println("mosquitto_sub -h 127.0.0.1 -p 1883 -t " + mqttTopic)
val mqttSink = ssc.sparkContext.broadcast(MqttSink())
countStream.foreachRDD { rdd =>
rdd.foreach { message =>
mqttSink.value.send(mqttTopic, message.toString()) // "send" method does not exist
}
}
} else {
countStream.print()
}
ssc.start() // Start the computation
ssc.awaitTermination() // Wait for the computation to terminate
}
}
package org.sense.spark.util
import org.fusesource.mqtt.client.{FutureConnection, MQTT, QoS}
class MqttSink(createProducer: () => FutureConnection) extends Serializable {
lazy val producer = createProducer()
def send(topic: String, message: String): Unit = {
producer.publish(topic, message.toString().getBytes, QoS.AT_LEAST_ONCE, false)
}
}
object MqttSink {
def apply(): MqttSink = {
val f = () => {
val mqtt = new MQTT()
mqtt.setHost("localhost", 1883)
val producer = mqtt.futureConnection()
producer.connect().await()
sys.addShutdownHook {
producer.disconnect().await()
}
producer
}
new MqttSink(f)
}
}
package org.sense.spark.util
import java.io.{BufferedReader, FileInputStream, InputStreamReader}
import java.nio.charset.StandardCharsets
import java.util.Locale
import java.util.zip.GZIPInputStream
import org.apache.spark.storage._
import org.apache.spark.streaming.receiver._
import org.joda.time.DateTime
import org.joda.time.format.{DateTimeFormat, DateTimeFormatter}
case class TaxiRide(rideId: Long, isStart: Boolean, startTime: DateTime, endTime: DateTime,
startLon: Float, startLat: Float, endLon: Float, endLat: Float,
passengerCnt: Short, taxiId: Long, driverId: Long)
object TimeFormatter {
val timeFormatter: DateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss").withLocale(Locale.US).withZoneUTC()
}
class TaxiRideSource extends Receiver[TaxiRide](StorageLevel.MEMORY_AND_DISK_2) {
val dataFilePath = "/home/flink/nycTaxiRides.gz";
var dataRateListener: DataRateListener = _
/**
* Start the thread that receives data over a connection
*/
def onStart() {
dataRateListener = new DataRateListener()
dataRateListener.start()
new Thread("TaxiRide Source") {
override def run() {
receive()
}
}.start()
}
def onStop() {}
/**
* Periodically generate a TaxiRide event and regulate the emission frequency
*/
private def receive() {
while (!isStopped()) {
val gzipStream = new GZIPInputStream(new FileInputStream(dataFilePath))
val reader: BufferedReader = new BufferedReader(new InputStreamReader(gzipStream, StandardCharsets.UTF_8))
try {
var line: String = null
do {
// start time before reading the line
val startTime = System.nanoTime
// read the line on the file and yield the object
line = reader.readLine
if (line != null) {
val taxiRide: TaxiRide = getTaxiRideFromString(line)
store(taxiRide)
}
// regulate frequency of the source
dataRateListener.busySleep(startTime)
} while (line != null)
} finally {
reader.close
}
}
}
def getTaxiRideFromString(line: String): TaxiRide = {
// println(line)
val tokens: Array[String] = line.split(",")
if (tokens.length != 11) {
throw new RuntimeException("Invalid record: " + line)
}
val rideId: Long = tokens(0).toLong
val (isStart, startTime, endTime) = tokens(1) match {
case "START" => (true, DateTime.parse(tokens(2), TimeFormatter.timeFormatter), DateTime.parse(tokens(3), TimeFormatter.timeFormatter))
case "END" => (false, DateTime.parse(tokens(2), TimeFormatter.timeFormatter), DateTime.parse(tokens(3), TimeFormatter.timeFormatter))
case _ => throw new RuntimeException("Invalid record: " + line)
}
val startLon: Float = if (tokens(4).length > 0) tokens(4).toFloat else 0.0f
val startLat: Float = if (tokens(5).length > 0) tokens(5).toFloat else 0.0f
val endLon: Float = if (tokens(6).length > 0) tokens(6).toFloat else 0.0f
val endLat: Float = if (tokens(7).length > 0) tokens(7).toFloat else 0.0f
val passengerCnt: Short = tokens(8).toShort
val taxiId: Long = tokens(9).toLong
val driverId: Long = tokens(10).toLong
TaxiRide(rideId, isStart, startTime, endTime, startLon, startLat, endLon, endLat, passengerCnt, taxiId, driverId)
}
}
The following piece of code is apart of a Twitter Streaming app that I'm using with Spark Streaming.:
val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4)
val filters = args.takeRight(args.length - 4)
// Set the system properties so that Twitter4j library used by twitter stream
// can use them to generate OAuth credentials
System.setProperty("twitter4j.oauth.consumerKey", consumerKey)
System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret)
System.setProperty("twitter4j.oauth.accessToken", accessToken)
System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret)
Whenever I go to run the program, I get the following error:
Exception in thread "main" scala.MatchError: [Ljava.lang.String;#323659f8 (of class [Ljava.lang.String;)
at SparkPopularHashTags$.main(SparkPopularHashTags.scala:18)
at SparkPopularHashTags.main(SparkPopularHashTags.scala)
Line 18 is:
val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4)
I have the Twitter4j.properties file saved in my F:\Software\ItelliJ\Projects\twitterStreamApp\src folder, and it's formatted like so:
oauth.consumerKey=***
oauth.consumerSecret=***
oauth.accessToken=***
oauth.accessTokenSecret=***
Where the "*"s are my keys without quotations around them (i.e. oauth.consumerKey=h12b31289fh7139fbh138ry)
Can anyone assist me with this please?
import org.apache.spark.streaming.{ Seconds, StreamingContext }
import org.apache.spark.SparkContext._
import org.apache.spark.streaming.twitter._
import org.apache.spark.SparkConf
import org.apache.spark.streaming._
import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.flume._
import twitter4j.auth.OAuthAuthorization
import twitter4j.conf.ConfigurationBuilder
object SparkPopularHashTags {
val conf = new SparkConf().setMaster("local[4]").setAppName("Spark Streaming - PopularHashTags")
val sc = new SparkContext(conf)
def main(args: Array[String]) {
sc.setLogLevel("WARN")
val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4)
// val filters = args.takeRight(args.length - 4)
args.lift(0).foreach { consumerKey =>
System.setProperty("twitter4j.oauth.consumerKey", consumerKey)
}
args.lift(1).foreach { consumerSecret =>
System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret)
}
args.lift(2).foreach { accessToken =>
System.setProperty("twitter4j.oauth.accessToken", accessToken)
}
args.lift(3).foreach { accessTokenSecret =>
System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret)
}
val filters = args.drop(4)
// Set the system properties so that Twitter4j library used by twitter stream
// can use them to generate OAuth credentials
// System.setProperty("twitter4j.oauth.consumerKey", consumerKey)
// System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret)
// System.setProperty("twitter4j.oauth.accessToken", accessToken)
// System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret)
// Set the Spark StreamingContext to create a DStream for every 5 seconds
val ssc = new StreamingContext(sc, Seconds(5))
val stream = TwitterUtils.createStream(ssc, None, filters)
// Split the stream on space and extract hashtags
val hashTags = stream.flatMap(status => status.getText.split(" ").filter(_.startsWith("#")))
// Get the top hashtags over the previous 60 sec window
val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60))
.map { case (topic, count) => (count, topic) }
.transform(_.sortByKey(false))
// Get the top hashtags over the previous 10 sec window
val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10))
.map { case (topic, count) => (count, topic) }
.transform(_.sortByKey(false))
// print tweets in the correct DStream
stream.print()
// Print popular hashtags
topCounts60.foreachRDD(rdd => {
val topList = rdd.take(10)
println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
})
topCounts10.foreachRDD(rdd => {
val topList = rdd.take(10)
println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count()))
topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
})
ssc.start()
ssc.awaitTermination()
}
}
This is the problem:
val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4)
This will fail if there are fewer than 4 arguments because it can't match the four values on the left hand side.
Instead, you need to test the elements of args individually to make sure they are present. For example
args.lift(0).foreach { consumerKey =>
System.setProperty("twitter4j.oauth.consumerKey", consumerKey)
}
args.lift(1).foreach { consumerSecret =>
System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret)
}
args.lift(2).foreach { accessToken =>
System.setProperty("twitter4j.oauth.accessToken", accessToken)
}
args.lift(3).foreach { accessTokenSecret =>
System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret)
}
val filters = args.drop(4)
This should happen only when your not setting your Program arguments or setting insufficient no. of arguments i.e. less than 4
I have following code :-
case class event(imei: String, date: String, gpsdt: String,dt: String,id: String)
case class historyevent(imei: String, date: String, gpsdt: String)
object kafkatesting {
def main(args: Array[String]) {
val clients = new RedisClientPool("192.168.0.40", 6379)
val conf = new SparkConf()
.setAppName("KafkaReceiver")
.set("spark.cassandra.connection.host", "192.168.0.40")
.set("spark.cassandra.connection.keep_alive_ms", "20000")
.set("spark.executor.memory", "3g")
.set("spark.driver.memory", "4g")
.set("spark.submit.deployMode", "cluster")
.set("spark.executor.instances", "4")
.set("spark.executor.cores", "3")
.set("spark.streaming.backpressure.enabled", "true")
.set("spark.streaming.backpressure.initialRate", "100")
.set("spark.streaming.kafka.maxRatePerPartition", "7")
val sc = SparkContext.getOrCreate(conf)
val ssc = new StreamingContext(sc, Seconds(10))
val sqlContext = new SQLContext(sc)
val kafkaParams = Map[String, String](
"bootstrap.servers" -> "192.168.0.113:9092",
"group.id" -> "test-group-aditya",
"auto.offset.reset" -> "largest")
val topics = Set("random")
val kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
kafkaStream.foreachRDD { rdd =>
val updatedRDD = rdd.map(a =>
{
implicit val formats = DefaultFormats
val jValue = parse(a._2)
val fleetrecord = jValue.extract[historyevent]
val hash = fleetrecord.imei + fleetrecord.date + fleetrecord.gpsdt
val md5Hash = DigestUtils.md5Hex(hash).toUpperCase()
val now = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime())
event(fleetrecord.imei, fleetrecord.date, fleetrecord.gpsdt, now, md5Hash)
})
.collect()
updatedRDD.foreach(f =>
{
clients.withClient {
client =>
{
val value = f.imei + " , " + f.gpsdt
val zscore = Calendar.getInstance().getTimeInMillis
val key = new SimpleDateFormat("yyyy-MM-dd").format(Calendar.getInstance().getTime())
val dt = new SimpleDateFormat("HH:mm:ss").format(Calendar.getInstance().getTime())
val q1 = "00:00:00"
val q2 = "06:00:00"
val q3 = "12:00:00"
val q4 = "18:00:00"
val quater = if (dt > q1 && dt < q2) {
System.out.println(dt + " lies in quarter 1");
" -> 1"
} else if (dt > q2 && dt < q3) {
System.out.println(dt + " lies in quarter 2");
" -> 2"
} else if (dt > q3 && dt < q4) {
System.out.println(dt + " lies in quarter 3");
" -> 3"
} else {
System.out.println(dt + " lies in quarter 4");
" -> 4"
}
client.zadd(key + quater, zscore, value)
println(f.toString())
}
}
})
val collection = sc.parallelize(updatedRDD)
collection.saveToCassandra("db", "table", SomeColumns("imei", "date", "gpsdt","dt","id"))
}
ssc.start()
ssc.awaitTermination()
}
}
I'm using this code to insert data from Kafka into Cassandra and Redis, but facing following issues:-
1) application creates a long queue of active batches while the previous batch is currently being processed. So, I want to have next batch only once the previous batch is finished executing.
2) I have four-node cluster which is processing each batch but it takes around 30-40 sec for executing 700 records.
Is my code is optimized or I need to work on my code for better performance?
Yes you can do all your stuff inside mapPartition. There are APIs from datastax that allow you to save the Dstream directly. Here is how you can do it for C*.
val partitionedDstream = kafkaStream.repartition(5) //change this value as per your data and spark cluster
//Now instead of iterating each RDD work on each partition.
val eventsStream: DStream[event] = partitionedDstream.mapPartitions(x => {
val lst = scala.collection.mutable.ListBuffer[event]()
while (x.hasNext) {
val a = x.next()
implicit val formats = DefaultFormats
val jValue = parse(a._2)
val fleetrecord = jValue.extract[historyevent]
val hash = fleetrecord.imei + fleetrecord.date + fleetrecord.gpsdt
val md5Hash = DigestUtils.md5Hex(hash).toUpperCase()
val now = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime())
lst += event(fleetrecord.imei, fleetrecord.date, fleetrecord.gpsdt, now, md5Hash)
}
lst.toList.iterator
})
eventsStream.cache() //because you are using same Dstream for C* and Redis
//instead of collecting each RDD save whole Dstream at once
import com.datastax.spark.connector.streaming._
eventsStream.saveToCassandra("db", "table", SomeColumns("imei", "date", "gpsdt", "dt", "id"))
Also cassandra accepts timestamp as Long value, so you can also change some part of your code as below
val now = System.currentTimeMillis()
//also change your case class to take `Long` instead of `String`
case class event(imei: String, date: String, gpsdt: String, dt: Long, id: String)
Similarly you can change for Redis as well.
Hi I'm new to spark and scala . I'm trying to stream some tweets through spark streaming with the following code:
object TwitterStreaming {
def main(args: Array[String]): Unit = {
if (args.length < 1) {
System.err.println("WrongUsage: PropertiesFile, [<filters>]")
System.exit(-1)
}
StreamingExamples.setStreaningLogLevels()
val myConfigFile = args(0)
val batchInterval_s = 1
val fileConfig = ConfigFactory.parseFile(new File(myConfigFile))
val appConf = ConfigFactory.load(fileConfig)
// Set the system properties so that Twitter4j library used by twitter stream
// can use them to generate OAuth credentials
System.setProperty("twitter4j.oauth.consumerKey", appConf.getString("consumerKey"))
System.setProperty("twitter4j.oauth.consumerSecret", appConf.getString("consumerSecret"))
System.setProperty("twitter4j.oauth.accessToken", appConf.getString("accessToken"))
System.setProperty("twitter4j.oauth.accessTokenSecret", appConf.getString("accessTokenSecret"))
val sparkConf = new SparkConf().setAppName("TwitterStreaming").setMaster(appConf.getString("SPARK_MASTER"))//local[2]
val ssc = new StreamingContext(sparkConf, Seconds(batchInterval_s)) // creating spark streaming context
val stream = TwitterUtils.createStream(ssc, None)
val tweet_data = stream.map(status => TweetData(status.getId, "#" + status.getUser.getScreenName, status.getText.trim()))
tweet_data.foreachRDD(rdd => {
println(s"A sample of tweets I gathered over ${batchInterval_s}s: ${rdd.take(10).mkString(" ")} (total tweets fetched: ${rdd.count()})")
})
}
}
case class TweetData(id: BigInt, author: String, tweetText: String)
My Error:
Exception in thread "main" com.typesafe.config.ConfigException$WrongType:/WorkSpace/InputFiles/application.conf: 5: Cannot concatenate object or list with a non-object-or-list, ConfigString("local") and SimpleConfigList([2]) are not compatible
at com.typesafe.config.impl.ConfigConcatenation.join(ConfigConcatenation.java:116)
can any one check the the code and tell me where I'm doing wrong?
If your config file contains:
SPARK_MASTER=local[2]
Change it to:
SPARK_MASTER="local[2]"
My RDD is \n separated records that look like
Single RDD
k1=v1,k2=v2,k3=v3
k1=v1,k2=v2,k3=v3
k1=v1,k2=v2,k3=v3
and want to convert it into a Array[Map[k,v]],
where each element in Array will be a different map[k,v] corresponding to a record.
Array will contain N number of such maps depending on the records in a single RDD.
I am new to both scala and spark. Any help in the conversion will help.
object SparkApp extends Logging with App {
override def main(args: Array[ String ]): Unit = {
val myConfigFile = new File("../sparkconsumer/conf/spark.conf")
val fileConfig = ConfigFactory.parseFile(myConfigFile).getConfig(GlobalConstants.CONFIG_ROOT_ELEMENT)
val propConf = ConfigFactory.load(fileConfig)
val topicsSet = propConf.getString(GlobalConstants.KAFKA_WHITE_LIST_TOPIC).split(",").toSet
val kafkaParams = Map[ String, String ]("metadata.broker.list" -> propConf.getString(GlobalConstants.KAFKA_BROKERS))
//logger.info(message = "Hello World , You are entering Spark!!!")
val conf = new SparkConf().setMaster("local[2]").setAppName(propConf.getString(GlobalConstants.JOB_NAME))
conf.set("HADOOP_HOME", "/usr/local/hadoop")
conf.set("hadoop.home.dir", "/usr/local/hadoop")
//Lookup
// logger.info("Window of 5 Seconds Enabled")
val ssc = new StreamingContext(conf, Seconds(5))
ssc.checkpoint("/tmp/checkpoint")
val apiFile = ssc.sparkContext.textFile(propConf.getString(GlobalConstants.API_FILE))
val arrayApi = ssc.sparkContext.broadcast(apiFile.distinct().collect())
val nonApiFile = ssc.sparkContext.textFile(propConf.getString(GlobalConstants.NON_API_FILE))
val arrayNonApi = ssc.sparkContext.broadcast(nonApiFile.distinct().collect())
val messages = KafkaUtils.createDirectStream[ String, String, StringDecoder, StringDecoder ](ssc, kafkaParams, topicsSet)
writeTOHDFS2(messages)
ssc.start()
ssc.awaitTermination()
}
def writeTOHDFS2(messages: DStream[ (String, String) ]): Unit = {
val records = messages.window(Seconds(10), Seconds(10))
val k = records.transform( rdd => rdd.map(r =>r._2)).filter(x=> filterNullImpressions(x))
k.foreachRDD { singleRdd =>
if (singleRdd.count() > 0) {
val maps = singleRdd.map(line => line.split("\n").flatMap(x => x.split(",")).flatMap(x => x.split("=")).foreach( x => new mutable.HashMap().put(x(0),x(1)))
val r = scala.util.Random
val sdf = new SimpleDateFormat("yyyy/MM/dd/HH/mm")
maps.saveAsTextFile("hdfs://localhost:8001/user/hadoop/spark/" + sdf.format(new Date())+r.nextInt)
}
}
}
}
Here's some code that should be pretty self-explainatory.
val lines = "k1=v1,k2=v2,k3=v3\nk1=v1,k2=v2\nk1=v1,k2=v2,k3=v3,k4=v4"
val maps = lines.split("\n")
.map(line => line.split(",")
.map(kvPairString => kvPairString.split("="))
.map(kvPairArray => (kvPairArray(0), kvPairArray(1))))
.map(_.toMap)
// maps is of type Array[Map[String, String]]
println(maps.mkString("\n"))
// prints:
// Map(k1 -> v1, k2 -> v2, k3 -> v3)
// Map(k1 -> v1, k2 -> v2)
// Map(k1 -> v1, k2 -> v2, k3 -> v3, k4 -> v4)
Word of advice - SO is not a "write code for me" platform. I understand that it's pretty hard to just dive into Scala and Spark, but next time please try to solve the problem yourself and post what you tried so far and which problems you ran into.