How to store data into HDFS using spark streaming - scala

I want to store streaming data into hdfs. Its a spark streaming code capture data from kafka topic.
I tried this
lines.saveAsHadoopFiles("hdfs://192.168.10.31:9000/user/spark/mystream/", "abc")
this is my code let me know here to write code for save data into hdfs and how.in console i am receiving output need to store in hdfs
Thanks in advance
package com.spark.cons.conskafka
import java.util.HashMap
import kafka.serializer.StringDecoder
import org.apache.kafka.clients.producer.{ KafkaProducer, ProducerConfig, ProducerRecord }
import org.apache.spark.SparkConf
import org.apache.spark.streaming._
import org.apache.spark.streaming.kafka._
import org.apache.spark.streaming.{ Seconds, StreamingContext }
import org.apache.spark.SparkContext._
import org.apache.spark.streaming.twitter._
import org.apache.spark.SparkConf
import org.apache.spark.streaming._
import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.storage.StorageLevel
import _root_.kafka.serializer.StringDecoder
object Consume {
def createContext(brokers: String, topics: String, checkpointDirectory: String): StreamingContext = {
println("Creating new context")
val conf = new SparkConf().setMaster("local[*]").setAppName("Spark Streaming - Kafka DirectReceiver - PopularHashTags").set("spark.executor.memory", "1g")
val sc = new SparkContext(conf)
sc.setLogLevel("WARN")
// Set the Spark StreamingContext to create a DStream for every 2 seconds
val ssc = new StreamingContext(sc, Seconds(2))
ssc.checkpoint("checkpoint")
// Define the Kafka parameters, broker list must be specified
val kafkaParams = Map[String, String](
"metadata.broker.list" -> brokers,
// start from the smallest available offset, ie the beginning of the kafka log
"auto.offset.reset" -> "largest")
// Define which topics to read from
val topicsSet = topics.split(",").toSet
// Map value from the kafka message (k, v) pair
val lines = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet)
// Filter hashtags
val hashTags = lines.map(_._2).flatMap(_.split(" ")).filter(_.startsWith("#"))
// Get the top hashtags over the previous 60/10 sec window
val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60))
.map { case (topic, count) => (count, topic) }
.transform(_.sortByKey(false))
val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10))
.map { case (topic, count) => (count, topic) }
.transform(_.sortByKey(false))
lines.print()
// Print popular hashtags
topCounts60.foreachRDD(rdd => {
val topList = rdd.take(10)
println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
})
topCounts10.foreachRDD(rdd => {
val topList = rdd.take(10)
println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count()))
topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
})
lines.count().map(cnt => "Received " + cnt + " kafka messages.").print()
ssc
}
def main(args: Array[String]) {
if (args.length < 2) {
System.err.println(s"""
|Usage: KafkaDirectPopularHashTags <brokers> <topics>
| <brokers> is a list of one or more Kafka brokers
| <topics> is a list of one or more kafka topics to consume from
| <checkpointDirectory> the directory where the metadata is stored
|
""".stripMargin)
System.exit(1)
}
// Create an array of arguments: brokers, topicname, checkpoint directory
val Array(brokers, topics, checkpointDirectory) = args
val ssc = StreamingContext.getOrCreate(checkpointDirectory,
() => createContext(brokers, topics, checkpointDirectory))
ssc.start()
ssc.awaitTermination()
}
}

Related

Spark Streaming code to read from Kafka broker and calculate average of numbers by mapWithState

I'm receiving my value from Kafka in the form of character,number in a random manner which is generated by another program. An example of values I receive:
a,4
b,3
d,7
f,5
b,2
...
Here is the program that generates these values and sends it over Kafka topic:
package generator
import java.util.{Date, Properties}
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, ProducerConfig}
import scala.util.Random
import kafka.producer.KeyedMessage
object ScalaProducerExample extends App {
def getRandomVal: String = {
val i = Random.nextInt(alphabet.size)
val key = alphabet(i)
val value = Random.nextInt(alphabet.size)
key + "," + value
}
val alphabet = 'a' to 'z'
val events = 10000
val topic = "avg"
val brokers = "localhost:9092"
val rnd = new Random()
val props = new Properties()
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers)
props.put(ProducerConfig.CLIENT_ID_CONFIG, "ScalaProducerExample")
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer")
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer")
val producer = new KafkaProducer[String, String](props)
while (true) {
val data = new ProducerRecord[String, String](topic, null, getRandomVal)
producer.send(data)
print(data + "\n")
}
producer.close()
}
My task is to show the up-to-date average for every character based on the sum and number of received values for it from the beginning until now.
I wrote this code for this task and I am receiving from Kafka successfully:
package DirectKafkaWordCount
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming._
import org.apache.spark.streaming.kafka010._
case class Data(key: String, count: Int)
object DirectKafkaWordCount {
def main(args: Array[String]): Unit = {
val Array(brokers, topics) = args
val sparkConf = new SparkConf().setMaster("local[4]").setAppName("DirectKafkaWordCount")
val ssc = new StreamingContext(sparkConf, Seconds(2))
ssc.checkpoint("_checkpoint")
val topicsSet = topics.split(",").toSet
val kafkaParams = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokers,
ConsumerConfig.GROUP_ID_CONFIG -> "1",
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer])
val messages = KafkaUtils.createDirectStream[String, String](
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topicsSet, kafkaParams))
val pairs = messages.map(_.value).map(x => (x.split(",")(0), x.split(",")(1).toInt))
val wc = pairs.mapWithState(StateSpec.function((key: String, value: Option[Int], state: State[String]) => {
val newNum = value.getOrElse(0)
val sData = state.getOption.getOrElse("a,0,0")
var count = sData.split(",")(1).toInt
var sum = sData.split(",")(2).toInt
sum = sum + newNum
count = count + 1
val output = key + "," + count.toString + "," + sum.toString
state.update(output)
output
}))
wc.map(process _).print()
ssc.start()
ssc.awaitTermination()
}
def process(s: String): String = {
var count = s.split(",")(1).toInt
var sum = s.split(",")(2).toInt
s.split(",")(0) + "," + (sum / count).toString
}
}
My problem is that the average for every character becomes the constant number 12. Is there sth wrong with the mapWithState function? How can I fix it? Something that makes me suspicious is that there isn't only one entry per character in the output, there may be 3 or 4 entries per character. A sample output:
-------------------------------------------
Time: 1651560488000 ms
-------------------------------------------
d,12
t,12
h,12
t,12
h,12
x,12
d,12
h,12
p,12
p,12
...

Kafka and Spark Streaming Simple Producer Consumer

I do not know why the data sent by producer do not reach the consumer.
I am working on cloudera virtual machine.
I am trying to write simple producer consumer where the producer uses Kafka and consumer uses spark streaming.
The Producer Code in scala:
import java.util.Properties
import org.apache.kafka.clients.producer._
object kafkaProducer {
def main(args: Array[String]) {
val props = new Properties()
props.put("bootstrap.servers", "localhost:9092")
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
val producer = new KafkaProducer[String, String](props)
val TOPIC = "test"
for (i <- 1 to 50) {
Thread.sleep(1000) //every 1 second
val record = new ProducerRecord(TOPIC, generator.getID().toString(),generator.getRandomValue().toString())
producer.send(record)
}
producer.close()
}
}
The Consumer Code in scala :
import java.util
import org.apache.kafka.clients.consumer.KafkaConsumer
import scala.collection.JavaConverters._
import java.util.Properties
import kafka.producer._
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkConf
import org.apache.spark.streaming._
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.streaming.kafka._
object kafkaConsumer {
def main(args: Array[String]) {
var totalCount = 0L
val sparkConf = new SparkConf().setMaster("local[1]").setAppName("AnyName").set("spark.driver.host", "localhost")
val ssc = new StreamingContext(sparkConf, Seconds(2))
ssc.checkpoint("checkpoint")
val stream = KafkaUtils.createStream(ssc, "localhost:9092", "spark-streaming-consumer-group", Map("test" -> 1))
stream.foreachRDD((rdd: RDD[_], time: Time) => {
val count = rdd.count()
println("\n-------------------")
println("Time: " + time)
println("-------------------")
println("Received " + count + " events\n")
totalCount += count
})
ssc.start()
Thread.sleep(20 * 1000)
ssc.stop()
if (totalCount > 0) {
println("PASSED")
} else {
println("FAILED")
}
}
}
The problem is resolved by changing in the consumer code the line :
val stream = KafkaUtils.createStream(ssc, "localhost:9092", "spark-streaming-consumer-group", Map("test" -> 1))
the second parameter should be the zookeeper port which 2181 not 9092 and the zookeeper will manage to connect to the Kafka port 9092 automatically.
Note: Kafka should be started from terminal before running both the producer and consumer.

Unable to fetch specific content from tweets using scala on spark shell

I am working on Hortonworks.I have stored tweets from twitter to Kafka topic.I am performing sentiment analysis on tweets using Kafka as a Producer and Spark as a Consumer using Scala on Spark-shell.But I want to fetch only specific content from tweets like Text,HashTag,tweets is positive or negative,words from the tweets which i selected as a positive or negative word.my training data is Data.txt.
I added dependencies :
org.apache.spark:spark-streaming-kafka_2.10:1.6.2,org.apache.spark:spark-streaming_2.10:1.6.2
Here is my code:
import org.apache.spark._
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.kafka._
val conf = new SparkConf().setMaster("local[4]").setAppName("KafkaReceiver")
val ssc = new StreamingContext(conf, Seconds(5))
val zkQuorum="sandbox.hortonworks.com:2181"
val group="test-consumer-group"
val topics="test"
val numThreads=5
val args=Array(zkQuorum, group, topics, numThreads)
val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
val lines = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap).map(_._2)
val hashTags = lines.flatMap(_.split(" ")).filter(_.startsWith("#"))
val wordSentimentFilePath = "hdfs://sandbox.hortonworks.com:8020/TwitterData/Data.txt"
val wordSentiments = ssc.sparkContext.textFile(wordSentimentFilePath).map { line =>
val Array(word, happiness) = line.split("\t")
(word, happiness)
} cache()
val happiest60 = hashTags.map(hashTag => (hashTag.tail, 1)).reduceByKeyAndWindow(_ + _, Seconds(60)).transform{topicCount => wordSentiments.join(topicCount)}.map{case (topic, tuple) => (topic, tuple._1 * tuple._2)}.map{case (topic, happinessValue) => (happinessValue, topic)}.transform(_.sortByKey(false))
happiest60.print()
ssc.start()
I got the output like this,
(negative,fear)
(positive,fitness)
I want output like this,
(#sports,Text from the Tweets,fitness,positive)
But I am not getting the solution to store Text and Hashtag like above.

Reuse kafka producer in Spark Streaming

We have a spark streaming application(following is the code) that sources data from kafka and does some transformations(on each message) before inserting the data into MongoDB. We have a middleware application that pushes the messages(in bulk) into Kafka and waits for an acknowledgement(for each message) from spark streaming application. If the acknowledgement is not received by the middleware within a certain period of time(5seconds) after sending the message into Kafka, the middleware application re-sends the message. The spark streaming application is able to receive around 50-100 messages(in one batch) and send acknowledgement for all the messages under 5 seconds. But if the middleware application pushes over 100 messages, it is resulting in middleware application re-sending the message due to delay in spark streaming sending the acknowledgement. In our current implementation, we create the producer each time we want to send an acknowledgement, which is taking 3-4 seconds.
package com.testing
import org.apache.spark.streaming._
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{ Seconds, StreamingContext }
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.streaming.kafka._
import org.apache.spark.sql.{ SQLContext, Row, Column, DataFrame }
import java.util.HashMap
import org.apache.kafka.clients.producer.{ KafkaProducer, ProducerConfig, ProducerRecord }
import scala.collection.mutable.ArrayBuffer
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.joda.time._
import org.joda.time.format._
import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._
import com.mongodb.util.JSON
import scala.io.Source._
import java.util.Properties
import java.util.Calendar
import scala.collection.immutable
import org.json4s.DefaultFormats
object Sample_Streaming {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("Sample_Streaming")
.setMaster("local[4]")
val sc = new SparkContext(sparkConf)
sc.setLogLevel("ERROR")
val sqlContext = new SQLContext(sc)
val ssc = new StreamingContext(sc, Seconds(1))
val props = new HashMap[String, Object]()
val bootstrap_server_config = "127.0.0.100:9092"
val zkQuorum = "127.0.0.101:2181"
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrap_server_config)
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer")
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer")
val TopicMap = Map("sampleTopic" -> 1)
val KafkaDstream = KafkaUtils.createStream(ssc, zkQuorum, "group", TopicMap).map(_._2)
val schemaDf = sqlContext.read.format("com.mongodb.spark.sql.DefaultSource")
.option("spark.mongodb.input.uri", "connectionURI")
.option("spark.mongodb.input.collection", "schemaCollectionName")
.load()
val outSchema = schemaDf.schema
var outDf = sqlContext.createDataFrame(sc.emptyRDD[Row], outSchema)
KafkaDstream.foreachRDD(rdd => rdd.collect().map { x =>
{
val jsonInput: JValue = parse(x)
/*Do all the transformations using Json libraries*/
val json4s_transformed = "transformed json"
val rdd = sc.parallelize(compact(render(json4s_transformed)) :: Nil)
val df = sqlContext.read.schema(outSchema).json(rdd)
df.write.option("spark.mongodb.output.uri", "connectionURI")
.option("collection", "Collection")
.mode("append").format("com.mongodb.spark.sql").save()
val producer = new KafkaProducer[String, String](props)
val message = new ProducerRecord[String, String]("topic_name", null, "message_received")
producer.send(message)
producer.close()
}
}
)
// Run the streaming job
ssc.start()
ssc.awaitTermination()
}
}
So we tried another approach of creating the producer outside of the foreachRDD and reuse it for the entire batch interval(following is the code). This seem to have helped as we are not creating the producer each time we want to send the acknowledgement. But for some reason, when we monitor the application on the spark UI, the streaming application's memory consumption is increasing steadily, which was not the case before. We tried using the --num-executors 1 option in spark-submit to limit the number of executors that get initiated by yarn.
object Sample_Streaming {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("Sample_Streaming")
.setMaster("local[4]")
val sc = new SparkContext(sparkConf)
sc.setLogLevel("ERROR")
val sqlContext = new SQLContext(sc)
val ssc = new StreamingContext(sc, Seconds(1))
val props = new HashMap[String, Object]()
val bootstrap_server_config = "127.0.0.100:9092"
val zkQuorum = "127.0.0.101:2181"
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrap_server_config)
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer")
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer")
val TopicMap = Map("sampleTopic" -> 1)
val KafkaDstream = KafkaUtils.createStream(ssc, zkQuorum, "group", TopicMap).map(_._2)
val schemaDf = sqlContext.read.format("com.mongodb.spark.sql.DefaultSource")
.option("spark.mongodb.input.uri", "connectionURI")
.option("spark.mongodb.input.collection", "schemaCollectionName")
.load()
val outSchema = schemaDf.schema
val producer = new KafkaProducer[String, String](props)
KafkaDstream.foreachRDD(rdd =>
{
rdd.collect().map ( x =>
{
val jsonInput: JValue = parse(x)
/*Do all the transformations using Json libraries*/
val json4s_transformed = "transformed json"
val rdd = sc.parallelize(compact(render(json4s_transformed)) :: Nil)
val df = sqlContext.read.schema(outSchema).json(rdd)
df.write.option("spark.mongodb.output.uri", "connectionURI")
.option("collection", "Collection")
.mode("append").format("com.mongodb.spark.sql").save()
val message = new ProducerRecord[String, String]("topic_name", null, "message_received")
producer.send(message)
producer.close()
}
)
}
)
// Run the streaming job
ssc.start()
ssc.awaitTermination()
}
}
My questions are:
How do I monitor the spark application's memory consumption, currently we are manually monitoring the application every 5 minutes until it exhausts the memory available in our cluster(2 node 16GB each)?
What are the best practices that are followed in the industry while using Spark streaming and kafka?
Kafka is a broker: It gives you delivery guarantees for the producer and the consumer. It's overkill to implement an 'over the top' acknowledge mechanism between the producer and the consumer. Ensure that the producer behaves correctly and that the consumer can recover in case of failure and the end-2-end delivery will be ensured.
Regarding the job, there's no wonder why its performance is poor: The processing is being done sequentially, element by element up to the point of the write to the external DB. This is plain wrong and should be addressed before attempting to fix any memory consumption issues.
This process could be improved like:
val producer = // create producer
val jsonDStream = kafkaDstream.transform{rdd => rdd.map{elem =>
val json = parse(elem)
render(doAllTransformations(json)) // output should be a String-formatted JSON object
}
}
jsonDStream.foreachRDD{ rdd =>
val df = sqlContext.read.schema(outSchema).json(rdd) // transform the complete collection, not element by element
df.write.option("spark.mongodb.output.uri", "connectionURI") // write in bulk, not one by one
.option("collection", "Collection")
.mode("append").format("com.mongodb.spark.sql").save()
val msg = //create message
producer.send(msg)
producer.flush() // force send. *DO NOT Close* otherwise it will not be able to send any more messages
}
This process could be improved further if we could replace all the string-centric JSON transformation by case class instances.

How to deserialize Flume's Avro events coming to Spark?

I have Flume Avro sink and SparkStreaming program that read the sink.
CDH 5.1 , Flume 1.5.0 , Spark 1.0 , using Scala as program lang on Spark
i was able to make the Spark example and count the Flume Avro Events.
however i was not able to De serialize the Flume Avro Event into string\text and then parse the structure row.
Does anyone have an example of how to do so using Scala?
You can deserialize the flume events with the below code:
val eventBody = stream.map(e => new String(e.event.getBody.array))
Here's an example of a spark streaming application for analyzing popular hashtags from twitter using a flume twitter source and avro sink to push the events to spark:
import org.apache.spark.streaming.{ Seconds, StreamingContext }
import org.apache.spark.SparkContext._
import org.apache.spark.streaming.twitter._
import org.apache.spark.SparkConf
import org.apache.spark.streaming._
import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.flume._
object PopularHashTags {
val conf = new SparkConf().setMaster("local[4]").setAppName("PopularHashTags").set("spark.executor.memory", "1g")
val sc = new SparkContext(conf)
def main(args: Array[String]) {
sc.setLogLevel("WARN")
System.setProperty("twitter4j.oauth.consumerKey", <consumerKey>)
System.setProperty("twitter4j.oauth.consumerSecret", <consumerSecret>)
System.setProperty("twitter4j.oauth.accessToken", <accessToken>)
System.setProperty("twitter4j.oauth.accessTokenSecret", <accessTokenSecret>)
val ssc = new StreamingContext(sc, Seconds(5))
val filter = args.takeRight(args.length)
val stream = FlumeUtils.createStream(ssc, <hostname>, <port>)
val tweets = stream.map(e => new String(e.event.getBody.array))
val hashTags = tweets.flatMap(status => status.split(" ").filter(_.startsWith("#")))
val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60))
.map { case (topic, count) => (count, topic) }
.transform(_.sortByKey(false))
// Print popular hashtags
topCounts60.foreachRDD(rdd => {
val topList = rdd.take(10)
println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
})
stream.count().map(cnt => "Received " + cnt + " flume events.").print()
ssc.start()
ssc.awaitTermination()
}
}
You can implement a custom decoder inorder to deserialize. Provide the expected type information along with it.
Try the code below:
stream.map(e => "Event:header:" + e.event.get(0).toString
+ "body: " + new String(e.event.getBody.array)).print