Create a Simple Kafka Consumer using Scala - scala

I am currently learning Scala & was trying to create a SimpleConsumer for retrieving messages from a Kafka partition.
The consumer should be able to handle the following tasks:
Keep track of Offsets.
Figure out which Broker is the lead Broker for a topic and partition
Must be able to handle Broker leader changes.
I was able to find a very good documentation to create this consumer in Java (https://cwiki.apache.org/confluence/display/KAFKA/0.8.0+SimpleConsumer+Example).
Does anyone have a sample Scala code for creating this simpleconsumer or if you could refer me some documentation which will point me in the right direction it will be greatly appreciated.

Here is the sample code of a Simple Kafka consumer written in Scala. Got it working after few trial and errors.
package com.Kafka.Consumer
import kafka.api.FetchRequest
import kafka.api.FetchRequestBuilder
import kafka.api.PartitionOffsetRequestInfo
import kafka.common.ErrorMapping
import kafka.common.TopicAndPartition
import kafka.javaapi._
import kafka.javaapi.consumer.SimpleConsumer
import kafka.message.MessageAndOffset
import java.nio.ByteBuffer
import java.util.ArrayList
import java.util.Collections
import java.util.HashMap
import java.util.List
import java.util.Map
import SimpleExample._
//remove if not needed
import scala.collection.JavaConversions._
object SimpleExample {
def main(args: Array[String]) {
val example = new SimpleExample()
val maxReads = java.lang.Integer.parseInt(args(0))
val topic = args(1)
val partition = java.lang.Integer.parseInt(args(2))
val seeds = new ArrayList[String]()
seeds.add(args(3))
val port = java.lang.Integer.parseInt(args(4))
try {
example.run(maxReads, topic, partition, seeds, port)
} catch {
case e: Exception => {
println("Oops:" + e)
e.printStackTrace()
}
}
}
def getLastOffset(consumer: SimpleConsumer,
topic: String,
partition: Int,
whichTime: Long,
clientName: String): Long = {
val topicAndPartition = new TopicAndPartition(topic, partition)
val requestInfo = new HashMap[TopicAndPartition, PartitionOffsetRequestInfo]()
requestInfo.put(topicAndPartition, new PartitionOffsetRequestInfo(whichTime, 1))
val request = new kafka.javaapi.OffsetRequest(requestInfo, kafka.api.OffsetRequest.CurrentVersion, clientName)
val response = consumer.getOffsetsBefore(request)
if (response.hasError) {
println("Error fetching data Offset Data the Broker. Reason: " +
response.errorCode(topic, partition))
return 0
}
val offsets = response.offsets(topic, partition)
offsets(0)
}
}
class SimpleExample {
private var m_replicaBrokers: List[String] = new ArrayList[String]()
def run(a_maxReads: Int,
a_topic: String,
a_partition: Int,
a_seedBrokers: List[String],
a_port: Int) {
val metadata = findLeader(a_seedBrokers, a_port, a_topic, a_partition)
if (metadata == null) {
println("Can't find metadata for Topic and Partition. Exiting")
return
}
if (metadata.leader == null) {
println("Can't find Leader for Topic and Partition. Exiting")
return
}
var leadBroker = metadata.leader.host
val clientName = "Client_" + a_topic + "_" + a_partition
var consumer = new SimpleConsumer(leadBroker, a_port, 100000, 64 * 1024, clientName)
var readOffset = getLastOffset(consumer, a_topic, a_partition, kafka.api.OffsetRequest.EarliestTime, clientName)
var numErrors = 0
//while (a_maxReads > 0) {
if (consumer == null) {
consumer = new SimpleConsumer(leadBroker, a_port, 100000, 64 * 1024, clientName)
}
val req = new FetchRequestBuilder().clientId(clientName).addFetch(a_topic, a_partition, readOffset,
100000)
.build()
val fetchResponse = consumer.fetch(req)
if (fetchResponse.hasError) {
numErrors += 1
val code = fetchResponse.errorCode(a_topic, a_partition)
println("Error fetching data from the Broker:" + leadBroker +
" Reason: " +
code)
if (numErrors > 5) //break
if (code == ErrorMapping.OffsetOutOfRangeCode) {
readOffset = getLastOffset(consumer, a_topic, a_partition, kafka.api.OffsetRequest.LatestTime, clientName)
//continue
}
consumer.close()
consumer = null
leadBroker = findNewLeader(leadBroker, a_topic, a_partition, a_port)
//continue
}
numErrors = 0
var numRead = 0
for (messageAndOffset <- fetchResponse.messageSet(a_topic, a_partition)) {
val currentOffset = messageAndOffset.offset
if (currentOffset < readOffset) {
println("Found an old offset: " + currentOffset + " Expecting: " +
readOffset)
//continue
}
readOffset = messageAndOffset.nextOffset
val payload = messageAndOffset.message.payload
val bytes = Array.ofDim[Byte](payload.limit())
payload.get(bytes)
println(String.valueOf(messageAndOffset.offset) + ": " + new String(bytes, "UTF-8"))
numRead += 1
// a_maxReads -= 1
}
if (numRead == 0) {
try {
Thread.sleep(1000)
} catch {
case ie: InterruptedException =>
}
}
//}
if (consumer != null) consumer.close()
}
private def findNewLeader(a_oldLeader: String,
a_topic: String,
a_partition: Int,
a_port: Int): String = {
for (i <- 0 until 3) {
var goToSleep = false
val metadata = findLeader(m_replicaBrokers, a_port, a_topic, a_partition)
if (metadata == null) {
goToSleep = true
} else if (metadata.leader == null) {
goToSleep = true
} else if (a_oldLeader.equalsIgnoreCase(metadata.leader.host) && i == 0) {
goToSleep = true
} else {
return metadata.leader.host
}
if (goToSleep) {
try {
Thread.sleep(1000)
} catch {
case ie: InterruptedException =>
}
}
}
println("Unable to find new leader after Broker failure. Exiting")
throw new Exception("Unable to find new leader after Broker failure. Exiting")
}
private def findLeader(a_seedBrokers: List[String],
a_port: Int,
a_topic: String,
a_partition: Int): PartitionMetadata = {
var returnMetaData: PartitionMetadata = null
for (seed <- a_seedBrokers) {
var consumer: SimpleConsumer = null
try {
consumer = new SimpleConsumer(seed, a_port, 100000, 64 * 1024, "leaderLookup")
val topics = Collections.singletonList(a_topic)
val req = new TopicMetadataRequest(topics)
val resp = consumer.send(req)
val metaData = resp.topicsMetadata
for (item <- metaData; part <- item.partitionsMetadata){
if (part.partitionId == a_partition) {
returnMetaData = part
//break
}
}
} catch {
case e: Exception => println("Error communicating with Broker [" + seed + "] to find Leader for [" +
a_topic +
", " +
a_partition +
"] Reason: " +
e)
} finally {
if (consumer != null) consumer.close()
}
}
if (returnMetaData != null) {
m_replicaBrokers.clear()
for (replica <- returnMetaData.replicas) {
m_replicaBrokers.add(replica.host)
}
}
returnMetaData
}
}

I built a simple kafka consumer and producer using scala.
consumer:
package com.kafka
import java.util.concurrent._
import java.util.{Collections, Properties}
import com.sun.javafx.util.Logging
import org.apache.kafka.clients.consumer.{ConsumerConfig, KafkaConsumer}
import scala.collection.JavaConversions._
class Consumer(val brokers: String,
val groupId: String,
val topic: String) extends Logging {
val props = createConsumerConfig(brokers, groupId)
val consumer = new KafkaConsumer[String, String](props)
var executor: ExecutorService = null
def shutdown() = {
if (consumer != null)
consumer.close()
if (executor != null)
executor.shutdown()
}
def createConsumerConfig(brokers: String, groupId: String): Properties = {
val props = new Properties()
props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers)
props.put(ConsumerConfig.GROUP_ID_CONFIG, groupId)
props.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "true")
props.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "1000")
props.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, "30000")
props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer")
props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer")
props
}
def run() = {
consumer.subscribe(Collections.singletonList(this.topic))
Executors.newSingleThreadExecutor.execute(new Runnable {
override def run(): Unit = {
while (true) {
val records = consumer.poll(1000)
for (record <- records) {
System.out.println("Received message: (" + record.key() + ", " + record.value() + ") at offset " + record.offset())
}
}
}
})
}
}
object Consumer extends App{
val newArgs = Array("localhost:9092", "2","test")
val example = new Consumer(newArgs(0), newArgs(1), newArgs(2))
example.run()
}
producer:
package com.kafka
import java.util.{Date, Properties}
import org.apache.kafka.clients.producer.KafkaProducer
import org.apache.kafka.clients.producer.ProducerRecord
object Producer extends App{
val newArgs = Array("20","test","localhost:9092")
val events = newArgs(0).toInt
val topic = newArgs(1)
val brokers = newArgs(2)
val props = new Properties()
props.put("bootstrap.servers", brokers)
props.put("client.id", "producer")
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
val producer = new KafkaProducer[String, String](props)
val t = System.currentTimeMillis()
for (nEvents <- Range(0, events)) {
val key = "messageKey " + nEvents.toString
val msg = "test message"
val data = new ProducerRecord[String, String](topic, key, msg)
//async
//producer.send(data, (m,e) => {})
//sync
producer.send(data)
}
System.out.println("sent per second: " + events * 1000 / (System.currentTimeMillis() - t))
producer.close()
}

Related

Scala Kafka Spark: Get all kafkaConsumer endOffSets and assign it to a val

The code below is inefficient, it requests the kafkaConsumer each time in the for-loop (where it says <!-- move code below -->). How do I move it to <!-- move it here --> so I only have to request it once per topic? I believe I have to get all the TopicPartition from the jsonOffsets and place it in kafkaConsumer.endOffsets but I'm not sure how to do that.
endOffsets takes type TopicPartitions
override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = {
event.progress.sources
// Ignoring sqs / jms sources their offsets
.filter(source => source.description.toLowerCase().contains("kafka"))
// ex offset :
// "endOffset" : {
// "rcs-enriched-event" : {
// "8" : 31376,
// "11" : 39114,
// } ...
.foreach(source => {
/// Map[Topic,Map[Partition, CurrentOffset]]
val jsonOffsets = objectMapper.readValue(source.endOffset, classOf[Map[String, Map[String, Int]]])
jsonOffsets.keys.filter(key => topics.contains(key))
.foreach(topic => {
val topicPartitionMap = new java.util.HashMap[TopicPartition, OffsetAndMetadata]()
// Map[Partition, CurrentOffset]
val topicOffsetList = new ListBuffer[Int]()
val offsets: Option[Map[String, Int]] = jsonOffsets.get(topic)
<!-- move it here -->
offsets match {
case Some(topicOffsetData) =>
topicOffsetData.keys.foreach(partition => {
/// "4" : 34937
val tp = new TopicPartition(topic, partition.toInt)
val oam = new OffsetAndMetadata(topicOffsetData(partition).toLong)
val bbCurrentOffset = topicOffsetData(partition).toLong
<!-- move code below -->
val kafkaPartitionOffset = kafkaConsumer.endOffsets(java.util.Arrays.asList(tp))
// latest offset
val partitionLatestOffset = kafkaPartitionOffset.get(tp)
// Log for a particular partition
val delta = partitionLatestOffset - bbCurrentOffset
topicOffsetList += delta.abs
topicPartitionMap.put(tp, oam)
})
}
try {
kafkaConsumer.commitSync(topicPartitionMap)
} catch {
case e: Exception => log.error(s"${groupId} Could not commit offset", e)
}
//log.info have the group id (unique id), the topic, cumulative consumer lag delta
//push out to logger
log.info("groupId: " + groupId + " topic: " + topic + " lagDeltaSum: " + topicOffsetList.sum)
})
})
}
Seems to me you should loop over offsets twice.
.foreach(topic => {
val offsets: Option[Map[String, Int]] = jsonOffsets.get(topic)
// Fetch latest offsets
val consumedPartitions = new ListBuffer[TopicPartition]()
offsets match {
case Some(topicOffsetData) =>
topicOffsetData.keys.foreach(partition => {
val tp = new TopicPartition(topic, partition.toInt)
consumedPartitions += tp
}
}
val latestOffsets = kafkaConsumer.endOffsets(consumedPartitions.asJava)
offsets match {
case Some(topicOffsetData) =>
// Use latest offsets, as needed ...

How to push datastream to kafka topic by retaining the order using string method in Flink Kafka Problem

I am trying to create a JSON dataset every 500 ms and want to push it to the Kafka topic so that I can set up some windows in the downstream and perform computations. Below is my code:
package KafkaAsSource
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.datastream.DataStream
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer.Semantic
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaProducer}
import org.apache.flink.streaming.connectors.kafka.internals.KeyedSerializationSchemaWrapper
import java.time.format.DateTimeFormatter
import java.time.LocalDateTime
import java.util.{Optional, Properties}
object PushingDataToKafka {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setMaxParallelism(256)
env.enableCheckpointing(5000)
val stream: DataStream[String] = env.fromElements(createData())
stream.addSink(sendToTopic(stream))
}
def getProperties(): Properties = {
val properties = new Properties()
properties.setProperty("bootstrap.servers", "localhost:9092")
properties.setProperty("zookeeper.connect", "localhost:2181")
return properties
}
def createData(): String = {
val minRange: Int = 0
val maxRange: Int = 1000
var jsonData = ""
for (a <- minRange to maxRange) {
jsonData = "{\n \"id\":\"" + a + "\",\n \"Category\":\"Flink\",\n \"eventTime\":\"" + DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.SSS").format(LocalDateTime.now) + "\"\n \n}"
println(jsonData)
Thread.sleep(500)
}
return jsonData
}
def sendToTopic(): Properties = {
val producer = new FlinkKafkaProducer[String](
"topic"
,
new KeyedSerializationSchemaWrapper[String](new SimpleStringSchema())
,
getProperties(),
FlinkKafkaProducer.Semantic.EXACTLY_ONCE
)
return producer
}
}
It gives me below error:
type mismatch;
found : Any
required: org.apache.flink.streaming.api.functions.sink.SinkFunction[String]
stream.addSink(sendToTopic())
Modified Code:
object FlinkTest {
def main(ars: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment()
env.setMaxParallelism(256)
var stream = env.fromElements("")
//env.enableCheckpointing(5000)
//val stream: DataStream[String] = env.fromElements("hey mc", "1")
val myProducer = new FlinkKafkaProducer[String](
"maddy", // target topic
new KeyedSerializationSchemaWrapper[String](new SimpleStringSchema()), // serialization schema
getProperties(), // producer config
FlinkKafkaProducer.Semantic.EXACTLY_ONCE)
val minRange: Int = 0
val maxRange: Int = 10
var jsonData = ""
for (a <- minRange to maxRange) {
jsonData = "{\n \"id\":\"" + a + "\",\n \"Category\":\"Flink\",\n \"eventTime\":\"" + DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.SSS").format(LocalDateTime.now) + "\"\n \n}"
println(a)
Thread.sleep(500)
stream = env.fromElements(jsonData)
println(jsonData)
stream.addSink(myProducer)
}
env.execute("hey")
}
def getProperties(): Properties = {
val properties = new Properties()
properties.setProperty("bootstrap.servers", "localhost:9092")
properties.setProperty("zookeeper.connect", "localhost:2181")
return properties
}
/*
def createData(): String = {
val minRange: Int = 0
val maxRange: Int = 10
var jsonData = ""
for (a <- minRange to maxRange) {
jsonData = "{\n \"id\":\"" + a + "\",\n \"Category\":\"Flink\",\n \"eventTime\":\"" + DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.SSS").format(LocalDateTime.now) + "\"\n \n}"
Thread.sleep(500)
}
return jsonData
}
*/
}
Modified Code gives me the data in the Kafka topic but it doesn't retain the order. What am I doing wrong here in the loops? Also, had to change the version of Flink to 1.12.2 from 1.13.5.
I was initially using Flink 1.13.5, Connectors and Scala of 2.11. What exactly I am missing over here?
A couple of things about this loop:
for (a <- minRange to maxRange) {
jsonData =
"{\n \"id\":\"" + a + "\",\n \"Category\":\"Flink\",\n \"eventTime\":\""
+ DateTimeFormatter
.ofPattern("yyyy-MM-dd HH:mm:ss.SSS")
.format(LocalDateTime.now) + "\"\n \n}"
println(a)
Thread.sleep(500)
stream = env.fromElements(jsonData)
println(jsonData)
stream.addSink(myProducer)
}
The sleep is happening in the Flink client, and only affects how long it takes the client to assemble the job graph before submitting it to the cluster. It has no effect on how the job runs.
This loop is creating 10 separate pipelines that will run independently, in parallel, all producing to the same Kafka topic. Those pipelines are going to race against each other.
To get the behavior you're looking for (a global ordering across a single pipeline) you'll want to produce all of the events from a single source (in order, of course), and run the job with a parallelism of one. Something like this would do it:
import org.apache.flink.streaming.api.scala.{StreamExecutionEnvironment, _}
object FlinkTest {
def main(ars: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment()
env.setParallelism(1)
val myProducer = ...
val jsonData = (i: Long) => ...
env.fromSequence(0, 9)
.map(i => jsonData(i))
.addSink(myProducer)
env.execute()
}
}
You can leave maxParallelism at 256 (or at its default value of 128); it's not particularly relevant here. The maxParallelism is the number of hash buckets that keyBy will hash the keys into, and it defines an upper limit on the scalability of the job.

How do I create an MQTT sink for Spark Streaming?

There are some examples of how to create MQTT sources [1] [2] for Spark Streaming. However, I want to create an MQTT sink where I can publish the results instead of using the print() method. I tried to create one MqttSink but I am getting object not serializable error. Then I am basing the code on this blog but I cannot find the method send that I created on the MqttSink object.
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{HashPartitioner, SparkConf}
import org.fusesource.mqtt.client.QoS
import org.sense.spark.util.{MqttSink, TaxiRideSource}
object TaxiRideCountCombineByKey {
val mqttTopic: String = "spark-mqtt-sink"
val qos: QoS = QoS.AT_LEAST_ONCE
def main(args: Array[String]): Unit = {
val outputMqtt: Boolean = if (args.length > 0 && args(0).equals("mqtt")) true else false
// Create a local StreamingContext with two working thread and batch interval of 1 second.
// The master requires 4 cores to prevent from a starvation scenario.
val sparkConf = new SparkConf()
.setAppName("TaxiRideCountCombineByKey")
.setMaster("local[4]")
val ssc = new StreamingContext(sparkConf, Seconds(1))
val stream = ssc.receiverStream(new TaxiRideSource())
val driverStream = stream.map(taxiRide => (taxiRide.driverId, 1))
val countStream = driverStream.combineByKey(
(v) => (v, 1), //createCombiner
(acc: (Int, Int), v) => (acc._1 + v, acc._2 + 1), //mergeValue
(acc1: (Int, Int), acc2: (Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2), // mergeCombiners
new HashPartitioner(3)
)
if (outputMqtt) {
println("Use the command below to consume data:")
println("mosquitto_sub -h 127.0.0.1 -p 1883 -t " + mqttTopic)
val mqttSink = ssc.sparkContext.broadcast(MqttSink)
countStream.foreachRDD { rdd =>
rdd.foreach { message =>
mqttSink.value.send(mqttTopic, message.toString()) // "send" method does not exist
}
}
} else {
countStream.print()
}
ssc.start() // Start the computation
ssc.awaitTermination() // Wait for the computation to terminate
}
}
import org.fusesource.mqtt.client.{FutureConnection, MQTT, QoS}
class MqttSink(createProducer: () => FutureConnection) extends Serializable {
lazy val producer = createProducer()
def send(topic: String, message: String): Unit = {
producer.publish(topic, message.toString().getBytes, QoS.AT_LEAST_ONCE, false)
}
}
object MqttSink {
def apply(): MqttSink = {
val f = () => {
val mqtt = new MQTT()
mqtt.setHost("localhost", 1883)
val producer = mqtt.futureConnection()
producer.connect().await()
sys.addShutdownHook {
producer.disconnect().await()
}
producer
}
new MqttSink(f)
}
}
As an alternative you could also use Structure Streaming with the Apache Bahir Spark Extention for MQTT.
Complete Example
build.sbt:
name := "MQTT_StructuredStreaming"
version := "0.1"
libraryDependencies += "org.apache.spark" % "spark-core_2.12" % "2.4.4"
libraryDependencies += "org.apache.spark" % "spark-sql_2.12" % "2.4.4"
libraryDependencies += "org.apache.spark" % "spark-streaming_2.12" % "2.4.4" % "provided"
libraryDependencies += "org.apache.bahir" % "spark-sql-streaming-mqtt_2.12" % "2.4.0"
Main.scala
import org.apache.spark.sql.streaming.StreamingQuery
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
object Main extends App {
val brokerURL = "tcp://localhost:1883"
val subTopicName = "/my/subscribe/topic"
val pubTopicName = "/my/publish/topic"
val spark: SparkSession = SparkSession
.builder
.appName("MQTT_StructStreaming")
.master("local[*]")
.config("spark.sql.streaming.checkpointLocation", "/my/sparkCheckpoint/dir")
.getOrCreate
spark.sparkContext.setLogLevel("ERROR")
import spark.implicits._
val lines: Dataset[String] = spark.readStream
.format("org.apache.bahir.sql.streaming.mqtt.MQTTStreamSourceProvider")
.option("topic", subTopicName)
.option("clientId", "some-client-id")
.option("persistence", "memory")
.load(brokerURL)
.selectExpr("CAST(payload AS STRING)").as[String]
// Split the lines into words
val words: Dataset[String] = lines.as[String].flatMap(_.split(";"))
// Generate running word count
val wordCounts: DataFrame = words.groupBy("value").count()
// Start running the query that prints the running counts to the console
val query: StreamingQuery = wordCounts.writeStream
.format("org.apache.bahir.sql.streaming.mqtt.MQTTStreamSinkProvider")
.outputMode("complete")
.option("topic", pubTopicName)
.option("brokerURL", brokerURL)
.start
query.awaitTermination()
}
this is a working example based on the blog entry Spark and Kafka integration patterns.
package org.sense.spark.app
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{HashPartitioner, SparkConf}
import org.fusesource.mqtt.client.QoS
import org.sense.spark.util.{MqttSink, TaxiRideSource}
object TaxiRideCountCombineByKey {
val mqttTopic: String = "spark-mqtt-sink"
val qos: QoS = QoS.AT_LEAST_ONCE
def main(args: Array[String]): Unit = {
val outputMqtt: Boolean = if (args.length > 0 && args(0).equals("mqtt")) true else false
// Create a local StreamingContext with two working thread and batch interval of 1 second.
// The master requires 4 cores to prevent from a starvation scenario.
val sparkConf = new SparkConf()
.setAppName("TaxiRideCountCombineByKey")
.setMaster("local[4]")
val ssc = new StreamingContext(sparkConf, Seconds(1))
val stream = ssc.receiverStream(new TaxiRideSource())
val driverStream = stream.map(taxiRide => (taxiRide.driverId, 1))
val countStream = driverStream.combineByKey(
(v) => (v, 1), //createCombiner
(acc: (Int, Int), v) => (acc._1 + v, acc._2 + 1), //mergeValue
(acc1: (Int, Int), acc2: (Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2), // mergeCombiners
new HashPartitioner(3)
)
if (outputMqtt) {
println("Use the command below to consume data:")
println("mosquitto_sub -h 127.0.0.1 -p 1883 -t " + mqttTopic)
val mqttSink = ssc.sparkContext.broadcast(MqttSink())
countStream.foreachRDD { rdd =>
rdd.foreach { message =>
mqttSink.value.send(mqttTopic, message.toString()) // "send" method does not exist
}
}
} else {
countStream.print()
}
ssc.start() // Start the computation
ssc.awaitTermination() // Wait for the computation to terminate
}
}
package org.sense.spark.util
import org.fusesource.mqtt.client.{FutureConnection, MQTT, QoS}
class MqttSink(createProducer: () => FutureConnection) extends Serializable {
lazy val producer = createProducer()
def send(topic: String, message: String): Unit = {
producer.publish(topic, message.toString().getBytes, QoS.AT_LEAST_ONCE, false)
}
}
object MqttSink {
def apply(): MqttSink = {
val f = () => {
val mqtt = new MQTT()
mqtt.setHost("localhost", 1883)
val producer = mqtt.futureConnection()
producer.connect().await()
sys.addShutdownHook {
producer.disconnect().await()
}
producer
}
new MqttSink(f)
}
}
package org.sense.spark.util
import java.io.{BufferedReader, FileInputStream, InputStreamReader}
import java.nio.charset.StandardCharsets
import java.util.Locale
import java.util.zip.GZIPInputStream
import org.apache.spark.storage._
import org.apache.spark.streaming.receiver._
import org.joda.time.DateTime
import org.joda.time.format.{DateTimeFormat, DateTimeFormatter}
case class TaxiRide(rideId: Long, isStart: Boolean, startTime: DateTime, endTime: DateTime,
startLon: Float, startLat: Float, endLon: Float, endLat: Float,
passengerCnt: Short, taxiId: Long, driverId: Long)
object TimeFormatter {
val timeFormatter: DateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss").withLocale(Locale.US).withZoneUTC()
}
class TaxiRideSource extends Receiver[TaxiRide](StorageLevel.MEMORY_AND_DISK_2) {
val dataFilePath = "/home/flink/nycTaxiRides.gz";
var dataRateListener: DataRateListener = _
/**
* Start the thread that receives data over a connection
*/
def onStart() {
dataRateListener = new DataRateListener()
dataRateListener.start()
new Thread("TaxiRide Source") {
override def run() {
receive()
}
}.start()
}
def onStop() {}
/**
* Periodically generate a TaxiRide event and regulate the emission frequency
*/
private def receive() {
while (!isStopped()) {
val gzipStream = new GZIPInputStream(new FileInputStream(dataFilePath))
val reader: BufferedReader = new BufferedReader(new InputStreamReader(gzipStream, StandardCharsets.UTF_8))
try {
var line: String = null
do {
// start time before reading the line
val startTime = System.nanoTime
// read the line on the file and yield the object
line = reader.readLine
if (line != null) {
val taxiRide: TaxiRide = getTaxiRideFromString(line)
store(taxiRide)
}
// regulate frequency of the source
dataRateListener.busySleep(startTime)
} while (line != null)
} finally {
reader.close
}
}
}
def getTaxiRideFromString(line: String): TaxiRide = {
// println(line)
val tokens: Array[String] = line.split(",")
if (tokens.length != 11) {
throw new RuntimeException("Invalid record: " + line)
}
val rideId: Long = tokens(0).toLong
val (isStart, startTime, endTime) = tokens(1) match {
case "START" => (true, DateTime.parse(tokens(2), TimeFormatter.timeFormatter), DateTime.parse(tokens(3), TimeFormatter.timeFormatter))
case "END" => (false, DateTime.parse(tokens(2), TimeFormatter.timeFormatter), DateTime.parse(tokens(3), TimeFormatter.timeFormatter))
case _ => throw new RuntimeException("Invalid record: " + line)
}
val startLon: Float = if (tokens(4).length > 0) tokens(4).toFloat else 0.0f
val startLat: Float = if (tokens(5).length > 0) tokens(5).toFloat else 0.0f
val endLon: Float = if (tokens(6).length > 0) tokens(6).toFloat else 0.0f
val endLat: Float = if (tokens(7).length > 0) tokens(7).toFloat else 0.0f
val passengerCnt: Short = tokens(8).toShort
val taxiId: Long = tokens(9).toLong
val driverId: Long = tokens(10).toLong
TaxiRide(rideId, isStart, startTime, endTime, startLon, startLat, endLon, endLat, passengerCnt, taxiId, driverId)
}
}

How can i reduce lag in kafka consumer/producer

I am looking for improvement in scala kafka code. For reduce lag, what should i do in consumer & producer.
This is the code I got from someone.
I know this code is not a difficult code. But I have never seen scala code before, and I am just beginning to learn about kafka. So I have a hard time finding the problem.
import java.util.Properties
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
import scala.util.Try
class KafkaMessenger(val servers: String, val sender: String) {
val props = new Properties()
props.put("bootstrap.servers", servers)
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
props.put("producer.type", "async")
val producer = new KafkaProducer[String, String](props)
def send(topic: String, message: Any): Try[Unit] = Try {
producer.send(new ProducerRecord(topic, message.toString))
}
def close(): Unit = producer.close()
}
object KafkaMessenger {
def apply(host: String, topic: String, sender: String, message: String): Unit = {
val messenger = new KafkaMessenger(host, sender)
messenger.send(topic, message)
messenger.close()
}
}
and this is consumer code.
import java.util.Properties
import java.util.concurrent.Executors
import com.satreci.g2gs.common.impl.utils.KafkaMessageTypes._
import kafka.admin.AdminUtils
import kafka.consumer._
import kafka.utils.ZkUtils
import org.I0Itec.zkclient.{ZkClient, ZkConnection}
import org.slf4j.LoggerFactory
import scala.language.postfixOps
class KafkaListener(val zookeeper: String,
val groupId: String,
val topic: String,
val handleMessage: ByteArrayMessage => Unit,
val workJson: String = ""
) extends AutoCloseable {
private lazy val logger = LoggerFactory.getLogger(this.getClass)
val config: ConsumerConfig = createConsumerConfig(zookeeper, groupId)
val consumer: ConsumerConnector = Consumer.create(config)
val sessionTimeoutMs: Int = 10 * 1000
val connectionTimeoutMs: Int = 8 * 1000
val zkClient: ZkClient = ZkUtils.createZkClient(zookeeper, sessionTimeoutMs, connectionTimeoutMs)
val zkUtils = new ZkUtils(zkClient, new ZkConnection(zookeeper), false)
def createConsumerConfig(zookeeper: String, groupId: String): ConsumerConfig = {
val props = new Properties()
props.put("zookeeper.connect", zookeeper)
props.put("group.id", groupId)
props.put("auto.offset.reset", "smallest")
props.put("zookeeper.session.timeout.ms", "5000")
props.put("zookeeper.sync.time.ms", "200")
props.put("auto.commit.interval.ms", "1000")
props.put("partition.assignment.strategy", "roundrobin")
new ConsumerConfig(props)
}
def run(threadCount: Int = 1): Unit = {
val streams = consumer.createMessageStreamsByFilter(Whitelist(topic), threadCount)
if (!AdminUtils.topicExists(zkUtils, topic)) {
AdminUtils.createTopic(zkUtils, topic, 1, 1)
}
val executor = Executors.newFixedThreadPool(threadCount)
for (stream <- streams) {
executor.submit(new MessageConsumer(stream))
}
logger.debug(s"KafkaListener start with ${threadCount}thread (topic=$topic)")
}
override def close(): Unit = {
consumer.shutdown()
logger.debug(s"$topic Listener close")
}
class MessageConsumer(val stream: MessageStream) extends Runnable {
override def run(): Unit = {
val it = stream.iterator()
while (it.hasNext()) {
val message = it.next().message()
if (workJson == "") {
handleMessage(message)
}
else {
val strMessage = new String(message)
val newMessage = s"$strMessage/#/$workJson"
val outMessage = newMessage.toCharArray.map(c => c.toByte)
handleMessage(outMessage)
}
}
}
}
}
Specifically, I want to modify the structure that creates KafkaProduce objects whenever I send a message. There seems to be many other improvements to reduce lag.
Increase the number of consumer(KafkaListener) instances with same group id.
It will increase the consumption rate. Eventually your lag between producer write & consumer will get minimized.

My Kafka Producer code just runs well without any exception, but no data is sent in brokers

I have created the topic "test_topic_02", and manually wrote data into broker which succeed. But when I produce data with the following code, writing data to broker did not work.
object KafkaProducer {
private val log: slf4j.Logger = LoggerFactory.getLogger(this.getClass)
Logger.getLogger("org").setLevel(Level.WARN)
def main(args: Array[String]): Unit = {
val topic = "test_topic_02"
val brokers = "10.31.31.45:9092"
val props = new Properties()
var partition: Int = 0
//partition
val list: List[Int] = List(0, 1, 2, 3, 4)
props.put("bootstrap.servers", brokers)
props.put("client.id", "KafkaProducer")
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
//create producer
val producer: KafkaProducer[String, String] = new KafkaProducer[String, String](props)
while (true) {
for (i <- 1 to 100) {
for (j <- list) {
partition = j
try {
val producerData = new ProducerRecord[String, String](topic, Integer.valueOf(partition), "message from simulator_" + Integer.toString(i), Integer.toString(i))
val future: Future[RecordMetadata] = producer.send(producerData)
println(producerData)
//implicit number to long
future.get(long2Long(3), TimeUnit.SECONDS)
println("Message Sent Successfully")
Thread.sleep(1000)
} catch {
case e : Exception =>
log.error("Launching Failed")
}
}
}
}
println("Stop Producing Data")
producer.close()
}
}
You should use the other send function that takes a callback to write to Kafka. In the callback, you can look for exceptions or errors.
ProducerRecord<byte[],byte[]> record = new ProducerRecord<byte[],byte[]>("the-topic", key, value);
producer.send(myRecord,
new Callback() {
public void onCompletion(RecordMetadata metadata, Exception e) {
if(e != null) {
e.printStackTrace();
} else {
System.out.println("The offset of the record we just sent is: " + metadata.offset());
}
}
});