As i am creating simple kafka consumer as below by using url :https://gist.github.com/akhil/6dfda8a04e33eff91a20 .
In that link, to print the consumed record, used a word "asScala" , that is not identified. and kindly , tell me, how to iterate the return type : ConsumerRecord[String,String] , which is poll() method's return type.
import java.util
import java.util.Properties
import org.apache.kafka.clients.consumer.{ConsumerRecords, KafkaConsumer}
object KafkaConsumerEx extends App {
val topic_name = "newtopic55"
val consumer_group = "KafkaConsumerBatch"
val prot = new Properties()
prot.put("bootstrap.servers","localhost:9092")
prot.put("group.id",consumer_group)
prot.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
prot.put("value.deserializer","org.apache.kafka.common.serialization.StringDeserializer")
val kfk_consumer = new KafkaConsumer[String,String](prot)
kfk_consumer.subscribe(util.Collections.singleton(topic_name))
println("here")
while(true){
val consumer_record : ConsumerRecords[String, String] = kfk_consumer.poll(100)
println("records count : " + consumer_record.count())
println("records partitions: " + consumer_record.partitions())
consumer_record.iterator().
}
}
Thanks in adv.
You can easily do that
for (record <- consumer_record.iterator()) {
println(s"Here's your $record")
}
Remember to add this import:
import scala.collection.JavaConversions._
Add another answer since the scala.collection.JavaConversions was deprecated as mentioned at here.
As for this question, the code could be like
import scala.collection.JavaConverters._
for (record <- asScalaIterator(consumer_record.iterator)) {
println(s"Here's your $record")
}
while(true){
val consumer_records = kfk_consumer.poll(100)
val record_iter=consumer_record.iterator()
while(record_iter.hasNext())
{
record=record_iter.next()
println("records partitions: " + record.partition()
"records_data:" + record.value())
}
}
Related
I am trying to create a JSON dataset every 500 ms and want to push it to the Kafka topic so that I can set up some windows in the downstream and perform computations. Below is my code:
package KafkaAsSource
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.datastream.DataStream
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer.Semantic
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaProducer}
import org.apache.flink.streaming.connectors.kafka.internals.KeyedSerializationSchemaWrapper
import java.time.format.DateTimeFormatter
import java.time.LocalDateTime
import java.util.{Optional, Properties}
object PushingDataToKafka {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setMaxParallelism(256)
env.enableCheckpointing(5000)
val stream: DataStream[String] = env.fromElements(createData())
stream.addSink(sendToTopic(stream))
}
def getProperties(): Properties = {
val properties = new Properties()
properties.setProperty("bootstrap.servers", "localhost:9092")
properties.setProperty("zookeeper.connect", "localhost:2181")
return properties
}
def createData(): String = {
val minRange: Int = 0
val maxRange: Int = 1000
var jsonData = ""
for (a <- minRange to maxRange) {
jsonData = "{\n \"id\":\"" + a + "\",\n \"Category\":\"Flink\",\n \"eventTime\":\"" + DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.SSS").format(LocalDateTime.now) + "\"\n \n}"
println(jsonData)
Thread.sleep(500)
}
return jsonData
}
def sendToTopic(): Properties = {
val producer = new FlinkKafkaProducer[String](
"topic"
,
new KeyedSerializationSchemaWrapper[String](new SimpleStringSchema())
,
getProperties(),
FlinkKafkaProducer.Semantic.EXACTLY_ONCE
)
return producer
}
}
It gives me below error:
type mismatch;
found : Any
required: org.apache.flink.streaming.api.functions.sink.SinkFunction[String]
stream.addSink(sendToTopic())
Modified Code:
object FlinkTest {
def main(ars: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment()
env.setMaxParallelism(256)
var stream = env.fromElements("")
//env.enableCheckpointing(5000)
//val stream: DataStream[String] = env.fromElements("hey mc", "1")
val myProducer = new FlinkKafkaProducer[String](
"maddy", // target topic
new KeyedSerializationSchemaWrapper[String](new SimpleStringSchema()), // serialization schema
getProperties(), // producer config
FlinkKafkaProducer.Semantic.EXACTLY_ONCE)
val minRange: Int = 0
val maxRange: Int = 10
var jsonData = ""
for (a <- minRange to maxRange) {
jsonData = "{\n \"id\":\"" + a + "\",\n \"Category\":\"Flink\",\n \"eventTime\":\"" + DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.SSS").format(LocalDateTime.now) + "\"\n \n}"
println(a)
Thread.sleep(500)
stream = env.fromElements(jsonData)
println(jsonData)
stream.addSink(myProducer)
}
env.execute("hey")
}
def getProperties(): Properties = {
val properties = new Properties()
properties.setProperty("bootstrap.servers", "localhost:9092")
properties.setProperty("zookeeper.connect", "localhost:2181")
return properties
}
/*
def createData(): String = {
val minRange: Int = 0
val maxRange: Int = 10
var jsonData = ""
for (a <- minRange to maxRange) {
jsonData = "{\n \"id\":\"" + a + "\",\n \"Category\":\"Flink\",\n \"eventTime\":\"" + DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.SSS").format(LocalDateTime.now) + "\"\n \n}"
Thread.sleep(500)
}
return jsonData
}
*/
}
Modified Code gives me the data in the Kafka topic but it doesn't retain the order. What am I doing wrong here in the loops? Also, had to change the version of Flink to 1.12.2 from 1.13.5.
I was initially using Flink 1.13.5, Connectors and Scala of 2.11. What exactly I am missing over here?
A couple of things about this loop:
for (a <- minRange to maxRange) {
jsonData =
"{\n \"id\":\"" + a + "\",\n \"Category\":\"Flink\",\n \"eventTime\":\""
+ DateTimeFormatter
.ofPattern("yyyy-MM-dd HH:mm:ss.SSS")
.format(LocalDateTime.now) + "\"\n \n}"
println(a)
Thread.sleep(500)
stream = env.fromElements(jsonData)
println(jsonData)
stream.addSink(myProducer)
}
The sleep is happening in the Flink client, and only affects how long it takes the client to assemble the job graph before submitting it to the cluster. It has no effect on how the job runs.
This loop is creating 10 separate pipelines that will run independently, in parallel, all producing to the same Kafka topic. Those pipelines are going to race against each other.
To get the behavior you're looking for (a global ordering across a single pipeline) you'll want to produce all of the events from a single source (in order, of course), and run the job with a parallelism of one. Something like this would do it:
import org.apache.flink.streaming.api.scala.{StreamExecutionEnvironment, _}
object FlinkTest {
def main(ars: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment()
env.setParallelism(1)
val myProducer = ...
val jsonData = (i: Long) => ...
env.fromSequence(0, 9)
.map(i => jsonData(i))
.addSink(myProducer)
env.execute()
}
}
You can leave maxParallelism at 256 (or at its default value of 128); it's not particularly relevant here. The maxParallelism is the number of hash buckets that keyBy will hash the keys into, and it defines an upper limit on the scalability of the job.
I am working on a scala application. I am using kafka in it. I am to consume message from kafka topic. Since I am writing a test case, I need to get record do some assertion to get pass my testcase. I am using following code to consume kafka message:
import java.util.{Collections, Properties}
import java.util.regex.Pattern
import org.apache.kafka.clients.consumer.KafkaConsumer
import scala.collection.JavaConverters._
object KafkaConsumerSubscribeApp extends App {
val props:Properties = new Properties()
props.put("group.id", "test")
props.put("bootstrap.servers","localhost:9092")
props.put("key.deserializer",
"org.apache.kafka.common.serialization.StringDeserializer")
Props.put("value.deserializer",
"org.apache.kafka.common.serialization.StringDeserializer")
props.put("enable.auto.commit", "true")
props.put("auto.commit.interval.ms", "1000")
val consumer = new KafkaConsumer(props)
val topics = List("topic_text")
try {
consumer.subscribe(topics.asJava)
while (true) {
val records = consumer.poll(10)
for (record <- records.asScala) {
println("Topic: " + record.topic() +
",Key: " + record.key() +
",Value: " + record.value() +
", Offset: " + record.offset() +
", Partition: " + record.partition())
}
}
}catch{
case e:Exception => e.printStackTrace()
}finally {
consumer.close()
}
}
There are two problems I am facing with this code. In intellij it is giving warning that poll method is deprecated. How can I modify this code for deprecation? Second problem is I want to this method to return the message which it gets from kafka topic. That message is in record.value(). How can I return it? With this code since while(true) is used, so it will be a endless loop and it will keep listening to messages from topic. How can I return record.value() from this method so that I can use data which I got from topic in other methods.
If you want to return you have to modify the codebase as follows. To remove the deprecation of the poll, you need to just pass the durations. For more info refer
def readFromKafka(//arguments) = {
// other code
consumer.subscribe(util.Collections.singletonList(topic))
consumer.poll(Duration.ofMillis(5000)).asScala.toList.map(_.value())
}
To keep on reading from Kafka, you have to keep on calling the function. For that, you may use the scheduler.
I hope it will help.
Cannot resolve symbol for Foreach
import java.util._
import org.apache.kafka.clients.consumer._
import org.apache.kafka.common.serialization.Deserializer
object ConsumerExample {
def main(args: Array[String]): Unit = {
val T_Name = "CarSensor"
val T_Group_Name = "CarSensorGroup"
val props = new Properties()
props.put("bootstrap.servers", "localhost:9092,localhost:9093,localhost:9094")
props.put("group.id",T_Group_Name)
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
val Kafka_Consumer=new KafkaConsumer[String,String](props)
Kafka_Consumer.subscribe (Arrays.asList(T_Name))
while(true)
{
val Consumer_Record=Kafka_Consumer.poll(100) //ConsumerRecords Object
// val RecordList=Consumer_Record.toString
for( i <- Consumer_Record)
{ //**This place is where Cannot resolve symbol for Foreach issue shows up for <- symbol.**
println("Supplier id = "+String.valueOf(i.value().getID())+ "Supplier name = " +i.value().getID())
}
}
}
}
I have used <- symbol in many examples before it worked.
I thought it was an issue with Intelliji and restarted it. Its a problem in object getting casted to different type I guess.
Consumer_Record.forEach(i => {
println("Supplier id = "+String.valueOf(i.value().getID())+ "Supplier name = " +i.value().getID())
})
works fine for me.
Except String doesn't have getID() method.
You can use for(i <- Consumer_Record.asScala) if you want for syntax, but you have to add import scala.collection.JavaConverters._.
val Kafka_Consumer=new KafkaConsumer[String,String](props)
Kafka_Consumer.subscribe(Arrays.asList(T_Name))
while(true) {
val Consumer_Record=Kafka_Consumer.poll(100) //ConsumerRecords Object
for( i <- Consumer_Record.asScala) {
println("Supplier id = "+String.valueOf(i.value())+ " Supplier name = " +i.key())
}
}
I have created a kafka producer using node js which is basically pushing the live data it is receiving from upstox into a kafka-topic. The kafka-producer snippet looks someting like this:
upstox.on("liveFeed", function(message) {
//message for live feed
var data = JSON.stringify(message);
var payload = [{
topic : 'live-feed',
message: data,
attributes: 1
}];
producer.send(payload, function(error, result) {
console.info('Sent payload to Kafka: ', payload);
if (error) {
console.error(error);
} else {
console.log('result: ', result)
}
});
It's giving me the live feed in the following format:
topic: live-feed,
message:{live-feed data},
attributes:1
Now I'm trying to code a spark streaming consumer which streams the data produced by this producer. I came up with something like this:
package com.senpuja.datastream
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object LiveFeedStream {
def main(args: Array[String]): Unit = {
val brokers = util.Try(args(0)).getOrElse("localhost:9092")
val inTopic = util.Try(args(1)).getOrElse("live-feed")
val sparkConf = new SparkConf()
val spark = new SparkContext(sparkConf)
val streamCtx = new StreamingContext(spark, Seconds(10))
val inTopicSet = Set(inTopic)
val kafkaParams = Map[String, String](
"bootstrap.servers" -> brokers,
"key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
"value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer"
)
val msg = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
streamCtx,
kafkaParams,
inTopicSet
)
msg.print()
streamCtx.start()
streamCtx.awaitTermination()}
But when I submit the code, I get the following output which is just null:
{null}, {null}
{null}, {null}
{null}, {null}
{null}, {null}
{null}, {null}
I want to retrieve the message part from the producer topic. I think it has something to do with the key-value thing I guess, but I'm not able to figure out its solution. Any help would be really appreciated!
Add enable.auto.commit = false in Kafka parameter and try.
I found that the problem was this that I was directly passing the message while the spark streaming code was looking for a key-value pair. So I used KeyedMessage to produce a key-value pair.
upstox.on("liveFeed", function(message) {
//message for live feed var
var data = JSON.stringify(message);
var km = new KeyedMessage(Math.floor(Math.random() * 10000), data);
var payload = [{
topic : 'live-feed',
messages: km
}];
producer.send(payload, function(error, result) {
console.info('Sent payload to Kafka: ', payload);
if (error) {
console.error(error);
} else {
console.log('result: ', result)
}
)}
It solved my problem.
I'm new to Spark.
What I'm trying to do is retrieving all related documents from a Couchbase View with a given Id from Spark Kafka Streaming.
When I try to get this documents form the Spark Context, I always have the error Task not serializable.
From there, I do understand that I can't use nesting RDD neither multiple Spark Context in the same JVM, but want to find a work around.
Here is my current approach:
package xxx.xxx.xxx
import com.couchbase.client.java.document.JsonDocument
import com.couchbase.client.java.document.json.JsonObject
import com.couchbase.client.java.view.ViewQuery
import com.couchbase.spark._
import org.apache.spark.broadcast.Broadcast
import _root_.kafka.serializer.StringDecoder
import org.apache.kafka.clients.producer.{ProducerRecord, KafkaProducer}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming._
import org.apache.spark.streaming.kafka._
object Streaming {
// Method to create a Json document from Key and Value
def CreateJsonDocument(s: (String, String)): JsonDocument = {
//println("- Parsing document")
//println(s._1)
//println(s._2)
val return_doc = JsonDocument.create(s._1, JsonObject.fromJson(s._2))
(return_doc)
//(return_doc.content().getString("click"), return_doc)
}
def main(args: Array[String]): Unit = {
// get arguments as key value
val arguments = args.grouped(2).collect { case Array(k,v) => k.replaceAll("--", "") -> v }.toMap
println("----------------------------")
println("Arguments passed to class")
println("----------------------------")
println("- Arguments")
println(arguments)
println("----------------------------")
// If the length of the passed arguments is less than 4
if (arguments.get("brokers") == null || arguments.get("topics") == null) {
// Provide system error
System.err.println("Usage: --brokers <broker1:9092> --topics <topic1,topic2,topic3>")
}
// Create the Spark configuration with app name
val conf = new SparkConf().setAppName("Streaming")
// Create the Spark context
val sc = new SparkContext(conf)
// Create the Spark Streaming Context
val ssc = new StreamingContext(sc, Seconds(2))
// Setup the broker list
val kafkaParams = Map("metadata.broker.list" -> arguments.getOrElse("brokers", ""))
// Setup the topic list
val topics = arguments.getOrElse("topics", "").split(",").toSet
// Get the message stream from kafka
val docs = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
docs
// Separate the key and the content
.map({ case (key, value) => (key, value) })
// Parse the content to transform in JSON Document
.map(s => CreateJsonDocument(s))
// Call the view to all related Review Application Documents
//.map(messagedDoc => RetrieveAllReviewApplicationDocs(messagedDoc, sc))
.map(doc => {
sc.couchbaseView(ViewQuery.from("my-design-document", "stats").key(messagedDoc.content.getString("id"))).collect()
})
.foreachRDD(
rdd => {
//Create a report of my documents and store it in Couchbase
rdd.foreach( println )
}
)
// Start the streaming context
ssc.start()
// Wait for termination and catch error if there is a problem in the process
ssc.awaitTermination()
}
}
Found the solution by using the Couchbase Client instead of the Couchbase Spark Context.
I don't know if it is the best way to go in a performance side, but I can retrieve the docs I need for computation.
package xxx.xxx.xxx
import com.couchbase.client.java.{Bucket, Cluster, CouchbaseCluster}
import com.couchbase.client.java.document.JsonDocument
import com.couchbase.client.java.document.json.JsonObject
import com.couchbase.client.java.view.{ViewResult, ViewQuery}
import _root_.kafka.serializer.StringDecoder
import org.apache.kafka.clients.producer.{ProducerRecord, KafkaProducer}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming._
import org.apache.spark.streaming.kafka._
object Streaming {
// Method to create a Json document from Key and Value
def CreateJsonDocument(s: (String, String)): JsonDocument = {
//println("- Parsing document")
//println(s._1)
//println(s._2)
val return_doc = JsonDocument.create(s._1, JsonObject.fromJson(s._2))
(return_doc)
//(return_doc.content().getString("click"), return_doc)
}
// Method to retrieve related documents
def RetrieveDocs (doc: JsonDocument, arguments: Map[String, String]): ViewResult = {
val cbHosts = arguments.getOrElse("couchbase-hosts", "")
val cbBucket = arguments.getOrElse("couchbase-bucket", "")
val cbPassword = arguments.getOrElse("couchbase-password", "")
val cluster: Cluster = CouchbaseCluster.create(cbHosts)
val bucket: Bucket = cluster.openBucket(cbBucket, cbPassword)
val docs : ViewResult = bucket.query(ViewQuery.from("my-design-document", "my-view").key(doc.content().getString("id")))
cluster.disconnect()
println(docs)
(docs)
}
def main(args: Array[String]): Unit = {
// get arguments as key value
val arguments = args.grouped(2).collect { case Array(k,v) => k.replaceAll("--", "") -> v }.toMap
println("----------------------------")
println("Arguments passed to class")
println("----------------------------")
println("- Arguments")
println(arguments)
println("----------------------------")
// If the length of the passed arguments is less than 4
if (arguments.get("brokers") == null || arguments.get("topics") == null) {
// Provide system error
System.err.println("Usage: --brokers <broker1:9092> --topics <topic1,topic2,topic3>")
}
// Create the Spark configuration with app name
val conf = new SparkConf().setAppName("Streaming")
// Create the Spark context
val sc = new SparkContext(conf)
// Create the Spark Streaming Context
val ssc = new StreamingContext(sc, Seconds(2))
// Setup the broker list
val kafkaParams = Map("metadata.broker.list" -> arguments.getOrElse("brokers", ""))
// Setup the topic list
val topics = arguments.getOrElse("topics", "").split(",").toSet
// Get the message stream from kafka
val docs = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
// Get broadcast arguments
val argsBC = sc.broadcast(arguments)
docs
// Separate the key and the content
.map({ case (key, value) => (key, value) })
// Parse the content to transform in JSON Document
.map(s => CreateJsonDocument(s))
// Call the view to all related Review Application Documents
.map(doc => RetrieveDocs(doc, argsBC))
.foreachRDD(
rdd => {
//Create a report of my documents and store it in Couchbase
rdd.foreach( println )
}
)
// Start the streaming context
ssc.start()
// Wait for termination and catch error if there is a problem in the process
ssc.awaitTermination()
}
}