How to read all the records in a Kafka topic - apache-kafka

I am using kafka : kafka_2.12-2.1.0, spring kafka on client side and have got stuck with an issue.
I need to load an in-memory map by reading all the existing messages within a kafka topic. I did this by starting a new consumer (with a unique consumer group id and setting the offset to earliest). Then I iterate over the consumer (poll method) to get all messages and stop when the consumer records become empty.
But I noticed that, when I start polling, the first few iterations return consumer records as empty and then it starts returning the actual records. Now this breaks my logic as our code thinks there are no records in the topic.
I have tried few other ways (like using offsets number) but haven't been able to come up with any solution, apart from keeping another record somewhere which tells me how many messages there are in the topic which needs to be read before I stop.
Any idea's please ?

To my understanding, what you are trying to achieve is to have a map constructed in your application based on the values that are already in a specific Topic.
For this task, instead of manually polling the topic, you can use Ktable in Kafka Streams DSL which will automatically construct a readable key-value store which is fault tolerant, replication enabled and automatically filled with new values.
You can do this simply by calling groupByKey on a stream and then using the aggregate.
KStreamBuilder builder = new KStreamBuilder();
KStream<String, Long> myKStream = builder.stream(Serdes.String(), Serdes.Long(), "topic_name");
KTable<String, Long> totalCount = myKStream.groupByKey().aggregate(this::initializer, this::aggregator);
(The actual code may vary depending on the kafka version, your configurations, etc..)
Read more about Kafka Stream concepts here
Then I iterate over the consumer (poll method) to get all messages and stop when the consumer records become empty
Kafka is a message streaming platform. Any data you stream is being updated continuously and you probably should not use it in a way that you expect the consuming to stop after a certain number of messages. How will you handle if a new message comes in after you stop the consumer?
Also the reason you are getting null records maybe probably related to records being in different partitions, etc..
What is your specific use case here?, There might be a good way to do it with the Kafka semantics itself.

You have to use 2 consumers one to load the offsets and another one to read all the records.
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.PartitionInfo;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.serialization.ByteArrayDeserializer;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Properties;
import java.util.Set;
import java.util.stream.Collectors;
public class KafkaRecordReader {
static final Map<String, Object> props = new HashMap<>();
static {
props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092");
props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class);
props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class);
props.put(ConsumerConfig.CLIENT_ID_CONFIG, "sample-client");
}
public static void main(String[] args) {
final Map<TopicPartition, OffsetInfo> partitionOffsetInfos = getOffsets(Arrays.asList("world, sample"));
final List<ConsumerRecord<byte[], byte[]>> records = readRecords(partitionOffsetInfos);
System.out.println(partitionOffsetInfos);
System.out.println("Read : " + records.size() + " records");
}
private static List<ConsumerRecord<byte[], byte[]>> readRecords(final Map<TopicPartition, OffsetInfo> offsetInfos) {
final Properties readerProps = new Properties();
readerProps.putAll(props);
readerProps.put(ConsumerConfig.CLIENT_ID_CONFIG, "record-reader");
final Map<TopicPartition, Boolean> partitionToReadStatusMap = new HashMap<>();
offsetInfos.forEach((tp, offsetInfo) -> {
partitionToReadStatusMap.put(tp, offsetInfo.beginOffset == offsetInfo.endOffset);
});
final List<ConsumerRecord<byte[], byte[]>> cachedRecords = new ArrayList<>();
try (final KafkaConsumer<byte[], byte[]> consumer = new KafkaConsumer<>(readerProps)) {
consumer.assign(offsetInfos.keySet());
for (final Map.Entry<TopicPartition, OffsetInfo> entry : offsetInfos.entrySet()) {
consumer.seek(entry.getKey(), entry.getValue().beginOffset);
}
boolean close = false;
while (!close) {
final ConsumerRecords<byte[], byte[]> consumerRecords = consumer.poll(Duration.ofMillis(100));
for (final ConsumerRecord<byte[], byte[]> record : consumerRecords) {
cachedRecords.add(record);
final TopicPartition currentTp = new TopicPartition(record.topic(), record.partition());
if (record.offset() + 1 == offsetInfos.get(currentTp).endOffset) {
partitionToReadStatusMap.put(currentTp, true);
}
}
boolean done = true;
for (final Map.Entry<TopicPartition, Boolean> entry : partitionToReadStatusMap.entrySet()) {
done &= entry.getValue();
}
close = done;
}
}
return cachedRecords;
}
private static Map<TopicPartition, OffsetInfo> getOffsets(final List<String> topics) {
final Properties offsetReaderProps = new Properties();
offsetReaderProps.putAll(props);
offsetReaderProps.put(ConsumerConfig.CLIENT_ID_CONFIG, "offset-reader");
final Map<TopicPartition, OffsetInfo> partitionOffsetInfo = new HashMap<>();
try (final KafkaConsumer<byte[], byte[]> consumer = new KafkaConsumer<>(offsetReaderProps)) {
final List<PartitionInfo> partitionInfos = new ArrayList<>();
topics.forEach(topic -> partitionInfos.addAll(consumer.partitionsFor("sample")));
final Set<TopicPartition> topicPartitions = partitionInfos
.stream()
.map(x -> new TopicPartition(x.topic(), x.partition()))
.collect(Collectors.toSet());
consumer.assign(topicPartitions);
final Map<TopicPartition, Long> beginningOffsets = consumer.beginningOffsets(topicPartitions);
final Map<TopicPartition, Long> endOffsets = consumer.endOffsets(topicPartitions);
for (final TopicPartition tp : topicPartitions) {
partitionOffsetInfo.put(tp, new OffsetInfo(beginningOffsets.get(tp), endOffsets.get(tp)));
}
}
return partitionOffsetInfo;
}
private static class OffsetInfo {
private final long beginOffset;
private final long endOffset;
private OffsetInfo(long beginOffset, long endOffset) {
this.beginOffset = beginOffset;
this.endOffset = endOffset;
}
#Override
public String toString() {
return "OffsetInfo{" +
"beginOffset=" + beginOffset +
", endOffset=" + endOffset +
'}';
}
#Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
OffsetInfo that = (OffsetInfo) o;
return beginOffset == that.beginOffset &&
endOffset == that.endOffset;
}
#Override
public int hashCode() {
return Objects.hash(beginOffset, endOffset);
}
}
}

Adding to the above answer from #arshad, the reason you are not getting the records is because you have already read them. See this answer here using earliest or latest does not matter on the consumer after you have a committed offset for the partition
I would use a seek to the beginning or the particular offset if you knew the starting offset.

Related

Kafkaconsumer.poll consume just from one partition

I created a class called ConsumerConfig and Service class that contain function that gets records from a topic.
consumer.poll consume just from one partition , i think i need to add a loop but while(true) in this case its not the best choice
when I debug I evaluate this function "records = consumer.poll(Duration.ofMillis(4000))" and each time I evaluate it it reads from the next partition
here is the code of the ConsumerConfig class and the functions that gets records from specific topic :
#Component
public class ConsumerConfig {
#Autowired
KafkaProperties kafkaProperties;
private Map<String, Object> buildDefaultConfig() {
final Map<String, Object> defaultClientConfig = new HashMap<>();
return defaultClientConfig;
}
#Bean
#RequestScope
public <K, V> KafkaConsumer<K, V> getKafkaConsumer() throws JsonProcessingException {
// Build config
final Map<String, Object> kafkaConsumerConfig = buildDefaultConfig();
String ss = new ObjectMapper().writeValueAsString(kafkaProperties.buildConsumerProperties());
log.info("config {}",ss);
kafkaConsumerConfig.putAll(kafkaProperties.buildAdminProperties());
kafkaConsumerConfig.put("key.deserializer", StringDeserializer.class);
kafkaConsumerConfig.put("value.deserializer", StringDeserializer.class);
kafkaConsumerConfig.put("max.poll.records", 500000);
kafkaConsumerConfig.put("max.poll.interval.ms",600000);
//kafkaConsumerConfig.put("fetch.max.wait.ms",2000);
//kafkaConsumerConfig.put("fetch.min.bytes",50000);
//kafkaConsumerConfig.put("groupId","mygr");
return new KafkaConsumer<K, V>(kafkaConsumerConfig);
}
}
function that get records :
public List<Record> recordsFromTopic(final String topic) {
// Find all partitions on topic.
final TopicDescription topicDescription = (TopicDescription) adminService.
topicDescription(topic);
final Collection<Integer> partitions = topicDescription
.partitions()
.stream()
.map(TopicPartitionInfo::partition)
.collect(Collectors.toList());
var list = consumeAllRecordsFromTopic(topic, partitions);
var element = list.stream().filter(Objects::nonNull).map(x -> Record
.builder()
.values(x.value())
.offset(x.offset())
.partition(x.partition()).build())
.collect(Collectors.toList());
return element;
}
public <K, V> List<ConsumerRecord<K, V>> consumeAllRecordsFromTopic(final String topic,
final Collection<Integer> partitionIds) {
// Create topic Partitions
final List<TopicPartition> topicPartitions = partitionIds
.stream()
.map((partitionId) -> new TopicPartition(topic, partitionId))
.collect(Collectors.toList());
final List<ConsumerRecord<K, V>> allRecords = new ArrayList<>();
ConsumerRecords<K, V> records;
// Assign topic partitions
consumer.assign(topicPartitions);
consumer.seekToBeginning(topicPartitions);
// Pull records from kafka
records = consumer.poll(Duration.ofMillis(4000));
records.forEach(allRecords::add);
return allRecords;
}
How many partitions do you have in the source topic and how many consumers are you running. Also while(true) is must required one for the continues polling of the records through consumer.poll() and iterate it through individual consumer records. Unless you have more than one consumer instance under a single consumer group that listening to a topic, consumer instance is going to read from more than one topic partition (if topic partition > 1). Provided, you also don't have some partitioning strategies implemented.

KTable not updating (immediately) when new messages are put on input stream

I have a kafka topic of Strings with an arbitrary key. I want to create a topic of characters in string : value pairs, e.g:
input("key","value") -> outputs (["v","value"],["a","value"],...)
To keep it simple, my input topic has a single partition, and thus the KTable code should be receiving all messages to a single instance.
I have created the following sandbox code, which builds the new table just fine , but doesn't update when a new item is put into the original topic:
import java.util.LinkedHashSet;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.common.serialization.StringSerializer;
import org.apache.kafka.common.utils.Bytes;
import org.apache.kafka.streams.KafkaStreams;
import org.apache.kafka.streams.KeyValue;
import org.apache.kafka.streams.StoreQueryParameters;
import org.apache.kafka.streams.StreamsBuilder;
import org.apache.kafka.streams.StreamsConfig;
import org.apache.kafka.streams.kstream.Consumed;
import org.apache.kafka.streams.kstream.Grouped;
import org.apache.kafka.streams.kstream.Materialized;
import org.apache.kafka.streams.state.KeyValueIterator;
import org.apache.kafka.streams.state.KeyValueStore;
import org.apache.kafka.streams.state.QueryableStoreTypes;
import org.apache.kafka.streams.state.ReadOnlyKeyValueStore;
public class Sandbox
{
private final static String kafkaBootstrapServers = "192.168.1.254:9092";
private final static String kafkaGlobalTablesDirectory = "C:\\Kafka\\tmp\\kafka-streams-global-tables\\";
private final static String topic = "sandbox";
private static KafkaStreams streams;
public static void main(String[] args)
{
// 1. set up the test data
Properties producerProperties = new Properties();
producerProperties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, kafkaBootstrapServers);
producerProperties.put(ProducerConfig.CLIENT_ID_CONFIG, Sandbox.class.getName() + "_testProducer");
producerProperties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
producerProperties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
Producer<String, String> sandboxProducer = new KafkaProducer<>(producerProperties);
sandboxProducer.send(new ProducerRecord<String, String>(topic,"uvw","uvw"));
// 2. read the test data and check it's working
ReadOnlyKeyValueStore<String, String> store = getStore();
printStore(store.all());
System.out.println("-------------ADDING NEW VALUE----------------");
sandboxProducer.send(new ProducerRecord<String, String>(topic,"xyz","xyz"));
System.out.println("-------------ADDED NEW VALUE----------------");
printStore(store.all());
sandboxProducer.close();
streams.close();
}
private static void printStore(KeyValueIterator<String, String> i)
{
System.out.println("-------------PRINT START----------------");
while (i.hasNext())
{
KeyValue<String, String> n = i.next();
System.out.println(n.key + ":" + String.join(",", n.value));
}
System.out.println("-------------PRINT END----------------");
}
private static ReadOnlyKeyValueStore<String, String> getStore()
{
ReadOnlyKeyValueStore<String, String> store = null;
String storeString = "sandbox_store";
StreamsBuilder builder = new StreamsBuilder();
builder.stream(topic
, Consumed.with(Serdes.String(),Serdes.String()))
.filter((k,v)->v!=null)
.flatMap((k,v)->{
Set<KeyValue<String, String>> results = new LinkedHashSet<>();
if (v != null)
{
for (char subChar : v.toCharArray())
{
results.add(KeyValue.<String, String>pair(new String(new char[] {subChar}), v));
}
}
return results;
})
.groupByKey(Grouped.with(Serdes.String(), Serdes.String()))
.aggregate(()->new String()
, (key, value, agg) -> {
agg = agg + value;
return agg;
}
,Materialized.<String, String, KeyValueStore<Bytes, byte[]>>as(storeString)
.withKeySerde(Serdes.String())
.withValueSerde(Serdes.String()));
final Properties streamsConfiguration = new Properties();
streamsConfiguration.put(StreamsConfig.APPLICATION_ID_CONFIG, "sandbox");
streamsConfiguration.put(StreamsConfig.CLIENT_ID_CONFIG, Sandbox.class.getName());
streamsConfiguration.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, kafkaBootstrapServers);
streamsConfiguration.put(StreamsConfig.STATE_DIR_CONFIG, kafkaGlobalTablesDirectory + "Sandbox");
streamsConfiguration.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
streams = new KafkaStreams(builder.build(), streamsConfiguration);
streams.setUncaughtExceptionHandler((Thread thread, Throwable throwable) -> {
System.out.println("Exception on thread " + thread.getName() + ":" + throwable.getLocalizedMessage());
});
Runtime.getRuntime().addShutdownHook(new Thread(streams::close));
streams.cleanUp(); // clear any old streams data - forces a rebuild of the local caches.
streams.start(); // hangs until the global table is built
StoreQueryParameters<ReadOnlyKeyValueStore<String, String>> storeSqp
= StoreQueryParameters.fromNameAndType(storeString
,QueryableStoreTypes.<String, String>keyValueStore());
// this while loop gives time for Kafka Streams to start up properly before creating the store
while (store == null)
{
try {
TimeUnit.SECONDS.sleep(1);
store = streams.store(storeSqp);
System.out.println("Store " + storeString + " Created successfully.");
} catch (InterruptedException e) {
}
catch (Exception e) {
System.out.println("Exception creating store " + storeString + ". Will try again in 1 second. Message: " + e.getLocalizedMessage());
}
}
return store;
}
}
The output I am getting is as follows:
Store sandbox_store Created successfully.
-------------PRINT START----------------
u:uvw
v:uvw
w:uvw
-------------PRINT END----------------
-------------ADDING NEW VALUE----------------
-------------ADDED NEW VALUE----------------
-------------PRINT START----------------
u:uvw
v:uvw
w:uvw
-------------PRINT END----------------
Note that the xyz I added has gone missing!
(p.s. I know I could use reduce instead of aggregate, but in practise the new value would be a different type, not a string, so it wouldn't work for my actual use-case)
Now, if I add a 10 second pause before printing the second time; or if I then restart the Sandbox class without clearing the topic, the first xyz shows up then. So it's clearly that there's a time delay somewhere in the system. And in practise I'm dealing with 300mb+ of messages all going onto the input topic at once, once an hour; and so the delay is even longer than just a few seconds.
How can I help speed things up?

How do I set in Kafka to not consume from where it left?

I have a Kafka consumer in Golang. I don't want to consume from where I left last time, but rather current message. How can I do it?
You can set enable.auto.commit to false and auto.offset.reset to latest for your consumer group id. This means kafka will not be automatically committing your offsets.
With auto commit disabled, your consumer group progress would not be saved (unless you do manually). So whenever the consumer is restarted for whatever reason, it does not find its progress saved and resets to the latest offset.
set a new group.id to your consumer.
Then use auto.offset.reset to define the behavior of this new consumer group, in you case: latest
Apache kafka consumer api provides a method called kafkaConsumer.seekToEnd() which can be used to ignore the existing messages and only consume messages published after the consumer has been started without changing the current group ID of the consumer.
Below is the implementation of the same. The program takes 3 arguments : topic name, group ID and offset range (0 to start from beginning, - 1 to receive messages after consumer has started, other than 0 or - 1 will imply to to consumer to consume from that offset)
import org.apache.kafka.clients.consumer.*;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.errors.WakeupException;
import java.util.*;
public class Consumer {
private static Scanner in;
public static void main(String[] argv)throws Exception{
if (argv.length != 3) {
System.err.printf("Usage: %s <topicName> <groupId> <startingOffset>\n",
Consumer.class.getSimpleName());
System.exit(-1);
}
in = new Scanner(System.in);
String topicName = argv[0];
String groupId = argv[1];
final long startingOffset = Long.parseLong(argv[2]);
ConsumerThread consumerThread = new ConsumerThread(topicName,groupId,startingOffset);
consumerThread.start();
String line = "";
while (!line.equals("exit")) {
line = in.next();
}
consumerThread.getKafkaConsumer().wakeup();
System.out.println("Stopping consumer .....");
consumerThread.join();
}
private static class ConsumerThread extends Thread{
private String topicName;
private String groupId;
private long startingOffset;
private KafkaConsumer<String,String> kafkaConsumer;
public ConsumerThread(String topicName, String groupId, long startingOffset){
this.topicName = topicName;
this.groupId = groupId;
this.startingOffset=startingOffset;
}
public void run() {
Properties configProperties = new Properties();
configProperties.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092");
configProperties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.ByteArrayDeserializer");
configProperties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
configProperties.put(ConsumerConfig.GROUP_ID_CONFIG, groupId);
configProperties.put(ConsumerConfig.CLIENT_ID_CONFIG, "offset123");
configProperties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,false);
configProperties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
//Figure out where to start processing messages from
kafkaConsumer = new KafkaConsumer<String, String>(configProperties);
kafkaConsumer.subscribe(Arrays.asList(topicName), new ConsumerRebalanceListener() {
public void onPartitionsRevoked(Collection<TopicPartition> partitions) {
System.out.printf("%s topic-partitions are revoked from this consumer\n", Arrays.toString(partitions.toArray()));
}
public void onPartitionsAssigned(Collection<TopicPartition> partitions) {
System.out.printf("%s topic-partitions are assigned to this consumer\n", Arrays.toString(partitions.toArray()));
Iterator<TopicPartition> topicPartitionIterator = partitions.iterator();
while(topicPartitionIterator.hasNext()){
TopicPartition topicPartition = topicPartitionIterator.next();
System.out.println("Current offset is " + kafkaConsumer.position(topicPartition) + " committed offset is ->" + kafkaConsumer.committed(topicPartition) );
if(startingOffset == -2) {
System.out.println("Leaving it alone");
}else if(startingOffset ==0){
System.out.println("Setting offset to begining");
kafkaConsumer.seekToBeginning(topicPartition);
}else if(startingOffset == -1){
System.out.println("Setting it to the end ");
kafkaConsumer.seekToEnd(topicPartition);
}else {
System.out.println("Resetting offset to " + startingOffset);
kafkaConsumer.seek(topicPartition, startingOffset);
}
}
}
});
//Start processing messages
try {
while (true) {
ConsumerRecords<String, String> records = kafkaConsumer.poll(100);
for (ConsumerRecord<String, String> record : records) {
System.out.println(record.value());
}
if(startingOffset == -2)
kafkaConsumer.commitSync();
}
}catch(WakeupException ex){
System.out.println("Exception caught " + ex.getMessage());
}finally{
kafkaConsumer.close();
System.out.println("After closing KafkaConsumer");
}
}
public KafkaConsumer<String,String> getKafkaConsumer(){
return this.kafkaConsumer;
}
}
}

Kafks consumer.poll returns no data

I have two Kafka (2.11-0.11.0.1) brokers. Default replication factor of topics is set to 2. Producers write data only to zero partition.
And I have scheduled executor which runs the task periodically. When it consumes a topic with a small number of records per minute (100 per minute) then in works like a charm. But for huge topics (10K per minute) method poll returns no data.
The task is:
import org.apache.kafka.clients.consumer.Consumer;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.TopicPartition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.time.Duration;
import java.util.Collections;
import java.util.Properties;
public final class TopicToDbPump implements Runnable {
private static final Logger log = LoggerFactory.getLogger(TopicToDbPump.class);
private final String topic;
private final TopicPartition topicPartition;
private final Properties properties;
public TopicToDbPump(String topic, Properties properties) {
this.topic = topic;
topicPartition = new TopicPartition(topic, 0);
this.properties = properties;
}
#Override
public void run() {
try (final Consumer<String, String> consumer = new KafkaConsumer<>(properties)) {
consumer.assign(Collections.singleton(topicPartition));
final long offset = readOffsetFromDb(topic);
consumer.seek(topicPartition, offset);
final ConsumerRecords<String, String> records = consumer.poll(Duration.ofSeconds(1));
if (records.isEmpty()) {
log.debug("No data from topic " + topic + " available");
return;
}
saveData(records.records(topic));
} catch (Throwable t) {
log.error("Etl process " + topic + " failed with exception", t);
}
}
}
Parameters of consumers are:
"bootstrap.servers" = "host-1:9092,host-2:9092",
"group.id" = "my-group",
"enable.auto.commit" = "false",
"key.deserializer" = "org.apache.kafka.common.serialization.StringDeserializer",
"value.deserializer" = "org.apache.kafka.common.serialization.StringDeserializer",
"max.partition.fetch.bytes": "50000000",
"max.poll.records": "10000"
What's wrong?
The Kafka Consumer API does not guarantee that the first call to poll() will return any data.
The Consumer first has to connect to the cluster, discover leaders for all partitions it's assigned to. As you imagine this can take a few seconds so it's unlikely data will have arrived immediately.
You should instead call poll() several times if no data is returned first.

How to send and get back data from Kafka in a single API call

I would like to send and get back data from Kafka in a single API call (see diagram below).
Is this possible? I already know how to make the data go in one direction (e.g., Spark Streaming reads data using the Kafka consumer API). I also know how to sort of 'fake it' by doing two one-way approaches (e.g., the web app is both a producer and consumer). However, when the web app makes an API call, I only want it to have to deal with its own record, not all of the records in the topic, so this seems like the wrong approach.
Other sub-optimal approaches I've thought of:
Save the Spark Streaming result in a database so that the web app can constantly poll the database until the result shows up. I'm worried that this could consume a lot of resources and delay response time.
Create short-lived / temporary consumers each time I call the Kafka producer. The temporary consumer would filter out all records except for the one it's looking for. When it finds the record it's looking for, the temporary consumer shuts down. I don't think this would work because the record the API caller cares about might go to a different partition, and so it would never be found.
Make a temporary topic for each of the web app's consumer API calls. I'm not sure if Kafka will complain about too many topics though.
Any advice?
What I did is....
Create a synProducer who send the data with a key and creates a consumer for a topic that has the name as the key of the sent message.
Then a synConsumer handle the message and reply to a topic, where the consumer at step 1 is waiting.
Delete the temporary topic
The downside of this approach is that the issues are not deleted immediately .
I will suggest you the third one, but with 2 topics: 1 for the request and 1 for the response. This is an example:
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import kafka.consumer.ConsumerConfig;
import kafka.consumer.ConsumerIterator;
import kafka.consumer.KafkaStream;
import kafka.javaapi.consumer.ConsumerConnector;
import kafka.javaapi.producer.Producer;
import kafka.producer.KeyedMessage;
import kafka.producer.ProducerConfig;
public class ConsumerGroupExample extends Thread {
private final ConsumerConnector consumer;
private final String topic;
private ConsumerIterator<byte[], byte[]> it;
private String mensaje="";
public ConsumerGroupExample(Properties props, String a_topic)
{
consumer = kafka.consumer.Consumer.createJavaConsumerConnector(new ConsumerConfig(props));
this.topic = a_topic;
Map<String, Integer> topicCountMap = new HashMap<String, Integer>();
topicCountMap.put(topic, 1);
Map<String, List<KafkaStream<byte[], byte[]>>> consumerMap = consumer.createMessageStreams(topicCountMap);
List<KafkaStream<byte[], byte[]>> streams = consumerMap.get(topic);
KafkaStream stream = streams.get(0);
it = stream.iterator();
}
public void shutdown()
{
if (consumer != null) consumer.shutdown();
}
public void run()
{
if (it.hasNext())
{
mensaje = new String(it.next().message());
}
System.out.println( mensaje );
}
public String getMensaje()
{
return this.mensaje;
}
public static void main(String[] args) {
Properties props = new Properties();
props.put("zookeeper.connect", "localhost:2181");
props.put("group.id", "Group");
props.put("zookeeper.session.timeout.ms", "400");
props.put("zookeeper.sync.time.ms", "200");
props.put("auto.commit.interval.ms", "1000");
props.put("consumer.timeout.ms", "10000");
ConsumerGroupExample example = new ConsumerGroupExample( props, "topicFoRResponse");
props = new Properties();
props.put("metadata.broker.list", "localhost:9092");
props.put("serializer.class", "kafka.serializer.StringEncoder");
props.put("request.required.acks", "1");
ProducerConfig config = new ProducerConfig(props);
example.start();
try {
Producer<String, String> colaParaEscritura;
KeyedMessage<String, String> data = new KeyedMessage<String, String>("topicForRequest", " message ");
colaParaEscritura = new kafka.javaapi.producer.Producer<String, String>(config);
colaParaEscritura.send(data);
System.out.println("enviado");
colaParaEscritura.close();
example.join();
System.out.println( "final"+ example.getMensaje() );
}
catch (InterruptedException ie) {
}
example.shutdown();
}
}