How to merge two streams and perform stateful operations on merged stream using Apache Beam - apache-beam

I have 2 Kafka streams, I want to merge by some key and on top of the merged stream I want to perform the stateful operation so that I can sum up counts from both streams
this what I tried but dint work ..
PCollection<String> stream1 = .. read from kafka
PCollection<String> stream2 = .. read from kafka
PCollection<String,Long> wonrdCount1 = stream1.apply(...)
PCollection<String,Long> wonrdCount2 = stream2.apply(...)
PCollection<String,Long> merged = merge wordcount1 and wordcount2 using CoGroupByKey
Pcolection<String,Long> finalStream = mergred.apply(...)
for finalstream apply state

public class KafkaWordCount implements Serializable {
private String kafkaBrokers =null;
private String topic =null;
public KafkaWordCount(String brokers, String topic){
this.kafkaBrokers =brokers;
this.topic =topic;
}
public PCollection<KV<String,Long>> build(Pipeline p){
final String myState="HELLO";
PCollection<KV<String,Long>> res =
p.apply(KafkaIO.<Long, String>read()
.withBootstrapServers(this.kafkaBrokers )
.withTopic(this.topic)
.withKeyDeserializer(LongDeserializer.class)
.withValueDeserializer(StringDeserializer.class))
.apply(ParDo.of(new DoFn<KafkaRecord<Long, String>, String>() {
#ProcessElement
public void processElement(ProcessContext processContext) {
KafkaRecord<Long, String> record = processContext.element();
processContext.output(record.getKV().getValue());
}
}))
.apply("ExtractWords",
ParDo.of(new DoFn<String, KV<String, Long>>() {
#ProcessElement
public void processElement(ProcessContext c) {
for (String word : c.element().split("[^\\p{L}]+")) {
if (!word.isEmpty()) {
c.output(KV.of(word,1L));
}
}
}
}));
return res;
}
}
public class DataPipe {
public static void main(String[] args) {
final String stateId = "myMapState";
final String myState = "myState";
PipelineOptions options = PipelineOptionsFactory.create();
options.as(FlinkPipelineOptions.class).setRunner(FlinkRunner.class);
Pipeline p = Pipeline.create(options);
PCollection<KV<String,Long>> stream1 =
new KafkaWordCount("localhost:9092","idm")
.build(p)
.apply(
Window
.<KV<String,Long>>into(
FixedWindows.of(Duration.millis(3600000)))
.triggering(
Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
.withAllowedLateness(Duration.ZERO)
.discardingFiredPanes());
PCollection<KV<String,Long>> stream2 =
new KafkaWordCount("localhost:9092","assist")
.build(p)
.apply(
Window
.<KV<String,Long>>into(
FixedWindows.of(Duration.millis(3600000)))
.triggering(
Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
.withAllowedLateness(Duration.ZERO)
.discardingFiredPanes());
final TupleTag<Long> web = new TupleTag<Long>();
final TupleTag<Long> assist = new TupleTag<Long>();
PCollection<KV<String, CoGbkResult>> joinedStream =
KeyedPCollectionTuple.of(web, stream1)
.and(assist, stream2)
.apply(CoGroupByKey.<String>create());
PCollection<KV<String,Long>> finalCountStream =
joinedStream
.apply(ParDo.of(
new DoFn<KV<String, CoGbkResult>, KV<String,Long>>() {
#StateId(stateId)
private final StateSpec<MapState<String, Long>> mapState =
StateSpecs.map();
#ProcessElement
public void processElement(
ProcessContext processContext,
#StateId(stateId) MapState<String, Long> state) {
KV<String,CoGbkResult> element = processContext.element();
Iterable<Long> count1 = element.getValue().getAll(web);
Iterable<Long> count2 = element.getValue().getAll(assist);
Long sumAmount =
StreamSupport
.stream(
Iterables
.concat(count1, count2)
.spliterator(),
false)
.collect(Collectors.summingLong(n -> n));
System.out.println(element.getKey()+"::"+sumAmount);
// processContext.output(element.getKey()+"::"+sumAmount);
Long currCount = state.get(element.getKey()).read()==null? 0L:state.get(element.getKey()).read();
Long newCount = currCount+sumAmount;
state.put(element.getKey(),sumAmount);
processContext.output(KV.of(element.getKey(),sumAmount));
}
}));
finalCountStream
.apply(ParDo.of(new DoFn<KV<String,Long>, KV<String,Long>>() {
#ProcessElement
public void processElement(ProcessContext processContext) {
processContext.output(processContext.element());
}
}))
.apply("finalState", ParDo.of(new DoFn<KV<String,Long>, String>() {
#StateId(myState)
private final StateSpec<MapState<String, Long>> mapState =
StateSpecs.map();
#ProcessElement
public void processElement(
ProcessContext c,
#StateId(myState) MapState<String, Long> state){
KV<String,Long> e = c.element();
System.out.println("Thread ID :"
+ Thread.currentThread().getId());
Long currCount =
state.get(e.getKey()).read()==null
? 0L
: state.get(e.getKey()).read();
Long newCount = currCount+e.getValue();
state.put(e.getKey(),newCount);
c.output(e.getKey()+":"+newCount);
}
}))
.apply(KafkaIO.<Void, String>write()
.withBootstrapServers("localhost:9092")
.withTopic("test")
.withValueSerializer(StringSerializer.class)
.values());
/* finalCountStream.apply(KafkaIO.<Void, String>write()
.withBootstrapServers("localhost:9092")
.withTopic("test")
.withValueSerializer(StringSerializer.class)
.values()
);*/
//finalCountStream.apply(TextIO.write().to("wordcounts"));
p.run().waitUntilFinish();
}
}
This Beam pipeline reads text from two kafka streams , split it into words and merge both streams based on word and finally emits word count from both stream to another kafka topic

For reference: if I understand correctly the problem, you can simplify the first part of your pipeline by using KafkaIO.withTopics(List<String>) and read from two (or more) topics in one step. So, there is no need to join data from different topics after.

Related

write to multiple Kafka topics in apache-beam?

I am executing a simple word count program where I used one Kafka topic (producer) as an input source then I apply a pardo to it for calculating the word count. Now I need help to write the words to different topics on the basis of their frequency. Let say all the word with even frequency will go to topic 1 and rest will go to topic 2.
can anyone help me with the example?
This can be done using Kafka.io writeRecord method that takes Producer<key,value> and then using new Produce<>("topic_name","key","value") -
below is the code -:
static class ExtractWordsFn extends DoFn<String, String> {
private final Counter emptyLines = Metrics.counter(ExtractWordsFn.class, "emptyLines");
private final Distribution lineLenDist =
Metrics.distribution(ExtractWordsFn.class, "lineLenDistro");
#ProcessElement
public void processElement(#Element String element, OutputReceiver<String> receiver) {
lineLenDist.update(element.length());
if (element.trim().isEmpty()) {
emptyLines.inc();
}
String[] words = element.split(ExampleUtils.TOKENIZER_PATTERN, -1);
for (String word : words) {
if (!word.isEmpty()) {
receiver.output(word);
}
}
}
}
public static class FormatAsTextFn extends SimpleFunction<KV<String, Long>, ProducerRecord<String,String>> {
#Override
public ProducerRecord<String, String> apply(KV<String, Long> input) {
if(input.getValue()%2==0)
return new ProducerRecord("test",input.getKey(),input.getKey()+" "+input.getValue().toString());
else
return new ProducerRecord("copy",input.getKey(),input.getKey()+" "+input.getValue().toString());
}
}
public static class CountWords
extends PTransform<PCollection<String>, PCollection<KV<String, Long>>> {
#Override
public PCollection<KV<String, Long>> expand(PCollection<String> lines) {
PCollection<String> words = lines.apply(ParDo.of(new ExtractWordsFn()));
PCollection<KV<String, Long>> wordCounts = words.apply(Count.perElement());
return wordCounts;
}
}
p.apply("ReadLines", KafkaIO.<Long, String>read()
.withBootstrapServers("localhost:9092")
.withTopic("copy")// use withTopics(List<String>) to read from multiple topics.
.withKeyDeserializer(LongDeserializer.class)
.withValueDeserializer(StringDeserializer.class)
.updateConsumerProperties(ImmutableMap.of("group.id", "my_beam_app_1"))
.updateConsumerProperties(ImmutableMap.of("enable.auto.commit", "true"))
.withLogAppendTime()
.withReadCommitted()
.commitOffsetsInFinalize()
.withProcessingTime()
.withoutMetadata()
)
.apply(Values.create())
.apply(Window.<String>into(FixedWindows.of(Duration.standardMinutes(1))))
.apply(new CountWords())
.apply(MapElements.via(new FormatAsTextFn())) //PCollection<ProducerRecord<string,string>>
.setCoder(ProducerRecordCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()))
.apply("WriteCounts", (KafkaIO.<String, String>writeRecords()
.withBootstrapServers("localhost:9092")
//.withTopic("test")
.withKeySerializer(StringSerializer.class)
.withValueSerializer(StringSerializer.class)
))

Kafka Stream fixed window not grouping by key

I get a single Kafka Stream. How can I accumulate messages for a specific time window irrespective of the key?
My use case is to write a file every 10 minutes out of a stream not considering the key.
You'll need to use a Transformer with a state store and schedule a punctuation call to go through the store every 10 minutes and emit the records. The transformer should return null as you are collecting the records in the state store, so you'll also need a filter after the transformer to ignore any null records.
Here's a quick example of something I think is close to what you are asking for. Let me know how it goes.
class WindowedTransformerExample {
public static void main(String[] args) {
final StreamsBuilder builder = new StreamsBuilder();
final String stateStoreName = "stateStore";
final StoreBuilder<KeyValueStore<String, String>> keyValueStoreBuilder =
Stores.keyValueStoreBuilder(Stores.inMemoryKeyValueStore(stateStoreName),
Serdes.String(),
Serdes.String());
builder.addStateStore(keyValueStoreBuilder);
builder.<String, String>stream("topic").transform(new WindowedTransformer(stateStoreName), stateStoreName)
.filter((k, v) -> k != null && v != null)
// Here's where you do something with records emitted after 10 minutes
.foreach((k, v)-> System.out.println());
}
static final class WindowedTransformer implements TransformerSupplier<String, String, KeyValue<String, String>> {
private final String storeName;
public WindowedTransformer(final String storeName) {
this.storeName = storeName;
}
#Override
public Transformer<String, String, KeyValue<String, String>> get() {
return new Transformer<String, String, KeyValue<String, String>>() {
private KeyValueStore<String, String> keyValueStore;
private ProcessorContext processorContext;
#Override
public void init(final ProcessorContext context) {
processorContext = context;
keyValueStore = (KeyValueStore<String, String>) context.getStateStore(storeName);
// could change this to PunctuationType.STREAM_TIME if needed
context.schedule(Duration.ofMinutes(10), PunctuationType.WALL_CLOCK_TIME, (ts) -> {
try(final KeyValueIterator<String, String> iterator = keyValueStore.all()) {
while (iterator.hasNext()) {
final KeyValue<String, String> keyValue = iterator.next();
processorContext.forward(keyValue.key, keyValue.value);
}
}
});
}
#Override
public KeyValue<String, String> transform(String key, String value) {
if (key != null) {
keyValueStore.put(key, value);
}
return null;
}
#Override
public void close() {
}
};
}
}
}

How to read the Header values in the Batch listener error handling scenario

I am trying to handle the exception at the listener
#KafkaListener(id = PropertiesUtil.ID,
topics = "#{'${kafka.consumer.topic}'}",
groupId = "${kafka.consumer.group.id.config}",
containerFactory = "containerFactory",
errorHandler = "errorHandler")
public void receiveEvents(#Payload List<ConsumerRecord<String, String>> recordList,
Acknowledgment acknowledgment) {
try {
log.info("Consuming the batch of size {} from kafka topic {}", consumerRecordList.size(),
consumerRecordList.get(0).topic());
processEvent(consumerRecordList);
incrementOffset(acknowledgment);
} catch (Exception exception) {
throwOrHandleExceptions(exception, recordList, acknowledgment);
.........
}
}
The Kafka container config:
#Bean
public KafkaListenerContainerFactory<ConcurrentMessageListenerContainer<String, String>>
containerFactory() {
ConcurrentKafkaListenerContainerFactory<String, String> factory =
new ConcurrentKafkaListenerContainerFactory<>();
factory.setConcurrency(this.numberOfConsumers);
factory.getContainerProperties().setAckOnError(false);
factory.getContainerProperties().setAckMode(ContainerProperties.AckMode.MANUAL);
factory.setConsumerFactory(getConsumerFactory());
factory.setBatchListener(true);
return factory;
}
}
the listener error handler impl
#Bean
public ConsumerAwareListenerErrorHandler errorHandler() {
return (m, e, c) -> {
MessageHeaders headers = m.getHeaders();
List<String> topics = headers.get(KafkaHeaders.RECEIVED_TOPIC, List.class);
List<Integer> partitions = headers.get(KafkaHeaders.RECEIVED_PARTITION_ID, List.class);
List<Long> offsets = headers.get(KafkaHeaders.OFFSET, List.class);
Map<TopicPartition, Long> offsetsToReset = new HashMap<>();
for (int i = 0; i < topics.size(); i++) {
int index = i;
offsetsToReset.compute(new TopicPartition(topics.get(i), partitions.get(i)),
(k, v) -> v == null ? offsets.get(index) : Math.min(v, offsets.get(index)));
}
...
};
}
when i try to run the same without the batching processing then i am able to fetch the partition,topic and offset values but when i enable batch processing and try to test it then i am getting only two values inside the headers i.e id and timestamp and other values are not set. Am i missing anything here??
What version are you using? I just tested it with Boot 2.2.4 (SK 2.3.5) and it works fine...
#SpringBootApplication
public class So60152179Application {
public static void main(String[] args) {
SpringApplication.run(So60152179Application.class, args);
}
#KafkaListener(id = "so60152179", topics = "so60152179", errorHandler = "eh")
public void listen(List<String> in) {
throw new RuntimeException("foo");
}
#Bean
public ConsumerAwareListenerErrorHandler eh() {
return (m, e, c) -> {
System.out.println(m);
return null;
};
}
#Bean
public ApplicationRunner runner(KafkaTemplate<String, String> template) {
return args -> {
template.send("so60152179", "foo");
};
}
#Bean
public NewTopic topic() {
return TopicBuilder.name("so60152179").partitions(1).replicas(1).build();
}
}
spring.kafka.listener.type=batch
spring.kafka.consumer.auto-offset-reset=earliest
GenericMessage [payload=[foo], headers={kafka_offset=[0], kafka_nativeHeaders=[RecordHeaders(headers = [], isReadOnly = false)], kafka_consumer=org.apache.kafka.clients.consumer.KafkaConsumer#2f2e787f, kafka_timestampType=[CREATE_TIME], kafka_receivedMessageKey=[null], kafka_receivedPartitionId=[0], kafka_receivedTopic=[so60152179], kafka_receivedTimestamp=[1581351585253], kafka_groupId=so60152179}]

How to evaluate consuming time in kafka stream application

I have 1.0.0 kafka stream application with two classes as below 'class FilterByPolicyStreamsApp' and 'class FilterByPolicyTransformerSupplier'. In my application, I read the events, perform some conditional checks and forward to same kafka in another topic. I able to get the producing time with 'eventsForwardTimeInMs' variable in FilterByPolicyTransformerSupplier class. But I unable to get the consuming time (with and without (de)serialization). How will I get this time? Please help me.
FilterByPolicyStreamsApp .java:
public class FilterByPolicyStreamsApp implements CommandLineRunner {
String policyKafkaTopicName="policy";
String policyFilterDataKafkaTopicName = "policy.filter.data";
String bootstrapServers="11.1.1.1:9092";
String sampleEventsKafkaTopicName = 'sample-.*";
String applicationId="filter-by-policy-app";
String policyFilteredEventsKafkaTopicName = "policy.filter.events";
public static void main(String[] args) {
SpringApplication.run(FilterByPolicyStreamsApp.class, args);
}
#Override
public void run(String... arg0) {
String policyGlobalTableName = policyKafkaTopicName + ".table";
String policyFilterDataGlobalTable = policyFilterDataKafkaTopicName + ".table";
Properties config = new Properties();
config.put(StreamsConfig.APPLICATION_ID_CONFIG, applicationId);
config.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers);
config.put(StreamsConfig.DEFAULT_TIMESTAMP_EXTRACTOR_CLASS_CONFIG, WallclockTimestampExtractor.class);
KStreamBuilder builder = new KStreamBuilder();
builder.globalTable(Serdes.String(), new JsonSerde<>(List.class), policyKafkaTopicName,
policyGlobalTableName);
builder.globalTable(Serdes.String(), new JsonSerde<>(PolicyFilterData.class), policyFilterDataKafkaTopicName,
policyFilterDataGlobalTable);
KStream<String, SampleEvent> events = builder.stream(Serdes.String(),
new JsonSerde<>(SampleEvent.class), Pattern.compile(sampleEventsKafkaTopicName));
events = events.transform(new FilterByPolicyTransformerSupplier(policyGlobalTableName,
policyFilterDataGlobalTable));
events.to(Serdes.String(), new JsonSerde<>(SampleEvent.class), policyFilteredEventsKafkaTopicName);
KafkaStreams streams = new KafkaStreams(builder, config);
streams.start();
streams.setUncaughtExceptionHandler(new Thread.UncaughtExceptionHandler() {
#Override
public void uncaughtException(Thread t, Throwable e) {
logger.error(e.getMessage(), e);
}
});
}
}
FilterByPolicyTransformerSupplier.java:
public class FilterByPolicyTransformerSupplier
implements TransformerSupplier<String, SampleEvent, KeyValue<String, SampleEvent>> {
private String policyGlobalTableName;
private String policyFilterDataGlobalTable;
public FilterByPolicyTransformerSupplier(String policyGlobalTableName,
String policyFilterDataGlobalTable) {
this.policyGlobalTableName = policyGlobalTableName;
this.policyFilterDataGlobalTable = policyFilterDataGlobalTable;
}
#Override
public Transformer<String, SampleEvent, KeyValue<String, SampleEvent>> get() {
return new Transformer<String, SampleEvent, KeyValue<String, SampleEvent>>() {
private KeyValueStore<String, List<String>> policyStore;
private KeyValueStore<String, PolicyFilterData> policyMetadataStore;
private ProcessorContext context;
#Override
public void close() {
}
#Override
public void init(ProcessorContext context) {
this.context = context;
// Call punctuate every 1 second
this.context.schedule(1000);
policyStore = (KeyValueStore<String, List<String>>) this.context
.getStateStore(policyGlobalTableName);
policyMetadataStore = (KeyValueStore<String, PolicyFilterData>) this.context
.getStateStore(policyFilterDataGlobalTable);
}
#Override
public KeyValue<String, SampleEvent> punctuate(long arg0) {
return null;
}
#Override
public KeyValue<String, SampleEvent> transform(String key, SampleEvent event) {
long eventsForwardTimeInMs = 0;
long forwardedEventCouunt = 0;
List<String> policyIds = policyStore.get(event.getCustomerCode().toLowerCase());
if (policyIds != null) {
for (String policyId : policyIds) {
/*
PolicyFilterData policyFilterMetadata = policyMetadataStore.get(policyId);
Do some condition checks on the event. If it satisfies then will forward them.
if(policyFilterMetadata == null){
continue;
}
*/
// Using context forward as event can map to multiple policies
long startForwardTime = System.currentTimeMillis();
context.forward(policyId, event);
forwardedEventCouunt++;
eventsForwardTimeInMs += System.currentTimeMillis() - startForwardTime;
}
}
return null;
}
};
}
}

How to process multi line input records in Spark

I have each record spread across multiple lines in the input file(Very huge file).
Ex:
Id: 2
ASIN: 0738700123
title: Test tile for this product
group: Book
salesrank: 168501
similar: 5 0738700811 1567184912 1567182813 0738700514 0738700915
categories: 2
|Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Earth-Based Religions[12472]|Wicca[12484]
|Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Earth-Based Religions[12472]|Witchcraft[12486]
reviews: total: 12 downloaded: 12 avg rating: 4.5
2001-12-16 cutomer: A11NCO6YTE4BTJ rating: 5 votes: 5 helpful: 4
2002-1-7 cutomer: A9CQ3PLRNIR83 rating: 4 votes: 5 helpful: 5
How to identify and process each multi line record in spark?
If the multi-line data has a defined record separator, you could use the hadoop support for multi-line records, providing the separator through a hadoop.Configuration object:
Something like this should do:
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
val conf = new Configuration
conf.set("textinputformat.record.delimiter", "id:")
val dataset = sc.newAPIHadoopFile("/path/to/data", classOf[TextInputFormat], classOf[LongWritable], classOf[Text], conf)
val data = dataset.map(x=>x._2.toString)
This will provide you with an RDD[String] where each element corresponds to a record. Afterwards you need to parse each record following your application requirements.
I have done this by implementing custom input format and record reader.
public class ParagraphInputFormat extends TextInputFormat {
#Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) {
return new ParagraphRecordReader();
}
}
public class ParagraphRecordReader extends RecordReader<LongWritable, Text> {
private long end;
private boolean stillInChunk = true;
private LongWritable key = new LongWritable();
private Text value = new Text();
private FSDataInputStream fsin;
private DataOutputBuffer buffer = new DataOutputBuffer();
private byte[] endTag = "\n\r\n".getBytes();
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
FileSplit split = (FileSplit) inputSplit;
Configuration conf = taskAttemptContext.getConfiguration();
Path path = split.getPath();
FileSystem fs = path.getFileSystem(conf);
fsin = fs.open(path);
long start = split.getStart();
end = split.getStart() + split.getLength();
fsin.seek(start);
if (start != 0) {
readUntilMatch(endTag, false);
}
}
public boolean nextKeyValue() throws IOException {
if (!stillInChunk) return false;
boolean status = readUntilMatch(endTag, true);
value = new Text();
value.set(buffer.getData(), 0, buffer.getLength());
key = new LongWritable(fsin.getPos());
buffer.reset();
if (!status) {
stillInChunk = false;
}
return true;
}
public LongWritable getCurrentKey() throws IOException, InterruptedException {
return key;
}
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
public float getProgress() throws IOException, InterruptedException {
return 0;
}
public void close() throws IOException {
fsin.close();
}
private boolean readUntilMatch(byte[] match, boolean withinBlock) throws IOException {
int i = 0;
while (true) {
int b = fsin.read();
if (b == -1) return false;
if (withinBlock) buffer.write(b);
if (b == match[i]) {
i++;
if (i >= match.length) {
return fsin.getPos() < end;
}
} else i = 0;
}
}
}
endTag identifies the end of each record.