I have been seeing so many kafka consumer rebalances even if the thread is consuming nothing.I would expect the consumer not to rebalance in this scenario.
Here is the sample code.
import argparse
from kafka.coordinator.assignors.range import RangePartitionAssignor
from kafka import KafkaConsumer
import time
import sys
import logging
from kafka.consumer.subscription_state import ConsumerRebalanceListener
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
def get_config(args):
config = {
'bootstrap_servers': args.host,
'group_id': args.group,
'key_deserializer': lambda msg: msg,
'value_deserializer': lambda msg: msg,
'partition_assignment_strategy': [RangePartitionAssignor],
'max_poll_records': args.records,
'auto_offset_reset': args.offset,
# 'max_poll_interval_ms': 300000,
# 'connections_max_idle_ms': 8 * 60 * 1000
}
return config
def start_consumer(args):
config = get_config(args)
consumer = KafkaConsumer(**config)
consumer.subscribe([args.topic],
listener=RepartitionListener(None))
for record in consumer:
print record.offset, record.partition
time.sleep(int(args.delay) / 1000.0)
class RepartitionListener(ConsumerRebalanceListener):
def __init__(self):
pass
def on_partitions_revoked(self, revoked):
print("partition revoked ")
for tp in revoked:
try:
print("[{}] revoked topic = {} partition = {}".format(time.strftime("%c"),
tp.topic, tp.partition))
partition_key = "{}_{}".format(tp.topic, str(tp.partition))
except Exception as e:
print("Got exception partition_key = {} {}".
format(tp, e.message))
def on_partitions_assigned(self, assigned):
pass
def main():
parser = argparse.ArgumentParser(
description='Tool to test consumer group with delay')
named_args = parser.add_argument_group('snamed arguments')
named_args.add_argument('-g', '--group', help='group id for the consumer',
required=True)
named_args.add_argument('-r', '--records', help='num records to consume',
required=True)
named_args.add_argument('-k', '--topic', help='kafka topic', required=True)
named_args.add_argument('-d', '--delay', help='add process delay in ms', required=True)
named_args.add_argument('-s', '--host', help='Kafka host format host:port', required=False)
parser.add_argument('-o', '--offset',
default='latest',
help='offset to read from earliest/latest')
args = parser.parse_args()
print args
start_consumer(args)
if __name__ == "__main__":
main()
How to avoid the rebalance trigger? From the logs I am seeing heartbeat is failing, But I am expecting the heartbeat to be continued even if there is no messages over a period of time greater than session.time.out.ms.
2019-02-27 20:39:43,281 - kafka.coordinator - WARNING - Heartbeat session expired, marking coordinator dead
Related
I have a script to test for at least one consume
the producer
import json
import random
import time
from confluent_kafka import Producer
import config
p = Producer({'bootstrap.servers':','.join(config.KAFKA_HOST),})
total_count = 0
c = 0
try:
for i in range(20000):
num = random.randint(1, 1000000)
total_count += num
a = {'t': num, 'time': time.time()}
p.produce('test-topic-vv', json.dumps(a))
c += 1
if c %100 == 0:
p.flush()
finally:
p.flush()
the consumer
import json
import random
import sys
from confluent_kafka import Consumer, TopicPartition
import config
c = Consumer({
'bootstrap.servers':','.join(config.KAFKA_HOST),
'group.id': 'test-topic-consumer-group',
'auto.offset.reset': 'earliest',
'enable.auto.offset.store': False,
'enable.auto.commit': True,
})
topic = 'test-topic-vv'
def test_for_seek():
try:
pp = []
pp.append(TopicPartition(topic, partition=8))
c.assign(pp)
while True:
msgs = c.consume(num_messages=10, timeout=10)
if not msgs:
print('no data and wait')
for i in c.assignment():
print(i.topic, i.partition, i.offset, c.get_watermark_offsets(i))
continue
for msg in msgs:
t1 = msg.partition()
o1 = msg.offset()
print('Received message: {} par {} offset {}'.format(msg.value().decode('utf-8'), t1, o1))
break
finally:
c.close()
def test_for_run():
try:
c.subscribe([topic])
total_count = 0
map_par = {}
while True:
msgs = c.consume(num_messages=10, timeout=5)
if not msgs:
print('no data and wait')
for i in c.assignment():
print(i.topic, i.partition, i.offset, c.get_watermark_offsets(i))
continue
deald = []
for msg in msgs:
t1 = msg.partition()
o1 = msg.offset()
print('Received message: {} par {} offset {}'.format(msg.value().decode('utf-8'), t1, o1))
if random.randint(1, 100) == 9:
# test for deal failed then retry again
print('deal failed will retry msg offset {} partition {}'.format(msg.offset(), msg.partition()))
break
else:
total_count += json.loads(msg.value())['t']
# test for deal success
if t1 in map_par:
if map_par[t1] + 1 != o1:
raise Exception('deal partition {} except last offset {} current offset {}'.format(t1, map_par[t1], o1))
map_par[t1] = o1
c.store_offsets(msg)
deald.append(msg)
group_partition = {}
for msg in msgs:
if msg in deald:
continue
partition = msg.partition()
offset = msg.offset()
if partition in group_partition:
group_partition[partition] = min(group_partition[partition], offset)
else:
group_partition[partition] = offset
# seek to deal failed partition offset
for k, v in group_partition.items():
c.seek(TopicPartition(topic, partition=k, offset=v))
print('deal failed will set msg offset {} partition {}'.format(v, k))
finally:
c.close()
if sys.argv[1] == 'test_for_seek':
test_for_seek()
else:
test_for_run()
the topic test-topic-vv has 9 partition
first i run producer to add some message to topic then consume it. but i got a exception
screenshot https://user-images.githubusercontent.com/12459874/194990350-8cd13128-f3fa-4a86-a93e-771af45f93f0.png
The latest message's offset of partition 8 should be 7382 but got 7391
then i run test_for_seek to check the consumer group's actually record offset it was 7382 indeed
screenshot https://user-images.githubusercontent.com/12459874/194990593-9b8431d0-ce07-4122-800d-f9b3c129f5f3.png
I also check the broker's group offset record
screenshot https://user-images.githubusercontent.com/12459874/194990684-9d8ad773-a569-4cee-9d4c-0a898e8f8922.png
it also was 7382
So what happened to consumer when use seek to manage offset, hope any one can help me to deal with the problem.
check information
confluent_kafka.version()==1.9.2
confluent_kafka.libversion()==1.9.2
Operating system: ubuntu 16.04
Python3.8
kafka 2.11-1.1.1
Kafka Client : 0.11.0.0-cp1
Kafka Broker :
On Kafka broker rolling restart, our application lost some messages while sending to broker. I believe with rolling restart there should not be any loss of message. These are the producer (Using Producer with asynchronous send() and not using callback/future etc) settings we are using :
val acksConfig: String = "all",
val retriesConfig: Int = Int.MAX_VALUE,
val retriesBackOffConfig: Int = 1000,
val batchSize: Int = 32768,
val lingerTime: Int = 1,
val maxBlockTime: Int = Int.MAX_VALUE,
val requestTimeOut: Int = 420000,
val bufferMemory: Int = 33_554_432,
val compressionType: String = "gzip",
val keySerializer: Class<StringSerializer> = StringSerializer::class.java,
val valueSerializer: Class<ByteArraySerializer> = ByteArraySerializer::class.java
I am seeing these exceptions in the logs
2019-03-19 17:30:59,224 [org.apache.kafka.clients.producer.internals.Sender] [kafka-producer-network-thread | producer-1] (Sender.java:511) WARN org.apache.kafka.clients.producer.internals.Sender - Got error produce response with correlation id 1105790 on topic-partition catapult_on_entitlement_updates_prod-67, retrying (2147483643 attempts left). Error: NOT_LEADER_FOR_PARTITION
But log says retry attempt left, i am curious why didnt it retry then? Let me know if anyone has any idea?
Two things to note:
What is the replication factor of the topic you are producing and what is the required number of min.insync.replicas?
What do you mean by "producer lost some messages". The producer if it cannot successfully produce to #min.insync.replicas brokers it will throw an exception and fail (for synchronous production). It is up to the producer/ client to retry in case of failure (synchronous or asynchronous production).
I want to publish in Kafka topic
I am unable to do so, the program halts.
I am getting this error:
KafkaTimeoutError: Failed to update metadata after 60.0 secs.
def saveResults(response):
entities_tweet = response["entities"]
for entity in entities_tweet:
try:
for i in entity_dict:
for j in entity_dict[i]:
if(entity["text"] in j):
entity["tweet"] = response["tweet"]
entity["tweetId"] = response["tweetId"]
entity["timeStamp"] = response["timeStamp"]
#entity["userProfile"] = response["userProfile"]
future = producer.send('argentina-iceland-june-16-watson', bytes(entity))
print("Published.")
else:
print("All ignored.")
future = producer.send('argentina-iceland-june-16-watson', bytes(entity))
print("Published")
except Exception as e:
print (e)
finally:
producer.flush()
However, this is working:
from kafka import KafkaProducer
from kafka.errors import KafkaError
producer = KafkaProducer(bootstrap_servers=['broker1:1234'])
# Asynchronous by default
future = producer.send('my-topic', b'raw_bytes')
It looks like that you're using incorrect boostrap server, it should be broker1:9092 instead of broker1:1234...
SimpleConsumer has been deprecated in kafka, with org.apache.kafka.clients.consumer.KafkaConsumer being the replacement. However, it doesn't have a send(...) function. How can I rewrite the below code using the new KafkaConsumer?
import scala.concurrent.duration._
import kafka.api.TopicMetadataRequest
import kafka.consumer.SimpleConsumer
....
val consumer = new SimpleConsumer(
host = "127.0.0.1",
port = 9092,
soTimeout = 2.seconds.toMillis.toInt,
bufferSize = 1024,
clientId = "health-check")
// this will fail if Kafka is unavailable
consumer.send(new TopicMetadataRequest(Nil, 1))
You can get topic metadata with .partitionsFor and .listTopics
There is no direct replacement for method ,it depends what you want to do .
If you need all partitions info there is method for that consumer.partitionFor(topic) in the new api
I have a system that pulls messages from a Kafka topic, and when it's unable to process messages because some external resource is unavailable, it shuts down the consumer, returns the message to the topic, and waits some time before starting the consumer again. The only problem is, shutting down doesn't work. Here's what I see in my logs:
2014-09-30 08:24:10,918 - com.example.kafka.KafkaConsumer [info] - [application-akka.actor.workflow-context-8] Shutting down kafka consumer for topic new-problem-reports
2014-09-30 08:24:10,927 - clients.kafka.ProblemReportObserver [info] - [application-akka.actor.workflow-context-8] Consumer shutdown
2014-09-30 08:24:11,946 - clients.kafka.ProblemReportObserver [warn] - [application-akka.actor.workflow-context-8] Sending 7410-1412090624000 back to the queue
2014-09-30 08:24:12,021 - clients.kafka.ProblemReportObserver [debug] - [kafka-akka.actor.kafka-consumer-worker-context-9] Message from partition 0: key=7410-1412090624000, msg=7410-1412090624000
There's a few layers at work here, but the important code is:
In KafkaConsumer.scala:
protected def consumer: ConsumerConnector = Consumer.create(config.asKafkaConfig)
def shutdown() = {
logger.info(s"Shutting down kafka consumer for topic ${config.topic}")
consumer.shutdown()
}
In the routine that observes messages:
(processor ? ProblemReportRequest(problemReportKey)).map {
case e: ConnectivityInterruption =>
val backoff = 10.seconds
logger.warn(s"Can't connect to essential services, pausing for $backoff", e)
stop()
// XXX: Shutdown isn't instantaneous, so returning has to happen after a delay.
// Unfortunately, there's still a race condition here, plus there's a chance the
// system will be shut down before the message has been returned.
system.scheduler.scheduleOnce(100 millis) { returnMessage(message) }
system.scheduler.scheduleOnce(backoff) { start() }
false
case e: Exception => returnMessage(message, e)
case _ => true
}.recover { case e => returnMessage(message, e) }
And the stop method:
def stop() = {
if (consumerRunning.get()) {
consumer.shutdown()
consumerRunning.compareAndSet(true, false)
logger.info("Consumer shutdown")
} else {
logger.info("Consumer is already shutdown")
}
!consumerRunning.get()
}
Is this a bug, or am I doing it wrong?
Because your consumer is a def. It creates a new Kafka instance and shut that new instance down when you call it like consumer.shutdown(). Make consumer a val instead.