How can I display all messages that are in a kafka topic?
I execute this code and it reads as a consumer what the producer wrote down at the moment the dag is being executed, but what was recorded there already by the last work of the dag, only the current one does not show me
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow_provider_kafka.operators.consume_from_topic import ConsumeFromTopicOperator
from airflow_provider_kafka.operators.produce_to_topic import ProduceToTopicOperator
default_args = {
"owner": "airflow",
"depend_on_past": False,
"start_date": datetime(2021, 7, 20),
"email_on_failure": False,
"email_on_retry": False,
"retries": 1,
"retry_delay": timedelta(minutes=5),
}
fruits_test = ["Apple", "Pear", "Peach", "Banana"]
def producer_function():
for i in fruits_test:
yield (json.dumps(i), json.dumps(i + i))
consumer_logger = logging.getLogger("airflow")
def consumer_function(message, prefix=None):
key = json.loads(message.key())
value = json.loads(message.value())
consumer_logger.info(f"{prefix} {message.topic()} # {message.offset()}; {key} : {value}")
return
with DAG(
"kafka_DAG",
default_args=default_args,
description="KafkaOperators",
schedule_interval=None,
start_date=datetime(2021, 1, 1),
catchup=False,
tags=["Test_DAG"],
) as dag:
t1 = ProduceToTopicOperator(
task_id="produce_to_topic",
topic="topictest",
producer_function=producer_function,
kafka_config={"bootstrap.servers": ":9092"},
)
t2 = ConsumeFromTopicOperator(
task_id="consume_from_topic",
topics=["topictest"],
apply_function=consumer_function,
apply_function_kwargs={"prefix": "consumed:::"},
consumer_config={
"bootstrap.servers": ":9092",
"group.id": "test-consumer-group",
"enable.auto.commit": False,
"auto.offset.reset": "earliest",
},
commit_cadence="end_of_batch",
max_messages=10,
max_batch_size=2,
)
task_id="consume_from_topic",
topics=["topictest"],
apply_function=consumer_function,
apply_function_kwargs={"prefix": "consumed:::"},
consumer_config={
"bootstrap.servers": ":9092",
"group.id": "test-consumer-group",
"enable.auto.commit": False,
"auto.offset.reset": "earliest",
},
commit_cadence="never",
max_messages=10,
max_batch_size=2,
)
You just need to make sure that the consumer isn't committing the offset ever. If you set "commit_cadence" to never, this consumer will run from the beginning of the log every time it executes.
Related
Right now, i want to implement node-rdkafka into our service, but i faced this error many times Broker: Unknown member.
The same issue on github was https://github.com/confluentinc/confluent-kafka-dotnet/issues/1464. they say our consumer using same group id to retry or delay. but i didn't find any retry and delay on my code.
or https://github.com/confluentinc/confluent-kafka-python/issues/1004, but i have recheck all consumer group id and it was unique.
The config of node-rdkafka producer as follows:
this.producer = new Producer({
"client.id": this.cliendID,
"metadata.broker.list": this.brokerList,
'compression.codec': "lz4",
'retry.backoff.ms': 200,
'socket.keepalive.enable': true,
'queue.buffering.max.messages': 100000,
'queue.buffering.max.ms': 1000,
'batch.num.messages': 1000000,
"transaction.timeout.ms": 2000,
"enable.idempotence": false,
"max.in.flight.requests.per.connection": 1,
"debug": this.debug,
'dr_cb': true,
"retries": 0,
"log_cb": (_: any) => console.log(`log_cb =>`, _),
"sasl.username": this.saslUsername,
"sasl.password": this.saslPassword,
"sasl.mechanism": this.saslMechanism,
"security.protocol": this.securityProtocol
}, {
"acks": -1
})
The config of node-rdkafka consumer as follows:
this.consumer = new KafkaConsumer({
'group.id': this.groupID,
'metadata.broker.list': this.brokerList,
"sasl.username": this.saslUsername,
"sasl.password": this.saslPassword,
"enable.auto.commit": false,
"auto.commit.interval.ms": 2000,
"session.timeout.ms": 45000,
"max.poll.interval.ms": 300000,
"heartbeat.interval.ms": 3000,
"api.version.request.timeout.ms": 10000,
"max.in.flight.requests.per.connection": 1,
"debug": this.debug,
"sasl.mechanism": this.saslMechanism,
"log.connection.close": true,
"log.queue": true,
"log_level": 7,
"log.thread.name": true,
"isolation.level": "read_committed",
"ssl.ca.location": "/etc/ssl/certs/",
"log_cb": (_: any) => console.log(`log_cb =>`, _),
"security.protocol": this.securityProtocol
}, {})
await new Promise(resolve => {
this.consumer?.connect()
this.consumer?.on('ready', () => {
try {
this.consumer?.subscribe(subscriptions)
this.consumer?.consume()
console.log('[SUCCESS] Subscribe Event => all event')
} catch (err) {
console.log('[FAILED] Subscribe => all event')
console.log(err)
}
resolve(this.consumer)
}).on('data', async (data) => {
this.topicFunctionMap[data.topic]({
partition: data.partition,
topic: data.topic,
message: {
key: data.key,
offset: data.offset.toString(),
size: data.size,
value: data.value,
timestamp: data.timestamp?.toString()
}
} as ISubsCallbackParam)
this.consumer?.commitSync({
topic: data.topic,
offset: data.offset,
partition: data.partition
})
})
})
Using those configuration, the consumer is able to receive event but its not last for long. around 2hours more it randomly gives those error.
I am not sure if it because manual commit or our function tooks long because i have tried both async and sync commit so the commitSync its not depend on our function.
Let says it because the our function tooks long, and it make our cosumer kicked from the group. maybe its the suspect after i found additional error Broker: Specified group generation id is not valid
source: https://github.com/confluentinc/confluent-kafka-dotnet/issues/1155
Its says i need to increase the session time out, then i tried to increase it to "session.timeout.ms": 300000 or 5min, and the heartbeat "heartbeat.interval.ms":3000, i found in github issue, that the heartbeat should less than = (timeout/3). so i think 3sec will fine.
Using "session.timeout.ms": 300000 and "heartbeat.interval.ms":3000
the consumer is able to consume and last for long but the problems is:
first time using those config, its fine running around 0-2sec to receive
after a while, its received, but tooks 1-10sec to receive the message
The detail errors:
received event => onCustomerServiceRegister
[COMMIT_ERR] LibrdKafkaError: Broker: Unknown member
at Function.createLibrdkafkaError [as create] (/src/app/node_modules/node-rdkafka/lib/error.js:454:10)
at KafkaConsumer.Client._errorWrap (/src/app/node_modules/node-rdkafka/lib/client.js:481:29)
at KafkaConsumer.commitSync (/src/app/node_modules/node-rdkafka/lib/kafka-consumer.js:560:8)
at KafkaRDConnect.<anonymous> (/src/app/dist/events/connectors/kafkaRD.js:240:110)
at step (/src/app/dist/events/connectors/kafkaRD.js:53:23)
at Object.next (/src/app/dist/events/connectors/kafkaRD.js:34:53)
at /src/app/dist/events/connectors/kafkaRD.js:28:71
at new Promise (<anonymous>)
at __awaiter (/src/app/dist/events/connectors/kafkaRD.js:24:12)
at KafkaConsumer.<anonymous> (/src/app/dist/events/connectors/kafkaRD.js:213:72)
at KafkaConsumer.emit (node:events:376:20)
at KafkaConsumer.EventEmitter.emit (node:domain:470:12)
at /src/app/node_modules/node-rdkafka/lib/kafka-consumer.js:488:12 {
First of all, I'm using VScode and airflow 2.0.1 is on a remote server, thus im ssh'ing onto the server. All the files are on the remote server.
In my dag.py I have
from airflow import DAG
from airflow.utils.dates import days_ago
from datetime import timedelta
from airflow.operators.python import PythonVirtualenvOperator
default_args = {
'owner': 'me',
'depends_on_past': False,
}
with DAG(
'test',
default_args=default_args,
description='Test ',
schedule_interval=timedelta(days=1),
start_date=days_ago(1),
) as dag:
t1 = PythonVirtualenvOperator(
task_id = "test1",
python_version = "3.7",
python_callable = test1,
requirements = req
)
t2 = PythonVirtualenvOperator(
task_id = "test2",
python_version = "3.7",
python_callable = test2,
requirements = req
)
t1>>t2
if __name__=="__main__":
from airflow.utils.state import State
dag.clear(dag_run_state=State.NONE)
dag.run()
and my launch.json in vscode
{ "version": "0.2.0",
"configurations": [
{
"name": "Python: Current File",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"env":{
"AIRFLOW__CORE__EXECUTOR":"DebugExecutor"
}
}
]
}
When I then enter dag.py and pressed the debug, it stops on dag.clear(dag_run_state=State.NONE) and throws (sqlite3.OperationalError) no such table: task_instance".
If I run airflow db check I get
[user]$ airflow db check
[2021-05-26 14:15:35,741] {cli_action_loggers.py:105} WARNING - Failed to log action with (sqlite3.OperationalError) no such table: log
[SQL: INSERT INTO log (dttm, dag_id, task_id, event, execution_date, owner, extra) VALUES (?, ?, ?, ?, ?, ?, ?)]
[parameters: ('2021-05-26 12:15:35.737300', None, None, 'cli_check', None, 'user', '{"host_name": "server", "full_command": "[\'/usr/local/bin/airflow\', \'db\', \'check\']"}')]
(Background on this error at: http://sqlalche.me/e/13/e3q8)
[2021-05-26 14:15:35,746] {db.py:762} INFO - Connection successful.
I'm using SQlite as DB and Airflow runs my DAGS (apart from the one I try to debug) without an issue, so I assume the DB tables are set up correctly?
I have a dag which queries the postgress database, And I am using postgresOperator
however when passing the parameter I am getting the below Error.
psycopg2.ProgrammingError: column "132" does not exist
LINE 1: ...d,derived_tstamp FROM atomic.events WHERE event_name = "132"
snapshot of my dag below :
default_args = {
"owner": "airflow",
"depends_on_past": False,
"start_date": airflow.utils.dates.days_ago(1),
"email": ["airflow#airflow.com"],
"email_on_failure": False,
"email_on_retry": False,
"retries": 1,
"retry_delay": timedelta(minutes=1),
}
dag = DAG("PostgresTest", default_args=default_args, schedule_interval='3,33 * * * *',template_searchpath = ['/root/airflow/sql/'])
dailyOperator = PostgresOperator(
task_id='Refresh_DailyScore',
postgres_conn_id='postgress_sophi',
params={"e_name":'"132"'},
sql='atomTest.sql',
dag=dag)
Snapshot of atomTest.sql
SELECT domain_userid,derived_tstamp FROM atomic.events WHERE event_name = {{ params.e_name }}
I am hitting my head the whole day to understand why airflow is considering 132 values as column.
Please suggest.
I am new to cloud composer & I want to execute one PostgreSQL SELECT query using gcp_cloud_sql hook in cloud composer's airflow. I tried with CloudSqlQueryOperator but it doesn't work with SELECT queries.
I want to create DAGs on basis of results I get from this select query.However, I am not able to create even simple connection for this SELECT query.
from six.moves.urllib.parse import quote_plus
import airflow
from airflow import models
from airflow.contrib.operators.gcp_sql_operator import (
CloudSqlQueryOperator
)
from datetime import date, datetime, timedelta
GCP_PROJECT_ID = "adtech-dev"
GCP_REGION = "<my cluster zone>"
GCSQL_POSTGRES_INSTANCE_NAME_QUERY = "testpostgres"
GCSQL_POSTGRES_DATABASE_NAME = ""
GCSQL_POSTGRES_USER = "<PostgreSQL User Name>"
GCSQL_POSTGRES_PASSWORD = "**********"
GCSQL_POSTGRES_PUBLIC_IP = "0.0.0.0"
GCSQL_POSTGRES_PUBLIC_PORT = "5432"
rule_query = "select r.id from rules r where r.id = 1"
postgres_kwargs = dict(
user=quote_plus(GCSQL_POSTGRES_USER),
password=quote_plus(GCSQL_POSTGRES_PASSWORD),
public_port=GCSQL_POSTGRES_PUBLIC_PORT,
public_ip=quote_plus(GCSQL_POSTGRES_PUBLIC_IP),
project_id=quote_plus(GCP_PROJECT_ID),
location=quote_plus(GCP_REGION),
instance=quote_plus(GCSQL_POSTGRES_INSTANCE_NAME_QUERY),
database=quote_plus(GCSQL_POSTGRES_DATABASE_NAME)
)
default_args = {
'owner': 'airflow',
'start_date': datetime(2018, 5, 31),
'email': ['aniruddha.dwivedi#xyz.com'],
'email_on_failure': True,
'email_on_retry': False,
'depends_on_past': False,
'catchup': False,
'retries': 3,
'retry_delay': timedelta(minutes=10),
}
os.environ['AIRFLOW_CONN_PROXY_POSTGRES_TCP'] = \
"gcpcloudsql://{user}:{password}#{public_ip}:{public_port}/{database}?" \
"database_type=postgres&" \
"project_id={project_id}&" \
"location={location}&" \
"instance={instance}&" \
"use_proxy=True&" \
"sql_proxy_use_tcp=True".format(**postgres_kwargs)
connection_names = [
"proxy_postgres_tcp"
]
tasks = []
with models.DAG(
dag_id='example_gcp_sql_query',
default_args=default_args,
schedule_interval=None
) as dag:
prev_task = None
for connection_name in connection_names:
task = CloudSqlQueryOperator(
gcp_cloudsql_conn_id=connection_name,
task_id="example_gcp_sql_task_" + connection_name,
sql=rule_query
)
tasks.append(task)
if prev_task:
prev_task >> task
prev_task = task
I have defined a Airflow sample task where I wanted to run a PrestoDB Query followed by a Spark job to perform a simple word count example. Here is the DAG I defined:
from pandas import DataFrame
import logging
from datetime import timedelta
from operator import add
import airflow
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.hooks.presto_hook import PrestoHook
default_args = {
'owner': 'airflow',
'start_date': airflow.utils.dates.days_ago(1),
'depends_on_past': False,
'email': ['airflow#example.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
}
dag = DAG(
'presto_dag',
default_args=default_args,
description='A simple tutorial DAG with PrestoDB and Spark',
# Continue to run DAG once per hour
schedule_interval='#daily',
)
def talk_to_presto():
ph = PrestoHook(host='presto.myhost.com', port=9988)
# Query PrestoDB
query = "show catalogs"
# Fetch Data
data = ph.get_records(query)
logging.info(data)
return data
def submit_to_spark():
# conf = SparkConf().setAppName("PySpark App").setMaster("http://sparkhost.com:18080/")
# sc = SparkContext(conf)
# data = sc.parallelize(list("Hello World"))
# counts = data.map(lambda x: (x, 1)).reduceByKey(add).sortBy(lambda x: x[1], ascending=False).collect()
# for (word, count) in counts:
# print("{}: {}".format(word, count))
# sc.stop()
return "Hello"
presto_task = PythonOperator(
task_id='talk_to_presto',
provide_context=True,
python_callable=talk_to_presto,
dag=dag,
)
spark_task = PythonOperator(
task_id='submit_to_spark',
provide_context=True,
python_callable=submit_to_spark,
dag=dag,
)
presto_task >> spark_task
When I submit the task, about 20 DAG instances stay in the running state:
But it never completes and no logs are generated, at least for the PrestoDB Query. I am able to run the same PrestoDB Query from the Airflow's Data Profiling > Ad-Hoc Query section correctly.
I have intentionally commented out the PySpark code as it wasn't running and not of focus in the question.
I have two questions:
Why aren't the tasks completed and stays in the running state?
What am I doing wrong with the PrestoHook as the query isn't running?