sla_miss_callback to send email on missing task SLA in Apache Airflow - email

I have a DAG A that is being triggered by a parent DAG B. So DAG A doesn't have any schedule interval defined in it.
1.I would like to set up a sla_miss_callback on one of the task in DAG A.
2.I would like to get an e-mail notification whenever the task misses it's SLA.
I have tried methods available in google and stackoverflow. The e-mail is not getting triggered as expected.
Sharing the sample code I have used for testing.
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from datetime import timedelta, datetime
import logging
def print_sla_miss(**kwargs):
logging.info("SLA missed")
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2021, 1, 1),
'email': 'sample#xxx.com',
'email_on_failure': True,
'email_on_retry': False,
'retries': 0
}
with DAG('sla_test', schedule_interval=None, max_active_runs=1, catchup=False,sla_miss_callback=print_sla_miss, default_args=default_args) as dag:
sleep = BashOperator(
task_id='timeout',
sla=timedelta(seconds=5),
bash_command='sleep 15',
retries=0,
dag=dag,
)
Thanks in advance.

SLAs will only be evaluated on scheduled DAG Runs. Since you have schedule_interval=None the SLA you set is not being evaluated for this DAG.
If there is a certain amount of time you expect the triggered DAG to finish, you could set that SLA in the sensor task in the parent DAG that checks when the child DAG is finished.
Another possible workaround is to set up a Slack notification for when the child DAG finishes entirely, or when a certain task starts/finishes so you can evaluate if it has been running for too long.

To achieve my requirement, I have created a seperate DAG that watches the task run status every 5 mins and notifies through e-mail based on the run status as below.To do this I am sending the execution date of my main DAG to an airflow variable.
#importing operators and modules
from airflow import DAG
from airflow.operators.python_operator import BranchPythonOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.email_operator import EmailOperator
from airflow.api.common.experimental.get_task_instance import get_task_instance
from airflow.models import Variable
from datetime import datetime,timedelta,timezone
import dateutil
#setting default arguments
default_args = {
'owner': 'test',
'depends_on_past': False,
'start_date': datetime(2021, 1, 1),
'email': ['abc#example.com'],
'email_on_failure': True,
'email_on_retry': False,
'retries': 0
}
#getting current status of task in main DAG
exec_date = dateutil.parser.parse(Variable.get('main_dag_execution_date'))
ti = get_task_instance('main_dag', 'task_to_check', exec_date)
state = ti.current_state()
start_date = ti.start_date
end_date = ti.end_date
print("start_date",start_date," end_date",end_date, " execution_date",exec_date)
#deciding the action based on status of the task
def check_task_status(**kwargs):
if state == 'running' and datetime.now(timezone.utc) > start_date + timedelta(minutes = 10):
breach_mail = 'breach_mail'
return breach_mail
elif state == 'failed':
failure_mail = 'failure_mail'
return failure_mail
else:
other_state = 'other_state'
return other_state
#print statement when status is not in breached or failed state
def print_current_state(**context):
if start_date is None:
print("task is in wait state")
else:
print("task is in " + state + " state")
with DAG('sla_check', schedule_interval='0-59/5 9-23 * * *', max_active_runs=1, catchup=False,default_args=default_args) as dag:
check_task_status = BranchPythonOperator(task_id='check_task_status', python_callable=check_task_status,
provide_context=True,
dag=dag)
breach_mail = EmailOperator(task_id='breach_mail', to='Abc#example.com',
subject='SLA for task breached',
html_content="<p>Hi,<br><br>task running belyond SLA<br>", dag=dag)
failure_mail = EmailOperator(task_id='failure_mail', to='Abc#example.com',
subject='task failed',
html_content="<p>Hi,<br><br>task failed. Please check.<br>", dag=dag)
other_state = PythonOperator(task_id='other_state', python_callable=print_current_state,
provide_context=True,
dag=dag)
check_task_status >> breach_mail
check_task_status >> failure_mail
check_task_status >> other_state

Related

How to use of on_failure_callback in Airflow 1.10.10+composer?

I wish to get an email notification when a single Airflow Operator fails. I need that because the failure of some tasks mustn't set the entire pipeline as failed.
To simulate the error, I set a source bucket as not existing bucket.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = "Stefano Giostra"
__credits__ = "Stefano Giostra"
__maintainer__ = "Stefano Giostra"
__version__ = "0.9.3"
__status__ = "Dev"
from airflow.models import Variable, DAG
from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
# from lib.bb_utils import *
import logging
from airflow.utils import dates
from datetime import timedelta
from functools import partial
from lib.bb_utils import load_json_file
from airflow.utils.email import send_email
ROOT_PATH = '/home/airflow/gcs/dags'
logger = logging.getLogger("dag_demo_2")
def notify_email(context, config): # **kwargs
"""Send custom email alerts."""
alerting_email_address = config.get('email_address')
print("---> notify_email -------------------")
print(context)
print(f"-->{alerting_email_address}")
print("<------------------------------------")
# print(context['dag'])
# email title.
# title = "Airflow alert: {task_name} Failed".format(context)
#
# # email contents
# body = """
# Hi, <br><br>
# There's been an error in the {task_name} job.<br>
# <br>
# Forever yours,<br>
# Airflow bot <br>
# """.format(**contextDict)
# for dest in dest_email:
# send_email(dest, title, body)
# ----------------------------------------------------------------------------------------------------------------------
# ----------------------------------------------------------------------------------------------------------------------
# ----------------------------------------------------------------------------------------------------------------------
# Dizionario dati con le chiavi richieste dai DAG di AirFlow
my_email = 'abc#xyz.com'
default_args = {
"owner": 'SG',
"depends_on_past": False,
"start_date": dates.days_ago(1),
"end_date": None,
"email_on_failure": 'my_email',
"email_on_retry": False,
"email": [my_email],
"retries": 2,
"retry_delay": timedelta(minutes=5),
"max_active_runs": 1,
"on_failure_callback": partial(notify_email, config={'email_address': my_email})
}
dag_name = 'SG-DagDemo-Once'
with DAG(dag_id=dag_name, default_args=default_args, schedule_interval="#once") as ldag:
project = Variable.get("PROJECT")
source_bucket = 'sg-dev'
source_object = 'covid19_italy/national_trends_2.csv'
bq_dataset = "covid19_italy"
bq_table_name = "national_trends"
bq_task_id = f'gcs_to_bq_load_{bq_table_name}'
schema_fields = load_json_file(f"{ROOT_PATH}/source/{bq_dataset}/{bq_table_name}_tabschema.json")
t = GoogleCloudStorageToBigQueryOperator(
dag=ldag,
task_id=bq_task_id,
bucket=source_bucket,
source_objects=[source_object],
destination_project_dataset_table="{0}.{1}.{2}".format(project, bq_dataset, bq_table_name),
schema_fields=schema_fields,
source_format='CSV',
skip_leading_rows=1,
write_disposition="WRITE_TRUNCATE"
)
To invoke notify_email() on a failure, it will be enough if you adjust default_args with:
"on_failure_callback": notify_email
then default_args should be included in the DAG creation sentence:
with DAG(dag_id='SG-DagDemo-Once', default_args=default_args) as dag:
You can try something like the following to call the function notify_email() on operator failures; each operator will call the same function (example taken from gcs_to_bq):
args = {
'owner': 'Airflow',
'start_date': airflow.utils.dates.days_ago(1),
'on_failure_callback': notify_email
}
dag_name = 'SG-DagDemo-Once'
with DAG(dag_id=dag_name, default_args=args, schedule_interval=None) as dag:
create_test_dataset = bash_operator.BashOperator(
task_id='create_airflow_test_dataset',
bash_command='bq mk airflow_test')
# [START howto_operator_gcs_to_bq]
load_csv = GoogleCloudStorageToBigQueryOperator(
task_id='gcs_to_bq_example',
bucket='cloud-samples-data',
source_objects=['bigquery/us-states/us-states.csv'],
destination_project_dataset_table='airflow_test.gcs_to_bq_table',
schema_fields=[
{'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name': 'post_abbr', 'type': 'STRING', 'mode': 'NULLABLE'},
],
write_disposition='WRITE_TRUNCATE')
# [END howto_operator_gcs_to_bq]
delete_test_dataset = bash_operator.BashOperator(
task_id='delete_airflow_test_dataset',
bash_command='bq rm -r -f -d airflow_test')
create_test_dataset >> load_csv >> delete_test_dataset
You can simulate an error by changing a piece of configuration on each operator. And you will need to complete the configuration for sending the email in notify_email().

"Dag Seems to be missing" error in a Cloud Composer Airflow Dynamic DAG

I have a dynamic Airflow DAG in Google Cloud Composer gets created, listed in the web-server and ran (backfill) without error.
However, there are issues:
When clicking on the DAG in web url, it says "DAG seems to be
missing"
Can't see Graph view/Tree view as showing the error above
Can't manually trigger the DAG as showing the error above
Trying to fix this for couple days...any hint will be helpful. Thank you!
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
from google.cloud import storage
from airflow.models import Variable
import json
args = {
'owner': 'xxx',
'start_date':'2020-11-5',
'provide_context': True
}
dag = DAG(
dag_id='dynamic',
default_args=args
)
def return_bucket_files(bucket_name='xxxxx', **kwargs):
client = storage.Client()
bucket = client.get_bucket(bucket_name)
blobs = bucket.list_blobs()
file_list = [blob.name for blob in blobs]
return file_list
def dynamic_gcs_to_gbq_etl(file, **kwargs):
mapping = json.loads(Variable.get("xxxxx"))
database = mapping[0][file]
table = mapping[1][file]
task=GoogleCloudStorageToBigQueryOperator(
task_id= f'gcs_load_{file}_to_gbq',
bucket='xxxxxxx',
source_objects=[f'{file}'],
destination_project_dataset_table=f'xxx.{database}.{table}',
write_disposition="WRITE_TRUNCATE",
autodetect=True,
skip_leading_rows=1,
source_format='CSV',
dag=dag)
return task
start_task = DummyOperator(
task_id='start',
dag=dag
)
end_task = DummyOperator(
task_id='end',
dag=dag)
push_bucket_files = PythonOperator(
task_id="return_bucket_files",
provide_context=True,
python_callable=return_bucket_files,
dag=dag)
for file in return_bucket_files():
gcs_load_task = dynamic_gcs_to_gbq_etl(file)
start_task >> push_bucket_files >> gcs_load_task >> end_task
This issue means that the Web Server is failing to fill in the DAG bag on its side - this problem is most likely not with your DAG specifically.
My suggestion would be right now to try and restart the web server (via the installation of some dummy package).
Similar issues reported in this post as well here.

Defining Global Airflow Variables Using Kwargs Passed From POST Json

I'm creating a DAG and that needs functionality to set global variables using kwargs passed in from the POST Json used to trigger the job. So far, I have attempted this way:
import airflow
from airflow import DAG
from datetime import timedelta
DAG_Name = 'dag_test'
DEFAULT_ARGS = {
'owner': '...',
'depends_on_past': False,
'email': ['...'],
'email_on_failure': True,
'start_date': datetime(2020,8,31)
}
dag = DAG(DAG_Name, default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=2))
snap_date = ''
output_loc = ''
recast = ''
def define_param(**kwargs):
global snap_date
global output_loc
global recast
snapshot = str(kwargs['dag_run'].conf['snap_date'])
output_s3 = kwargs['dag_run'].conf['output_loc']
recast = str(kwargs['dag_run'].conf['recast'])
DEFINE_PARAMETERS = PythonOperator(
task_id='DEFINE_PARAMETERS',
python_callable=define_param,
provide_context=True,
dag=dag)
But this does not work. How would I use kwargs to set global dag variables?
Use Variable.set as it will make the actual update to the database, along with handling session and serialization for you if needed.
Variable.set("snap_date", "2019-09-17")
Ref: https://github.com/apache/airflow/blob/1.10.1/airflow/models.py#L4558-L4569

Apache Airflow - trigger/schedule DAG rerun on completion (File Sensor)

Good Morning.
I'm trying to setup a DAG too
Watch/sense for a file to hit a network folder
Process the file
Archive the file
Using the tutorials online and stackoverflow I have been able to come up with the following DAG and Operator that successfully achieves the objectives, however I would like the DAG to be rescheduled or rerun on completion so it starts watching/sensing for another file.
I attempted to set a variable max_active_runs:1 and then a schedule_interval: timedelta(seconds=5) this yes reschedules the DAG but starts queuing task and locks the file.
Any ideas welcome on how I could rerun the DAG after the archive_task?
Thanks
DAG CODE
from airflow import DAG
from airflow.operators import PythonOperator, OmegaFileSensor, ArchiveFileOperator
from datetime import datetime, timedelta
from airflow.models import Variable
default_args = {
'owner': 'glsam',
'depends_on_past': False,
'start_date': datetime.now(),
'provide_context': True,
'retries': 100,
'retry_delay': timedelta(seconds=30),
'max_active_runs': 1,
'schedule_interval': timedelta(seconds=5),
}
dag = DAG('test_sensing_for_a_file', default_args=default_args)
filepath = Variable.get("soucePath_Test")
filepattern = Variable.get("filePattern_Test")
archivepath = Variable.get("archivePath_Test")
sensor_task = OmegaFileSensor(
task_id='file_sensor_task',
filepath=filepath,
filepattern=filepattern,
poke_interval=3,
dag=dag)
def process_file(**context):
file_to_process = context['task_instance'].xcom_pull(
key='file_name', task_ids='file_sensor_task')
file = open(filepath + file_to_process, 'w')
file.write('This is a test\n')
file.write('of processing the file')
file.close()
proccess_task = PythonOperator(
task_id='process_the_file',
python_callable=process_file,
provide_context=True,
dag=dag
)
archive_task = ArchiveFileOperator(
task_id='archive_file',
filepath=filepath,
archivepath=archivepath,
dag=dag)
sensor_task >> proccess_task >> archive_task
FILE SENSOR OPERATOR
import os
import re
from datetime import datetime
from airflow.models import BaseOperator
from airflow.plugins_manager import AirflowPlugin
from airflow.utils.decorators import apply_defaults
from airflow.operators.sensors import BaseSensorOperator
class ArchiveFileOperator(BaseOperator):
#apply_defaults
def __init__(self, filepath, archivepath, *args, **kwargs):
super(ArchiveFileOperator, self).__init__(*args, **kwargs)
self.filepath = filepath
self.archivepath = archivepath
def execute(self, context):
file_name = context['task_instance'].xcom_pull(
'file_sensor_task', key='file_name')
os.rename(self.filepath + file_name, self.archivepath + file_name)
class OmegaFileSensor(BaseSensorOperator):
#apply_defaults
def __init__(self, filepath, filepattern, *args, **kwargs):
super(OmegaFileSensor, self).__init__(*args, **kwargs)
self.filepath = filepath
self.filepattern = filepattern
def poke(self, context):
full_path = self.filepath
file_pattern = re.compile(self.filepattern)
directory = os.listdir(full_path)
for files in directory:
if re.match(file_pattern, files):
context['task_instance'].xcom_push('file_name', files)
return True
return False
class OmegaPlugin(AirflowPlugin):
name = "omega_plugin"
operators = [OmegaFileSensor, ArchiveFileOperator]
Dmitris method worked perfectly.
I also found in my reading setting schedule_interval=None and then using the TriggerDagRunOperator worked equally as well
trigger = TriggerDagRunOperator(
task_id='trigger_dag_RBCPV99_rerun',
trigger_dag_id="RBCPV99_v2",
dag=dag)
sensor_task >> proccess_task >> archive_task >> trigger
Set schedule_interval=None and use airflow trigger_dag command from BashOperator to launch next execution at the completion of the previous one.
trigger_next = BashOperator(task_id="trigger_next",
bash_command="airflow trigger_dag 'your_dag_id'", dag=dag)
sensor_task >> proccess_task >> archive_task >> trigger_next
You can start your first run manually with the same airflow trigger_dag command and then trigger_next task will automatically trigger the next one. We use this in production for many months now and and it runs perfectly.

Celery broadcast task not working

I've tried to make a broadcast task but only one of my workers recieve it per each call. Would you please help me? (I'm using rabbitmq and node-celery)
default_exchange = Exchange('celery', type='direct')
celery.conf.update(
CELERY_RESULT_BACKEND = "amqp",
CELERY_RESULT_SERIALIZER='json',
CELERY_QUEUES = (
Queue('celery', default_exchange, routing_key='celery'),
Broadcast('broadcast_tasks'),
),
CELERY_ROUTES = (
{'my_tasks.sample_broadcast_task': {
'queue': 'broadcast_tasks',
}},
{'my_tasks.sample_normal_task': {
'queue': 'celery',
'exchange': 'celery',
'exchange_type': 'direct',
'routing_key': 'celery',
}}
),
)
I've also test following configurtion but not working.
celery.conf.update(
CELERY_RESULT_BACKEND = "amqp",
CELERY_RESULT_SERIALIZER='json',
CELERY_QUEUES=(
Queue('celery', Exchange('celery'), routing_key='celery'),
Broadcast('broadcast'),
),
)
#celery.task(ignore_result=True, queue='broadcast',
options=dict(queue='broadcast'))
def sample_broadcast_task():
print "test"
EDIT
after changing how to run worker by adding -Q broadcast, now i face to this error:
PreconditionFailed: Exchange.declare: (406) PRECONDITION_FAILED - inequivalent arg 'type' for exchange 'broadcast' in vhost '/': received 'direct' but current is 'fanout'
After trying many many many things, i finally find a solution. This work for me.
( celery 3.1.24 (Cipater) and Python 2.7.12 )
WORKER - tasks.py :
from celery import Celery
import celery_config
from kombu.common import Broadcast, Queue, Exchange
app = Celery()
app.config_from_object(sysadmin_celery_config)
#app.task
def print_prout(x):
print x
return x
WORKER - celery_config.py :
# coding=utf-8
from kombu.common import Broadcast, Queue, Exchange
BROKER_URL = 'amqp://login:pass#172.17.0.1//'
CELERY_RESULT_BACKEND = 'redis://:login#172.17.0.1'
CELERY_TIMEZONE = 'Europe/Paris'
CELERY_ENABLE_UTC = True
CELERY_TASK_SERIALIZER = 'pickle'
CELERY_RESULT_SERIALIZER = 'pickle'
CELERY_ACCEPT_CONTENT = ['pickle', 'json', 'msgpack', 'yaml']
CELERY_DISABLE_RATE_LIMITS = True
CELERY_ALWAYS_EAGER = False
CELERY_QUEUES = (Broadcast('broadcast_tasks'), )
worker lauched with :
celery -A celery_worker.tasks worker --loglevel=info --concurrency=1 -n worker_name_1
On the client (another docker container for me).
from celery import Celery
from celery_worker import tasks
result = tasks.print_prout.apply_async(['prout'], queue='broadcast_tasks')
print result.get()
The next step for me is how to retrieve and display results returned by all the workers. The "print result.get()" seems to return only the result of the last worker.
It does not seem obvious ( Have Celery broadcast return results from all workers )
according to your description:
I've tried to make a broadcast task but only one of my workers recieve it per each call
you may be using direct type exchange.
Try this
from celery import Celery
from kombu.common import Broadcast
BROKER_URL = 'amqp://guest:guest#localhost:5672//'
class CeleryConf:
# List of modules to import when celery starts.
CELERY_ACCEPT_CONTENT = ['json']
CELERY_IMPORTS = ('main.tasks')
CELERY_QUEUES = (Broadcast('q1'),)
CELERY_ROUTES = {
'tasks.sampletask': {'queue': 'q1'}
}
celeryapp = Celery('celeryapp', broker=BROKER_URL)
celeryapp.config_from_object(CeleryConf())
#celeryapp.task
def sampletask(form):
print form
To send the message, do
d= sampletask.apply_async(['4c5b678350fc643'],serializer="json", queue='q1')