I have defined a Airflow sample task where I wanted to run a PrestoDB Query followed by a Spark job to perform a simple word count example. Here is the DAG I defined:
from pandas import DataFrame
import logging
from datetime import timedelta
from operator import add
import airflow
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.hooks.presto_hook import PrestoHook
default_args = {
'owner': 'airflow',
'start_date': airflow.utils.dates.days_ago(1),
'depends_on_past': False,
'email': ['airflow#example.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
}
dag = DAG(
'presto_dag',
default_args=default_args,
description='A simple tutorial DAG with PrestoDB and Spark',
# Continue to run DAG once per hour
schedule_interval='#daily',
)
def talk_to_presto():
ph = PrestoHook(host='presto.myhost.com', port=9988)
# Query PrestoDB
query = "show catalogs"
# Fetch Data
data = ph.get_records(query)
logging.info(data)
return data
def submit_to_spark():
# conf = SparkConf().setAppName("PySpark App").setMaster("http://sparkhost.com:18080/")
# sc = SparkContext(conf)
# data = sc.parallelize(list("Hello World"))
# counts = data.map(lambda x: (x, 1)).reduceByKey(add).sortBy(lambda x: x[1], ascending=False).collect()
# for (word, count) in counts:
# print("{}: {}".format(word, count))
# sc.stop()
return "Hello"
presto_task = PythonOperator(
task_id='talk_to_presto',
provide_context=True,
python_callable=talk_to_presto,
dag=dag,
)
spark_task = PythonOperator(
task_id='submit_to_spark',
provide_context=True,
python_callable=submit_to_spark,
dag=dag,
)
presto_task >> spark_task
When I submit the task, about 20 DAG instances stay in the running state:
But it never completes and no logs are generated, at least for the PrestoDB Query. I am able to run the same PrestoDB Query from the Airflow's Data Profiling > Ad-Hoc Query section correctly.
I have intentionally commented out the PySpark code as it wasn't running and not of focus in the question.
I have two questions:
Why aren't the tasks completed and stays in the running state?
What am I doing wrong with the PrestoHook as the query isn't running?
Related
I have a simple glue pyspark job, which connects to Mongodb source through a glue catalog table and extracts data from Mongodb collections and writes to json output into s3 using a glue dynamic frame.
The Mongo database here is deeply nested no sql with structs and arrays. Since it is a no-sql db, source schema is not fixed. Nested columns may vary between document to document.
However, the job fails with the below error.
ERROR: py4j.protocol.Py4JJavaError: An error occurred while calling o75.pyWriteDynamicFrame.: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 4 times, most recent failure: Lost task 0.3 in stage 1.0 (TID 6, 10.3.29.22, executor 1): com.mongodb.spark.exceptions.MongoTypeConversionException: Cannot cast STRING into a IntegerType (value: BsonString{value=''})
As, the job fails due to datatype mismatch reason, I have tried all possible solutions like using resolveChoice(). Since error is for property with 'int' datatype, I tried casting all the property with 'int' type to 'string'.
I also tried the code with dropnullfields, writing with spark dataframe, applymapping, without using catalog table (from_options directly from mongo table), with and without repartition.
All these attempts are commented in the code for reference.
CODE SNIPPET
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
print("Started")
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "<catalog_db_name>", table_name = "<catalog_table_name>", additional_options = {"database": "<mongo_database_name>", "collection": "<mongo_db_collection>"}, transformation_ctx = "datasource0")
# Code to read data directly from mongo database
# datasource0 = glueContext.create_dynamic_frame_from_options(connection_type = "mongodb", connection_options = { "uri": "<connection_string>", "database": "<mongo_db_name>", "collection": "<mongo_collection>", "username": "<db_username>", "password": "<db_password>"})
# Code sample for resolveChoive (converted all the 'int' datatype to 'string'
# resolve_dyf = datasource0.resolveChoice(specs = [("nested.property", "cast:string"),("nested.further[].property", "cast:string")])
# Code sample to dropnullfields
# dyf_dropNullfields = DropNullFields.apply(frame = resolve_dyf, transformation_ctx = "dyf_dropNullfields")
data_sink0 = datasource0.repartition(1)
print("Repartition done")
# Code sample to sink using spark's write method
# data_sink0.write.format("json").option("header","true").save("s3://<s3_folder_path>")
datasink1 = glueContext.write_dynamic_frame.from_options(frame = data_sink0, connection_type = "s3", connection_options = {"path": "s3://<S3_folder_path>"}, format = "json", transformation_ctx = "datasink1")
print("Data Sink complete")
job.commit()
NOTE
I am not exactly sure why it is happening because this isssue is intermittent. Sometimes it works perfectly but at times it fails. So it is quite confusing.
Any help will be highly appreciated.
I was facing the same problem. Simple solution of this is to increase the sample size from 1000 (which is default for MongoDB) to 100000. Adding sample config for your reference.
`read_config = {
"uri": documentdb_write_uri,
"database": "your_db",
"collection": "your_collection",
"username": "user",
"password": "password",
"partitioner": "MongoSamplePartitioner",
"sampleSize": "100000",
"partitionerOptions.partitionSizeMB": "1000",
"partitionerOptions.partitionKey": "_id"
}`
I do not know if i am lack of airflow scheduler knowledge or if this is a potential bug from airflow.
situation is like this:
my dag's start date is set to be "start_date": airflow.utils.dates.days_ago(1),
i uploaded the dag to the folder where airflow scans the DAGs
i then turn the dag on (it was by default 'off')
the tasks in the pipeline immediately goes into 'up_for_retry' and you do not really see what had been tried before.
airflow Version Info: Version : 1.10.14. it is run on kubenetes in azure
use Celery executor with Redis
the task instance details are listed below:
Task Instance Details
Dependencies Blocking Task From Getting Scheduled
Dependency Reason
Task Instance State Task is in the 'up_for_retry' state which is not a valid state for execution. The task must be cleared in order to be run.
Not In Retry Period Task is not ready for retry yet but will be retried automatically. Current date is 2021-05-17T09:06:57.239015+00:00 and task will be retried at 2021-05-17T09:09:50.662150+00:00.
am i missing something to judge if it is a bug or if it is expected?
addition, below is the DAG definition as requested.
import airflow
from airflow import DAG
from airflow.contrib.operators.databricks_operator import DatabricksSubmitRunOperator
from airflow.models import Variable
dag_args = {
"owner": "our_project_team_name",
"retries": 1,
"email": ["ouremail_address_replaced_by_this_string"],
"email_on_failure": True,
"email_on_retry": True,
"depends_on_past": False,
"start_date": airflow.utils.dates.days_ago(1),
}
# Implement cluster reuse on Databricks, pick from light, medium, heavy cluster type based on workloads
clusters = Variable.get("our_project_team_namejob_cluster_config", deserialize_json=True)
databricks_connection = "our_company_databricks"
adl_connection = "our_company_wasb"
pipeline_name = "process_our_data_from_boomi"
dag = DAG(dag_id=pipeline_name, default_args=dag_args, schedule_interval="0 3 * * *")
notebook_dir = "/Shared/our_data_name"
lib_path_sub = ""
lib_name_dev_plus_branch = ""
atlas_library = {
"whl": f"dbfs:/python-wheels/atlas{lib_path_sub}/atlas_library-0{lib_name_dev_plus_branch}-py3-none-any.whl"
}
create_our_data_name_source_data_from_boomi_notebook_params = {
"existing_cluster_id": clusters["our_cluster_name"],
"notebook_task": {
"notebook_path": f"{notebook_dir}/create_our_data_name_source_data_from_boomi",
"base_parameters": {"Extraction_date": "{{ ds_nodash }}"},
},
}
create_our_data_name_standardized_table_from_source_xml_notebook_params = {
"existing_cluster_id": clusters["our_cluster_name"],
"notebook_task": {
"notebook_path": f"{notebook_dir}/create_our_data_name_standardized_table_from_source_xml",
"base_parameters": {"Extraction_date": "{{ ds_nodash }}"},
},
}
create_our_data_name_enriched_table_from_standardized_notebook_params = {
"existing_cluster_id": clusters["our_cluster_name"],
"notebook_task": {
"notebook_path": f"{notebook_dir}/create_our_data_name_enriched",
"base_parameters": {"Extraction_date": "{{ ds_nodash }}"},
},
}
layer_1_task = DatabricksSubmitRunOperator(
task_id="Load_our_data_name_to_source",
databricks_conn_id=databricks_connection,
dag=dag,
json=create_our_data_name_source_data_from_boomi_notebook_params,
libraries=[atlas_library],
)
layer_2_task = DatabricksSubmitRunOperator(
task_id="Load_our_data_name_to_standardized",
databricks_conn_id=databricks_connection,
dag=dag,
json=create_our_data_name_standardized_table_from_source_xml_notebook_params,
libraries=[
{"maven": {"coordinates": "com.databricks:spark-xml_2.11:0.5.0"}},
{"pypi": {"package": "inflection"}},
atlas_library,
],
)
layer_3_task = DatabricksSubmitRunOperator(
task_id="Load_our_data_name_to_enriched",
databricks_conn_id=databricks_connection,
dag=dag,
json=create_our_data_name_enriched_table_from_standardized_notebook_params,
libraries=[atlas_library],
)
layer_1_task >> layer_2_task >> layer_3_task
after getting some help from #AnandVidvat about trying to make retry=0 experiment and some firend help to change operator to either DummyOperator or PythonOperator, i can confirm that the issue is not to do with DatabricksOperator or airflow version 1.10.x. i.e it is not an airflow bug.
so in summary, when a DAG, has meaningful operator, my setup fails in first Execution without any task log, and during retry works OK (the task log hides the fact it had been retried, because the failure had no logs).
In order to reduce the total run time. The workaround/patch, before finding the real cause, is to set the retry_delay to 10 seconds (default is 5 mins, and it makes DAG run long unnessicssarily.)
Next step is to figure out what is causing this 1st failure thing, by checking logs on scheduler or woker pods in our current setup (azure K8s, postgresql, Redis, celery executor).
p.s. I used below DAG tested and get the conclusion.
import airflow
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
import time
from pprint import pprint
dag_args = {
"owner": "min_test",
"retries": 1,
"email": ["c243d70b.domain.onmicrosoft.com#emea.teams.ms"],
"email_on_failure": True,
"email_on_retry": True,
"depends_on_past": False,
"start_date": airflow.utils.dates.days_ago(1),
}
pipeline_name = "min_test_debug_airflow_baseline_PythonOperator_1_retry"
dag = DAG(
dag_id=pipeline_name,
default_args=dag_args,
schedule_interval="0 3 * * *",
tags=["min_test_airflow"],
)
def my_sleeping_function(random_base):
"""This is a function that will run within the DAG execution"""
time.sleep(random_base)
def print_context(ds, **kwargs):
pprint(kwargs)
print(ds)
return "Whatever you return gets printed in the logs"
run_this = PythonOperator(
task_id="print_the_context",
provide_context=True,
python_callable=print_context,
dag=dag,
)
# Generate 3 sleeping tasks, sleeping from 0 to 2 seconds respectively
for i in range(3):
task = PythonOperator(
task_id="sleep_for_" + str(i),
python_callable=my_sleeping_function,
op_kwargs={"random_base": float(i) / 10},
dag=dag,
)
task.set_upstream(run_this)
Summary of my DAG:
I am using SSH Operator to SSH to an EC2 instance and run a JAR file which will connect to multiple DBs. I've declared the Airflow Connection in my DAG file and able to pass the variables into the EC2 instance. As you can see from below, I'm passing properties into JAVA command.
Airflow version - airflow-1-10.7
Package installed - apache-airflow[crypto]
from airflow import DAG
from datetime import datetime, timedelta
from airflow.contrib.hooks.ssh_hook import SSHHook
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.hooks.base_hook import BaseHook
from airflow.models.connection import Connection
ssh_hook = SSHHook(ssh_conn_id='ssh_to_ec2')
ssh_hook.no_host_key_check = True
redshift_connection = BaseHook.get_connection("my_redshift")
rs_user = redshift_connection.login
rs_password = redshift_connection.password
mongo_connection = BaseHook.get_connection("my_mongo")
mongo_user = mongo_connection.login
mongo_password = mongo_connection.password
default_args = {
'owner': 'AIRFLOW',
'start_date': datetime(2020, 4, 1, 0, 0),
'email': [],
'retries': 1,
}
dag = DAG('connect_to_redshift', default_args=default_args)
t00_00 = SSHOperator(
task_id='ssh_and_connect_db',
ssh_hook=ssh_hook,
command="java "
"-Drs_user={rs_user} -Drs_pass={rs_pass} "
"-Dmongo_user={mongo_user} -Dmongo_pass={mongo_pass} "
"-jar /home/airflow/root.jar".format(rs_user=rs_user,rs_pass=rs_pass,mongo_user=mongo_user,mongo_pass=mongo_pass),
dag=dag)
t00_00
Problem
The value for rs_pass,mongo_pass will be exposed in Rendered_Template/Airflow log which is not good and I would like to have a solution that can hide all these sensitive information from log and rendered template with SSH Operator.
So far I've tried to minimum the log verbose to ERROR in airflow.cfg, but it still shows in Rendered_Template.
Please enlighten me.
Thanks
So, I am trying to create a table in my Redshift DB by using airflow. My connection works and I tested it with a SQL command but when I change the sql command to a create table command it runs successfully but it doesn't show up in my redshift DB.
Here is my code:
from datetime import datetime, timedelta
from airflow import DAG
from airflow.hooks.postgres_hook import PostgresHook
from airflow.models import BaseOperator
from airflow.operators.python_operator import PythonOperator
default_args = {
'owner': 'james_c',
'depends_on_past': False,
'start_date': datetime(2019,4,1),
'email': ['myemail#aol.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 2,
'retry_delay': timedelta(minutes=1)
}
def get_activated_sources():
request ="CREATE TABLE if not exists schema1.db1.tb1 (vendor_id varchar(50) PRIMARY KEY, vendor_name VARCHAR(255) NOT NULL);"
pg_hook = PostgresHook(postgre_conn_id="postgres_default", schema='schema1')
connection = pg_hook.get_conn()
cursor = connection.cursor()
cursor.execute(request)
#cursor.fetchall()
cursor.close()
connection.close()
with DAG('create_sample_table_dagg', description='testing my redshift connection', default_args=default_args, schedule_interval='#once', catchup=False) as dag:
hook_task = PythonOperator(task_id='hook_task', python_callable=get_activated_sources)
Any ideas/suggestion as to why its running and completing by not actually creating the table in redshift?f
Your code is fine, you just need to write:
connection.commit()
under
cursor.execute(request)
With:
default_args = {
...
'retries': 1,
'retry_delay': timedelta (seconds = 1),
...
}
I can get the task that fails to retry several times, but how can I get it when a task fails, the DAG starts again?
Of course, automatically...
You can run a second "Fail Check" DAG that queries for any task instances where the task_id matches what you want and the state is failed using the provide_session util. Then, you'll want to optionally clear downstream tasks as well and set the state of the relevant DagRun to running.
from datetime import datetime, timedelta
from sqlalchemy import and_
import json
from airflow import DAG
from airflow.models import TaskInstance, DagRun
from airflow.utils.db import provide_session
from airflow.operators.python_operator import PythonOperator
default_args = {'start_date': datetime(2018, 6, 11),
'retries': 2,
'retry_delay': timedelta(minutes=2),
'email': [],
'email_on_failure': True}
dag = DAG('__RESET__FAILED_TASKS',
default_args=default_args,
schedule_interval='#daily',
catchup=False
)
#provide_session
def check_py(session=None, **kwargs):
relevant_task_id = 'relevant_task_id'
obj = (session
.query(TaskInstance)
.filter(and_(TaskInstance.task_id == relevant_task_id,
TaskInstance.state == 'failed'))
.all())
if obj is None:
raise KeyError('No failed Task Instances of {} exist.'.format(relevant_task_id))
else:
# Clear the relevant tasks.
(session
.query(TaskInstance)
.filter(and_(TaskInstance.task_id == relevant_task_id,
TaskInstance.state == 'failed'))
.delete())
# Clear downstream tasks and set relevant DAG state to RUNNING
for _ in obj:
_ = json.loads(_.val)
# OPTIONAL: Clear downstream tasks in the specified Dag Run.
for task in _['downstream_tasks']:
(session
.query(TaskInstance)
.filter(and_(TaskInstance.task_id == task,
TaskInstance.dag_id == _['dag_id'],
TaskInstance.execution_date == datetime.strptime(_['ts'],
"%Y-%m-%dT%H:%M:%S")))
.delete())
# Set the Dag Run state to "running"
dag_run = (session
.query(DagRun)
.filter(and_(DagRun.dag_id == _['dag_id'],
DagRun.execution_date == datetime.strptime(_['ts'],
"%Y-%m-%dT%H:%M:%S")))
.first())
dag_run.set_state('running')
with dag:
run_check = PythonOperator(task_id='run_check',
python_callable=check_py,
provide_context=True)
run_check
The canonical solution to this in Airflow is to create a subdagoperator that wraps all the other tasks in the dag, and apply the retry to that.
You could potentially use the on_failure_callback feature to call a python / bash script that would restart the DAG. There is not currently a feature provided by Airflow to automatically restart the DAG upon task failure.