I'm using the following Airflow version inside my Docker container and I am currently having some issues related to a broken DAG
FROM apache/airflow:2.3.4-python3.9
I have other DAGs running with the same argument 'request_cpu' and perfectly functional, I'm not sure what the issue could be
Broken DAG: [/home/airflow/airflow/dags/my_project.py] Traceback (most recent call last):
File "/home/airflow/.local/lib/python3.10/site-packages/airflow/models/baseoperator.py", line 858, in __init__
self.resources = coerce_resources(resources)
File "/home/airflow/.local/lib/python3.10/site-packages/airflow/models/baseoperator.py", line 133, in coerce_resources
return Resources(**resources)
TypeError: Resources.__init__() got an unexpected keyword argument 'request_cpu'
This is my current DAG configuration
# DAG configuration
DAG_ID = "my_project_id"
DAG_DESCRIPTION = "description"
DAG_IMAGE = image
default_args = {
"owner": "airflow",
"depends_on_past": False,
"max_active_tasks": 1,
"max_active_runs": 1,
"email_on_failure": True,
"email": ["my#mail.com"],
"retries": 0,
"email_on_retry": False,
"image_pull_policy": "Always",
}
# Define desired resources.
compute_resources = {
# Cpu: 500m milliCPU is about half cpu, other values, 1, 2, 4... for full cpu allocation
"request_cpu": "500m",
# Memory: Mi for Megabytes or Gi for Gigabytes
"request_memory": "512Mi",
"limit_cpu": "500m",
"limit_memory": "1Gi",
}
with DAG(
DAG_ID,
default_args=default_args,
start_date=datetime(2022, 5, 9),
schedule_interval="0 21 */16 * *", # Every 16 days or twice per month
max_active_runs=1,
max_active_tasks=1,
catchup=False,
description=DAG_DESCRIPTION,
tags=["my tags"],
) as dag:
# AWS credentials
creds = tools.get_config_params(key="AWS-keys")
my_task = KubernetesPodOperator(
namespace="airflow",
image=DAG_IMAGE,
image_pull_secrets=[k8s.V1LocalObjectReference("docker-registry")],
container_resources=compute_resources,
env_vars={
"AWS_ACCESS_KEY_ID": creds["access_key"],
"AWS_SECRET_ACCESS_KEY": creds["secret_access_key"],
"EXECUTION_DATE": "{{ execution_date }}",
},
cmds=["python3", "my_project.py"],
is_delete_operator_pod=True,
in_cluster=False,
name="my-project-name",
task_id="my-task",
config_file=os.path.expanduser("~") + "/.kube/config",
get_logs=True,
resources=compute_resources,
)
First resources is deprecated so you should use only container_resources.
The container_resources is expecting V1ResourceRequirements not dict. You should do:
from kubernetes.client import models as k8s
compute_resources=k8s.V1ResourceRequirements(
requests={
'memory': '512Mi',
'cpu': '500m'
},
limits={
'memory': '1Gi',
'cpu': 500m
}
)
Then
my_task = KubernetesPodOperator(..., container_resources=compute_resources)
Related
I do not know if i am lack of airflow scheduler knowledge or if this is a potential bug from airflow.
situation is like this:
my dag's start date is set to be "start_date": airflow.utils.dates.days_ago(1),
i uploaded the dag to the folder where airflow scans the DAGs
i then turn the dag on (it was by default 'off')
the tasks in the pipeline immediately goes into 'up_for_retry' and you do not really see what had been tried before.
airflow Version Info: Version : 1.10.14. it is run on kubenetes in azure
use Celery executor with Redis
the task instance details are listed below:
Task Instance Details
Dependencies Blocking Task From Getting Scheduled
Dependency Reason
Task Instance State Task is in the 'up_for_retry' state which is not a valid state for execution. The task must be cleared in order to be run.
Not In Retry Period Task is not ready for retry yet but will be retried automatically. Current date is 2021-05-17T09:06:57.239015+00:00 and task will be retried at 2021-05-17T09:09:50.662150+00:00.
am i missing something to judge if it is a bug or if it is expected?
addition, below is the DAG definition as requested.
import airflow
from airflow import DAG
from airflow.contrib.operators.databricks_operator import DatabricksSubmitRunOperator
from airflow.models import Variable
dag_args = {
"owner": "our_project_team_name",
"retries": 1,
"email": ["ouremail_address_replaced_by_this_string"],
"email_on_failure": True,
"email_on_retry": True,
"depends_on_past": False,
"start_date": airflow.utils.dates.days_ago(1),
}
# Implement cluster reuse on Databricks, pick from light, medium, heavy cluster type based on workloads
clusters = Variable.get("our_project_team_namejob_cluster_config", deserialize_json=True)
databricks_connection = "our_company_databricks"
adl_connection = "our_company_wasb"
pipeline_name = "process_our_data_from_boomi"
dag = DAG(dag_id=pipeline_name, default_args=dag_args, schedule_interval="0 3 * * *")
notebook_dir = "/Shared/our_data_name"
lib_path_sub = ""
lib_name_dev_plus_branch = ""
atlas_library = {
"whl": f"dbfs:/python-wheels/atlas{lib_path_sub}/atlas_library-0{lib_name_dev_plus_branch}-py3-none-any.whl"
}
create_our_data_name_source_data_from_boomi_notebook_params = {
"existing_cluster_id": clusters["our_cluster_name"],
"notebook_task": {
"notebook_path": f"{notebook_dir}/create_our_data_name_source_data_from_boomi",
"base_parameters": {"Extraction_date": "{{ ds_nodash }}"},
},
}
create_our_data_name_standardized_table_from_source_xml_notebook_params = {
"existing_cluster_id": clusters["our_cluster_name"],
"notebook_task": {
"notebook_path": f"{notebook_dir}/create_our_data_name_standardized_table_from_source_xml",
"base_parameters": {"Extraction_date": "{{ ds_nodash }}"},
},
}
create_our_data_name_enriched_table_from_standardized_notebook_params = {
"existing_cluster_id": clusters["our_cluster_name"],
"notebook_task": {
"notebook_path": f"{notebook_dir}/create_our_data_name_enriched",
"base_parameters": {"Extraction_date": "{{ ds_nodash }}"},
},
}
layer_1_task = DatabricksSubmitRunOperator(
task_id="Load_our_data_name_to_source",
databricks_conn_id=databricks_connection,
dag=dag,
json=create_our_data_name_source_data_from_boomi_notebook_params,
libraries=[atlas_library],
)
layer_2_task = DatabricksSubmitRunOperator(
task_id="Load_our_data_name_to_standardized",
databricks_conn_id=databricks_connection,
dag=dag,
json=create_our_data_name_standardized_table_from_source_xml_notebook_params,
libraries=[
{"maven": {"coordinates": "com.databricks:spark-xml_2.11:0.5.0"}},
{"pypi": {"package": "inflection"}},
atlas_library,
],
)
layer_3_task = DatabricksSubmitRunOperator(
task_id="Load_our_data_name_to_enriched",
databricks_conn_id=databricks_connection,
dag=dag,
json=create_our_data_name_enriched_table_from_standardized_notebook_params,
libraries=[atlas_library],
)
layer_1_task >> layer_2_task >> layer_3_task
after getting some help from #AnandVidvat about trying to make retry=0 experiment and some firend help to change operator to either DummyOperator or PythonOperator, i can confirm that the issue is not to do with DatabricksOperator or airflow version 1.10.x. i.e it is not an airflow bug.
so in summary, when a DAG, has meaningful operator, my setup fails in first Execution without any task log, and during retry works OK (the task log hides the fact it had been retried, because the failure had no logs).
In order to reduce the total run time. The workaround/patch, before finding the real cause, is to set the retry_delay to 10 seconds (default is 5 mins, and it makes DAG run long unnessicssarily.)
Next step is to figure out what is causing this 1st failure thing, by checking logs on scheduler or woker pods in our current setup (azure K8s, postgresql, Redis, celery executor).
p.s. I used below DAG tested and get the conclusion.
import airflow
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
import time
from pprint import pprint
dag_args = {
"owner": "min_test",
"retries": 1,
"email": ["c243d70b.domain.onmicrosoft.com#emea.teams.ms"],
"email_on_failure": True,
"email_on_retry": True,
"depends_on_past": False,
"start_date": airflow.utils.dates.days_ago(1),
}
pipeline_name = "min_test_debug_airflow_baseline_PythonOperator_1_retry"
dag = DAG(
dag_id=pipeline_name,
default_args=dag_args,
schedule_interval="0 3 * * *",
tags=["min_test_airflow"],
)
def my_sleeping_function(random_base):
"""This is a function that will run within the DAG execution"""
time.sleep(random_base)
def print_context(ds, **kwargs):
pprint(kwargs)
print(ds)
return "Whatever you return gets printed in the logs"
run_this = PythonOperator(
task_id="print_the_context",
provide_context=True,
python_callable=print_context,
dag=dag,
)
# Generate 3 sleeping tasks, sleeping from 0 to 2 seconds respectively
for i in range(3):
task = PythonOperator(
task_id="sleep_for_" + str(i),
python_callable=my_sleeping_function,
op_kwargs={"random_base": float(i) / 10},
dag=dag,
)
task.set_upstream(run_this)
I'm trying to get dask-kubernetes to work with my GKE account. The maddening thing is that it worked. But now it doesn't. I set up a cluster fine. The nodes get created fine as well. They run for 60 seconds and then time out with the following message (as shown with kubectl logs podname):
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/bin/dask-worker", line 8, in <module>
sys.exit(go())
File "/opt/conda/lib/python3.8/site-packages/distributed/cli/dask_worker.py", line 446, in go
main()
File "/opt/conda/lib/python3.8/site-packages/click/core.py", line 829, in __call__
return self.main(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/click/core.py", line 782, in main
rv = self.invoke(ctx)
File "/opt/conda/lib/python3.8/site-packages/click/core.py", line 1066, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/opt/conda/lib/python3.8/site-packages/click/core.py", line 610, in invoke
return callback(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/distributed/cli/dask_worker.py", line 432, in main
loop.run_sync(run)
File "/opt/conda/lib/python3.8/site-packages/tornado/ioloop.py", line 532, in run_sync
return future_cell[0].result()
File "/opt/conda/lib/python3.8/site-packages/distributed/cli/dask_worker.py", line 426, in run
await asyncio.gather(*nannies)
File "/opt/conda/lib/python3.8/asyncio/tasks.py", line 684, in _wrap_awaitable
return (yield from awaitable.__await__())
File "/opt/conda/lib/python3.8/site-packages/distributed/core.py", line 284, in _
raise TimeoutError(
asyncio.exceptions.TimeoutError: Nanny failed to start in 60 seconds
Which, I assume means that the workers can't connect to the scheduler which runs on my laptop? However I don't understand why. The port seems to be open.
from dask_kubernetes import KubeCluster
from dask.distributed import Client
import dask.array as da
if __name__ == '__main__':
cluster = KubeCluster.from_yaml('worker-spec-2.yml')
cluster.scale(1)
client = Client(cluster)
array = da.ones((1000, 1000, 1000))
print(array.mean().compute())
And the worker-spec-2.yml contains the following:
kind: Pod
metadata:
labels:
foo: bar
spec:
restartPolicy: Never
containers:
- image: daskdev/dask:latest
imagePullPolicy: IfNotPresent
args: [dask-worker, --nthreads, '1', --no-dashboard, --memory-limit, 1GB, --death-timeout, '60']
name: easyvvuq
env:
- name: EXTRA_PIP_PACKAGES
value: git+https://github.com/dask/distributed
resources:
limits:
cpu: "1"
memory: 2G
requests:
cpu: 500m
memory: 2G
Again, this or something similar has worked for me. I may have changed something in the worker-spec.yml but that is about it.
My question would - how do I go about diagnosing this? I am not a kubernetes expert by any means.
I am using using https://github.com/helm/charts/tree/master/stable/airflow helm chart and building v1.10.8 puckle/docker-airflow image with kubernetes installed on it and using that image in the helm chart,
But I keep getting
File "/usr/local/bin/airflow", line 37, in <module>
args.func(args)
File "/usr/local/lib/python3.7/site-packages/airflow/bin/cli.py", line 1140, in initdb
db.initdb(settings.RBAC)
File "/usr/local/lib/python3.7/site-packages/airflow/utils/db.py", line 332, in initdb
dagbag = models.DagBag()
File "/usr/local/lib/python3.7/site-packages/airflow/models/dagbag.py", line 95, in __init__
executor = get_default_executor()
File "/usr/local/lib/python3.7/site-packages/airflow/executors/__init__.py", line 48, in get_default_executor
DEFAULT_EXECUTOR = _get_executor(executor_name)
File "/usr/local/lib/python3.7/site-packages/airflow/executors/__init__.py", line 87, in _get_executor
return KubernetesExecutor()
File "/usr/local/lib/python3.7/site-packages/airflow/contrib/executors/kubernetes_executor.py", line 702, in __init__
self.kube_config = KubeConfig()
File "/usr/local/lib/python3.7/site-packages/airflow/contrib/executors/kubernetes_executor.py", line 283, in __init__
self.kube_client_request_args = json.loads(kube_client_request_args)
File "/usr/local/lib/python3.7/json/__init__.py", line 348, in loads
return _default_decoder.decode(s)
File "/usr/local/lib/python3.7/json/decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/usr/local/lib/python3.7/json/decoder.py", line 353, in raw_decode
obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
In my scheduler, also as various sources advise,
I tried setting :
AIRFLOW__KUBERNETES__KUBE_CLIENT_REQUEST_ARGS: {"_request_timeout" : [60,60] }
in my helm values. that also didn't work any one have any ideas what am I missing?
Here's my values.yaml
airflow:
image:
repository: airflow-docker-local
tag: 1.10.8
executor: Kubernetes
service:
type: LoadBalancer
config:
AIRFLOW__KUBERNETES__WORKER_CONTAINER_REPOSITORY: airflow-docker-local
AIRFLOW__KUBERNETES__WORKER_CONTAINER_TAG: 1.10.8
AIRFLOW__KUBERNETES__WORKER_CONTAINER_IMAGE_PULL_POLICY: Never
AIRFLOW__KUBERNETES__WORKER_SERVICE_ACCOUNT_NAME: airflow
AIRFLOW__KUBERNETES__DAGS_VOLUME_CLAIM: airflow
AIRFLOW__KUBERNETES__NAMESPACE: airflow
AIRFLOW__KUBERNETES__KUBE_CLIENT_REQUEST_ARGS: {"_request_timeout" : [60,60] }
AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://postgres:airflow#airflow-postgresql:5432/airflow
persistence:
enabled: true
existingClaim: ''
workers:
enabled: false
postgresql:
enabled: true
redis:
enabled: false
EDIT :
Various attempts to set environment variable in helm values.yaml didn't work, after that I added (pay attention to double and single quotes)
ENV AIRFLOW__KUBERNETES__KUBE_CLIENT_REQUEST_ARGS='{"_request_timeout" : [60,60] }'
to Dockerfile here : https://github.com/puckel/docker-airflow/blob/1.10.9/Dockerfile#L19
after that my airflow-scheduler pod starts but then I keep getting following error on my scheduler pod.
Process KubernetesJobWatcher-9: Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/urllib3/contrib/pyopenssl.py", line 313,
in recv_into return self.connection.recv_into(*args, **kwargs) File "/usr/local/lib/python3.7/site-packages/OpenSSL/SSL.py",
line 1840, in recv_into self._raise_ssl_error(self._ssl, result) File "/usr/local/lib/python3.7/site-packages/OpenSSL/SSL.py",
line 1646, in _raise_ssl_error raise WantReadError() OpenSSL.SSL.WantReadError
For the helm value, the template uses a loop that places the airflow.config map into double quotes ". This means any " in a value needs to be escaped for the output templated YAML to be valid.
airflow:
config:
AIRFLOW__KUBERNETES__KUBE_CLIENT_REQUEST_ARGS: '{\"_request_timeout\":60}'
That deploys and runs (but I haven't completed an end to end test)
According to this github issue, the python scheduler SSL timeout may not be a problem as the watcher starts again after the 60 second connection timeout.
I have a dag which queries the postgress database, And I am using postgresOperator
however when passing the parameter I am getting the below Error.
psycopg2.ProgrammingError: column "132" does not exist
LINE 1: ...d,derived_tstamp FROM atomic.events WHERE event_name = "132"
snapshot of my dag below :
default_args = {
"owner": "airflow",
"depends_on_past": False,
"start_date": airflow.utils.dates.days_ago(1),
"email": ["airflow#airflow.com"],
"email_on_failure": False,
"email_on_retry": False,
"retries": 1,
"retry_delay": timedelta(minutes=1),
}
dag = DAG("PostgresTest", default_args=default_args, schedule_interval='3,33 * * * *',template_searchpath = ['/root/airflow/sql/'])
dailyOperator = PostgresOperator(
task_id='Refresh_DailyScore',
postgres_conn_id='postgress_sophi',
params={"e_name":'"132"'},
sql='atomTest.sql',
dag=dag)
Snapshot of atomTest.sql
SELECT domain_userid,derived_tstamp FROM atomic.events WHERE event_name = {{ params.e_name }}
I am hitting my head the whole day to understand why airflow is considering 132 values as column.
Please suggest.
I've created a classes based periodic task using djcelery to send emails to the client. Task is performing the action and sending email when it is called from shell but while using the crontab, I am getting KeyError as "Schedule.tasks.run". I have added the following setting and created the tasks:
settings.py
import os
import djcelery
djcelery.setup_loader()
BROKER_URL = 'django://'
BROKER_HOST = "localhost"
BROKER_PORT = 5672
BROKER_USER = "guest"
BROKER_PASSWORD = "guest"
BROKER_VHOST = "/"
CELERYBEAT_SCHEDULER = 'djcelery.schedulers.DatabaseScheduler'
CELERY_RESULT_BACKEND = 'djcelery.backends.database:DatabaseBackend'
CELERYBEAT_SCHEDULE = {
"runs-every-30-seconds": {
"task": "schedules.tasks.EndingDrawslotScheduler.run",
"schedule": timedelta(seconds=30),
"args": (16, 16)
},
}
app.conf.timezone = 'UTC'
INSTALLED_APPS = ('djcelery',
'kombu.transport.django',)
Error-Info:
The full contents of the message body was:
{'utc': True, 'callbacks': None, 'id': '6ad19ff8-9825-4d54-a8b2-0a8322fc9fb1',
'args': [], 'taskset': None, 'retries': 0, 'timelimit': (None, None),
'kwargs': {}, 'expires': None, 'errbacks': None, 'chord': None, 'task':
'schedules.tasks.run', 'eta': None} (262b)
Traceback (most recent call last):
File "/home/s/proj/env/lib/python3.5/site-packages/celery/worker/consumer.py", line 465, in on_task_received strategies[type_](message, body,
KeyError: 'schedules.tasks.run'