PostgresOperator in Airflow getting error while passing parameter - postgresql

I have a dag which queries the postgress database, And I am using postgresOperator
however when passing the parameter I am getting the below Error.
psycopg2.ProgrammingError: column "132" does not exist
LINE 1: ...d,derived_tstamp FROM atomic.events WHERE event_name = "132"
snapshot of my dag below :
default_args = {
"owner": "airflow",
"depends_on_past": False,
"start_date": airflow.utils.dates.days_ago(1),
"email": ["airflow#airflow.com"],
"email_on_failure": False,
"email_on_retry": False,
"retries": 1,
"retry_delay": timedelta(minutes=1),
}
dag = DAG("PostgresTest", default_args=default_args, schedule_interval='3,33 * * * *',template_searchpath = ['/root/airflow/sql/'])
dailyOperator = PostgresOperator(
task_id='Refresh_DailyScore',
postgres_conn_id='postgress_sophi',
params={"e_name":'"132"'},
sql='atomTest.sql',
dag=dag)
Snapshot of atomTest.sql
SELECT domain_userid,derived_tstamp FROM atomic.events WHERE event_name = {{ params.e_name }}
I am hitting my head the whole day to understand why airflow is considering 132 values as column.
Please suggest.

Related

airflow message kafka topic

How can I display all messages that are in a kafka topic?
I execute this code and it reads as a consumer what the producer wrote down at the moment the dag is being executed, but what was recorded there already by the last work of the dag, only the current one does not show me
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow_provider_kafka.operators.consume_from_topic import ConsumeFromTopicOperator
from airflow_provider_kafka.operators.produce_to_topic import ProduceToTopicOperator
default_args = {
"owner": "airflow",
"depend_on_past": False,
"start_date": datetime(2021, 7, 20),
"email_on_failure": False,
"email_on_retry": False,
"retries": 1,
"retry_delay": timedelta(minutes=5),
}
fruits_test = ["Apple", "Pear", "Peach", "Banana"]
def producer_function():
for i in fruits_test:
yield (json.dumps(i), json.dumps(i + i))
consumer_logger = logging.getLogger("airflow")
def consumer_function(message, prefix=None):
key = json.loads(message.key())
value = json.loads(message.value())
consumer_logger.info(f"{prefix} {message.topic()} # {message.offset()}; {key} : {value}")
return
with DAG(
"kafka_DAG",
default_args=default_args,
description="KafkaOperators",
schedule_interval=None,
start_date=datetime(2021, 1, 1),
catchup=False,
tags=["Test_DAG"],
) as dag:
t1 = ProduceToTopicOperator(
task_id="produce_to_topic",
topic="topictest",
producer_function=producer_function,
kafka_config={"bootstrap.servers": ":9092"},
)
t2 = ConsumeFromTopicOperator(
task_id="consume_from_topic",
topics=["topictest"],
apply_function=consumer_function,
apply_function_kwargs={"prefix": "consumed:::"},
consumer_config={
"bootstrap.servers": ":9092",
"group.id": "test-consumer-group",
"enable.auto.commit": False,
"auto.offset.reset": "earliest",
},
commit_cadence="end_of_batch",
max_messages=10,
max_batch_size=2,
)
task_id="consume_from_topic",
topics=["topictest"],
apply_function=consumer_function,
apply_function_kwargs={"prefix": "consumed:::"},
consumer_config={
"bootstrap.servers": ":9092",
"group.id": "test-consumer-group",
"enable.auto.commit": False,
"auto.offset.reset": "earliest",
},
commit_cadence="never",
max_messages=10,
max_batch_size=2,
)
You just need to make sure that the consumer isn't committing the offset ever. If you set "commit_cadence" to never, this consumer will run from the beginning of the log every time it executes.

Airflow - KubernetesPodOperator - Broken DAG: unexpected keyword argument 'request_cpu'

I'm using the following Airflow version inside my Docker container and I am currently having some issues related to a broken DAG
FROM apache/airflow:2.3.4-python3.9
I have other DAGs running with the same argument 'request_cpu' and perfectly functional, I'm not sure what the issue could be
Broken DAG: [/home/airflow/airflow/dags/my_project.py] Traceback (most recent call last):
File "/home/airflow/.local/lib/python3.10/site-packages/airflow/models/baseoperator.py", line 858, in __init__
self.resources = coerce_resources(resources)
File "/home/airflow/.local/lib/python3.10/site-packages/airflow/models/baseoperator.py", line 133, in coerce_resources
return Resources(**resources)
TypeError: Resources.__init__() got an unexpected keyword argument 'request_cpu'
This is my current DAG configuration
# DAG configuration
DAG_ID = "my_project_id"
DAG_DESCRIPTION = "description"
DAG_IMAGE = image
default_args = {
"owner": "airflow",
"depends_on_past": False,
"max_active_tasks": 1,
"max_active_runs": 1,
"email_on_failure": True,
"email": ["my#mail.com"],
"retries": 0,
"email_on_retry": False,
"image_pull_policy": "Always",
}
# Define desired resources.
compute_resources = {
# Cpu: 500m milliCPU is about half cpu, other values, 1, 2, 4... for full cpu allocation
"request_cpu": "500m",
# Memory: Mi for Megabytes or Gi for Gigabytes
"request_memory": "512Mi",
"limit_cpu": "500m",
"limit_memory": "1Gi",
}
with DAG(
DAG_ID,
default_args=default_args,
start_date=datetime(2022, 5, 9),
schedule_interval="0 21 */16 * *", # Every 16 days or twice per month
max_active_runs=1,
max_active_tasks=1,
catchup=False,
description=DAG_DESCRIPTION,
tags=["my tags"],
) as dag:
# AWS credentials
creds = tools.get_config_params(key="AWS-keys")
my_task = KubernetesPodOperator(
namespace="airflow",
image=DAG_IMAGE,
image_pull_secrets=[k8s.V1LocalObjectReference("docker-registry")],
container_resources=compute_resources,
env_vars={
"AWS_ACCESS_KEY_ID": creds["access_key"],
"AWS_SECRET_ACCESS_KEY": creds["secret_access_key"],
"EXECUTION_DATE": "{{ execution_date }}",
},
cmds=["python3", "my_project.py"],
is_delete_operator_pod=True,
in_cluster=False,
name="my-project-name",
task_id="my-task",
config_file=os.path.expanduser("~") + "/.kube/config",
get_logs=True,
resources=compute_resources,
)
First resources is deprecated so you should use only container_resources.
The container_resources is expecting V1ResourceRequirements not dict. You should do:
from kubernetes.client import models as k8s
compute_resources=k8s.V1ResourceRequirements(
requests={
'memory': '512Mi',
'cpu': '500m'
},
limits={
'memory': '1Gi',
'cpu': 500m
}
)
Then
my_task = KubernetesPodOperator(..., container_resources=compute_resources)

Airflow Debug throws "(sqlite3.OperationalError) no such table: task_instance" when debugging remote from VScode

First of all, I'm using VScode and airflow 2.0.1 is on a remote server, thus im ssh'ing onto the server. All the files are on the remote server.
In my dag.py I have
from airflow import DAG
from airflow.utils.dates import days_ago
from datetime import timedelta
from airflow.operators.python import PythonVirtualenvOperator
default_args = {
'owner': 'me',
'depends_on_past': False,
}
with DAG(
'test',
default_args=default_args,
description='Test ',
schedule_interval=timedelta(days=1),
start_date=days_ago(1),
) as dag:
t1 = PythonVirtualenvOperator(
task_id = "test1",
python_version = "3.7",
python_callable = test1,
requirements = req
)
t2 = PythonVirtualenvOperator(
task_id = "test2",
python_version = "3.7",
python_callable = test2,
requirements = req
)
t1>>t2
if __name__=="__main__":
from airflow.utils.state import State
dag.clear(dag_run_state=State.NONE)
dag.run()
and my launch.json in vscode
{ "version": "0.2.0",
"configurations": [
{
"name": "Python: Current File",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"env":{
"AIRFLOW__CORE__EXECUTOR":"DebugExecutor"
}
}
]
}
When I then enter dag.py and pressed the debug, it stops on dag.clear(dag_run_state=State.NONE) and throws (sqlite3.OperationalError) no such table: task_instance".
If I run airflow db check I get
[user]$ airflow db check
[2021-05-26 14:15:35,741] {cli_action_loggers.py:105} WARNING - Failed to log action with (sqlite3.OperationalError) no such table: log
[SQL: INSERT INTO log (dttm, dag_id, task_id, event, execution_date, owner, extra) VALUES (?, ?, ?, ?, ?, ?, ?)]
[parameters: ('2021-05-26 12:15:35.737300', None, None, 'cli_check', None, 'user', '{"host_name": "server", "full_command": "[\'/usr/local/bin/airflow\', \'db\', \'check\']"}')]
(Background on this error at: http://sqlalche.me/e/13/e3q8)
[2021-05-26 14:15:35,746] {db.py:762} INFO - Connection successful.
I'm using SQlite as DB and Airflow runs my DAGS (apart from the one I try to debug) without an issue, so I assume the DB tables are set up correctly?

airflow dag - task is immediately put into 'up_for_retry' state ('start_date' is 1 day ago)

I do not know if i am lack of airflow scheduler knowledge or if this is a potential bug from airflow.
situation is like this:
my dag's start date is set to be "start_date": airflow.utils.dates.days_ago(1),
i uploaded the dag to the folder where airflow scans the DAGs
i then turn the dag on (it was by default 'off')
the tasks in the pipeline immediately goes into 'up_for_retry' and you do not really see what had been tried before.
airflow Version Info: Version : 1.10.14. it is run on kubenetes in azure
use Celery executor with Redis
the task instance details are listed below:
Task Instance Details
Dependencies Blocking Task From Getting Scheduled
Dependency Reason
Task Instance State Task is in the 'up_for_retry' state which is not a valid state for execution. The task must be cleared in order to be run.
Not In Retry Period Task is not ready for retry yet but will be retried automatically. Current date is 2021-05-17T09:06:57.239015+00:00 and task will be retried at 2021-05-17T09:09:50.662150+00:00.
am i missing something to judge if it is a bug or if it is expected?
addition, below is the DAG definition as requested.
import airflow
from airflow import DAG
from airflow.contrib.operators.databricks_operator import DatabricksSubmitRunOperator
from airflow.models import Variable
dag_args = {
"owner": "our_project_team_name",
"retries": 1,
"email": ["ouremail_address_replaced_by_this_string"],
"email_on_failure": True,
"email_on_retry": True,
"depends_on_past": False,
"start_date": airflow.utils.dates.days_ago(1),
}
# Implement cluster reuse on Databricks, pick from light, medium, heavy cluster type based on workloads
clusters = Variable.get("our_project_team_namejob_cluster_config", deserialize_json=True)
databricks_connection = "our_company_databricks"
adl_connection = "our_company_wasb"
pipeline_name = "process_our_data_from_boomi"
dag = DAG(dag_id=pipeline_name, default_args=dag_args, schedule_interval="0 3 * * *")
notebook_dir = "/Shared/our_data_name"
lib_path_sub = ""
lib_name_dev_plus_branch = ""
atlas_library = {
"whl": f"dbfs:/python-wheels/atlas{lib_path_sub}/atlas_library-0{lib_name_dev_plus_branch}-py3-none-any.whl"
}
create_our_data_name_source_data_from_boomi_notebook_params = {
"existing_cluster_id": clusters["our_cluster_name"],
"notebook_task": {
"notebook_path": f"{notebook_dir}/create_our_data_name_source_data_from_boomi",
"base_parameters": {"Extraction_date": "{{ ds_nodash }}"},
},
}
create_our_data_name_standardized_table_from_source_xml_notebook_params = {
"existing_cluster_id": clusters["our_cluster_name"],
"notebook_task": {
"notebook_path": f"{notebook_dir}/create_our_data_name_standardized_table_from_source_xml",
"base_parameters": {"Extraction_date": "{{ ds_nodash }}"},
},
}
create_our_data_name_enriched_table_from_standardized_notebook_params = {
"existing_cluster_id": clusters["our_cluster_name"],
"notebook_task": {
"notebook_path": f"{notebook_dir}/create_our_data_name_enriched",
"base_parameters": {"Extraction_date": "{{ ds_nodash }}"},
},
}
layer_1_task = DatabricksSubmitRunOperator(
task_id="Load_our_data_name_to_source",
databricks_conn_id=databricks_connection,
dag=dag,
json=create_our_data_name_source_data_from_boomi_notebook_params,
libraries=[atlas_library],
)
layer_2_task = DatabricksSubmitRunOperator(
task_id="Load_our_data_name_to_standardized",
databricks_conn_id=databricks_connection,
dag=dag,
json=create_our_data_name_standardized_table_from_source_xml_notebook_params,
libraries=[
{"maven": {"coordinates": "com.databricks:spark-xml_2.11:0.5.0"}},
{"pypi": {"package": "inflection"}},
atlas_library,
],
)
layer_3_task = DatabricksSubmitRunOperator(
task_id="Load_our_data_name_to_enriched",
databricks_conn_id=databricks_connection,
dag=dag,
json=create_our_data_name_enriched_table_from_standardized_notebook_params,
libraries=[atlas_library],
)
layer_1_task >> layer_2_task >> layer_3_task
after getting some help from #AnandVidvat about trying to make retry=0 experiment and some firend help to change operator to either DummyOperator or PythonOperator, i can confirm that the issue is not to do with DatabricksOperator or airflow version 1.10.x. i.e it is not an airflow bug.
so in summary, when a DAG, has meaningful operator, my setup fails in first Execution without any task log, and during retry works OK (the task log hides the fact it had been retried, because the failure had no logs).
In order to reduce the total run time. The workaround/patch, before finding the real cause, is to set the retry_delay to 10 seconds (default is 5 mins, and it makes DAG run long unnessicssarily.)
Next step is to figure out what is causing this 1st failure thing, by checking logs on scheduler or woker pods in our current setup (azure K8s, postgresql, Redis, celery executor).
p.s. I used below DAG tested and get the conclusion.
import airflow
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
import time
from pprint import pprint
dag_args = {
"owner": "min_test",
"retries": 1,
"email": ["c243d70b.domain.onmicrosoft.com#emea.teams.ms"],
"email_on_failure": True,
"email_on_retry": True,
"depends_on_past": False,
"start_date": airflow.utils.dates.days_ago(1),
}
pipeline_name = "min_test_debug_airflow_baseline_PythonOperator_1_retry"
dag = DAG(
dag_id=pipeline_name,
default_args=dag_args,
schedule_interval="0 3 * * *",
tags=["min_test_airflow"],
)
def my_sleeping_function(random_base):
"""This is a function that will run within the DAG execution"""
time.sleep(random_base)
def print_context(ds, **kwargs):
pprint(kwargs)
print(ds)
return "Whatever you return gets printed in the logs"
run_this = PythonOperator(
task_id="print_the_context",
provide_context=True,
python_callable=print_context,
dag=dag,
)
# Generate 3 sleeping tasks, sleeping from 0 to 2 seconds respectively
for i in range(3):
task = PythonOperator(
task_id="sleep_for_" + str(i),
python_callable=my_sleeping_function,
op_kwargs={"random_base": float(i) / 10},
dag=dag,
)
task.set_upstream(run_this)

How to execute PostgreSQL SELECT query using cloud sql in cloud composer's airflow?

I am new to cloud composer & I want to execute one PostgreSQL SELECT query using gcp_cloud_sql hook in cloud composer's airflow. I tried with CloudSqlQueryOperator but it doesn't work with SELECT queries.
I want to create DAGs on basis of results I get from this select query.However, I am not able to create even simple connection for this SELECT query.
from six.moves.urllib.parse import quote_plus
import airflow
from airflow import models
from airflow.contrib.operators.gcp_sql_operator import (
CloudSqlQueryOperator
)
from datetime import date, datetime, timedelta
GCP_PROJECT_ID = "adtech-dev"
GCP_REGION = "<my cluster zone>"
GCSQL_POSTGRES_INSTANCE_NAME_QUERY = "testpostgres"
GCSQL_POSTGRES_DATABASE_NAME = ""
GCSQL_POSTGRES_USER = "<PostgreSQL User Name>"
GCSQL_POSTGRES_PASSWORD = "**********"
GCSQL_POSTGRES_PUBLIC_IP = "0.0.0.0"
GCSQL_POSTGRES_PUBLIC_PORT = "5432"
rule_query = "select r.id from rules r where r.id = 1"
postgres_kwargs = dict(
user=quote_plus(GCSQL_POSTGRES_USER),
password=quote_plus(GCSQL_POSTGRES_PASSWORD),
public_port=GCSQL_POSTGRES_PUBLIC_PORT,
public_ip=quote_plus(GCSQL_POSTGRES_PUBLIC_IP),
project_id=quote_plus(GCP_PROJECT_ID),
location=quote_plus(GCP_REGION),
instance=quote_plus(GCSQL_POSTGRES_INSTANCE_NAME_QUERY),
database=quote_plus(GCSQL_POSTGRES_DATABASE_NAME)
)
default_args = {
'owner': 'airflow',
'start_date': datetime(2018, 5, 31),
'email': ['aniruddha.dwivedi#xyz.com'],
'email_on_failure': True,
'email_on_retry': False,
'depends_on_past': False,
'catchup': False,
'retries': 3,
'retry_delay': timedelta(minutes=10),
}
os.environ['AIRFLOW_CONN_PROXY_POSTGRES_TCP'] = \
"gcpcloudsql://{user}:{password}#{public_ip}:{public_port}/{database}?" \
"database_type=postgres&" \
"project_id={project_id}&" \
"location={location}&" \
"instance={instance}&" \
"use_proxy=True&" \
"sql_proxy_use_tcp=True".format(**postgres_kwargs)
connection_names = [
"proxy_postgres_tcp"
]
tasks = []
with models.DAG(
dag_id='example_gcp_sql_query',
default_args=default_args,
schedule_interval=None
) as dag:
prev_task = None
for connection_name in connection_names:
task = CloudSqlQueryOperator(
gcp_cloudsql_conn_id=connection_name,
task_id="example_gcp_sql_task_" + connection_name,
sql=rule_query
)
tasks.append(task)
if prev_task:
prev_task >> task
prev_task = task