I am adding a task to an airflow dag as follows:
examples_task = KubernetesPodOperator(
task_id='examples_generation',
dag=dag,
namespace='test',
image='test_amazon_image',
name='pipe-labelled-examples-generation-tf-record-operator',
env={
'GOOGLE_APPLICATION_CREDENTIALS': Variable.get('google_cloud_credentials')
},
arguments=[
"--assets_path", Variable.get('assets_path'),
"--folder_source", Variable.get('folder_source'),
"--folder_destination", Variable.get('folder_destination'),
"--gcs_folder_destination", Variable.get('gcs_folder_destination'),
"--aws_region", Variable.get('aws_region'),
"--s3_endpoint", Variable.get('s3_endpoint')
],
get_logs=True)
I thought I could paste the service account json file as a variable and call it but this doesn't work and airflow/google documentation isn't clear. How do you do this?
Solutions to port the json into an argument
examples_task = KubernetesPodOperator(
task_id='examples_generation',
dag=dag,
namespace='test',
image='test_amazon_image',
name='pipe-labelled-examples-generation-tf-record-operator',
arguments=[
"--folder_source", Variable.get('folder_source'),
"--folder_destination", Variable.get('folder_destination'),
"--gcs_folder_destination", Variable.get('gcs_folder_destination'),
"--aws_region", Variable.get('aws_region'),
"--s3_endpoint", Variable.get('s3_endpoint')
"--gcs_credentials", Variable.get('google_cloud_credentials')
],
get_logs=True)
then in the cli set
import json
from google.cloud import storage
from google.oauth2 import service_account
credentials = service_account.Credentials.from_service_account_info(json.loads(gcs_credentials))
client = storage.Client(project='project_id', credentials=credentials)
Related
In the Opensearch L2 construct, if you add fine grained access controls, a Secret in Secrets Manager will be created for you (accessible by the masterUserPassword).
I want to use this generated password within a CloudformationInit later on, but not sure how to.
from aws_cdk import aws_ec2 as ec2
from aws_cdk import aws_iam as iam
from aws_cdk import aws_opensearchservice as opensearch
from aws_cdk import aws_s3 as s3
class OpensearchStack(Stack):
def __init__(
self,
scope: Construct,
construct_id: str,
**kwargs,
) -> None:
super().__init__(scope, construct_id, **kwargs)
vpc = ec2.Vpc(self, "generatorVpc", max_azs=2)
bucket = s3.Bucket(self, "My Bucket")
domain = opensearch.Domain(self,"OpensearchDomain",
version=opensearch.EngineVersion.OPENSEARCH_1_3,
vpc=vpc,
fine_grained_access_control=opensearch.AdvancedSecurityOptions(
master_user_name="osadmin",
),
)
instance = ec2.Instance(self, "Instance",
vpc=vpc,
instance_type=ec2.InstanceType.of(
instance_class=ec2.InstanceClass.M5,
instance_size=ec2.InstanceSize.LARGE,
),
machine_image=ec2.MachineImage.latest_amazon_linux(
generation=ec2.AmazonLinuxGeneration.AMAZON_LINUX_2,
),
init=ec2.CloudFormationInit.from_elements(
ec2.InitFile.from_string(
file_name="/home/ec2-user/logstash-8.4.0/config/my_conf.conf",
owner="ec2-user",
mode="00755",
content=f"""input {{
s3 {{
bucket => "{bucket.bucket_name}"
region => "{self.region}"
}}
}}
output {{
opensearch {{
hosts => ["{domain.domain_endpoint}:443"]
user => "{domain.master_user_password.secrets_manager("What secret id do I put here?", json_field="username")}"
password => "{domain.master_user_password.secrets_manager("What secret id do I put here?", json_field="password")}"
ecs_compatibility => disabled
}}
}}
""",
)
)
)
Since SecretValue doesn't have a secretId property, I'm not sure how I can determine the Secret ID/Arn of the masterUserPassword.
Is there a better way to get the generated credentials inside my logstash config?
The username value is easy, as you are explicitly setting it as osadmin. To get the password reference, call to_string method on the Domain's master_user_password attribute, which is a SecretValue:
domain.master_user_password.to_string()
In the synthesized template, this gets turned into a CloudFormation dynamic reference to the secret's password. The actual password is not known to the template. It will be resolved cloud-side at deploy time.
The SecretsValue.secrets_manager static method also synthesizes the same dynamic reference. However, you can't use it. The method requires the secret ID, which is not exposed if the Domain construct generates the secret for you.
I ended up adding commands to the CloudFormationInit to pull the OS Credentials from Secrets Manager and did a find and replace which worked
from aws_cdk import aws_ec2 as ec2
from aws_cdk import aws_opensearchservice as opensearch
from aws_cdk import aws_s3 as s3
from aws_cdk import aws_secretsmanager as secretsmanager
from aws_cdk import Stack
from constructs import Construct
class OpensearchStack(Stack):
def __init__(
self,
scope: Construct,
construct_id: str,
**kwargs,
) -> None:
super().__init__(scope, construct_id, **kwargs)
vpc = ec2.Vpc(self, "generatorVpc", max_azs=2)
bucket = s3.Bucket(self, "My Bucket")
domain = opensearch.Domain(self,"OpensearchDomain",
version=opensearch.EngineVersion.OPENSEARCH_1_3,
vpc=vpc,
fine_grained_access_control=opensearch.AdvancedSecurityOptions(
master_user_name="osadmin",
),
)
# Get the domain secret
domain_secret: secretsmanager.Secret = domain.node.find_child("MasterUser")
instance = ec2.Instance(self, "Instance",
vpc=vpc,
instance_type=ec2.InstanceType.of(
instance_class=ec2.InstanceClass.M5,
instance_size=ec2.InstanceSize.LARGE,
),
machine_image=ec2.MachineImage.latest_amazon_linux(
generation=ec2.AmazonLinuxGeneration.AMAZON_LINUX_2,
),
init=ec2.CloudFormationInit.from_elements(
ec2.InitFile.from_string(
file_name="/home/ec2-user/logstash-8.4.0/config/my_conf.conf",
owner="ec2-user",
mode="00755",
content=f"""input {{
s3 {{
bucket => "{bucket.bucket_name}"
region => "{self.region}"
}}
}}
output {{
opensearch {{
hosts => ["{domain.domain_endpoint}:443"]
user => "REPLACE_WITH_USERNAME"
password => "REPLACE_WITH_PASSWORD"
ecs_compatibility => disabled
}}
}}
""",
),
ec2.InitPackage.yum("jq"), # install jq
ec2.InitCommand.shell_command(
shell_command=(
f"aws configure set region {self.region} && "
# save secret value to variable
f"OS_SECRET=$(aws secretsmanager get-secret-value --secret-id {domain_secret.secret_arn} "
"--query SecretString) && "
# Pull values from json string
"OS_USER=$(echo $OS_SECRET | jq -r '. | fromjson | .username') && "
"OS_PASS=$(echo $OS_SECRET | jq -r '. | fromjson | .password') && "
# Find and replace
"sed -i \"s/REPLACE_WITH_USERNAME/$OS_USER/g\" /home/ec2-user/logstash-8.4.0/config/my_conf.conf && "
"sed -i \"s/REPLACE_WITH_PASSWORD/$OS_PASS/g\" /home/ec2-user/logstash-8.4.0/config/my_conf.conf"
),
),
)
)
# Don't forget to grant the instance read access to the secret
domain_secret.grant_read(instance.role)
There is an Airflow operator GCSToLocalFilesystemOperator to copy ONE file from GCS bucket to the local filesystem. But it supports only one file and it is not possible to copy many files for a given prefix.
There is a reverse operator LocalFilesystemToGCSOperator that allows to copy many files from local filesystem to the bucket, you do it simply with the star in the path "/*".
Do you know what is the best way to copy files by the prefix from a bucket to the local filesystem in Airflow? Am I missing something or it is not just implemented for some reason?
The solution I came up so far is compressing the files before putting it to the bucket, download as one file with airflow and unzip with BashOperator locally. I'm wondering if there is a better way.
I was able to successfully copy multiple files from GCS bucket to local filesystem(mapped) for a given prefix in Airflow using the below approach.
import datetime
from airflow import models
from airflow.operators import bash
from airflow.providers.google.cloud.hooks.gcs import GCSHook
from airflow.operators import python
YESTERDAY = datetime.datetime.now() - datetime.timedelta(days=1)
BUCKET_NAME = 'qpalzm-bucket'
GCS_FILES = ['luffy.jpg', 'zoro.jpg']
LOCAL_PATH = '/home/airflow/gcs/data'
PREFIX = 'testfolder'
default_args = {
'owner': 'Composer Example',
'depends_on_past': False,
'email': [''],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': datetime.timedelta(minutes=5),
'start_date': YESTERDAY,
}
#
with models.DAG(
'multi_copy_gcs_to_local',
catchup=False,
default_args=default_args,
schedule_interval=datetime.timedelta(days=1)) as dag:
def multi_copy(**kwargs):
hook = GCSHook()
for gcs_file in GCS_FILES:
#initialize file name and the local directory where it will be copied
filename = f'{LOCAL_PATH}/{gcs_file}'
#check if PREFIX is available and initialize the gcs file to be copied
if PREFIX:
object_name = f'{PREFIX}/{gcs_file}'
else:
object_name = f'{gcs_file}'
#perform gcs hook download
hook.download(
bucket_name = BUCKET_NAME,
object_name = object_name,
filename = filename
)
#execute multi_copy method
multi_copy_op = python.PythonOperator(
task_id='multi_gcs_to_local',
provide_context=True,
python_callable=multi_copy,
)
multi_copy_op
Output:
I'm having a confusion with KubernetesPodOperator from Airflow, and I'm wondering how to pass the load_users_into_table() function that it has a conn_id parameter stored in connection of Airflow in the Pod ?
In the official doc proposes to put the conn_id in Secret but I don't understand how can I pass it in my function load_users_into_table() after that.
https://airflow.apache.org/docs/stable/kubernetes.html
the function (task) to be executed in the pod:
def load_users_into_table(postgres_hook, schema, path):
gdf = read_csv(path)
gdf.to_sql('users', con=postgres_hook.get_sqlalchemy_engine(), schema=schema)
the dag:
_pg_hook = PostgresHook(postgres_conn_id = _conn_id)
with dag:
test = KubernetesPodOperator(
namespace=namespace,
image=image_name,
cmds=["python", "-c"],
arguments=[load_users_into_table],
labels={"dag-id": dag.dag_id},
name="airflow-test-pod",
task_id="task-1",
is_delete_operator_pod=True,
in_cluster=in_cluster,
get_logs=True,
config_file=config_file,
executor_config={
"KubernetesExecutor": {"request_memory": "512Mi",
"limit_memory": "1024Mi",
"request_cpu": "1",
"limit_cpu": "2"}
}
)
Assuming you want to run with K8sPodOperator, you can use argparse and add arguments to the docker cmd. Something in these lines should do the job:
import argparse
def f(arg):
print(arg)
parser = argparse.ArgumentParser()
parser.add_argument('--foo', help='foo help')
args = parser.parse_args()
if __name__ == '__main__':
f(args.foo)
Dockerfile:
FROM python:3
COPY main.py main.py
CMD ["python", "main.py", "--foo", "somebar"]
There are other ways to solve this such as using secrets, configMaps or even Airflow Variables, but this should get you moving forward.
Summary of my DAG:
I am using SSH Operator to SSH to an EC2 instance and run a JAR file which will connect to multiple DBs. I've declared the Airflow Connection in my DAG file and able to pass the variables into the EC2 instance. As you can see from below, I'm passing properties into JAVA command.
Airflow version - airflow-1-10.7
Package installed - apache-airflow[crypto]
from airflow import DAG
from datetime import datetime, timedelta
from airflow.contrib.hooks.ssh_hook import SSHHook
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.hooks.base_hook import BaseHook
from airflow.models.connection import Connection
ssh_hook = SSHHook(ssh_conn_id='ssh_to_ec2')
ssh_hook.no_host_key_check = True
redshift_connection = BaseHook.get_connection("my_redshift")
rs_user = redshift_connection.login
rs_password = redshift_connection.password
mongo_connection = BaseHook.get_connection("my_mongo")
mongo_user = mongo_connection.login
mongo_password = mongo_connection.password
default_args = {
'owner': 'AIRFLOW',
'start_date': datetime(2020, 4, 1, 0, 0),
'email': [],
'retries': 1,
}
dag = DAG('connect_to_redshift', default_args=default_args)
t00_00 = SSHOperator(
task_id='ssh_and_connect_db',
ssh_hook=ssh_hook,
command="java "
"-Drs_user={rs_user} -Drs_pass={rs_pass} "
"-Dmongo_user={mongo_user} -Dmongo_pass={mongo_pass} "
"-jar /home/airflow/root.jar".format(rs_user=rs_user,rs_pass=rs_pass,mongo_user=mongo_user,mongo_pass=mongo_pass),
dag=dag)
t00_00
Problem
The value for rs_pass,mongo_pass will be exposed in Rendered_Template/Airflow log which is not good and I would like to have a solution that can hide all these sensitive information from log and rendered template with SSH Operator.
So far I've tried to minimum the log verbose to ERROR in airflow.cfg, but it still shows in Rendered_Template.
Please enlighten me.
Thanks
We are running a self-managed Airflow 1.10.2 with KubernetesExecutor on a GKE cluster in GCP. All internal operators are working fine so far, except the KubernetesPodOperator, which we would like to use for running our custom docker images. It seems that the Airflow worker images don't have privileges to start other pods inside the Kubernetes cluster. DAG just does not seem to be doing anything after starting it. This is what we found in the logs initially:
FileNotFoundError: [Errno 2] No such file or directory: '/root/.kube/config'
Next try - in_cluster=True parameter in the KubernetesPodOperator section does not seem to help. After that, we tried to use this parameter in airflow.cfg, section [kubernetes]:
gcp_service_account_keys = kubernetes-executor-private-key:/var/tmp/private/kubernetes_executor_private_key.json
and the error message was now TypeError: a bytes-like object is required, not 'str'
This is the parameter definition from github:
# GCP Service Account Keys to be provided to tasks run on Kubernetes Executors
# Should be supplied in the format: key-name-1:key-path-1,key-name-2:key-path-2
gcp_service_account_keys =
Already tried using various kinds of parentheses and quotes here, no success.
DAG code:
from datetime import datetime, timedelta
from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator
from airflow.operators.dummy_operator import DummyOperator
default_args = {
'owner': 'xxx',
'depends_on_past': False,
'start_date': datetime.utcnow(),
'email': ['airflow#example.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5)
}
dag = DAG(
'kubernetes_sample', default_args=default_args, schedule_interval=timedelta(minutes=10))
start = DummyOperator(task_id='run_this_first', dag=dag)
passing = KubernetesPodOperator(namespace='default',
image="Python:3.6",
cmds=["Python","-c"],
arguments=["print('hello world')"],
labels={"foo": "bar"},
name="passing-test",
task_id="passing-task",
in_cluster=True,
get_logs=True,
dag=dag
)
failing = KubernetesPodOperator(namespace='default',
image="ubuntu:1604",
cmds=["Python","-c"],
arguments=["print('hello world')"],
labels={"foo": "bar"},
in_cluster=True,
name="fail",
task_id="failing-task",
get_logs=True,
dag=dag
)
passing.set_upstream(start)
failing.set_upstream(start)
Anyone facing the same problem? Am i missing something here?