How do I bind a Celery task to a specific queue? - celery

I want to write a task that is only executable from within a given queue - if somebody tries to pass a different queue into the routing_key parameter of apply_async I want to raise an exception. How do I do this?

You could write your own task that would check to make sure a valid routing key is being passed in when apply_async is being called. You can also apply this to queues. Set up routes and queues in your config:
import celery
from kombu import Queue, Exchange
app = celery.Celery('app')
app.conf.CELERY_QUEUES = (
Queue('add', Exchange('default'), routing_key='good'),
)
app.conf.CELERY_ROUTES = {
'app.add': {
'queue': 'add',
'routing_key': 'good'
}
}
Now, create your own Task class that will perform the check on the routing key. You'll need to override apply_async:
class RouteCheckerTask(celery.Task):
abstract = True
def apply_async(self, args=None, kwargs=None, task_id=None, producer=None,
link=None, link_error=None, **options):
app = self._get_app()
routing_key = options.get('routing_key', None)
if routing_key:
valid_routes = [v['routing_key'] for k, v in app.conf.CELERY_ROUTES.items()]
is_valid = routing_key in valid_routes
if not is_valid:
raise NotImplementedError('{} is not a valid routing key. Options are: {}'.format(routing_key, valid_routes))
if app.conf.CELERY_ALWAYS_EAGER:
return self.apply(args, kwargs, task_id=task_id or uuid(), link=link, link_error=link_error, **options)
# add 'self' if this is a "task_method".
if self.__self__ is not None:
args = args if isinstance(args, tuple) else tuple(args or ())
args = (self.__self__, ) + args
return app.send_task(
self.name, args, kwargs, task_id=task_id, producer=producer,
link=link, link_error=link_error, result_cls=self.AsyncResult,
**dict(self._get_exec_options(), **options)
)
Base your tasks from this one and call apply_async normally:
#app.task(base=RouteCheckerTask)
def add(x, y):
return x + y
# Fails
add.apply_async([1, 2], routing_key='bad')
# Passes
add.apply_async([1, 2], routing_key='good')

Related

Airflow - How to push xcom from ecs operator?

In my airflow dag, I have an ecs_operator task followed by python operator task. I want to push some messages from ECS task to python task using xcom feature of airflow. I tried the option do_xcom_push=True with no result. Find below sample dag.
dag = DAG(
dag_name, default_args=default_args, schedule_interval=None)
start = DummyOperator(task_id = 'start'
,dag =dag)
end = DummyOperator(task_id = 'end'
,dag =dag)
ecs_operator_args = {
'launch_type': 'FARGATE',
'task_definition': 'task-def:2',
'cluster': 'cluster-name',
'region_name': 'region',
'network_configuration': {
'awsvpcConfiguration':
{}
}
}
ecs_task = ECSOperator(
task_id='x_com_test'
,**ecs_operator_args
,do_xcom_push=True
,params={'my_param': 'Parameter-1'}
,dag=dag)
def pull_function(**kwargs):
ti = kwargs['ti']
msg = ti.xcom_pull(task_ids='x_com_test',key='the_message')
print("received message: '%s'" % msg)
pull_task = PythonOperator(
task_id='pull_task',
python_callable=pull_function,
provide_context=True,
dag=dag)
start >> ecs_task >> pull_task >> end
You need to setup a cloudwatch log group for the container.
ECSOperator needs to be extended to support pushing to xcom:
from collections import deque
from airflow.utils import apply_defaults
from airflow.contrib.operators.ecs_operator import ECSOperator
class MyECSOperator(ECSOperator):
#apply_defaults
def __init__(self, xcom_push=False, **kwargs):
super(CLECSOperator, self).__init__(**kwargs)
self.xcom_push_flag = xcom_push
def execute(self, context):
super().execute(context)
if self.xcom_push_flag:
return self._last_log_event()
def _last_log_event(self):
if self.awslogs_group and self.awslogs_stream_prefix:
task_id = self.arn.split("/")[-1]
stream_name = "{}/{}".format(self.awslogs_stream_prefix, task_id)
events = self.get_logs_hook().get_log_events(self.awslogs_group, stream_name)
last_event = deque(events, maxlen=1).pop()
return last_event["message"]
dag = DAG(
dag_name, default_args=default_args, schedule_interval=None)
start = DummyOperator(task_id = 'start'
,dag =dag)
end = DummyOperator(task_id = 'end'
,dag =dag)
ecs_operator_args = {
'launch_type': 'FARGATE',
'task_definition': 'task-def:2',
'cluster': 'cluster-name',
'region_name': 'region',
'awslogs_group': '/aws/ecs/myLogGroup',
'awslogs_stream_prefix': 'myStreamPrefix',
'network_configuration': {
'awsvpcConfiguration':
{}
}
}
ecs_task = MyECSOperator(
task_id='x_com_test'
,**ecs_operator_args
,xcom_push=True
,params={'my_param': 'Parameter-1'}
,dag=dag)
def pull_function(**kwargs):
ti = kwargs['ti']
msg = ti.xcom_pull(task_ids='x_com_test',key='return_value')
print("received message: '%s'" % msg)
pull_task = PythonOperator(
task_id='pull_task',
python_callable=pull_function,
provide_context=True,
dag=dag)
start >> ecs_task >> pull_task >> end
ecs_task will take the last event from the log group before finishing executing, and push it to xcom.
Apache-AWS has a new commit that pretty much implements what #Бојан-Аџиевски mentioned above, so you don't need to write your custom ECSOperator. Available as of version 1.1.0
All you gotta do is to provide the do_xcom_push=True when calling the ECSOperator and provide the correct awslogs_group and awslogs_stream_prefix.
Make sure your awslogs_stream_prefix follows the following format:
prefix-name/container-name
As this is what ECS directs logs to.

How to schedule a chain task in celery

I want to run a complex task scheduled by beat. Let us assume the default add/mul tasks are defined.
#app.on_after_configure.connect
def setup_periodic_tasks(sender, **kwargs):
sender.add_periodic_task(
crontab(),
add.s(2,3) | mul.s(2)
)
But this will return an error in the worker:
NotImplementedError: chain is not a real task
How can I schedule a non trivial task with celery beat?
One way to do that is to schedule your tasks chain in beat_schedule in your celeryconfig, using link option, celery_tasks here is a module name where your tasks are defined
from celery.schedules import crontab
from celery import signature
beat_schedule = {
'chained': {
'task': 'celery_tasks.add',
'schedule': crontab(),
'options': {
'queue': 'default',
'link': signature('celery_tasks.mul',
args=(),
kwargs={},
options={
'link': signature('celery_tasks.another_task',
args=(),
kwargs={},
queue='default')
},
queue='default')
},
'args': ()
}
}
for add chained periodic tasks you can use an #app.task when declare your chain and then, add this new task on add_periodic_task() method. Example:
#app.on_after_finalize.connect ->i use this because it`s declared on task.py
def setup_periodic_tasks(sender, **kwargs):
sender.add_periodic_task(timedelta(minutes=10), chian_st22.s(),name='test')
#app.task
def chian_st22(): -> i create the task with chain
cadena = chain(st22.s(), mailer.s()).apply_async()
#app.task
def mailer(data):
clase = CheckAlert()
mail = clase.envio_mail(data)
return mail
#app.task
def st22():
clase = CheckAlert()
st = clase.check_st22_dumps()
return st

airflow http callback sensor

Our airflow implementation sends out http requests to get services to do tasks. We want those services to let airflow know when they complete their task, so we are sending a callback url to the service which they will call when their task is complete. I can't seem to find a callback sensor, however. How do people handle this normally?
There is no such thing as a callback or webhook sensor in Airflow. The sensor definition follows as taken from the documentation:
Sensors are a certain type of operator that will keep running until a certain criterion is met. Examples include a specific file landing in HDFS or S3, a partition appearing in Hive, or a specific time of the day. Sensors are derived from BaseSensorOperator and run a poke method at a specified poke_interval until it returns True.
This means that a sensor is an operator that performs polling behavior on external systems. In that sense, your external services should have a way of keeping state for each executed task - either internally or externally - so that a polling sensor can check on that state.
This way you can use for example the airflow.operators.HttpSensor that polls an HTTP endpoint until a condition is met. Or even better, write your own custom sensor that gives you the opportunity to do more complex processing and keep state.
Otherwise, if the service outputs data in a storage system you can use a sensor that polls a database for example. I believe you get the idea.
I'm attaching a custom operator example that I've written for integrating with the Apache Livy API. The sensor does two things: a) submits a Spark job through the REST API and b) waits for the job to be completed.
The operator extends the SimpleHttpOperator and at the same time implements the HttpSensor thus combining both functionalities.
class LivyBatchOperator(SimpleHttpOperator):
"""
Submits a new Spark batch job through
the Apache Livy REST API.
"""
template_fields = ('args',)
ui_color = '#f4a460'
#apply_defaults
def __init__(self,
name,
className,
file,
executorMemory='1g',
driverMemory='512m',
driverCores=1,
executorCores=1,
numExecutors=1,
args=[],
conf={},
timeout=120,
http_conn_id='apache_livy',
*arguments, **kwargs):
"""
If xcom_push is True, response of an HTTP request will also
be pushed to an XCom.
"""
super(LivyBatchOperator, self).__init__(
endpoint='batches', *arguments, **kwargs)
self.http_conn_id = http_conn_id
self.method = 'POST'
self.endpoint = 'batches'
self.name = name
self.className = className
self.file = file
self.executorMemory = executorMemory
self.driverMemory = driverMemory
self.driverCores = driverCores
self.executorCores = executorCores
self.numExecutors = numExecutors
self.args = args
self.conf = conf
self.timeout = timeout
self.poke_interval = 10
def execute(self, context):
"""
Executes the task
"""
payload = {
"name": self.name,
"className": self.className,
"executorMemory": self.executorMemory,
"driverMemory": self.driverMemory,
"driverCores": self.driverCores,
"executorCores": self.executorCores,
"numExecutors": self.numExecutors,
"file": self.file,
"args": self.args,
"conf": self.conf
}
print payload
headers = {
'X-Requested-By': 'airflow',
'Content-Type': 'application/json'
}
http = HttpHook(self.method, http_conn_id=self.http_conn_id)
self.log.info("Submitting batch through Apache Livy API")
response = http.run(self.endpoint,
json.dumps(payload),
headers,
self.extra_options)
# parse the JSON response
obj = json.loads(response.content)
# get the new batch Id
self.batch_id = obj['id']
log.info('Batch successfully submitted with Id %s', self.batch_id)
# start polling the batch status
started_at = datetime.utcnow()
while not self.poke(context):
if (datetime.utcnow() - started_at).total_seconds() > self.timeout:
raise AirflowSensorTimeout('Snap. Time is OUT.')
sleep(self.poke_interval)
self.log.info("Batch %s has finished", self.batch_id)
def poke(self, context):
'''
Function that the sensors defined while deriving this class should
override.
'''
http = HttpHook(method='GET', http_conn_id=self.http_conn_id)
self.log.info("Calling Apache Livy API to get batch status")
# call the API endpoint
endpoint = 'batches/' + str(self.batch_id)
response = http.run(endpoint)
# parse the JSON response
obj = json.loads(response.content)
# get the current state of the batch
state = obj['state']
# check the batch state
if (state == 'starting') or (state == 'running'):
# if state is 'starting' or 'running'
# signal a new polling cycle
self.log.info('Batch %s has not finished yet (%s)',
self.batch_id, state)
return False
elif state == 'success':
# if state is 'success' exit
return True
else:
# for all other states
# raise an exception and
# terminate the task
raise AirflowException(
'Batch ' + str(self.batch_id) + ' failed (' + state + ')')
Hope this will help you a bit.

falcon python example with celery

import falcon
import json
from tasks import add
from waitress import serve
class tasksresource:
def on_get(self, req, resp):
"""Handles GET requests"""
self.result = add.delay(1, 2)
self.context = {'ID': self.result.id, 'final result': self.result.ready()}
resp.body = json.dumps(self.context)
api = falcon.API()
api.add_route('/result', tasksresource())
# api.add_route('/result/task', taskresult())
if __name__ == '__main__':
serve(api, host='127.1.0.1', port=5555)
how do i get the Get the task id from json payload ( post data)
and add a route to it
Here a small example. Structure of files:
/project
__init__.py
app.py # routes, falcon etc.
tasks.py # celery
example.py # script for demonstration how it works
app.py:
import json
import falcon
from tasks import add
from celery.result import AsyncResult
class StartTask(object):
def on_get(self, req, resp):
# start task
task = add.delay(4, 4)
resp.status = falcon.HTTP_200
# return task_id to client
result = {'task_id': task.id}
resp.body = json.dumps(result)
class TaskStatus(object):
def on_get(self, req, resp, task_id):
# get result of task by task_id and generate content to client
task_result = AsyncResult(task_id)
result = {'status': task_result.status, 'result': task_result.result}
resp.status = falcon.HTTP_200
resp.body = json.dumps(result)
app = falcon.API()
# registration of routes
app.add_route('/start_task', StartTask())
app.add_route('/task_status/{task_id}', TaskStatus())
tasks.py:
from time import sleep
import celery
app = celery.Celery('tasks', broker='redis://localhost:6379/0', backend='redis://localhost:6379/0')
#app.task
def add(x, y):
"""
:param int x:
:param int y:
:return: int
"""
# sleep just for demonstration
sleep(5)
return x + y
Now we need to start celery application. Go to project folder and run:
celery -A tasks worker --loglevel=info
After this we need to start Falcon application. Go to project folder and run:
gunicorn app:app
Ok. Everything is ready.
example.py is small client side which can help to understand:
from time import sleep
import requests
# start new task
task_info = requests.get('http://127.0.0.1:8000/start_task')
task_info = task_info.json()
while True:
# check status of task by task_id while task is working
result = requests.get('http://127.0.0.1:8000/task_status/' + task_info['task_id'])
task_status = result.json()
print task_status
if task_status['status'] == 'SUCCESS' and task_status['result']:
print 'Task with id = %s is finished' % task_info['task_id']
print 'Result: %s' % task_status['result']
break
# sleep and check status one more time
sleep(1)
Just call python ./example.py and you should see something like this:
{u'status': u'PENDING', u'result': None}
{u'status': u'PENDING', u'result': None}
{u'status': u'PENDING', u'result': None}
{u'status': u'PENDING', u'result': None}
{u'status': u'PENDING', u'result': None}
{u'status': u'SUCCESS', u'result': 8}
Task with id = 76542904-6c22-4536-99d9-87efd66d9fe7 is finished
Result: 8
Hope this helps you.
The above example by Danila Ganchar is great and very helpful. I'm using celery version 4.3.0 with Python 3, and one of the errors I received from using the example above is on this line:
task_result = AsyncResult(task_id)
The error I would receive is:
AttributeError: 'DisabledBackend' object has no attribute '_get_task_meta_for'
This may be a recent change, but result.AsyncResult (or just AsyncResult in this example because he imported it from celery.result) doesn't know the backend you are using. There are 2 solutions to solving this problem:
1) You can take the AsyncResult of the actual task itself add.AsyncResult(task_id) because the add task already has the backend defined through the #app.task decorator. The downside to this in this example is you want to be able to get the result for any task by just passing in the task_id via the Falcon endpoint, so this is limited
2) The preferred method is to just pass in the app parameter to the AsyncResult function:
task = result.AsyncResult(id, app=app)
Hope this helps!

Kubernetes API call equivalent to 'kubectl apply'

I try to use the master api to update resources.
In 1.2 to update a deployment resource I'm doing kubectl apply -f new updateddeployment.yaml
How to do the same action with the api?
I checked the code in pkg/kubectl/cmd/apply.go and I think the following lines of code shows what's behind the scene when you run kubectl apply -f:
// Compute a three way strategic merge patch to send to server.
patch, err := strategicpatch.CreateThreeWayMergePatch(original, modified, current,
versionedObject, true)
helper := resource.NewHelper(info.Client, info.Mapping)
_, err = helper.Patch(info.Namespace, info.Name, api.StrategicMergePatchType, patch)
And here is the code helper.Patch:
func (m *Helper) Patch(namespace, name string, pt api.PatchType, data []byte) (runtime.Object, error) {
return m.RESTClient.Patch(pt).
NamespaceIfScoped(namespace, m.NamespaceScoped).
Resource(m.Resource).
Name(name).
Body(data).
Do().
Get()
}
This API is not really convincingly designed, since it forces us to reimplement such basic stuff at the client side...
Anyway, here is my attempt to reinvent the hexagonal wheel in Python...
Python module kube_apply
Usage is like kube_apply.fromYaml(myStuff)
can read strings or opened file streams (via lib Yaml)
handles yaml files with several concatenated objects
implementation is rather braindead and first attempts
to insert the resource. If this fails, it tries a patch,
and if this also fails, it deletes the resource and
inserts it anew.
File: kube_apply.py
#!/usr/bin/python3
# coding: utf-8
# __________ ________________________________________________ #
# kube_apply - apply Yaml similar to kubectl apply -f file.yaml #
# #
# (C) 2019 Hermann Vosseler <Ichthyostega#web.de> #
# This is OpenSource software; licensed under Apache License v2+ #
# ############################################################### #
'''
Utility for the official Kubernetes python client: apply Yaml data.
While still limited to some degree, this utility attempts to provide
functionality similar to `kubectl apply -f`
- load and parse Yaml
- try to figure out the object type and API to use
- figure out if the resource already exists, in which case
it needs to be patched or replaced alltogether.
- otherwise just create a new resource.
Based on inspiration from `kubernetes/utils/create_from_yaml.py`
#since: 2/2019
#author: Ichthyostega
'''
import re
import yaml
import logging
import kubernetes.client
def runUsageExample():
''' demonstrate usage by creating a simple Pod through default client
'''
logging.basicConfig(level=logging.DEBUG)
#
# KUBECONFIG = '/path/to/special/kubecfg.yaml'
# import kubernetes.config
# client = kubernetes.config.new_client_from_config(config_file=KUBECONFIG)
# # --or alternatively--
# kubernetes.config.load_kube_config(config_file=KUBECONFIG)
fromYaml('''
kind: Pod
apiVersion: v1
metadata:
name: dummy-pod
labels:
blow: job
spec:
containers:
- name: sleepr
image: busybox
command:
- /bin/sh
- -c
- sleep 24000
''')
def fromYaml(rawData, client=None, **kwargs):
''' invoke the K8s API to create or replace an object given as YAML spec.
#param rawData: either a string or an opened input stream with a
YAML formatted spec, as you'd use for `kubectl apply -f`
#param client: (optional) preconfigured client environment to use for invocation
#param kwargs: (optional) further arguments to pass to the create/replace call
#return: response object from Kubernetes API call
'''
for obj in yaml.load_all(rawData):
createOrUpdateOrReplace(obj, client, **kwargs)
def createOrUpdateOrReplace(obj, client=None, **kwargs):
''' invoke the K8s API to create or replace a kubernetes object.
The first attempt is to create(insert) this object; when this is rejected because
of an existing object with same name, we attempt to patch this existing object.
As a last resort, if even the patch is rejected, we *delete* the existing object
and recreate from scratch.
#param obj: complete object specification, including API version and metadata.
#param client: (optional) preconfigured client environment to use for invocation
#param kwargs: (optional) further arguments to pass to the create/replace call
#return: response object from Kubernetes API call
'''
k8sApi = findK8sApi(obj, client)
try:
res = invokeApi(k8sApi, 'create', obj, **kwargs)
logging.debug('K8s: %s created -> uid=%s', describe(obj), res.metadata.uid)
except kubernetes.client.rest.ApiException as apiEx:
if apiEx.reason != 'Conflict': raise
try:
# asking for forgiveness...
res = invokeApi(k8sApi, 'patch', obj, **kwargs)
logging.debug('K8s: %s PATCHED -> uid=%s', describe(obj), res.metadata.uid)
except kubernetes.client.rest.ApiException as apiEx:
if apiEx.reason != 'Unprocessable Entity': raise
try:
# second attempt... delete the existing object and re-insert
logging.debug('K8s: replacing %s FAILED. Attempting deletion and recreation...', describe(obj))
res = invokeApi(k8sApi, 'delete', obj, **kwargs)
logging.debug('K8s: %s DELETED...', describe(obj))
res = invokeApi(k8sApi, 'create', obj, **kwargs)
logging.debug('K8s: %s CREATED -> uid=%s', describe(obj), res.metadata.uid)
except Exception as ex:
message = 'K8s: FAILURE updating %s. Exception: %s' % (describe(obj), ex)
logging.error(message)
raise RuntimeError(message)
return res
def patchObject(obj, client=None, **kwargs):
k8sApi = findK8sApi(obj, client)
try:
res = invokeApi(k8sApi, 'patch', obj, **kwargs)
logging.debug('K8s: %s PATCHED -> uid=%s', describe(obj), res.metadata.uid)
return res
except kubernetes.client.rest.ApiException as apiEx:
if apiEx.reason == 'Unprocessable Entity':
message = 'K8s: patch for %s rejected. Exception: %s' % (describe(obj), apiEx)
logging.error(message)
raise RuntimeError(message)
else:
raise
def deleteObject(obj, client=None, **kwargs):
k8sApi = findK8sApi(obj, client)
try:
res = invokeApi(k8sApi, 'delete', obj, **kwargs)
logging.debug('K8s: %s DELETED. uid was: %s', describe(obj), res.details and res.details.uid or '?')
return True
except kubernetes.client.rest.ApiException as apiEx:
if apiEx.reason == 'Not Found':
logging.warning('K8s: %s does not exist (anymore).', describe(obj))
return False
else:
message = 'K8s: deleting %s FAILED. Exception: %s' % (describe(obj), apiEx)
logging.error(message)
raise RuntimeError(message)
def findK8sApi(obj, client=None):
''' Investigate the object spec and lookup the corresponding API object
#param client: (optional) preconfigured client environment to use for invocation
#return: a client instance wired to the apriopriate API
'''
grp, _, ver = obj['apiVersion'].partition('/')
if ver == '':
ver = grp
grp = 'core'
# Strip 'k8s.io', camel-case-join dot separated parts. rbac.authorization.k8s.io -> RbacAuthorzation
grp = ''.join(part.capitalize() for part in grp.rsplit('.k8s.io', 1)[0].split('.'))
ver = ver.capitalize()
k8sApi = '%s%sApi' % (grp, ver)
return getattr(kubernetes.client, k8sApi)(client)
def invokeApi(k8sApi, action, obj, **args):
''' find a suitalbe function and perform the actual API invocation.
#param k8sApi: client object for the invocation, wired to correct API version
#param action: either 'create' (to inject a new objet) or 'replace','patch','delete'
#param obj: the full object spec to be passed into the API invocation
#param args: (optional) extraneous arguments to pass
#return: response object from Kubernetes API call
'''
# transform ActionType from Yaml into action_type for swagger API
kind = camel2snake(obj['kind'])
# determine namespace to place the object in, supply default
try: namespace = obj['metadata']['namespace']
except: namespace = 'default'
functionName = '%s_%s' %(action,kind)
if hasattr(k8sApi, functionName):
# namespace agnostic API
function = getattr(k8sApi, functionName)
else:
functionName = '%s_namespaced_%s' %(action,kind)
function = getattr(k8sApi, functionName)
args['namespace'] = namespace
if not 'create' in functionName:
args['name'] = obj['metadata']['name']
if 'delete' in functionName:
from kubernetes.client.models.v1_delete_options import V1DeleteOptions
obj = V1DeleteOptions()
return function(body=obj, **args)
def describe(obj):
return "%s '%s'" % (obj['kind'], obj['metadata']['name'])
def camel2snake(string):
string = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', string)
string = re.sub('([a-z0-9])([A-Z])', r'\1_\2', string).lower()
return string
if __name__=='__main__':
runUsageExample()
You could install the kubectl binary and invoke it from within your Python program a la:
exec(f"kubectl apply -f - <<EOF{yaml_manifests}EOF --prune")
Once server-side apply is ready this problem should get a bit easier as there will effectively be a k8s API endpoint you can hit (though still doesn't sound like it will be resource-agnostic, i.e. you will still have to PATCH /api/v1/some-k8s-resource specifically, whereas with kubectl apply you can input some heterogenous list of resources).