How mature is Chronos? Is it a viable alternative to scheduler like celery-beat?
Right now our scheduling implements a periodic "heartbeat" task that checks of "outstanding" events and fires them if they are overdue. We are using python-dateutil's rrule for defining this.
We are looking at alternatives to this approach, and Chronos seems a very attactive alternative: 1) it would mitigate the necessity to use a heartbeat schedule task, 2) it supports RESTful submission of events with ISO8601 format, 3) has a useful interface for management, and 4) it scales.
The crucial requirement is that scheduling needs to be configurable on the fly from the Web Interface. This is why can't use celerybeat's built-in scheduling out of the box.
Are we going to shoot ourselves in the foot by switching over to Chronos?
This SO has solutions to your dynamic periodic task problem. It's not the accepted answer at the moment:
from djcelery.models import PeriodicTask, IntervalSchedule
from datetime import datetime
class TaskScheduler(models.Model):
periodic_task = models.ForeignKey(PeriodicTask)
#staticmethod
def schedule_every(task_name, period, every, args=None, kwargs=None):
""" schedules a task by name every "every" "period". So an example call would be:
TaskScheduler('mycustomtask', 'seconds', 30, [1,2,3])
that would schedule your custom task to run every 30 seconds with the arguments 1 ,2 and 3 passed to the actual task.
"""
permissible_periods = ['days', 'hours', 'minutes', 'seconds']
if period not in permissible_periods:
raise Exception('Invalid period specified')
# create the periodic task and the interval
ptask_name = "%s_%s" % (task_name, datetime.datetime.now()) # create some name for the period task
interval_schedules = IntervalSchedule.objects.filter(period=period, every=every)
if interval_schedules: # just check if interval schedules exist like that already and reuse em
interval_schedule = interval_schedules[0]
else: # create a brand new interval schedule
interval_schedule = IntervalSchedule()
interval_schedule.every = every # should check to make sure this is a positive int
interval_schedule.period = period
interval_schedule.save()
ptask = PeriodicTask(name=ptask_name, task=task_name, interval=interval_schedule)
if args:
ptask.args = args
if kwargs:
ptask.kwargs = kwargs
ptask.save()
return TaskScheduler.objects.create(periodic_task=ptask)
def stop(self):
"""pauses the task"""
ptask = self.periodic_task
ptask.enabled = False
ptask.save()
def start(self):
"""starts the task"""
ptask = self.periodic_task
ptask.enabled = True
ptask.save()
def terminate(self):
self.stop()
ptask = self.periodic_task
self.delete()
ptask.delete()
I haven't used djcelery yet, but it supposedly has an admin interface for dynamic periodic tasks.
Related
I noticed that for scheduled task the execution date is set in the past according to
Airflow was developed as a solution for ETL needs. In the ETL world,
you typically summarize data. So, if I want to summarize data for
2016-02-19, I would do it at 2016-02-20 midnight GMT, which would be
right after all data for 2016-02-19 becomes available.
however, when a dag triggers another dag the execution time is set to now().
Is there a way to have the triggered dags with the same execution time of triggering dag? Of course, I can rewrite the template and use yesterday_ds, however, this is a tricky solution.
The following class expands on TriggerDagRunOperator to allow passing the execution date as a string that then gets converted back into a datetime. It's a bit hacky but it is the only way I found to get the job done.
from datetime import datetime
import logging
from airflow import settings
from airflow.utils.state import State
from airflow.models import DagBag
from airflow.operators.dagrun_operator import TriggerDagRunOperator, DagRunOrder
class MMTTriggerDagRunOperator(TriggerDagRunOperator):
"""
MMT-patched for passing explicit execution date
(otherwise it's hard to hook the datetime.now() date).
Use when you want to explicity set the execution date on the target DAG
from the controller DAG.
Adapted from Paul Elliot's solution on airflow-dev mailing list archives:
http://mail-archives.apache.org/mod_mbox/airflow-dev/201711.mbox/%3cCAJuWvXgLfipPmMhkbf63puPGfi_ezj8vHYWoSHpBXysXhF_oZQ#mail.gmail.com%3e
Parameters
------------------
execution_date: str
the custom execution date (jinja'd)
Usage Example:
-------------------
my_dag_trigger_operator = MMTTriggerDagRunOperator(
execution_date="{{execution_date}}"
task_id='my_dag_trigger_operator',
trigger_dag_id='my_target_dag_id',
python_callable=lambda: random.getrandbits(1),
params={},
dag=my_controller_dag
)
"""
template_fields = ('execution_date',)
def __init__(
self, trigger_dag_id, python_callable, execution_date,
*args, **kwargs
):
self.execution_date = execution_date
super(MMTTriggerDagRunOperator, self).__init__(
trigger_dag_id=trigger_dag_id, python_callable=python_callable,
*args, **kwargs
)
def execute(self, context):
run_id_dt = datetime.strptime(self.execution_date, '%Y-%m-%d %H:%M:%S')
dro = DagRunOrder(run_id='trig__' + run_id_dt.isoformat())
dro = self.python_callable(context, dro)
if dro:
session = settings.Session()
dbag = DagBag(settings.DAGS_FOLDER)
trigger_dag = dbag.get_dag(self.trigger_dag_id)
dr = trigger_dag.create_dagrun(
run_id=dro.run_id,
state=State.RUNNING,
execution_date=self.execution_date,
conf=dro.payload,
external_trigger=True)
logging.info("Creating DagRun {}".format(dr))
session.add(dr)
session.commit()
session.close()
else:
logging.info("Criteria not met, moving on")
There is an issue you may run into when using this and not setting execution_date=now(): your operator will throw a mysql error if you try to start a dag with an identical execution_date twice. This is because the execution_date and dag_id are used to create the row index and rows with identical indexes cannot be inserted.
I can't think of a reason you would ever want to run two identical dags with the same execution_date in production anyway, but it is something I ran into while testing and you should not be alarmed by it. Simply clear the old job or use a different datetime.
The TriggerDagRunOperator now has an execution_date parameter to set the execution date of the triggered run.
Unfortunately the parameter is not in the template fields.
If it will be added to template fields (or if you override the operator and change the template_fields value) it will be possible to use it like this:
my_trigger_task= TriggerDagRunOperator(task_id='my_trigger_task',
trigger_dag_id="triggered_dag_id",
python_callable=conditionally_trigger,
execution_date= '{{execution_date}}',
dag=dag)
It has not been released yet but you can see the sources here:
https://github.com/apache/incubator-airflow/blob/master/airflow/operators/dagrun_operator.py
The commit that did the change was:
https://github.com/apache/incubator-airflow/commit/089c996fbd9ecb0014dbefedff232e8699ce6283#diff-41f9029188bd5e500dec9804fed26fb4
I improved a bit the MMTTriggerDagRunOperator. The function checks if the dag_run already exists, if found, restart the dag using the clear function of airflow. This allows us to create a dependency between dags because the possibility to have the execution date moved to the triggered dag opens a whole universe of amazing possibilities. I wonder why this is not the default behavior in airflow.
def execute(self, context):
run_id_dt = datetime.strptime(self.execution_date, '%Y-%m-%d %H:%M:%S')
dro = DagRunOrder(run_id='trig__' + run_id_dt.isoformat())
dro = self.python_callable(context, dro)
if dro:
session = settings.Session()
dbag = DagBag(settings.DAGS_FOLDER)
trigger_dag = dbag.get_dag(self.trigger_dag_id)
if not trigger_dag.get_dagrun( self.execution_date ):
dr = trigger_dag.create_dagrun(
run_id=dro.run_id,
state=State.RUNNING,
execution_date=self.execution_date,
conf=dro.payload,
external_trigger=True
)
logging.info("Creating DagRun {}".format(dr))
session.add(dr)
session.commit()
else:
trigger_dag.clear(
start_date = self.execution_date,
end_date = self.execution_date,
only_failed = False,
only_running = False,
confirm_prompt = False,
reset_dag_runs = True,
include_subdags= False,
dry_run = False
)
logging.info("Cleared DagRun {}".format(trigger_dag))
session.close()
else:
logging.info("Criteria not met, moving on")
There is a function available in the experimental API section of airflow that allows you to trigger a dag with a specific execution date. https://github.com/apache/incubator-airflow/blob/master/airflow/api/common/experimental/trigger_dag.py
You can call this function as a part of PythonOperator and achieve the objective.
So it will look like
from airflow.api.common.experimental.trigger_dag import trigger_dag
trigger_operator=PythonOperator(task_id='YOUR_TASK_ID',
python_callable=trigger_dag,
op_args=['dag_id'],
op_kwargs={'execution_date': datetime.now()})
I want to accomplish something like this:
results = []
for i in range(N):
data = generate_data_slowly()
res = tasks.process_data.apply_async(data)
results.append(res)
celery.collect(results).then(tasks.combine_processed_data())
ie launch asynchronous tasks over a long period of time, then schedule a dependent task that will only be executed once all earlier tasks are complete.
I've looked at things like chain and chord, but it seems like they only work if you can construct your task graph completely upfront.
For anyone interested, I ended up using this snippet:
#app.task(bind=True, max_retries=None)
def wait_for(self, task_id_or_ids):
try:
ready = app.AsyncResult(task_id_or_ids).ready()
except TypeError:
ready = all(app.AsyncResult(task_id).ready()
for task_id in task_id_or_ids)
if not ready:
self.retry(countdown=2**self.request.retries)
And writing the workflow something like this:
task_ids = []
for i in range(N):
task = (generate_data_slowly.si(i) |
process_data.si(i)
)
task_id = task.delay().task_id
task_ids.append(task_id)
final_task = (wait_for(task_ids) |
combine_processed_data.si()
)
final_task.delay()
That way you would be running your tasks synchronously.
The solution depends entirely on how and where data are collected. Roughly, given that generate_data_slowly and tasks.process_data are synchronized, a better approach would be to join both in one task (or a chain) and to group them.
chord will allow you to add a callback to that group.
The simplest example would be:
from celery import chord
#app.task
def getnprocess_data():
data = generate_data_slowly()
return whatever_process_data_does(data)
header = [getnprocess_data.s() for i in range(N)]
callback = combine_processed_data.s()
chord(header)(callback).get()
I need to schedule a task for the first day of each month. Up until now, I have been using this:
system.scheduler.schedule(0.microseconds, 30.days, schedulerActor, "update")
But as you may have guessed it, this ends up sometimes running the task twice a month (march) or none a month (february). Is there a better way to schedule the task for the first day of each month using Akka Scheduler?
Built-in Akka scheduler is more a delayer than a scheduler. I would recommend using akka-quartz-scheduler. This module allows you to actually schedule tasks to run when you want.
The usage is simple. Some config:
akka {
quartz {
schedules {
YourScheduleName {
description = "A cron job that fires off every first of the month at 5AM"
expression = "0 0 5 1 1/1 ? *"
}
}
}
}
And then in the code:
case object Tick
val yourActor = system.actorOf(Props[YourActor])
QuartzSchedulerExtension(system).schedule("YourScheduleName", yourActor, Tick)
Technically I can install cron on the machine and curl the url, but I'm trying to avoid that. Any way to accomplish this?
Reason I want to avoid cron is so I can easily change the schedule or stop it completely without also ssh'ing into the machine to do so.
Take a look at: https://github.com/enragedginger/akka-quartz-scheduler.
Refer to http://quartz-scheduler.org/api/2.1.7/org/quartz/CronExpression.html for valid CronExpressions and examples.
An example taken from the docs:
An example schedule called Every-30-Seconds which, aptly, fires-off every 30 seconds:
akka {
quartz {
schedules {
Every30Seconds {
description = "A cron job that fires off every 30 seconds"
expression = "*/30 * * ? * *"
calendar = "OnlyBusinessHours"
}
}
}
}
You can integrate this into your Play! application (probably in your Global application)
You can use the Akka scheduler.
val scheduler = Akka.system(app).scheduler
scheduler.schedule(0 seconds, 1 hour) {
// run this block every hour
}
The first parameter is a delay, so if you wanted to delay to a specific time you could easily calculate the target time with some simple date arithmetic.
Check out https://github.com/philcali/cronish
Some example code from README.md:
val payroll = task {
println("You have just been paid... Finally!")
}
// Yes... that's how you run it
payroll executes "every last Friday in every month"
val greetings = job (println("Hello there")) describedAs "General Greetings"
// give a delayed start
val delayed = greetings runs "every day at 7:30" in 5.seconds
// give an exact time to start
val exact = greetings runs "every day at noon" starting now + 1.week
// resets a job to its definition
val reseted = exact.reset()
reseted starting now + 1.day
I want to create the following flow using celery configuration\api:
Send TaskA(argB) Only if celery queue has no TaskA(argB) already pending
Is it possible? how?
You can make your job aware of other tasks by some sort of memoization. If you use a cache control key (redis, memcached, /tmp, whatever is handy), you can make execution depend on that key. I'm using redis as an example.
from redis import Redis
#app.task
def run_only_one_instance(params):
try:
sentinel = Redis().incr("run_only_one_instance_sentinel")
if sentinel == 1:
#I am the legitimate running task
perform_task()
else:
#Do you want to do something else on task duplicate?
pass
Redis().decr("run_only_one_instance_sentinel")
except Exception as e:
Redis().decr("run_only_one_instance_sentinel")
# potentially log error with Sentry?
# decrement the counter to insure tasks can run
# or: raise e
I cannot think of a way but to
Retrieve all executing and scheduled tasks via celery inspect
Iterate through them to see if your task is there.
check this SO question to see how the first point is done.
good luck
I don't know it's gonna help you more than the other answers, but there goes my approach, following the same idea given by srj. I needed a way to block my server to launch tasks with same id to queue. So I made a general function to help me.
def is_task_active_or_registered(app, task_id):
i = app.control.inspect()
active_dict = i.active()
scheduled_dict = i.scheduled()
keys_set = set(active_dict.keys() + scheduled_dict.keys())
tasks_ids_set = set()
for _dict in [active_dict, scheduled_dict]:
for k in keys_set:
for task in _dict[k]:
tasks_ids_set.add(task['id'])
if task_id in tasks_ids_set:
return True
else:
return False
So, I use it like this:
In the context where my celery-app object is available, I define:
def check_task_can_not_run(task_id):
return is_task_active_or_registered(app=celery, task_id=task_id)
And so, from my client request, I call this check_task_can_not_run(...) and block task from being launched in case of True.
I was facing similar problem. The Beat was making duplicates in my queue. I wanted to use expires but this feature isn't working properly https://github.com/celery/celery/issues/4300.
So here is scheduler which checks if task has been already enqueued (based on task name).
# -*- coding: UTF-8 -*-
from __future__ import unicode_literals
import json
from heapq import heappop, heappush
from celery.beat import event_t
from celery.schedules import schedstate
from django_celery_beat.schedulers import DatabaseScheduler
from typing import List, Optional
from typing import TYPE_CHECKING
from your_project import celery_app
if TYPE_CHECKING:
from celery.beat import ScheduleEntry
def is_task_in_queue(task, queue_name=None):
# type: (str, Optional[str]) -> bool
queues = [queue_name] if queue_name else celery_app.amqp.queues.keys()
for queue in queues:
if task in get_celery_queue_tasks(queue):
return True
return False
def get_celery_queue_tasks(queue_name):
# type: (str) -> List[str]
with celery_app.pool.acquire(block=True) as conn:
tasks = conn.default_channel.client.lrange(queue_name, 0, -1)
decoded_tasks = []
for task in tasks:
j = json.loads(task)
task = j['headers']['task']
if task not in decoded_tasks:
decoded_tasks.append(task)
return decoded_tasks
class SmartScheduler(DatabaseScheduler):
"""
Smart means that prevents duplicating of tasks in queues.
"""
def is_due(self, entry):
# type: (ScheduleEntry) -> schedstate
is_due, next_time_to_run = entry.is_due()
if (
not is_due or # duplicate wouldn't be created
not is_task_in_queue(entry.task) # not in queue so let it run
):
return schedstate(is_due, next_time_to_run)
# Task should be run (is_due) and it is present in queue (is_task_in_queue)
H = self._heap
if not H:
return schedstate(False, self.max_interval)
event = H[0]
verify = heappop(H)
if verify is event:
next_entry = self.reserve(entry)
heappush(H, event_t(self._when(next_entry, next_time_to_run), event[1], next_entry))
else:
heappush(H, verify)
next_time_to_run = min(verify[0], next_time_to_run)
return schedstate(False, min(next_time_to_run, self.max_interval))