Unregistered task in Celery - celery

tasks.py:
from celery import Celery
from django.http import HttpResponse
from anyjson import serialize
celery = Celery('tasks', broker='amqp://guest#localhost//')
##celery.task
def add(request):
x = int(request.GET['x'])
y = int(request.GET['y'])
result = x+y
response = {'status': 'success', 'retval': result}
return HttpResponse(serialize(response), mimetype='application/json')
after starting the worker,i tried http dispatcher:
from celery.task.http import URL
res = URL('http://localhost/add').get_async(x=10, y=10)
res.state
'PENDING'
and got error like,
[2013-04-03 06:39:54,791: ERROR/MainProcess] Received unregistered task of type u'celery.task.http.HttpDispatchTask'.
The message has been ignored and discarded.
Did you remember to import the module containing this task?
Or maybe you are using relative imports?
More: http://docs.celeryq.org/en/latest/userguide/tasks.html#names
The full contents of the message body was:
{u'utc': True, u'chord': None, u'args': [u'http://localhost/add', u'GET'], u'retries': 0, u'expires': None, u'task': u'celery.task.http.HttpDispatchTask', u'callbacks': None, u'errbacks': None, u'taskset': None, u'kwargs': {u'y': 10, u'x': 10}, u'eta': None, u'id': u'29f83cc9-ba5a-4008-9d2d-6f7bb24b0cfc'} (288b)
Traceback (most recent call last):
File "/usr/local/gdp/python2.7.2/lib/python2.7/site-packages/celery-3.0.13-py2.7.egg/celery/worker/consumer.py", line 435, in on_task_received
strategies[name](message, body, message.ack_log_error)
KeyError: u'celery.task.http.HttpDispatchTask'

Apologies if I'm stating the obvious, but the task decorator ##celery.task is commented out.

Related

Python multiprocessing, can't pickle thread.lock (pymongo.Cursor)

First, let me assure you I read all the relevant answers and they don't work for me.
I am using multiprocessing Pool to parallelize my data creation. I am using Mongodb 5.0 and pymongo client.
As you can see I am initializing the mongo client in the worker as suggested by the available answers but still I get a :
TypeError: cannot pickle '_thread.lock' object
Exception ignored in: <function CommandCursor.__del__ at 0x7f96f6fff160>
Is there a way I can use multiprocessing with pymongo.Cursor ??
Any help will be appreciated
This is the function that calls the Pool
def get_all_valid_events(
event_criteria:str,
all_listings:List[str],
earnings:List[Dict[str,Any]],
days_around_earnings=0,
debug=False,
poolsize=10,
chunk_size=100,
lookback=30,
lookahead = 0
):
start = time.perf_counter()
listings = Manager().list(all_listings.copy())
valid_events = []
if debug:
for i in range(ceil(len(listings)/chunk_size)):
valid_events += get_valid_event_dates_by_listing(event_criteria,listings[i*chunk_size:(i+1)*chunk_size] , earnings, days_around_earnings,debug)
else:
payload = list()
for i in range(ceil(len(listings)/chunk_size)):
payload.append(
[
event_criteria,
listings[i*chunk_size:(i+1)*chunk_size],
earnings,
days_around_earnings,
debug,
lookback,
lookahead
]
)
with ThreadPool(poolsize) as pool:
valid_events = pool.starmap(get_valid_event_dates_by_listing, payload)
print(f"getting all valid true events took {time.perf_counter() - start} sec")
return valid_events
And this is the worker function:
def get_valid_event_dates_by_listing(
event_criteria:str,
listings:List[str],
earnings_list,
days_around_earnings=0,
debug=False,
lookback=30,
lookahead=0
) -> List[Tuple[Tuple[str, datetime], int]]:
#TODO: generalize event filter
start = time.perf_counter()
client = MongoClient()
db = client['stock_signals']
cursor_candles_by_listing = db.candles.find(
{'listing': {'$in': listings}},
{'_id':0, 'listing':1, 'date':1,'position':1, 'PD_BBANDS_6_lower':1, 'close':1, 'PD_BBANDS_6_upper':1}
)
candles = list(cursor_candles_by_listing)
df = pd.DataFrame(candles).dropna()
minimum_position_dict = dict(df.groupby('listing').min()['position']) # We need the minimum position by listing to filter only events that have lookback
# Filter only the dates that satisfy the criteria
lte_previous_bb_6_lower = df['close'] <= df[f"{event_criteria}_lower"].shift()
gte_previous_bb_6_upper = df['close'] >= df[f"{event_criteria}_upper"].shift()
potential_true_events_df = df[lte_previous_bb_6_lower | gte_previous_bb_6_upper]
potential_false_events_df = df.drop(potential_true_events_df.index)
potential_true_event_dates = potential_true_events_df[['listing', 'date', 'position']].values
actual_true_event_dates = earning_helpers.filter_event_dates_by_earnings_and_position(potential_true_event_dates, earnings_list, minimum_position_dict ,days_around_earning=days_around_earnings, lookback=lookback)
true_event_dates = [((event_date[0], event_date[1], event_date[2]), 1) for event_date in actual_true_event_dates]
potential_false_event_dates = potential_false_events_df[['listing', 'date', 'position']].values
actual_false_event_dates = _random_false_events_from_listing_df(potential_false_event_dates, len(actual_true_event_dates), earnings_list, minimum_position_dict, days_around_earnings,lookback)
false_events_dates = [((event_date[0], event_date[1], event_date[2]), 0) for event_date in actual_false_event_dates]
all_event_dates = true_event_dates + false_events_dates
shuffle(all_event_dates)
print(f"getting a true sequence for listing took {time.perf_counter() - start} sec")
return all_event_dates
And this is my main
from utils import event_helpers, earning_helpers
from utils.queries import get_candle_listing
if __name__ == "__main__":
all_listings = get_candle_listing.get_listings()
earnigns = earning_helpers.get_all_earnings_dates()
res = event_helpers.get_all_valid_events('PD_BBANDS_6', all_listings, earnigns, 2, chunk_size=100)
Full Stack Trace
File "test_multiprocess.py", line 8, in <module>
res = event_helpers.get_all_valid_events('PD_BBANDS_6', all_listings, earnigns, 2, chunk_size=100)
File "/media/data/projects/ml/signal_platform/utils/event_helpers.py", line 53, in get_all_valid_events
valid_events = pool.starmap(get_valid_event_dates_by_listing, payload)
File "/home/froy001/.asdf/installs/python/3.8.12/lib/python3.8/multiprocessing/pool.py", line 372, in starmap
return self._map_async(func, iterable, starmapstar, chunksize).get()
File "/home/froy001/.asdf/installs/python/3.8.12/lib/python3.8/multiprocessing/pool.py", line 771, in get
raise self._value
File "/home/froy001/.asdf/installs/python/3.8.12/lib/python3.8/multiprocessing/pool.py", line 537, in _handle_tasks
put(task)
File "/home/froy001/.asdf/installs/python/3.8.12/lib/python3.8/multiprocessing/connection.py", line 206, in send
self._send_bytes(_ForkingPickler.dumps(obj))
File "/home/froy001/.asdf/installs/python/3.8.12/lib/python3.8/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
TypeError: cannot pickle '_thread.lock' object
Exception ignored in: <function CommandCursor.__del__ at 0x7f46e91e21f0>
Traceback (most recent call last):
File "/home/froy001/.cache/pypoetry/virtualenvs/signal-platform-31MTNyCe-py3.8/lib/python3.8/site-packages/pymongo/command_cursor.py", line 68, in __del__
File "/home/froy001/.cache/pypoetry/virtualenvs/signal-platform-31MTNyCe-py3.8/lib/python3.8/site-packages/pymongo/command_cursor.py", line 83, in __die
File "/home/froy001/.cache/pypoetry/virtualenvs/signal-platform-31MTNyCe-py3.8/lib/python3.8/site-packages/pymongo/mongo_client.py", line 1696, in _cleanup_cursor
File "/home/froy001/.cache/pypoetry/virtualenvs/signal-platform-31MTNyCe-py3.8/lib/python3.8/site-packages/pymongo/client_session.py", line 466, in _end_session
File "/home/froy001/.cache/pypoetry/virtualenvs/signal-platform-31MTNyCe-py3.8/lib/python3.8/site-packages/pymongo/client_session.py", line 871, in in_transaction
File "/home/froy001/.cache/pypoetry/virtualenvs/signal-platform-31MTNyCe-py3.8/lib/python3.8/site-packages/pymongo/client_session.py", line 362, in active
AttributeError: 'NoneType' object has no attribute 'STARTING'
Update: 01-23
I tried using the multiprocess library using dill but it didn't help

Chaining an iterable task result to a group, then chaining the results of this group to a chord

Related: How to chain a Celery task that returns a list into a group? and Chain a celery task's results into a distributed group, but they lack a MWE, to my understanding do not fully cover my needs, and are maybe outdated.
I need to dynamically pass the output of a task to a parallel group of tasks, then pass the results of this group to a final task. I would like to make this a single "meta task".
Here's my attempt so far:
# chaintasks.py
import random
from typing import List, Iterable, Callable
from celery import Celery, signature, group, chord
app = Celery(
"chaintasks",
backend="redis://localhost:6379",
broker="pyamqp://guest#localhost//",
)
app.conf.update(task_track_started=True, result_persistent=True)
#app.task
def select_items(items: List[str]):
print("In select_items, received", items)
selection = tuple(set(random.choices(items, k=random.randint(1, len(items)))))
print("In select_items, sending", selection)
return selection
#app.task
def process_item(item: str):
print("In process_item, received", item)
return item + "_processed"
#app.task
def group_items(items: List[str]):
print("In group_items, received", items)
return ":".join(items)
#app.task
def dynamic_map_group(iterable: Iterable, callback: Callable):
# Map a callback over an iterator and return as a group
# Credit to https://stackoverflow.com/a/13569873/5902284
callback = signature(callback)
return group(callback.clone((arg,)) for arg in iterable).delay()
#app.task
def dynamic_map_chord(iterable: Iterable, callback: Callable):
callback = signature(callback)
return chord(callback.clone((arg,)) for arg in iterable).delay()
Now, to try this out, let's spin up a celery worker and use docker to for rabbitmq and redis
$ docker run -p 5672:5672 rabbitmq
$ docker run -p 6379:6379 redis
$ celery -A chaintasks:app worker -E
Now, let's try to illustrate what I need:
items = ["item1", "item2", "item3", "item4"]
print(select_items.s(items).apply_async().get())
This outputs: ['item3'], great.
meta_task1 = select_items.s(items) | dynamic_map_group.s(process_item.s())
meta_task1_result = meta_task1.apply_async().get()
print(meta_task1_result)
It gets trickier here: the output is [['b916f5d3-4367-46c5-896f-f2d2e2e59a00', None], [[['3f78d31b-e374-484e-b05b-40c7a2038081', None], None], [['bce50d49-466a-43e9-b6ad-1b78b973b50f', None], None]]].
I am not sure what the Nones, mean, but I guess the UUIDs represents the subtasks of my meta task. But how do I get their results from here?
I tried:
subtasks_ids = [i[0][0] for i in meta_task1_result[1]]
processed_items = [app.AsyncResult(i).get() for i in subtasks_ids]
but this just hangs. What am I doing wrong here? I'm expecting an output looking like ['processed_item1', 'processed_item3']
What about trying to chord the output of select_items to group_items?
meta_task2 = select_items.s(items) | dynamic_map_chord.s(group_items.s())
print(meta_task2.apply_async().get())
Ouch, this raises an AttributeError that I don't really understand.
Traceback (most recent call last):
File "chaintasks.py", line 70, in <module>
print(meta_task2.apply_async().get())
File "/site-packages/celery/result.py", line 223, in get
return self.backend.wait_for_pending(
File "/site-packages/celery/backends/asynchronous.py", line 201, in wait_for_pending
return result.maybe_throw(callback=callback, propagate=propagate)
File "/site-packages/celery/result.py", line 335, in maybe_throw
self.throw(value, self._to_remote_traceback(tb))
File "/site-packages/celery/result.py", line 328, in throw
self.on_ready.throw(*args, **kwargs)
File "/site-packages/vine/promises.py", line 234, in throw
reraise(type(exc), exc, tb)
File "/site-packages/vine/utils.py", line 30, in reraise
raise value
AttributeError: 'NoneType' object has no attribute 'clone'
Process finished with exit code 1
And finally, what I really am trying to do is something like:
ultimate_meta_task = (
select_items.s(items)
| dynamic_map_group.s(process_item.s())
| dynamic_map_chord.s(group_items.s())
)
print(ultimate_meta_task.apply_async().get())
but this leads to:
Traceback (most recent call last):
File "chaintasks.py", line 77, in <module>
print(ultimate_meta_task.apply_async().get())
File "/site-packages/celery/result.py", line 223, in get
return self.backend.wait_for_pending(
File "/site-packages/celery/backends/asynchronous.py", line 199, in wait_for_pending
for _ in self._wait_for_pending(result, **kwargs):
File "/site-packages/celery/backends/asynchronous.py", line 265, in _wait_for_pending
for _ in self.drain_events_until(
File "/site-packages/celery/backends/asynchronous.py", line 58, in drain_events_until
on_interval()
File "/site-packages/vine/promises.py", line 160, in __call__
return self.throw()
File "/site-packages/vine/promises.py", line 157, in __call__
retval = fun(*final_args, **final_kwargs)
File "/site-packages/celery/result.py", line 236, in _maybe_reraise_parent_error
node.maybe_throw()
File "/site-packages/celery/result.py", line 335, in maybe_throw
self.throw(value, self._to_remote_traceback(tb))
File "/site-packages/celery/result.py", line 328, in throw
self.on_ready.throw(*args, **kwargs)
File "/site-packages/vine/promises.py", line 234, in throw
reraise(type(exc), exc, tb)
File "/site-packages/vine/utils.py", line 30, in reraise
raise value
kombu.exceptions.EncodeError: TypeError('Object of type GroupResult is not JSON serializable')
Process finished with exit code 1
What I try to achieve here, is to get an output looking like 'processed_item1:processed_item3'. Is this even doable using celery? Any help here is appreciated.

Error creating universal sentence encoder embeddings using beam & tf transform

I have a simple beam pipline that takes some text and gets embeddings using universal sentence encoder with tf transform. Very similar to the demo made using tf 1.
import tensorflow as tf
import apache_beam as beam
import tensorflow_transform.beam as tft_beam
import tensorflow_transform.coders as tft_coders
from apache_beam.options.pipeline_options import PipelineOptions
import tempfile
model = None
def embed_text(text):
import tensorflow_hub as hub
global model
if model is None:
model = hub.load(
'https://tfhub.dev/google/universal-sentence-encoder/4')
embedding = model(text)
return embedding
def get_metadata():
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.tf_metadata import dataset_metadata
metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
'id': dataset_schema.ColumnSchema(
tf.string, [], dataset_schema.FixedColumnRepresentation()),
'text': dataset_schema.ColumnSchema(
tf.string, [], dataset_schema.FixedColumnRepresentation())
}))
return metadata
def preprocess_fn(input_features):
text_integerized = embed_text(input_features['text'])
output_features = {
'id': input_features['id'],
'embedding': text_integerized
}
return output_features
def run(pipeline_options, known_args):
argv = None # if None, uses sys.argv
pipeline_options = PipelineOptions(argv)
pipeline = beam.Pipeline(options=pipeline_options)
with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
articles = (
pipeline
| beam.Create([
{'id':'01','text':'To be, or not to be: that is the question: '},
{'id':'02','text':"Whether 'tis nobler in the mind to suffer "},
{'id':'03','text':'The slings and arrows of outrageous fortune, '},
{'id':'04','text':'Or to take arms against a sea of troubles, '},
]))
articles_dataset = (articles, get_metadata())
transformed_dataset, transform_fn = (
articles_dataset
| 'Extract embeddings' >> tft_beam.AnalyzeAndTransformDataset(preprocess_fn)
)
transformed_data, transformed_metadata = transformed_dataset
_ = (
transformed_data | 'Write embeddings to TFRecords' >> beam.io.tfrecordio.WriteToTFRecord(
file_path_prefix='{0}'.format(known_args.output_dir),
file_name_suffix='.tfrecords',
coder=tft_coders.example_proto_coder.ExampleProtoCoder(
transformed_metadata.schema),
num_shards=1
)
)
result = pipeline.run()
result.wait_until_finished()
python 3.6.8, tf==2.0, tf_transform==0.15, apache-beam[gcp]==0.16 (I tried various compatible combos from https://github.com/tensorflow/transform)
I am getting an error when tf_transform calls the graph analyser:
...
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tensorflow_transform/beam/impl.py", line 462, in process
lambda: self._make_graph_state(saved_model_dir))
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tfx_bsl/beam/shared.py", line 221, in acquire
return _shared_map.acquire(self._key, constructor_fn)
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tfx_bsl/beam/shared.py", line 184, in acquire
result = control_block.acquire(constructor_fn)
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tfx_bsl/beam/shared.py", line 87, in acquire
result = constructor_fn()
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tensorflow_transform/beam/impl.py", line 462, in <lambda>
lambda: self._make_graph_state(saved_model_dir))
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tensorflow_transform/beam/impl.py", line 438, in _make_graph_state
self._exclude_outputs, self._tf_config)
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tensorflow_transform/beam/impl.py", line 357, in __init__
tensor_inputs = graph_tools.get_dependent_inputs(graph, inputs, fetches)
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tensorflow_transform/graph_tools.py", line 686, in get_dependent_inputs
sink_tensors_ready)
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tensorflow_transform/graph_tools.py", line 499, in __init__
table_init_op, graph_analyzer_for_table_init, translate_path_fn)
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tensorflow_transform/graph_tools.py", line 560, in _get_table_init_op_source_info
if table_init_op.type not in _TABLE_INIT_OP_TYPES:
AttributeError: 'Tensor' object has no attribute 'type' [while running 'Extract embeddings/TransformDataset/Transform']
Exception ignored in: <bound method CapturableResourceDeleter.__del__ of <tensorflow.python.training.tracking.tracking.CapturableResourceDeleter object at 0x14152fbe0>>
Traceback (most recent call last):
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tensorflow_core/python/training/tracking/tracking.py", line 190, in __del__
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3872, in as_default
File "/Users/justingrace/.pyenv/versions/3.6.8/lib/python3.6/contextlib.py", line 159, in helper
TypeError: 'NoneType' object is not callable
It appears like the graph analyser is expecting a list of operations with a type attribute but it is receiving a tensor. I can't grasp why this error is occuring other than a bug in the graph analyzer or a compatibility issue with tfx_bsl (there seem to be issues with pyarrow 0.14 so I have downgraded to 0.13)
Output of pip freeze:
absl-py==0.8.1
annoy==1.12.0
apache-beam==2.16.0
appnope==0.1.0
astor==0.8.1
astunparse==1.6.3
attrs==19.1.0
avro-python3==1.9.1
backcall==0.1.0
bleach==3.1.0
cachetools==3.1.1
certifi==2019.11.28
chardet==3.0.4
crcmod==1.7
cymem==1.31.2
cytoolz==0.9.0.1
decorator==4.4.1
defusedxml==0.6.0
dill==0.3.0
docopt==0.6.2
en-core-web-lg==2.0.0
en-coref-lg==3.0.0
en-ner-trained==2.0.0
entrypoints==0.3
fastavro==0.21.24
fasteners==0.15
flashtext==2.7
future==0.18.2
fuzzywuzzy==0.16.0
gast==0.2.2
google-api-core==1.16.0
google-apitools==0.5.28
google-auth==1.11.0
google-auth-oauthlib==0.4.1
google-cloud-bigquery==1.17.1
google-cloud-bigtable==1.0.0
google-cloud-core==1.3.0
google-cloud-datastore==1.7.4
google-cloud-pubsub==1.0.2
google-pasta==0.1.8
google-resumable-media==0.4.1
googleapis-common-protos==1.51.0
grpc-google-iam-v1==0.12.3
grpcio==1.24.0
h5py==2.10.0
hdfs==2.5.8
httplib2==0.12.0
idna==2.8
importlib-metadata==1.5.0
ipykernel==5.1.4
ipython==7.12.0
ipython-genutils==0.2.0
ipywidgets==7.5.1
jedi==0.16.0
Jinja2==2.11.1
jsonpickle==1.2
jsonschema==3.2.0
jupyter==1.0.0
jupyter-client==5.3.4
jupyter-console==6.1.0
jupyter-core==4.6.2
Keras-Applications==1.0.8
Keras-Preprocessing==1.1.0
lxml==4.2.1
Markdown==3.2.1
MarkupSafe==1.1.1
mistune==0.8.4
mock==2.0.0
monotonic==1.5
more-itertools==8.2.0
msgpack==0.6.2
msgpack-numpy==0.4.4
murmurhash==0.28.0
nbconvert==5.6.1
nbformat==5.0.4
networkx==2.1
nltk==3.4.5
notebook==6.0.3
numpy==1.18.1
oauth2client==3.0.0
oauthlib==3.1.0
opt-einsum==3.1.0
packaging==20.1
pandas==0.23.0
pandocfilters==1.4.2
parso==0.6.1
pathlib2==2.3.5
pbr==5.4.4
pexpect==4.8.0
pickleshare==0.7.5
plac==0.9.6
pluggy==0.13.1
preshed==1.0.1
prometheus-client==0.7.1
prompt-toolkit==3.0.3
proto-google-cloud-datastore-v1==0.90.4
protobuf==3.11.3
psutil==5.6.7
ptyprocess==0.6.0
py==1.8.1
pyahocorasick==1.4.0
pyarrow==0.13.0
pyasn1==0.4.8
pyasn1-modules==0.2.8
pydot==1.4.1
Pygments==2.5.2
PyHamcrest==1.9.0
pymongo==3.10.1
pyparsing==2.4.6
pyrsistent==0.15.7
pytest==5.3.5
python-dateutil==2.8.0
python-Levenshtein==0.12.0
pytz==2019.3
PyYAML==3.13
pyzmq==18.1.1
qtconsole==4.6.0
regex==2017.4.5
repoze.lru==0.7
requests==2.22.0
requests-oauthlib==1.3.0
rsa==4.0
scikit-learn==0.19.1
scipy==1.4.1
Send2Trash==1.5.0
six==1.14.0
spacy==2.0.12
tb-nightly==2.2.0a20200217
tensorboard==2.0.2
tensorflow==2.0.0
tensorflow-estimator==2.0.1
tensorflow-hub==0.6.0
tensorflow-metadata==0.15.2
tensorflow-serving-api==2.1.0
tensorflow-transform==0.15.0
termcolor==1.1.0
terminado==0.8.3
testpath==0.4.4
textblob==0.15.1
tf-estimator-nightly==2.1.0.dev2020012309
tf-nightly==2.2.0.dev20200217
tfx-bsl==0.15.0
thinc==6.10.3
toolz==0.10.0
tornado==6.0.3
tqdm==4.23.3
traitlets==4.3.3
typing==3.7.4.1
typing-extensions==3.7.4.1
ujson==1.35
Unidecode==1.0.22
urllib3==1.25.8
wcwidth==0.1.8
webencodings==0.5.1
Werkzeug==1.0.0
Whoosh==2.7.4
widgetsnbextension==3.5.1
wrapt==1.11.2
zipp==2.2.0
This could be an underlying issue according to this github post. Try using an updated version of tensorflow (2.1.0), or maybe even an updated version of your keras packages.

Hyperopt mongotrials issue with Pickle: AttributeError: 'module' object has no attribute

I'm trying to use Hyperopt parallel search with MongoDB, and encountered some issues with Mongotrials, which have been discussed here. I've tried all their methods, and I am still unable to find solutions to my specific problem. The specific model I'm trying to minimize is RadomForestRegressor from sklearn.
I've followed this tutorial. And I'm able to print out the calculated "fmin" with no issue.
Here are my steps so far:
1) Activate a virtual environment called "tensorflow" (I've installed all my libraries there)
2) Start MongoDB:
(tensorflow) bash-3.2$ mongod --dbpath . --port 1234 --directoryperdb --journal --nohttpinterface
3) Initiate workers:
(tensorflow) bash-3.2$ hyperopt-mongo-worker --mongo=localhost:1234/foo_db --poll-interval=0.1
4) Run my python code, and my python code is as follows:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from hyperopt.mongoexp import MongoTrials
# Preprocessing data
train_xg = pd.read_csv('train.csv')
n_train = len(train_xg)
print "Whole data set size: ", n_train
# Creating columns for features, and categorical features
features_col = [x for x in train_xg.columns if x not in ['id', 'loss', 'log_loss']]
cat_features_col = [x for x in train_xg.select_dtypes(include=['object']).columns if x not in ['id', 'loss', 'log_loss']]
for c in range(len(cat_features_col)):
train_xg[cat_features_col[c]] = train_xg[cat_features_col[c]].astype('category').cat.codes
# Use this to train random forest regressor
train_xg_x = np.array(train_xg[features_col])
train_xg_y = np.array(train_xg['loss'])
space_rf = { 'min_samples_leaf': hp.choice('min_samples_leaf', range(1,100)) }
trials = MongoTrials('mongo://localhost:1234/foo_db/jobs', exp_key='exp1')
def minMe(params):
# Hyperopt tuning for hyperparameters
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from hyperopt import STATUS_OK
try:
import dill as pickle
print('Went with dill')
except ImportError:
import pickle
def hyperopt_rf(params):
rf = RandomForestRegressor(**params)
return cross_val_score(rf, train_xg_x, train_xg_y).mean()
acc = hyperopt_rf(params)
print 'new acc:', acc, 'params: ', params
return {'loss': -acc, 'status': STATUS_OK}
best = fmin(fn=minMe, space=space_rf, trials=trials, algo=tpe.suggest, max_evals=100)
print "Best: ", best
5) After I run the above Python code, I get the following errors:
INFO:hyperopt.mongoexp:Error while unpickling. Try installing dill via "pip install dill" for enhanced pickling support.
INFO:hyperopt.mongoexp:job exception: 'module' object has no attribute 'minMe'
Traceback (most recent call last):
File "/Users/WernerChao/tensorflow/bin/hyperopt-mongo-worker", line 6, in <module>
sys.exit(hyperopt.mongoexp.main_worker())
File "/Users/WernerChao/tensorflow/lib/python2.7/site-packages/hyperopt/mongoexp.py", line 1302, in main_worker
return main_worker_helper(options, args)
File "/Users/WernerChao/tensorflow/lib/python2.7/site-packages/hyperopt/mongoexp.py", line 1249, in main_worker_helper
mworker.run_one(reserve_timeout=float(options.reserve_timeout))
File "/Users/WernerChao/tensorflow/lib/python2.7/site-packages/hyperopt/mongoexp.py", line 1064, in run_one
domain = pickle.loads(blob)
AttributeError: 'module' object has no attribute 'minMe'
INFO:hyperopt.mongoexp:PROTOCOL mongo
INFO:hyperopt.mongoexp:USERNAME None
INFO:hyperopt.mongoexp:HOSTNAME localhost
INFO:hyperopt.mongoexp:PORT 1234
INFO:hyperopt.mongoexp:PATH /foo_db/jobs
INFO:hyperopt.mongoexp:DB foo_db
INFO:hyperopt.mongoexp:COLLECTION jobs
INFO:hyperopt.mongoexp:PASS None
INFO:hyperopt.mongoexp:Error while unpickling. Try installing dill via "pip install dill" for enhanced pickling support.
INFO:hyperopt.mongoexp:job exception: 'module' object has no attribute 'minMe'
Traceback (most recent call last):
File "/Users/WernerChao/tensorflow/bin/hyperopt-mongo-worker", line 6, in <module>
sys.exit(hyperopt.mongoexp.main_worker())
File "/Users/WernerChao/tensorflow/lib/python2.7/site-packages/hyperopt/mongoexp.py", line 1302, in main_worker
return main_worker_helper(options, args)
File "/Users/WernerChao/tensorflow/lib/python2.7/site-packages/hyperopt/mongoexp.py", line 1249, in main_worker_helper
mworker.run_one(reserve_timeout=float(options.reserve_timeout))
File "/Users/WernerChao/tensorflow/lib/python2.7/site-packages/hyperopt/mongoexp.py", line 1064, in run_one
domain = pickle.loads(blob)
AttributeError: 'module' object has no attribute 'minMe'
INFO:hyperopt.mongoexp:PROTOCOL mongo
INFO:hyperopt.mongoexp:USERNAME None
INFO:hyperopt.mongoexp:HOSTNAME localhost
INFO:hyperopt.mongoexp:PORT 1234
INFO:hyperopt.mongoexp:PATH /foo_db/jobs
INFO:hyperopt.mongoexp:DB foo_db
INFO:hyperopt.mongoexp:COLLECTION jobs
INFO:hyperopt.mongoexp:PASS None
INFO:hyperopt.mongoexp:Error while unpickling. Try installing dill via "pip install dill" for enhanced pickling support.
INFO:hyperopt.mongoexp:job exception: 'module' object has no attribute 'minMe'
Traceback (most recent call last):
File "/Users/WernerChao/tensorflow/bin/hyperopt-mongo-worker", line 6, in <module>
sys.exit(hyperopt.mongoexp.main_worker())
File "/Users/WernerChao/tensorflow/lib/python2.7/site-packages/hyperopt/mongoexp.py", line 1302, in main_worker
return main_worker_helper(options, args)
File "/Users/WernerChao/tensorflow/lib/python2.7/site-packages/hyperopt/mongoexp.py", line 1249, in main_worker_helper
mworker.run_one(reserve_timeout=float(options.reserve_timeout))
File "/Users/WernerChao/tensorflow/lib/python2.7/site-packages/hyperopt/mongoexp.py", line 1064, in run_one
domain = pickle.loads(blob)
AttributeError: 'module' object has no attribute 'minMe'
INFO:hyperopt.mongoexp:PROTOCOL mongo
INFO:hyperopt.mongoexp:USERNAME None
INFO:hyperopt.mongoexp:HOSTNAME localhost
INFO:hyperopt.mongoexp:PORT 1234
INFO:hyperopt.mongoexp:PATH /foo_db/jobs
INFO:hyperopt.mongoexp:DB foo_db
INFO:hyperopt.mongoexp:COLLECTION jobs
INFO:hyperopt.mongoexp:PASS None
INFO:hyperopt.mongoexp:no job found, sleeping for 0.7s
INFO:hyperopt.mongoexp:Error while unpickling. Try installing dill via "pip install dill" for enhanced pickling support.
INFO:hyperopt.mongoexp:job exception: 'module' object has no attribute 'minMe'
Traceback (most recent call last):
File "/Users/WernerChao/tensorflow/bin/hyperopt-mongo-worker", line 6, in <module>
sys.exit(hyperopt.mongoexp.main_worker())
File "/Users/WernerChao/tensorflow/lib/python2.7/site-packages/hyperopt/mongoexp.py", line 1302, in main_worker
return main_worker_helper(options, args)
File "/Users/WernerChao/tensorflow/lib/python2.7/site-packages/hyperopt/mongoexp.py", line 1249, in main_worker_helper
mworker.run_one(reserve_timeout=float(options.reserve_timeout))
File "/Users/WernerChao/tensorflow/lib/python2.7/site-packages/hyperopt/mongoexp.py", line 1064, in run_one
domain = pickle.loads(blob)
AttributeError: 'module' object has no attribute 'minMe'
INFO:hyperopt.mongoexp:exiting with N=9223372036854775803 after 4 consecutive exceptions
6) Then Mongo workers would shut off.
Things I've tried:
install "dill" as the error suggested -> didn't work
Put global imports into the objective function so it can pickle -> didn't work
Put try except with "dill" or "pickle" as import -> didn't work
Does anyone have similar issues? I'm running out of ideas to try, and have been working on this for 2 days in vain. I think I am missing something really simple here, just can't seem to find it.
What am I missing?
Any suggestion is welcomed please!
Had the same problem in python 3.5. Installing Dill didn't help, nor dir setting workdir in MongoTrials or hyperopt-mongo-worker cli. hyperopt-mongo-worker doesn't seem to have access to __main__ where the function was defined:
AttributeError: Can't get attribute 'minMe' on <module '__main__' from ...hyperopt-mongo-worker
As #jaikumarm suggested, I circumvented the problem by writing a module file with all the required functions. However, instead of soft-linking it into the bin directory, I extended the PYTHONPATH before running hyperopt-mongo-worker:
export PYTHONPATH="${PYTHONPATH}:<dir_with_the_module.py>"
hyperopt-mongo-worker ...
That way, the hyperopt-monogo-worker is able to import the module containing minMe.
I fought with this for several days before coming up with a workable solution. there are two problems:
1. the mongo worker spawns off a separate process to run the optimizer so any context from your original python file is lost and unavailable for this new process.
2. the imports on this new process happen in the context of the hyperopt-mongo-worker scipy, which is in your case will be /Users/WernerChao/tensorflow/bin/.
So my solution is to make this new optimizer function completely self sufficient
optimizer.py
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
# Preprocessing data
train_xg = pd.read_csv('train.csv')
n_train = len(train_xg)
print "Whole data set size: ", n_train
# Creating columns for features, and categorical features
features_col = [x for x in train_xg.columns if x not in ['id', 'loss', 'log_loss']]
cat_features_col = [x for x in train_xg.select_dtypes(include=['object']).columns if x not in ['id', 'loss', 'log_loss']]
for c in range(len(cat_features_col)):
train_xg[cat_features_col[c]] = train_xg[cat_features_col[c]].astype('category').cat.codes
# Use this to train random forest regressor
train_xg_x = np.array(train_xg[features_col])
train_xg_y = np.array(train_xg['loss'])
def minMe(params):
# Hyperopt tuning for hyperparameters
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from hyperopt import STATUS_OK
try:
import dill as pickle
print('Went with dill')
except ImportError:
import pickle
def hyperopt_rf(params):
rf = RandomForestRegressor(**params)
return cross_val_score(rf, train_xg_x, train_xg_y).mean()
acc = hyperopt_rf(params)
print 'new acc:', acc, 'params: ', params
return {'loss': -acc, 'status': STATUS_OK}
wrapper.py
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from hyperopt.mongoexp import MongoTrials
import optimizer
space_rf = { 'min_samples_leaf': hp.choice('min_samples_leaf', range(1,100)) }
best = fmin(fn=optimizer.minMe, space=space_rf, trials=trials, algo=tpe.suggest, max_evals=100)
print "Best: ", best
trials = MongoTrials('mongo://localhost:1234/foo_db/jobs', exp_key='exp1')
Once you have this code link the optimizer.py to the bin folder
ln -s /Users/WernerChao/Git/test/optimizer.py /Users/WernerChao/tensorflow/bin/
now run the wrapper.py and then the mongo worker it should be able to import the optimizer from its local context and run the minMe function.
Try to install Dill in the Python environment of your tensorflow (or possibly the worker):
/Users/WernerChao/tensorflow/lib/python2.7/site-packages/hyperopt
Your aim is to get rid of the hyperopt error message:
hyperopt.mongoexp:Error while unpickling. Try installing dill via "pip install dill" for enhanced pickling support.
This is because the Python by default cannot marshal a function. It requires dill library to extend Python's pickling module for serialising/de-serialising Python objects. In your case, it failed to serialise your function minMe().
I made a separate file which calculates the loss and copied it to /anaconda2/bin/
and
/anaconda2/lib/python2.7/site-packages/hyperopt
it is working fine.
This was my Traceback
Traceback (most recent call last):
File "/home/greatskull/anaconda2/bin/hyperopt-mongo-worker", line 6, in <module>
sys.exit(hyperopt.mongoexp.main_worker())
File "/home/greatskull/anaconda2/lib/python2.7/site-packages/hyperopt/mongoexp.py", line 1302, in main_worker
return main_worker_helper(options, args)
File "/home/greatskull/anaconda2/lib/python2.7/site-packages/hyperopt/mongoexp.py", line 1249, in main_worker_helper
mworker.run_one(reserve_timeout=float(options.reserve_timeout))
File "/home/greatskull/anaconda2/lib/python2.7/site-packages/hyperopt/mongoexp.py", line 1073, in run_one
with temp_dir(workdir, erase_created_workdir), working_dir(workdir):
File "/home/greatskull/anaconda2/lib/python2.7/contextlib.py", line 17, in __enter__
return self.gen.next()
File "/home/greatskull/anaconda2/lib/python2.7/site-packages/hyperopt/utils.py", line 229, in temp_dir
os.makedirs(dir)
File "/home/greatskull/anaconda2/lib/python2.7/os.py", line 150, in makedirs
makedirs(head, mode)
File "/home/greatskull/anaconda2/lib/python2.7/os.py", line 157, in makedirs
mkdir(name, mode)

python: sending a mail, fails when inside a "with" block

I am wondering why this code
test = smtplib.SMTP('smtp.gmail.com', 587)
test.ehlo()
test.starttls()
test.ehlo()
test.login('address','passw')
test.sendmail(sender, recipients, composed)
test.close()
works, but when written like this
with smtplib.SMTP('smtp.gmail.com', 587) as s:
s.ehlo()
s.starttls()
s.ehlo()
s.login('address','passw')
s.sendmail(sender, recipients, composed)
s.close()
it fails with the message
Unable to send the email. Error: <class 'AttributeError'>
Traceback (most recent call last):
File "py_script.py", line 100, in <module>
with smtplib.SMTP('smtp.gmail.com', 587) as s:
AttributeError: __exit__
Why is this happening? (python3 on a raspberry pi)
Thx
You are not using Python 3.3 or up. In your version of Python, smtplib.SMTP() is not a context manager and cannot be using in a with statement.
The traceback is directly caused because there is no __exit__ method, a requirement for context managers.
From the smptlib.SMTP() documentation:
Changed in version 3.3: Support for the with statement was added.
You can wrap the object in a context manager with #contextlib.contextmanager:
from contextlib import contextmanager
from smtplib import SMTPResponseException, SMTPServerDisconnected
#contextmanager
def quitting_smtp_cm(smtp):
try:
yield smtp
finally:
try:
code, message = smtp.docmd("QUIT")
if code != 221:
raise SMTPResponseException(code, message)
except SMTPServerDisconnected:
pass
finally:
smtp.close()
This uses the same exit behaviour as was added in Python 3.3. Use it like this:
with quitting_smtp_cm(smtplib.SMTP('smtp.gmail.com', 587)) as s:
s.ehlo()
s.starttls()
s.ehlo()
s.login('address','passw')
s.sendmail(sender, recipients, composed)
Note that it'll close the connection for you.