Using pathlib.Path with spark.read.parquet - pyspark

Is it possible to use pathlib.Path objects with spark.read.parquet and other pyspark.sql.DataFrameReader methods?
It doesn't work by default:
>>> from pathlib import Path
>>> basedir = Path("/data")
>>> spark.read.parquet(basedir / "name.parquet")
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-5-cec8ced1bc5d> in <module>
----> 1 spark.read.parquet(basedir / "name.parquet")
<... a long traceback ...>
/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_command_part(parameter, python_proxy_pool)
296 command_part += ";" + interface
297 else:
--> 298 command_part = REFERENCE_TYPE + parameter._get_object_id()
299
300 command_part += "\n"
AttributeError: 'PosixPath' object has no attribute '_get_object_id'
I tried to write py4j type converter:
class PathConverter(object):
def can_convert(self, object):
return isinstance(object, Path)
def convert(self, object, gateway_client):
JavaString = JavaClass("java.lang.String", gateway_client)
return JavaString(str(object))
register_input_converter(PathConverter())
But it looks like I misunderstood some string conversion related concepts/specifics, because jvm.java.lang.String("string") in py4j returns the python str object:
>>> spark.read.parquet(basedir / "name.parquet")
<... a long traceback ...>
/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1306
1307 for temp_arg in temp_args:
-> 1308 temp_arg._detach()
AttributeError: 'str' object has no attribute '_detach'

I have only one ugly solution for now:
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index fa3e829a88..7441a8ba8c 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
## -298,7 +298,7 ## class DataFrameReader(OptionUtils):
modifiedAfter=modifiedAfter, datetimeRebaseMode=datetimeRebaseMode,
int96RebaseMode=int96RebaseMode)
- return self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths)))
+ return self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths, converter=str)))
def text(self, paths, wholetext=False, lineSep=None, pathGlobFilter=None,
recursiveFileLookup=None, modifiedBefore=None,
Also, looking through the readwriter.py source code it feels safe enough to monkeypatch its version of _to_seq:
from pyspark.sql import readwriter
def converter(x):
if isinstance(x, PurePath):
return str(x)
return x
readwriter._to_seq = partial(readwriter._to_seq, converter=converter)
Or maybe more correct and full workaround would be to monkeypatch the reader/writer methods directly:
#wraps(readwriter.DataFrameWriter.parquet)
def parquet(self, path, mode=None, partitionBy=None, compression=None):
return parquet.__wrapped__(self, str(path), mode=mode,
partitionBy=partitionBy,
compression=compression)
readwriter.DataFrameWriter.parquet = parquet

Related

How to do Semantic segmentation with detectron2

I'm using Detectron2 to do instance segmentation as in the tutorial. Below is the code:
from detectron2.config import CfgNode
import detectron2.data.transforms as T
from detectron2.data import build_detection_train_loader, DatasetMapper
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
transform_list = [
# T.Resize(shape=(200,300)),
T.RandomRotation(angle=90.0),
# T.RandomContrast(intensity_min=0.75, intensity_max=1.25),
# T.RandomBrightness(intensity_min=0.75, intensity_max=1.25),
# T.RandomSaturation(intensity_min=0.75, intensity_max=1.25),
# T.RandomLighting(scale=0.1),
T.RandomFlip(),
# T.RandomCrop(crop_type="absolute", crop_size=(180, 270))
]
# custom_mapper = get_custom_mapper(transfrom_list)
custom_mapper = DatasetMapper(
cfg,
is_train=True,
augmentations=transform_list,
use_instance_mask=True,
instance_mask_format="bitmask",
)
class CustomTrainer(DefaultTrainer):
#classmethod
def build_test_loader(cls, cfg: CfgNode, dataset_name):
return build_detection_test_loader(cfg, dataset_name, mapper=custom_mapper)
#classmethod
def build_train_loader(cls, cfg: CfgNode):
return build_detection_train_loader(cfg, mapper=custom_mapper)
cfg.INPUT.MASK_FORMAT = 'bitmask'
cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS = False
trainer = CustomTrainer(cfg)
# trainer = DefaultTrainer(cfg)
# trainer.resume_or_load(resume=False)
# trainer.train()
However, in this case I don't care about instances and more like I want to do semantic segmentation but there is no tutorial or examples to do that nor I'm seeing a semantic model I can start with. Misc/semantic_R_50_FPN_1x.yaml throws an error saying there is no pretrained model available.
So instead I'm trying to use the SemSegEvaluator instead of COCO evaluator to give me metrics around semantic rather than instances. Below is the code:
from detectron2.evaluation import COCOEvaluator, inference_on_dataset, SemSegEvaluator
from detectron2.data import build_detection_test_loader
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.4
# evaluator = COCOEvaluator(val_dataset_name, output_dir=os.path.join(cfg.OUTPUT_DIR, 'val'), use_fast_impl=False, tasks=['segm'])
evaluator = SemSegEvaluator(val_dataset_name, output_dir=os.path.join(cfg.OUTPUT_DIR, 'val'))
val_loader = build_detection_test_loader(cfg, val_dataset_name)
eval_result = inference_on_dataset(predictor.model, val_loader, evaluator)
print(eval_result)
However, this is failing with the following error:
[12/20 16:29:02 d2.data.datasets.coco]: Loaded 50 imagesss abdul in COCO format from /content/gdrive/MyDrive/SolarDetection/datasets/train8//val/labels.json
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-10-61bd5aaec8ea> in <module>
3 cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.4
4 # evaluator = COCOEvaluator(val_dataset_name, output_dir=os.path.join(cfg.OUTPUT_DIR, 'val'), use_fast_impl=False, tasks=['segm'])
----> 5 evaluator = SemSegEvaluator(val_dataset_name, output_dir=os.path.join(cfg.OUTPUT_DIR, 'val'))
6 val_loader = build_detection_test_loader(cfg, val_dataset_name)
7 # ipdb.set_trace(context=6)
1 frames
/content/gdrive/MyDrive/repos/detectron2/detectron2/evaluation/sem_seg_evaluation.py in <dictcomp>(.0)
69
70 self.input_file_to_gt_file = {
---> 71 dataset_record["file_name"]: dataset_record["sem_seg_file_name"]
72 for dataset_record in DatasetCatalog.get(dataset_name)
73 }
KeyError: 'sem_seg_file_name'
Any idea or hint how I can setup and use the SemSegEvaluator?

Python multiprocessing, can't pickle thread.lock (pymongo.Cursor)

First, let me assure you I read all the relevant answers and they don't work for me.
I am using multiprocessing Pool to parallelize my data creation. I am using Mongodb 5.0 and pymongo client.
As you can see I am initializing the mongo client in the worker as suggested by the available answers but still I get a :
TypeError: cannot pickle '_thread.lock' object
Exception ignored in: <function CommandCursor.__del__ at 0x7f96f6fff160>
Is there a way I can use multiprocessing with pymongo.Cursor ??
Any help will be appreciated
This is the function that calls the Pool
def get_all_valid_events(
event_criteria:str,
all_listings:List[str],
earnings:List[Dict[str,Any]],
days_around_earnings=0,
debug=False,
poolsize=10,
chunk_size=100,
lookback=30,
lookahead = 0
):
start = time.perf_counter()
listings = Manager().list(all_listings.copy())
valid_events = []
if debug:
for i in range(ceil(len(listings)/chunk_size)):
valid_events += get_valid_event_dates_by_listing(event_criteria,listings[i*chunk_size:(i+1)*chunk_size] , earnings, days_around_earnings,debug)
else:
payload = list()
for i in range(ceil(len(listings)/chunk_size)):
payload.append(
[
event_criteria,
listings[i*chunk_size:(i+1)*chunk_size],
earnings,
days_around_earnings,
debug,
lookback,
lookahead
]
)
with ThreadPool(poolsize) as pool:
valid_events = pool.starmap(get_valid_event_dates_by_listing, payload)
print(f"getting all valid true events took {time.perf_counter() - start} sec")
return valid_events
And this is the worker function:
def get_valid_event_dates_by_listing(
event_criteria:str,
listings:List[str],
earnings_list,
days_around_earnings=0,
debug=False,
lookback=30,
lookahead=0
) -> List[Tuple[Tuple[str, datetime], int]]:
#TODO: generalize event filter
start = time.perf_counter()
client = MongoClient()
db = client['stock_signals']
cursor_candles_by_listing = db.candles.find(
{'listing': {'$in': listings}},
{'_id':0, 'listing':1, 'date':1,'position':1, 'PD_BBANDS_6_lower':1, 'close':1, 'PD_BBANDS_6_upper':1}
)
candles = list(cursor_candles_by_listing)
df = pd.DataFrame(candles).dropna()
minimum_position_dict = dict(df.groupby('listing').min()['position']) # We need the minimum position by listing to filter only events that have lookback
# Filter only the dates that satisfy the criteria
lte_previous_bb_6_lower = df['close'] <= df[f"{event_criteria}_lower"].shift()
gte_previous_bb_6_upper = df['close'] >= df[f"{event_criteria}_upper"].shift()
potential_true_events_df = df[lte_previous_bb_6_lower | gte_previous_bb_6_upper]
potential_false_events_df = df.drop(potential_true_events_df.index)
potential_true_event_dates = potential_true_events_df[['listing', 'date', 'position']].values
actual_true_event_dates = earning_helpers.filter_event_dates_by_earnings_and_position(potential_true_event_dates, earnings_list, minimum_position_dict ,days_around_earning=days_around_earnings, lookback=lookback)
true_event_dates = [((event_date[0], event_date[1], event_date[2]), 1) for event_date in actual_true_event_dates]
potential_false_event_dates = potential_false_events_df[['listing', 'date', 'position']].values
actual_false_event_dates = _random_false_events_from_listing_df(potential_false_event_dates, len(actual_true_event_dates), earnings_list, minimum_position_dict, days_around_earnings,lookback)
false_events_dates = [((event_date[0], event_date[1], event_date[2]), 0) for event_date in actual_false_event_dates]
all_event_dates = true_event_dates + false_events_dates
shuffle(all_event_dates)
print(f"getting a true sequence for listing took {time.perf_counter() - start} sec")
return all_event_dates
And this is my main
from utils import event_helpers, earning_helpers
from utils.queries import get_candle_listing
if __name__ == "__main__":
all_listings = get_candle_listing.get_listings()
earnigns = earning_helpers.get_all_earnings_dates()
res = event_helpers.get_all_valid_events('PD_BBANDS_6', all_listings, earnigns, 2, chunk_size=100)
Full Stack Trace
File "test_multiprocess.py", line 8, in <module>
res = event_helpers.get_all_valid_events('PD_BBANDS_6', all_listings, earnigns, 2, chunk_size=100)
File "/media/data/projects/ml/signal_platform/utils/event_helpers.py", line 53, in get_all_valid_events
valid_events = pool.starmap(get_valid_event_dates_by_listing, payload)
File "/home/froy001/.asdf/installs/python/3.8.12/lib/python3.8/multiprocessing/pool.py", line 372, in starmap
return self._map_async(func, iterable, starmapstar, chunksize).get()
File "/home/froy001/.asdf/installs/python/3.8.12/lib/python3.8/multiprocessing/pool.py", line 771, in get
raise self._value
File "/home/froy001/.asdf/installs/python/3.8.12/lib/python3.8/multiprocessing/pool.py", line 537, in _handle_tasks
put(task)
File "/home/froy001/.asdf/installs/python/3.8.12/lib/python3.8/multiprocessing/connection.py", line 206, in send
self._send_bytes(_ForkingPickler.dumps(obj))
File "/home/froy001/.asdf/installs/python/3.8.12/lib/python3.8/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
TypeError: cannot pickle '_thread.lock' object
Exception ignored in: <function CommandCursor.__del__ at 0x7f46e91e21f0>
Traceback (most recent call last):
File "/home/froy001/.cache/pypoetry/virtualenvs/signal-platform-31MTNyCe-py3.8/lib/python3.8/site-packages/pymongo/command_cursor.py", line 68, in __del__
File "/home/froy001/.cache/pypoetry/virtualenvs/signal-platform-31MTNyCe-py3.8/lib/python3.8/site-packages/pymongo/command_cursor.py", line 83, in __die
File "/home/froy001/.cache/pypoetry/virtualenvs/signal-platform-31MTNyCe-py3.8/lib/python3.8/site-packages/pymongo/mongo_client.py", line 1696, in _cleanup_cursor
File "/home/froy001/.cache/pypoetry/virtualenvs/signal-platform-31MTNyCe-py3.8/lib/python3.8/site-packages/pymongo/client_session.py", line 466, in _end_session
File "/home/froy001/.cache/pypoetry/virtualenvs/signal-platform-31MTNyCe-py3.8/lib/python3.8/site-packages/pymongo/client_session.py", line 871, in in_transaction
File "/home/froy001/.cache/pypoetry/virtualenvs/signal-platform-31MTNyCe-py3.8/lib/python3.8/site-packages/pymongo/client_session.py", line 362, in active
AttributeError: 'NoneType' object has no attribute 'STARTING'
Update: 01-23
I tried using the multiprocess library using dill but it didn't help

Error creating universal sentence encoder embeddings using beam & tf transform

I have a simple beam pipline that takes some text and gets embeddings using universal sentence encoder with tf transform. Very similar to the demo made using tf 1.
import tensorflow as tf
import apache_beam as beam
import tensorflow_transform.beam as tft_beam
import tensorflow_transform.coders as tft_coders
from apache_beam.options.pipeline_options import PipelineOptions
import tempfile
model = None
def embed_text(text):
import tensorflow_hub as hub
global model
if model is None:
model = hub.load(
'https://tfhub.dev/google/universal-sentence-encoder/4')
embedding = model(text)
return embedding
def get_metadata():
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.tf_metadata import dataset_metadata
metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
'id': dataset_schema.ColumnSchema(
tf.string, [], dataset_schema.FixedColumnRepresentation()),
'text': dataset_schema.ColumnSchema(
tf.string, [], dataset_schema.FixedColumnRepresentation())
}))
return metadata
def preprocess_fn(input_features):
text_integerized = embed_text(input_features['text'])
output_features = {
'id': input_features['id'],
'embedding': text_integerized
}
return output_features
def run(pipeline_options, known_args):
argv = None # if None, uses sys.argv
pipeline_options = PipelineOptions(argv)
pipeline = beam.Pipeline(options=pipeline_options)
with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
articles = (
pipeline
| beam.Create([
{'id':'01','text':'To be, or not to be: that is the question: '},
{'id':'02','text':"Whether 'tis nobler in the mind to suffer "},
{'id':'03','text':'The slings and arrows of outrageous fortune, '},
{'id':'04','text':'Or to take arms against a sea of troubles, '},
]))
articles_dataset = (articles, get_metadata())
transformed_dataset, transform_fn = (
articles_dataset
| 'Extract embeddings' >> tft_beam.AnalyzeAndTransformDataset(preprocess_fn)
)
transformed_data, transformed_metadata = transformed_dataset
_ = (
transformed_data | 'Write embeddings to TFRecords' >> beam.io.tfrecordio.WriteToTFRecord(
file_path_prefix='{0}'.format(known_args.output_dir),
file_name_suffix='.tfrecords',
coder=tft_coders.example_proto_coder.ExampleProtoCoder(
transformed_metadata.schema),
num_shards=1
)
)
result = pipeline.run()
result.wait_until_finished()
python 3.6.8, tf==2.0, tf_transform==0.15, apache-beam[gcp]==0.16 (I tried various compatible combos from https://github.com/tensorflow/transform)
I am getting an error when tf_transform calls the graph analyser:
...
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tensorflow_transform/beam/impl.py", line 462, in process
lambda: self._make_graph_state(saved_model_dir))
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tfx_bsl/beam/shared.py", line 221, in acquire
return _shared_map.acquire(self._key, constructor_fn)
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tfx_bsl/beam/shared.py", line 184, in acquire
result = control_block.acquire(constructor_fn)
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tfx_bsl/beam/shared.py", line 87, in acquire
result = constructor_fn()
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tensorflow_transform/beam/impl.py", line 462, in <lambda>
lambda: self._make_graph_state(saved_model_dir))
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tensorflow_transform/beam/impl.py", line 438, in _make_graph_state
self._exclude_outputs, self._tf_config)
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tensorflow_transform/beam/impl.py", line 357, in __init__
tensor_inputs = graph_tools.get_dependent_inputs(graph, inputs, fetches)
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tensorflow_transform/graph_tools.py", line 686, in get_dependent_inputs
sink_tensors_ready)
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tensorflow_transform/graph_tools.py", line 499, in __init__
table_init_op, graph_analyzer_for_table_init, translate_path_fn)
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tensorflow_transform/graph_tools.py", line 560, in _get_table_init_op_source_info
if table_init_op.type not in _TABLE_INIT_OP_TYPES:
AttributeError: 'Tensor' object has no attribute 'type' [while running 'Extract embeddings/TransformDataset/Transform']
Exception ignored in: <bound method CapturableResourceDeleter.__del__ of <tensorflow.python.training.tracking.tracking.CapturableResourceDeleter object at 0x14152fbe0>>
Traceback (most recent call last):
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tensorflow_core/python/training/tracking/tracking.py", line 190, in __del__
File "/Users/justingrace/.pyenv/versions/hlx36/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3872, in as_default
File "/Users/justingrace/.pyenv/versions/3.6.8/lib/python3.6/contextlib.py", line 159, in helper
TypeError: 'NoneType' object is not callable
It appears like the graph analyser is expecting a list of operations with a type attribute but it is receiving a tensor. I can't grasp why this error is occuring other than a bug in the graph analyzer or a compatibility issue with tfx_bsl (there seem to be issues with pyarrow 0.14 so I have downgraded to 0.13)
Output of pip freeze:
absl-py==0.8.1
annoy==1.12.0
apache-beam==2.16.0
appnope==0.1.0
astor==0.8.1
astunparse==1.6.3
attrs==19.1.0
avro-python3==1.9.1
backcall==0.1.0
bleach==3.1.0
cachetools==3.1.1
certifi==2019.11.28
chardet==3.0.4
crcmod==1.7
cymem==1.31.2
cytoolz==0.9.0.1
decorator==4.4.1
defusedxml==0.6.0
dill==0.3.0
docopt==0.6.2
en-core-web-lg==2.0.0
en-coref-lg==3.0.0
en-ner-trained==2.0.0
entrypoints==0.3
fastavro==0.21.24
fasteners==0.15
flashtext==2.7
future==0.18.2
fuzzywuzzy==0.16.0
gast==0.2.2
google-api-core==1.16.0
google-apitools==0.5.28
google-auth==1.11.0
google-auth-oauthlib==0.4.1
google-cloud-bigquery==1.17.1
google-cloud-bigtable==1.0.0
google-cloud-core==1.3.0
google-cloud-datastore==1.7.4
google-cloud-pubsub==1.0.2
google-pasta==0.1.8
google-resumable-media==0.4.1
googleapis-common-protos==1.51.0
grpc-google-iam-v1==0.12.3
grpcio==1.24.0
h5py==2.10.0
hdfs==2.5.8
httplib2==0.12.0
idna==2.8
importlib-metadata==1.5.0
ipykernel==5.1.4
ipython==7.12.0
ipython-genutils==0.2.0
ipywidgets==7.5.1
jedi==0.16.0
Jinja2==2.11.1
jsonpickle==1.2
jsonschema==3.2.0
jupyter==1.0.0
jupyter-client==5.3.4
jupyter-console==6.1.0
jupyter-core==4.6.2
Keras-Applications==1.0.8
Keras-Preprocessing==1.1.0
lxml==4.2.1
Markdown==3.2.1
MarkupSafe==1.1.1
mistune==0.8.4
mock==2.0.0
monotonic==1.5
more-itertools==8.2.0
msgpack==0.6.2
msgpack-numpy==0.4.4
murmurhash==0.28.0
nbconvert==5.6.1
nbformat==5.0.4
networkx==2.1
nltk==3.4.5
notebook==6.0.3
numpy==1.18.1
oauth2client==3.0.0
oauthlib==3.1.0
opt-einsum==3.1.0
packaging==20.1
pandas==0.23.0
pandocfilters==1.4.2
parso==0.6.1
pathlib2==2.3.5
pbr==5.4.4
pexpect==4.8.0
pickleshare==0.7.5
plac==0.9.6
pluggy==0.13.1
preshed==1.0.1
prometheus-client==0.7.1
prompt-toolkit==3.0.3
proto-google-cloud-datastore-v1==0.90.4
protobuf==3.11.3
psutil==5.6.7
ptyprocess==0.6.0
py==1.8.1
pyahocorasick==1.4.0
pyarrow==0.13.0
pyasn1==0.4.8
pyasn1-modules==0.2.8
pydot==1.4.1
Pygments==2.5.2
PyHamcrest==1.9.0
pymongo==3.10.1
pyparsing==2.4.6
pyrsistent==0.15.7
pytest==5.3.5
python-dateutil==2.8.0
python-Levenshtein==0.12.0
pytz==2019.3
PyYAML==3.13
pyzmq==18.1.1
qtconsole==4.6.0
regex==2017.4.5
repoze.lru==0.7
requests==2.22.0
requests-oauthlib==1.3.0
rsa==4.0
scikit-learn==0.19.1
scipy==1.4.1
Send2Trash==1.5.0
six==1.14.0
spacy==2.0.12
tb-nightly==2.2.0a20200217
tensorboard==2.0.2
tensorflow==2.0.0
tensorflow-estimator==2.0.1
tensorflow-hub==0.6.0
tensorflow-metadata==0.15.2
tensorflow-serving-api==2.1.0
tensorflow-transform==0.15.0
termcolor==1.1.0
terminado==0.8.3
testpath==0.4.4
textblob==0.15.1
tf-estimator-nightly==2.1.0.dev2020012309
tf-nightly==2.2.0.dev20200217
tfx-bsl==0.15.0
thinc==6.10.3
toolz==0.10.0
tornado==6.0.3
tqdm==4.23.3
traitlets==4.3.3
typing==3.7.4.1
typing-extensions==3.7.4.1
ujson==1.35
Unidecode==1.0.22
urllib3==1.25.8
wcwidth==0.1.8
webencodings==0.5.1
Werkzeug==1.0.0
Whoosh==2.7.4
widgetsnbextension==3.5.1
wrapt==1.11.2
zipp==2.2.0
This could be an underlying issue according to this github post. Try using an updated version of tensorflow (2.1.0), or maybe even an updated version of your keras packages.

Made Locust to login to a Web Application

I want locust to be able to login to my web application and start to click in the links inside the web application.
With this code I just get activity for the front page with the login and i don't get any notification from inside the application.
Code:
import random
from locust import HttpLocust, TaskSet, task
from pyquery import PyQuery
class WalkPages(TaskSet):
def on_start(self):
self.client.post("/", {
"UserName": "my#email.com",
"Password": "2Password!",
"submit": "Sign In"
})
self.index_page()
#task(10)
def index_page(self):
r = self.client.get("/Dashboard.mvc")
pq = PyQuery(r.content)
link_elements = pq("a")
self.urls_on_current_page = []
for l in link_elements:
if "href" in l.attrib:
self.urls_on_current_page.append(l.attrib["href"])
#task(30)
def load_page(self):
url = random.choice(self.urls_on_current_page)
r = self.client.get(url)
class AwesomeUser(HttpLocust):
task_set = WalkPages
host = "https://myenv.beta.webapp.com"
min_wait = 20 * 1000
max_wait = 60 * 1000
I get the follow msg in the terminal after the first round.
[2015-02-13 12:08:43,740] webapp-qa/ERROR/stderr: Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/locust/core.py", line 267, in run
self.execute_next_task()
File "/usr/local/lib/python2.7/dist-packages/locust/core.py", line 293, in execute_next_task
self.execute_task(task["callable"], *task["args"], **task["kwargs"])
File "/usr/local/lib/python2.7/dist-packages/locust/core.py", line 305, in execute_task
task(self, *args, **kwargs)
File "/home/webapp/LoadTest/locustfile.py", line 31, in load_page
url = random.choice(self.urls_on_current_page)
File "/usr/lib/python2.7/random.py", line 273, in choice
return seq[int(self.random() * len(seq))] # raises IndexError if seq is empty
IndexError: list index out of range
[2015-02-13 12:08:43,752] webapp-qa/ERROR/stderr: Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/locust/core.py", line 267, in run
self.execute_next_task()
File "/usr/local/lib/python2.7/dist-packages/locust/core.py", line 293, in execute_next_task
self.execute_task(task["callable"], *task["args"], **task["kwargs"])
File "/usr/local/lib/python2.7/dist-packages/locust/core.py", line 305, in execute_task
task(self, *args, **kwargs)
File "/home/webapp/LoadTest/locustfile.py", line 31, in load_page
url = random.choice(self.urls_on_current_page)
File "/usr/lib/python2.7/random.py", line 273, in choice
return seq[int(self.random() * len(seq))] # raises IndexError if seq is empty
IndexError: list index out of range
[2015-02-13 12:08:43,775] webapp-qa/ERROR/stderr: Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/locust/core.py", line 267, in run
self.execute_next_task()
File "/usr/local/lib/python2.7/dist-packages/locust/core.py", line 293, in execute_next_task
self.execute_task(task["callable"], *task["args"], **task["kwargs"])
File "/usr/local/lib/python2.7/dist-packages/locust/core.py", line 305, in execute_task
task(self, *args, **kwargs)
File "/home/webapp/LoadTest/locustfile.py", line 31, in load_page
url = random.choice(self.urls_on_current_page)
File "/usr/lib/python2.7/random.py", line 273, in choice
return seq[int(self.random() * len(seq))] # raises IndexError if seq is empty
IndexError: list index out of range
Your list may be empty.
#task(30)
def load_page(self):
if self.urls_on_current_page:
url = random.choice(self.urls_on_current_page)
r = self.client.get(url)
I takes time but someone may need this. My findings in your code: login requests seems not correct (check mine if correct), you cannot reach a variable defined inside of a function from another function, giving task(10) is not suitable for data setter function. Set urls_on_current_page as a class variable to serve for other class members. See my code and comment:
import random
from locust import HttpLocust, TaskSet, task
from pyquery import PyQuery
class WalkPages(TaskSet):
# define variable here to access them from inside the functions
urls_on_current_page = []
def login(self):
self.client.post("/login", data = {"UserName": "mesutgunes#email.com", "Password": "password"})
def get_urls(self):
r = self.client.get("/Dashboard.mvc")
pq = PyQuery(r.content)
link_elements = pq("a")
for link in link_elements:
if key in link.attrib and "http" not in link.attrib[key]:
# there maybe external link on the page
self.urls_on_current_page.append(link.attrib[key])
def on_start(self):
self.login()
self.get_urls()
#task(30)
def load_page(self):
url = random.choice(self.urls_on_current_page)
r = self.client.get(url)
class AwesomeUser(HttpLocust):
task_set = WalkPages
host = "https://myenv.beta.webapp.com"
min_wait = 20 * 1000
max_wait = 60 * 1000

AttributeError when using lettuce 'world'

I have two files:
steps.py:
from lettuce import *
from splinter.browser import Browser
#before.harvest
def set_browser():
world.browser = Browser('webdriver.chrome')
#step(u'Given I visit "([^"]*)"')
def given_i_visit(step, url):
world.browser.visit(url)
test.feature:
Feature: Do some basic tests
Scenario: Check whether the website is accessable
Given I visit "/"
Running lettuce against them returns this:
Feature: Do some basic tests # features/test.feature:1
Scenario: Check whether the website is accessable # features/test.feature:2
Given I visit "/" # features/steps.py:8
Traceback (most recent call last):
File "/..../site-packages/lettuce/core.py", line 125, in __call__
ret = self.function(self.step, *args, **kw)
File "/..../test/features/steps.py", line 9, in given_i_visit
world.browser.visit(url)
AttributeError: 'thread._local' object has no attribute 'browser'
1 feature (0 passed)
1 scenario (0 passed)
1 step (1 failed, 0 passed)
Any ideas on what could be going wrong?
Although not in the documentation. place the terrain.py file in the same directory as your steps and features files. Initialized the world attribute with any value and you should be ok.
The problem is that the before.harvest takes some data, so the right code will be the following:
from lettuce import *
from splinter import Browser
#before.harvest
def set_browser(data):
world.browser = Browser('webdriver.chrome')
#step(u'Given I visit "([^"]*)"')
def given_i_visit(step, url):
world.browser.visit(url)
hope it helps!