How to get vocabulary from WordEmbeddingsModel in sparknlp - pyspark

I need to create an embedding matrix from embeddings generated by WordEmbeddingsModel in sparknlp. Until now i have this code :
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
# define sparknlp pipeline
document = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token")
embeddings = WordEmbeddingsModel\
.pretrained("w2v_cc_300d","sq")\
.setInputCols(["document", "token"])\
.setOutputCol("embeddings")
embeddingsFinisher = EmbeddingsFinisher()\
.setInputCols("embeddings")\
.setOutputCols("finished_embeddings")\
.setOutputAsVector(True)
pipeline = Pipeline(stages=[document, tokenizer, embeddings, embeddingsFinisher])
model = pipeline.fit(spark_train_df)
In this case, the model has an annotator WordEmbeddingsModel but this annotator doesn't have the getVocab method to fetch the vocabulary. How can I retrieve the vocabulary if the list of attributes and methods available for the model is:
dir(model)
['__abstractmethods__',
'__class__',
'__class_getitem__',
'__delattr__',
'__dict__',
'__dir__',
'__doc__',
'__eq__',
'__format__',
'__ge__',
'__getattribute__',
'__gt__',
'__hash__',
'__init__',
'__init_subclass__',
'__le__',
'__lt__',
'__module__',
'__ne__',
'__new__',
'__orig_bases__',
'__parameters__',
'__reduce__',
'__reduce_ex__',
'__repr__',
'__setattr__',
'__sizeof__',
'__slots__',
'__str__',
'__subclasshook__',
'__weakref__',
'_abc_impl',
'_copyValues',
'_copy_params',
'_defaultParamMap',
'_dummy',
'_from_java',
'_is_protocol',
'_paramMap',
'_params',
'_randomUID',
'_resetUid',
'_resolveParam',
'_set',
'_setDefault',
'_shouldOwn',
'_testOwnParam',
'_to_java',
'_transform',
'clear',
'copy',
'explainParam',
'explainParams',
'extractParamMap',
'getOrDefault',
'getParam',
'hasDefault',
'hasParam',
'isDefined',
'isSet',
'load',
'params',
'read',
'save',
'set',
'stages',
'transform',
'uid',
'write']

Related

Voila not clearing output/dispalying new output

I am trying to get an ipywidget button to change colour when clicked. I figured a way to do it as below
import ipywidgets as ipw
from ipywidgets import *
from IPython.display import display, HTML, clear_output
from IPython.display import display, HTML
pdf_btn = ipw.Button(description = 'Run PDF',button_style = 'danger',
layout=Layout(width='150px', height='30px'))
new_btn = ipw.Button(description = 'PDF done',button_style = 'success',
layout=Layout(width='150px', height='30px'))
HBox = ipw.HBox([pdf_btn])
HBox1 = ipw.HBox([new_btn])
def clear(b):
clear_output()
display(HBox1)
pdf_btn.on_click(clear)
display(HBox)
When inputting this code in Jupyter Notebook and rendering through Voila nothing happens.
Any ideas why ? And any suggestions ?
You can just update the button style maybe? Like so:
import ipywidgets as ipw
from ipywidgets import *
from IPython.display import display, HTML, clear_output
from IPython.display import display, HTML
pdf_btn = ipw.Button(description = 'Run PDF',button_style = 'danger',
layout=Layout(width='150px', height='30px'))
HBox = ipw.HBox([pdf_btn])
def clear(b):
pdf_btn.button_style="success"
pdf_btn.description="PDF done"
pdf_btn.on_click(clear)
display(HBox)
Note that your code fails in JupyterLab.
Voila is much more similar to JupyterLab's rendering machinery than the classic interface. For now, it is best to use JupyterLab in conjunction with developing for Voila. (Note that this will change soon as the underlying machinery for what is now the traditional Jupyter notebook interface will soon run on machinery more in line with JupyterLab, see Build Jupyter Notebook v7 off of JupyterLab components. A lot of the old code / approaches will then consistently not work and either interface will be as suitable for developing with Voila in mind.)
When your code fails in JupyterLab it logs that to the console, similar to as shown here. I suspect you could follow Jason's advice, what I reference in my reply, which is here, if you actually want the other button there. (I guess I should say if you actually want the other HBox there since for some reason you have each button in a separate HBox in your code.) In other words, you'd need to add in how to handle the output from HBox1 correctly as part of the main output. Yours is going to the JupyterLab Log console as it is now.
So one option is to add HBox1 in as a displayed item, yet hide it initially:
import ipywidgets as ipw
from ipywidgets import *
from IPython.display import display, HTML, clear_output
from IPython.display import display, HTML
pdf_btn = ipw.Button(description = 'Run PDF',button_style = 'danger',
layout=Layout(width='150px', height='30px'))
new_btn = ipw.Button(description = 'PDF done',button_style = 'success',
layout=Layout(width='150px', height='30px'))
HBox = ipw.HBox([pdf_btn])
HBox1 = ipw.HBox([new_btn])
HBox1.layout.visibility = 'hidden' #based on https://github.com/jupyter-widgets/ipywidgets/issues/674#issuecomment-234321603
def clear(b):
HBox1.layout.visibility = 'visible'
HBox.layout.display = 'none' #based on https://stackoverflow.com/a/54134344/8508004
pdf_btn.on_click(clear)
display(HBox)
display(HBox1)
You could also make an overarching vertical box for the items and then toggle which one it contains, similar to here. This is the 'cleanest' option I'll list here, other than just having one button & changing the button itself:
import ipywidgets as ipw
from ipywidgets import *
from IPython.display import display, HTML, clear_output
from IPython.display import display, HTML
pdf_btn = ipw.Button(description = 'Run PDF',button_style = 'danger',
layout=Layout(width='150px', height='30px'))
new_btn = ipw.Button(description = 'PDF done',button_style = 'success',
layout=Layout(width='150px', height='30px'))
HBox = ipw.HBox([pdf_btn])
HBox1 = ipw.HBox([new_btn])
vb = VBox(children = [HBox])
def clear(b):
vb.children = [HBox1]
pdf_btn.on_click(clear)
display(vb)
('cleanest' in regards to it not creating extra space in the output area.)
Or combine the vertical box approach with the approach of toggling off via the layout settings:
import ipywidgets as ipw
from ipywidgets import *
from IPython.display import display, HTML, clear_output
from IPython.display import display, HTML
pdf_btn = ipw.Button(description = 'Run PDF',button_style = 'danger',
layout=Layout(width='150px', height='30px'))
new_btn = ipw.Button(description = 'PDF done',button_style = 'success',
layout=Layout(width='150px', height='30px'))
HBox = ipw.HBox([pdf_btn])
HBox1 = ipw.HBox([new_btn])
HBox1.layout.visibility = 'hidden' #based on https://github.com/jupyter-widgets/ipywidgets/issues/674#issuecomment-234321603
vb = VBox(children = [HBox, HBox1])
def clear(b):
HBox1.layout.visibility = 'visible'
HBox.layout.display = 'none' #based on https://stackoverflow.com/a/54134344/8508004
pdf_btn.on_click(clear)
display(vb)
And if you want to fully leverage Jupyter's display system further, combine it with an overarching out so you can use print(), too:
import ipywidgets as ipw
from ipywidgets import *
from IPython.display import display, HTML, clear_output
from IPython.display import display, HTML
out = widgets.Output()
pdf_btn = ipw.Button(description = 'Run PDF',button_style = 'danger',
layout=Layout(width='150px', height='30px'))
new_btn = ipw.Button(description = 'PDF done',button_style = 'success',
layout=Layout(width='150px', height='30px'))
HBox = ipw.HBox([pdf_btn])
HBox1 = ipw.HBox([new_btn])
vb = VBox(children = [HBox])
def clear(b):
vb.children = [HBox1]
with out:
print('Enjoy!')
pdf_btn.on_click(clear)
with out:
display(vb)
out

How to use of on_failure_callback in Airflow 1.10.10+composer?

I wish to get an email notification when a single Airflow Operator fails. I need that because the failure of some tasks mustn't set the entire pipeline as failed.
To simulate the error, I set a source bucket as not existing bucket.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = "Stefano Giostra"
__credits__ = "Stefano Giostra"
__maintainer__ = "Stefano Giostra"
__version__ = "0.9.3"
__status__ = "Dev"
from airflow.models import Variable, DAG
from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
# from lib.bb_utils import *
import logging
from airflow.utils import dates
from datetime import timedelta
from functools import partial
from lib.bb_utils import load_json_file
from airflow.utils.email import send_email
ROOT_PATH = '/home/airflow/gcs/dags'
logger = logging.getLogger("dag_demo_2")
def notify_email(context, config): # **kwargs
"""Send custom email alerts."""
alerting_email_address = config.get('email_address')
print("---> notify_email -------------------")
print(context)
print(f"-->{alerting_email_address}")
print("<------------------------------------")
# print(context['dag'])
# email title.
# title = "Airflow alert: {task_name} Failed".format(context)
#
# # email contents
# body = """
# Hi, <br><br>
# There's been an error in the {task_name} job.<br>
# <br>
# Forever yours,<br>
# Airflow bot <br>
# """.format(**contextDict)
# for dest in dest_email:
# send_email(dest, title, body)
# ----------------------------------------------------------------------------------------------------------------------
# ----------------------------------------------------------------------------------------------------------------------
# ----------------------------------------------------------------------------------------------------------------------
# Dizionario dati con le chiavi richieste dai DAG di AirFlow
my_email = 'abc#xyz.com'
default_args = {
"owner": 'SG',
"depends_on_past": False,
"start_date": dates.days_ago(1),
"end_date": None,
"email_on_failure": 'my_email',
"email_on_retry": False,
"email": [my_email],
"retries": 2,
"retry_delay": timedelta(minutes=5),
"max_active_runs": 1,
"on_failure_callback": partial(notify_email, config={'email_address': my_email})
}
dag_name = 'SG-DagDemo-Once'
with DAG(dag_id=dag_name, default_args=default_args, schedule_interval="#once") as ldag:
project = Variable.get("PROJECT")
source_bucket = 'sg-dev'
source_object = 'covid19_italy/national_trends_2.csv'
bq_dataset = "covid19_italy"
bq_table_name = "national_trends"
bq_task_id = f'gcs_to_bq_load_{bq_table_name}'
schema_fields = load_json_file(f"{ROOT_PATH}/source/{bq_dataset}/{bq_table_name}_tabschema.json")
t = GoogleCloudStorageToBigQueryOperator(
dag=ldag,
task_id=bq_task_id,
bucket=source_bucket,
source_objects=[source_object],
destination_project_dataset_table="{0}.{1}.{2}".format(project, bq_dataset, bq_table_name),
schema_fields=schema_fields,
source_format='CSV',
skip_leading_rows=1,
write_disposition="WRITE_TRUNCATE"
)
To invoke notify_email() on a failure, it will be enough if you adjust default_args with:
"on_failure_callback": notify_email
then default_args should be included in the DAG creation sentence:
with DAG(dag_id='SG-DagDemo-Once', default_args=default_args) as dag:
You can try something like the following to call the function notify_email() on operator failures; each operator will call the same function (example taken from gcs_to_bq):
args = {
'owner': 'Airflow',
'start_date': airflow.utils.dates.days_ago(1),
'on_failure_callback': notify_email
}
dag_name = 'SG-DagDemo-Once'
with DAG(dag_id=dag_name, default_args=args, schedule_interval=None) as dag:
create_test_dataset = bash_operator.BashOperator(
task_id='create_airflow_test_dataset',
bash_command='bq mk airflow_test')
# [START howto_operator_gcs_to_bq]
load_csv = GoogleCloudStorageToBigQueryOperator(
task_id='gcs_to_bq_example',
bucket='cloud-samples-data',
source_objects=['bigquery/us-states/us-states.csv'],
destination_project_dataset_table='airflow_test.gcs_to_bq_table',
schema_fields=[
{'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name': 'post_abbr', 'type': 'STRING', 'mode': 'NULLABLE'},
],
write_disposition='WRITE_TRUNCATE')
# [END howto_operator_gcs_to_bq]
delete_test_dataset = bash_operator.BashOperator(
task_id='delete_airflow_test_dataset',
bash_command='bq rm -r -f -d airflow_test')
create_test_dataset >> load_csv >> delete_test_dataset
You can simulate an error by changing a piece of configuration on each operator. And you will need to complete the configuration for sending the email in notify_email().

django rest framework - return custom data for a model serializer field

I have the following model:
class ServerSimpleConfigSerializer(mixins.GetCSConfigMixin, serializers.ModelSerializer):
mp_autoteambalance = serializers.BooleanField(label='Auto Team Balance', default=True, required=False)
mp_friendlyfire = serializers.BooleanField(label='Friendly Fire', default=False, required=False)
mp_autokick = serializers.BooleanField(label='Auto team-killer banning and idle client kicking', default=True, required=False)
# hostname = serializers.CharField(label='Hostname', max_length=75, required=True)
rcon_password = serializers.CharField(label='RCON Password', max_length=75, required=True)
sv_password = serializers.CharField(label='Server Password', max_length=75, required=False)
mp_startmoney = serializers.IntegerField(label='Start Money', required=False, validators=[MinValueValidator(800), MaxValueValidator(16000)])
mp_roundtime = serializers.FloatField(label='Round Time', required=False)
mp_timelimit = serializers.IntegerField(label='Map Time Limit', required=False)
fpath = os.path.join(settings.ROOT_DIR, "cs16/tmp/server.json")
class Meta:
model = CS16Server
fields = ('name', 'game_config', 'mp_autoteambalance', 'mp_friendlyfire',
'mp_autokick', 'hostname', 'rcon_password', 'sv_password',
'mp_startmoney', 'mp_roundtime', 'mp_timelimit')
read_only_fields = ('name', 'game_config',)
The model has the following fields:
name, game_config (big text) and hostname
How can I return the above defined fields for the serializer, although they are not present on the model ?
I would like to set some custom values for each field & return them as a JSON.
Is that possible ?
Actually the values for the above defined fields are found in "game_config" field.
I would like to parse those values & return them & I would not want to put them as separate fields in the model.
Parse game_config, obtain a pair of: (field0, val0) ... (fieldN, valN) and in the serializer,
set those values for the serializer fields.
For now I only get the following response:
{
"name": "Chronos",
"game_config": "hostname \"A New Gameservers.com Server is Born\"\nrcon_password \"\"\nsv_password \"1410271\"\nsv_contact email#domain.com\nsv_region 255\nsv_filterban 1\nsv_logbans 0\nsv_unlag 1\nmp_startmoney 800\nmp_chattime 30\nmp_footsteps 1\nsv_footsteps 1\nmp_logdetail 0\nmp_logmessages 0\nmp_timelimit 30\nmp_autokick 1\nmp_autoteambalance 1\nmp_flashlight 0\nmp_forcerespawn 0\nmp_forcechasecam 0\nmp_freezetime 0\nmp_friendlyfire 0\nmp_hostagepenalty 0\nmp_limitteams 0\nmp_roundtime 5\nmp_tkpunish 1\nsv_voiceenable 1\nsv_voicecodec voice_speex\nsv_voicequality 3\nsv_alltalk 0\nsv_restartround 1\nsv_maxspeed 320\nsv_proxies 1\nallow_spectators 1\nsv_allowupload 1\npausable 0\ndecalfrequency 40\nmp_falldamage 0\nsv_cheats 0\nsv_lan 0\nsv_maxrate 20000\nsv_minrate 4000\nexec listip.cfg",
"mp_autoteambalance": true,
"mp_friendlyfire": false,
"mp_autokick": true,
"hostname": "none"
}

is there some documentation on all methods of PixBuf, especially replace_data? (gtk3)

I was wondering where I can find the documentation from all of the methods that are implemented in PixBuf (found via dir, python3):
['__class__', '__copy__', '__deepcopy__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__gdoc__', '__ge__', '__getattribute__', '__gpointer__', '__grefcount__', '__gsignals__', '__gt__', '__gtype__', '__hash__', '__info__', '__init__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_force_floating', '_ref', '_ref_sink', '_unref', '_unsupported_data_method', '_unsupported_method', 'add_alpha', 'apply_embedded_orientation', 'bind_property', 'bind_property_full', 'chain', 'compat_control', 'composite', 'composite_color', 'composite_color_simple', 'connect', 'connect_after', 'connect_data', 'connect_object', 'connect_object_after', 'copy', 'copy_area', 'deserialize', 'disconnect', 'disconnect_by_func', 'emit', 'emit_stop_by_name', 'equal', 'fill', 'find_property', 'flip', 'force_floating', 'freeze_notify', 'from_pixdata', 'g_type_instance', 'get_bits_per_sample', 'get_byte_length', 'get_colorspace', 'get_data', 'get_file_info', 'get_file_info_async', 'get_file_info_finish', 'get_formats', 'get_has_alpha', 'get_height', 'get_n_channels', 'get_option', 'get_options', 'get_pixels', 'get_properties', 'get_property', 'get_qdata', 'get_rowstride', 'get_width', 'handler_block', 'handler_block_by_func', 'handler_disconnect', 'handler_is_connected', 'handler_unblock', 'handler_unblock_by_func', 'hash', 'install_properties', 'install_property', 'interface_find_property', 'interface_install_property', 'interface_list_properties', 'is_floating', 'list_properties', 'load', 'load_async', 'load_finish', 'new', 'new_for_string', 'new_from_bytes', 'new_from_data', 'new_from_file', 'new_from_file_at_scale', 'new_from_file_at_size', 'new_from_inline', 'new_from_resource', 'new_from_resource_at_scale', 'new_from_stream', 'new_from_stream_async', 'new_from_stream_at_scale', 'new_from_stream_at_scale_async', 'new_from_stream_finish', 'new_from_xpm_data', 'new_subpixbuf', 'notify', 'notify_by_pspec', 'override_property', 'props', 'qdata', 'read_pixel_bytes', 'read_pixels', 'ref', 'ref_count', 'ref_sink', 'replace_data', 'replace_qdata', 'rotate_simple', 'run_dispose', 'saturate_and_pixelate', 'save_to_bufferv', 'save_to_callbackv', 'save_to_stream_finish', 'savev', 'scale', 'scale_simple', 'serialize', 'set_data', 'set_properties', 'set_property', 'steal_data', 'steal_qdata', 'stop_emission', 'stop_emission_by_name', 'thaw_notify', 'to_string', 'unref', 'watch_closure', 'weak_ref']
I am particularly interested in replace_data, as I need to update the reference to the data in the PixBuf (would that be possible?).
Any idea on where I can find that documentation?
The main GdkPixbuf documentation is https://developer.gnome.org/gdk-pixbuf/stable/
Python-specific documentation is https://lazka.github.io/pgi-docs/#GdkPixbuf-2.0

How to discover undocumented mimdump inline script parameters?

I am trying to parse different elements of the request and response header with inline scripting and mitmdump. Some features are undocumented. I will post the lessons learned in reply to this question.
Why not use the official documentation?
http://mitmproxy.org/doc/scripting/inlinescripts.html
The canonical API documentation is the code, which you can browse locally or in our GitHub repo. You can view the API documentation using pydoc (which is installed with Python by default), like this:
pydoc libmproxy.protocol.http.HTTPRequest
gives better output.
The use of dir() in an inline script shows all the varables that you can use for parsing.
def response(context, flow):
print dir(flow)
print dir(flow.request)
for cookie in flow.response.headers["Set-Cookie"]:
print "%s:\t%s" % (flow.request.host, cookie)
Results for dir(flow)
['__class__', '__delattr__', '__dict__', '__doc__', '__eq__', '__format__', '__getattribute__', '__hash__',
'__init__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__',
'__str__', '__subclasshook__', '__weakref__', '_backup', '_stateobject_attributes',
'_stateobject_long_attributes', 'accept_intercept', 'backup', 'client_conn', 'copy', 'error', 'from_state',
'get_state', 'id', 'intercept', 'intercepting', 'kill', 'live', 'load_state', 'match', 'modified', 'replace',
'reply', 'request', 'response', 'revert', 'server_conn', 'type']
results for dir(flow.request)
['__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__hash__',
'__init__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__',
'__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_assemble_first_line', '_assemble_head',
'_assemble_headers', '_stateobject_attributes', '_stateobject_long_attributes', 'anticache', 'anticomp',
'assemble', 'constrain_encoding', 'content', 'copy', 'decode', 'encode', 'form_in', 'form_out', 'from_state',
'from_stream', 'get_cookies', 'get_decoded_content', 'get_form_urlencoded', 'get_path_components',
'get_query', 'get_state', 'headers', 'host', 'httpversion', 'is_replay', 'load_state', 'method', 'path',
'port', 'pretty_host', 'pretty_url', 'replace', 'scheme', 'set_form_urlencoded', 'set_path_components',
'set_query', 'size', 'stickyauth', 'stickycookie', 'timestamp_end', 'timestamp_start', 'update_host_header',
'url']
Results for dir(flow.response)
['__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__hash__',
'__init__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__',
'__str__', '__subclasshook__', '__weakref__', '_assemble_first_line', '_assemble_head', '_assemble_headers',
'_refresh_cookie', '_stateobject_attributes', '_stateobject_long_attributes', 'assemble', 'code', 'content',
'copy', 'decode', 'encode', 'from_state', 'from_stream', 'get_cookies', 'get_decoded_content', 'get_state',
'headers', 'httpversion', 'is_replay', 'load_state', 'msg', 'refresh', 'replace', 'size', 'stream',
'timestamp_end', 'timestamp_start']