Retrieve data when clicking a button - class

Trying to receive input from WCT_Control into WCT_DataPull
Cant figure out how to get the data into WCT_DataPull to perform an action with it. I think I am going about this backwards, but I also think I have been staring at it too long.
Essentially, the user enters the information necessary into a GUI to connect to a specific SQL table (predetermined) and then saves the data in the table and outputs it as a csv file backup.
I want the user to click the submit button and that creates the backup. However, at this point when I click the button, it will store all the data in the variables (If I put a print statement in I see the correct values), but I cant seem to figure out how to get the variables to WCT_DataPull, where the backup creation action is performed.
WCT_Control
from PyQt5.QtWidgets import *
from WCT_View import Ui_MainWindow
class Controller(QMainWindow, Ui_MainWindow):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.setupUi(self)
self.run.clicked.connect(lambda : self.submit())
def submit(self):
self.run.clicked.connect()
server = self.server_entry.text()
database = self.data_entry.text()
station = self.station_entry.text()
app = self.app_entry.text()
backup_name = self.filename_entry.text()
self.server_entry.setText('')
self.data_entry.setText('')
self.station_entry.setText('')
self.app_entry.setText('')
self.filename_entry.setText('')
return server, database, station, app, backup_name
WCT_DataPull
from WCT_Control import *
import pyodbc
import csv
pull_data = Controller()
def write_bak():
driver = 'ODBC Driver 17 for SQL Server'
serv, data, stat, app, bak_name = pull_data.submit()
conn = pyodbc.connect('DRIVER={0};SERVER={1};DATABASE={2};Trusted_Connection=yes'.format(driver, serv, data))
cursor = conn.cursor()
rows = cursor.execute("""
select DnsName, PackageName, Code, Value from WorkstationApplicationSettings
where DnsName=? and PackageName=?
""", stat, app).fetchall()
for row in rows:
print(row.PackageName,':', row.Code, ':', row.Value)
with open(bak_name, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(rows)

So you just have to do the things in opposite way, instead of using PYQT5 in WCT script, use the WCT function in PYQT5 script
WCT_Control
from PyQt5.QtWidgets import *
from WCT_DataPull import write_bak
class Controller(QMainWindow, Ui_MainWindow):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.setupUi(self)
self.run.clicked.connect(lambda : self.submit())
def submit(self):
self.run.clicked.connect()
server = self.server_entry.text()
database = self.data_entry.text()
station = self.station_entry.text()
app = self.app_entry.text()
backup_name = self.filename_entry.text()
self.server_entry.setText('')
self.data_entry.setText('')
self.station_entry.setText('')
self.app_entry.setText('')
self.filename_entry.setText('')
write_bak(serv, data, stat, app, bak_name)
WCT_DataPull
import pyodbc
import csv
def write_bak(serv, data, stat, app, bak_name):
driver = 'ODBC Driver 17 for SQL Server'
conn = pyodbc.connect('DRIVER={0};SERVER={1};DATABASE={2};Trusted_Connection=yes'.format(driver, serv, data))
cursor = conn.cursor()
rows = cursor.execute("""
select DnsName, PackageName, Code, Value from WorkstationApplicationSettings
where DnsName=? and PackageName=?
""", stat, app).fetchall()
for row in rows:
print(row.PackageName,':', row.Code, ':', row.Value)
with open(bak_name, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(rows)
Also Make sure to write the code to run the WCT_control by using Qapplication

Related

Trouble with applying a UDF on a column in Pyspark Dataframe

My goal is to clean the Data in a column in a Pyspark DF. I have written a function for cleaning .
def preprocess(text):
text = text.lower()
text=text.strip()
text=re.compile('<.*?>').sub('', text)
text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
text = re.sub('\s+', ' ', text)
text = re.sub(r'\[[0-9]*\]',' ',text)
text=re.sub(r'[^\w\s]', '', text.lower().strip())
text = re.sub(r'\d',' ',text)
text = re.sub(r'\s+',' ',text)
return text
#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
text = [i for i in text.split() if not i in stop_words]
return text
# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('V'):
return wordnet.VERB
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
# Tokenize the sentence
def lemmatizer(string):
word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
return " ".join(a)
#Final Function
def finalpreprocess(string):
return lemmatizer(' '.join(remove_stopwords(preprocess(string))))
The functions seems to work fine when I test it . When I do
text = 'Ram and Bheem are buddies. They (both) like <b>running</b>. They got better at it over the weekend'
print(finalpreprocess(text))
I see the exact result I want.
ram bheem buddy like run get well weekend
How ever when I try to apply this function finalpreprocess() to a column in pyspark dataframe . I am getting errors.
Here is what I did.
udf_txt_clean = udf(lambda x: finalpreprocess(x),StringType())
df.withColumn("cleaned_text",lem(col("reason"))).select("reason","cleaned_text").show(10,False)
Then I am getting the error :
Traceback (most recent call last):
File "/databricks/spark/python/pyspark/serializers.py", line 473, in dumps
return cloudpickle.dumps(obj, pickle_protocol)
File "/databricks/spark/python/pyspark/cloudpickle/cloudpickle_fast.py", line 73, in dumps
cp.dump(obj)
File "/databricks/spark/python/pyspark/cloudpickle/cloudpickle_fast.py", line 563, in dump
return Pickler.dump(self, obj)
TypeError: cannot pickle '_thread.RLock' object
PicklingError: Could not serialize object: TypeError: cannot pickle '_thread.RLock' object
So far here is what I did. In my finalpreprocess(), I am using three different functions preprocess(),remove_stopwords(), lemmatizer() . I changed my udf_txt_clean accordingly . Like
udf_txt_clean = udf(lambda x: preprocess(x),StringType())
udf_txt_clean = udf(lambda x: remove_stopwords(x),StringType())
These two run fine But -
udf_txt_clean = udf(lambda x: lemmatizer (x),StringType())
is the one that is giving me the error. I am not able to understand why this function is giving the error but not the other two. From my limited understating I see that its having trouble trying to pickle this function but I am not able to understand why its trying to pickle this in the first place or if there is a work around for it.
It would help if the example were more reproducible next time. It took a bit to re-create this. No worries, though,I have a solution here.
First, cloudpickle is the mechanism of Spark to move a function from drivers to workers. So functions are pickled and then sent to the workers for execution. So something you are using can't be pickled. In order to quickly test, you can just use:
import cloudpickle
cloudpickle.dumps(x)
where x is something that you are testing if it's cloudpickle-able. In this case, I tried a couple of times and found wordnet not to be serializable. You can test with:
cloudpickle.dumps(wordnet)
and it will reproduce the issue. You can get around this by importing the stuff that can't be pickled inside your function. Here is an end-to-end example for you.
import re
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType,IntegerType,StringType
def preprocess(text):
text = text.lower()
text=text.strip()
text=re.compile('<.*?>').sub('', text)
text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
text = re.sub('\s+', ' ', text)
text = re.sub(r'\[[0-9]*\]',' ',text)
text=re.sub(r'[^\w\s]', '', text.lower().strip())
text = re.sub(r'\d',' ',text)
text = re.sub(r'\s+',' ',text)
return text
#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
text = [i for i in text.split() if not i in stop_words]
return text
def lemmatizer(string):
from nltk.corpus import wordnet
def get_wordnet_pos(tag):
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('V'):
return wordnet.VERB
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
return " ".join(a)
#Final Function
def finalpreprocess(string):
return lemmatizer(' '.join(remove_stopwords(preprocess(string))))
spark = SparkSession.builder.getOrCreate()
text = 'Ram and Bheem are buddies. They (both) like <b>running</b>. They got better at it over the weekend'
test = pd.DataFrame({"test": [text]})
sdf = spark.createDataFrame(test)
udf_txt_clean = udf(lambda x: finalpreprocess(x),StringType())
sdf.withColumn("cleaned_text",udf_txt_clean(col("test"))).select("test","cleaned_text").show(10,False)

Local module PyExcelAuto is not imported in behave's steps definition file datasteps.py, giving error Behave(project main/parent folder) is not module

I have created a new step defintion file datasteps.py for data.feature
datasteps.py:
*from Behave.features.PyExcel.PyExcelAuto import PyExcelAuto
pywin = None
from behave import *
#given('excel "{filename}" & "{sheetname}"')
def step_impl(context,filename, sheetname):
global pywin
pywin = PyExcelAuto()
pywin.setSheetName(filename,sheetname)
pywin.setDictionaryForColIndex()
#when('add some data in excelsheet "{sheetname}"')
def step_impl(context,sheetname ):
global pywin
colCount = pywin.getColCount()
rowCount = pywin.getRowCount()
print ("row & col counts are :{row} & {col}".format(rowCount, colCount))
#then(u'verify data is added in excelsheet "{sheetname}"')
def step_impl(context,sheetname):
pass
#then(u'close browser')
def step_impl(context):
pass*
data.feature:
*Feature: Data to Read
Scenario: Data is to be read from Excel
Given excel "data.xlsx" & "dummy"
When add some data in excelsheet "dummy"
Then verify data is added in excelsheet "dummy"
And close browser*
However, when trying to import PyExcelAuto from Behave.features.PyExcel.PyExcelAuto in datasteps.py getting error message "ModuleNotFoundError: No module named 'Behave'"

Dash app connections to AWS postgres DB VERY SLOW

I've created a live-updating dash app connected to a public facing AWS Postgres database. I've put db connection within my callback so it updates, but I find that it takes a long long time to retrieve data and create the graph, such that if the interval time is reduced to 10 seconds or less, no graph loads at all. I've tried to store the data in dcc.store but the initial load still takes a very long time. My abbreviated code is written below. I'm assuming the lag time is from the engine connecting to the database, because I am only reading a few rows and columns. Is there anyway to speed this up?
import plotly.graph_objs as go
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State
from plotly.subplots import make_subplots
from sqlalchemy import create_engine, MetaData, Table
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import declarative_base
from sqlalchemy import Column, Integer, String, func, Date, ARRAY
from sqlalchemy.orm import sessionmaker
app = dash.Dash(__name__, external_stylesheets=[BS], suppress_callback_exceptions=True, update_title=None)
server=app.server
app.layout = html.Div([
dcc.Store(id='time', storage_type='session'),
dcc.Store(id='blood_pressure', storage_type='session'),
html.Div(dcc.Graph(id='live-graph', animate=False), className='w-100'),
html.Div(id= "testing"),
dcc.Interval(
id='graph-update-BP',
interval=30000,
n_intervals=0
)]), width={"size": 10, "offset": 0.5}),
#app.callback(
dash.dependencies.Output('live-graph', 'figure'),
dash.dependencies.Output('blood_pressure', 'data'),
dash.dependencies.Output('time', 'data'),
[dash.dependencies.Input('graph-update-BP', 'n_intervals')],
Input('live-graph', 'relayoutData'),
)
def update_graph_scatter_1(n):
trace = []
blood_pressure = []
time = []
engine = create_engine("postgresql://username:password#address:5432/xxxxx", echo=True, future=True)
Session = sessionmaker(bind=engine)
session = Session()
Base = automap_base()
Base.prepare(engine, reflect=True)
User = Base.classes.users
Datex = Base.classes.data
for instance in session.query(Datex).filter(Datex.user_id == 3).filter(Datex.date_time == 'Monday,Apr:26'):
blood_pressure.append([instance.systolic, instance.mean, instance.diastolic])
time.append(instance.time)
for i in range(0, len(blood_pressure)):
trace.append(go.Box(y=blood_pressure[i],
x=time[i],
line=dict(color='#6a92ff'),
hoverinfo='all'))
fig = make_subplots(rows=1, cols=1)
def append_trace():
for i in range(0, len(trace)):
fig.append_trace(trace[i], 1, 1)
append_trace()
return fig, blood_pressure, hr,
You can increase performance in your app in the following ways:
Non-programming methods:
If your app is deployed on AWS, ensure your app is connecting to your database over private IP. This reduces the number of networks your data has to traverse and will result in significantly lower latency.
Ensure your virtual machine has enough RAM. (If you're loading 2GB of data to a machine with 1GB available RAM, you're going to see the IO hit disk before loading to your program.)
Programming methods:
Modularize connecting to your database, and only do it once. This decreases the overhead required to reserve resources and authenticate connecting to the database
import os
class DbConnection:
"""Use this class to connect to your database within a dashapp"""
def __init__(self, **kwargs):
self.DB_URI = os.environ.get('DB_URI', kwargs.get('DB_URI'))
self.echo = kwargs.get('echo', True)
self.future = kwargs.get('future', True)
# Now create the engine
self.engine = create_engine(self.DB_URI, echo=self.echo, future=self.self)
# Make the session maker
self.session_maker = sessionmaker(bind=self.engine)
#property
def session(self):
"""Return a session as a property"""
return self.session_maker()
# -------------------------------------------
# In your app, instantiate the database connection
# and map your base
my_db_connection = DbConnection() # provide kwargs as needed
session = my_db_connection.session # necessary to assign property to a variable
# Map the classes
Base = automap_base()
Base.prepare(my_db_connection.engine, reflect=True)
User = Base.classes.users
Datex = Base.classes.data
Cache frequently queried data. Unless your data is massive and dramatically varying, you should expect better performance from loading the data from disk (or RAM) on your machine, than over the network from your database.
from functools import lru_cache
#lru_cache()
def get_blood_pressure(session, user_id, date):
"""returns blood pressure for a given user for a given date"""
blood_pressure, time = [], []
query = session.query(Datex)\
.filter(Datex.user_id == 3)\
.filter(Datex.date_time == 'Monday,Apr:26')
# I like short variable names when interacting with db results
for rec in query:
time.append(rec.time)
blood_pressure.append([rec.systolic, rec.mean, rec.diastolic])
# finally
return blood_pressure, time
Putting them all together, your callback should be a lot quicker
def update_graph_scatter_1(n):
# I'm not sure how these variables will be assigned
# but you'll figure it out
blood_pressure, time = get_blood_pressure(session=session, user_id=user_id, date='Monday,Apr:26')
# Create new traces
for i in range(0, len(blood_pressure)):
trace.append(go.Box(
y=blood_pressure[i],
x=time[i],
line=dict(color='#6a92ff'),
hoverinfo='all'
))
# Add to subplots
fig = make_subplots(rows=1, cols=1)
for i in range(0, len(trace)):
fig.append_trace(trace[i], 1, 1)
return fig, blood_pressure, time
Lastly, it looks like you're recreating your graph objects each update. This is a heavy operation. I'd recommend updating the graph's data instead. I know this is possible, since I've done this in the past. But it looks like the solution is not-trivial, unfortunately. Perhaps an item for a later response or follow up Q.
Further reading: https://dash.plotly.com/performance

How do i implement a locustfile where each locust takes unique value from csv files for it's task?

enter code here
from locust import HttpLocust, TaskSet, task
class ExampleTask(TaskSet):
csvfile = open('failed.csv', 'r')
data = csvfile.readlines()
bakdata = list(data)
#task
def fun(self):
try:
value = self.data.pop().split(',')
print('------This is the value {}'.format(value[0]))
except IndexError:
self.data = list(self.bakdata)
class ExampleUser(HttpLocust):
host = 'https://www.google.com'
task_set = ExampleTask
Following my csv file:
516,True,success
517,True,success
518,True,success
519,True,success
520,True,success
521,True,success
522,True,success
523,True,success
524,True,success
525,True,success
526,True,success
527,True,success
528,True,success
529,True,success
530,True,success
531,True,success
532,True,success
533,True,success
534,True,success
535,True,success
536,True,success
537,True,success
538,True,success
539,True,success
540,True,success
541,True,success
542,True,success
543,True,success
544,True,success
545,True,success
546,True,success
547,True,success
548,True,success
549,True,success
550,True,success
551,True,success
552,True,success
553,True,success
554,True,success
555,True,success
556,True,success
557,True,success
558,True,success
559,True,success
Here after csv file end , locust does not takes unique value, it takes same value for all the users which is simulated.
I'm not 100% sure, but I think your problem is this line:
self.data = list(self.bakdata)
This will give each User instance a different copy of the list.
It should work if you change it to:
ExampleTask.data = list(self.bakdata)
Or you can use locust-plugins's CSVReader, see the example here:
https://github.com/SvenskaSpel/locust-plugins/blob/master/examples/csvreader_ex.py

SQLAlchemy Core CREATE TEMPORARY TABLE AS

I'm trying to use the PostgreSQL CREATE TEMPORARY TABLE foo AS SELECT... query in SQLAlchemy Core. I've looked through the docs but don't see a way to do this.
I have a SQLA statement object. How do I create a temporary table from its results?
This is what I came up with. Please tell me if this is the wrong way to do it.
from sqlalchemy.sql import Select
from sqlalchemy.ext.compiler import compiles
class CreateTableAs(Select):
"""Create a CREATE TABLE AS SELECT ... statement."""
def __init__(self, columns, new_table_name, is_temporary=False,
on_commit_delete_rows=False, on_commit_drop=False, *arg, **kw):
"""By default the table sticks around after the transaction. You can
change this behavior using the `on_commit_delete_rows` or
`on_commit_drop` arguments.
:param on_commit_delete_rows: All rows in the temporary table will be
deleted at the end of each transaction block.
:param on_commit_drop: The temporary table will be dropped at the end
of the transaction block.
"""
super(CreateTableAs, self).__init__(columns, *arg, **kw)
self.is_temporary = is_temporary
self.new_table_name = new_table_name
self.on_commit_delete_rows = on_commit_delete_rows
self.on_commit_drop = on_commit_drop
#compiles(CreateTableAs)
def s_create_table_as(element, compiler, **kw):
"""Compile the statement."""
text = compiler.visit_select(element)
spec = ['CREATE', 'TABLE', element.new_table_name, 'AS SELECT']
if element.is_temporary:
spec.insert(1, 'TEMPORARY')
on_commit = None
if element.on_commit_delete_rows:
on_commit = 'ON COMMIT DELETE ROWS'
elif element.on_commit_drop:
on_commit = 'ON COMMIT DROP'
if on_commit:
spec.insert(len(spec)-1, on_commit)
text = text.replace('SELECT', ' '.join(spec))
return text