I am writing an entire Pandas DataFrame as bytestream into a flat file and into a Mongo Database, e.g here
import logging
from io import BytesIO
import pandas as pd
import numpy as np
from pymongo import MongoClient
from uuid import uuid4
logger = logging.getLogger(__name__)
logging.basicConfig(format='%(asctime)s %(levelname)s - %(message)s', level=logging.INFO)
MONGODB_SETTINGS = {"Local": {'host': 'host.docker.internal',
'port': 27017}}
if __name__ == '__main__':
logger.info("Start")
frame = pd.DataFrame(columns=list("ABCDE"), data=np.random.randn(300_000, 5))
logger.info("Constructed Frame (object)")
name = str(uuid4())
bytestream = frame.to_parquet()
post = {"frame": bytestream, "name": name}
logger.info("Dictionary constructed")
with open('data/tmp/output', 'wb') as file:
file.write(bytestream)
logger.info("Bytestrem written to disk")
for key, settings in MONGODB_SETTINGS.items():
logger.info(50 * "-")
logger.info(key)
with MongoClient(**settings) as client:
db = client.capture
collection = db.frame
logger.info("Start writinng into Database")
collection.insert_one(post)
logger.info("Object written to Database")
# read the frame back from database
x = collection.find_one({"name": name})
with BytesIO(x["frame"]) as buffer:
frame_out = pd.read_parquet(buffer)
logger.info("Object read from Database")
pd.testing.assert_frame_equal(frame, frame_out)
logger.info(50 * "-")
It takes like 0.1s to write the file where as it takes 3s to write into the Mongo Database. The database is hosted on my local computer and runs within the standard Mongo image. Am I missing something. Is that loss of speed normal?
With code as ineffecient as that, yes.
Related
Trying to receive input from WCT_Control into WCT_DataPull
Cant figure out how to get the data into WCT_DataPull to perform an action with it. I think I am going about this backwards, but I also think I have been staring at it too long.
Essentially, the user enters the information necessary into a GUI to connect to a specific SQL table (predetermined) and then saves the data in the table and outputs it as a csv file backup.
I want the user to click the submit button and that creates the backup. However, at this point when I click the button, it will store all the data in the variables (If I put a print statement in I see the correct values), but I cant seem to figure out how to get the variables to WCT_DataPull, where the backup creation action is performed.
WCT_Control
from PyQt5.QtWidgets import *
from WCT_View import Ui_MainWindow
class Controller(QMainWindow, Ui_MainWindow):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.setupUi(self)
self.run.clicked.connect(lambda : self.submit())
def submit(self):
self.run.clicked.connect()
server = self.server_entry.text()
database = self.data_entry.text()
station = self.station_entry.text()
app = self.app_entry.text()
backup_name = self.filename_entry.text()
self.server_entry.setText('')
self.data_entry.setText('')
self.station_entry.setText('')
self.app_entry.setText('')
self.filename_entry.setText('')
return server, database, station, app, backup_name
WCT_DataPull
from WCT_Control import *
import pyodbc
import csv
pull_data = Controller()
def write_bak():
driver = 'ODBC Driver 17 for SQL Server'
serv, data, stat, app, bak_name = pull_data.submit()
conn = pyodbc.connect('DRIVER={0};SERVER={1};DATABASE={2};Trusted_Connection=yes'.format(driver, serv, data))
cursor = conn.cursor()
rows = cursor.execute("""
select DnsName, PackageName, Code, Value from WorkstationApplicationSettings
where DnsName=? and PackageName=?
""", stat, app).fetchall()
for row in rows:
print(row.PackageName,':', row.Code, ':', row.Value)
with open(bak_name, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(rows)
So you just have to do the things in opposite way, instead of using PYQT5 in WCT script, use the WCT function in PYQT5 script
WCT_Control
from PyQt5.QtWidgets import *
from WCT_DataPull import write_bak
class Controller(QMainWindow, Ui_MainWindow):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.setupUi(self)
self.run.clicked.connect(lambda : self.submit())
def submit(self):
self.run.clicked.connect()
server = self.server_entry.text()
database = self.data_entry.text()
station = self.station_entry.text()
app = self.app_entry.text()
backup_name = self.filename_entry.text()
self.server_entry.setText('')
self.data_entry.setText('')
self.station_entry.setText('')
self.app_entry.setText('')
self.filename_entry.setText('')
write_bak(serv, data, stat, app, bak_name)
WCT_DataPull
import pyodbc
import csv
def write_bak(serv, data, stat, app, bak_name):
driver = 'ODBC Driver 17 for SQL Server'
conn = pyodbc.connect('DRIVER={0};SERVER={1};DATABASE={2};Trusted_Connection=yes'.format(driver, serv, data))
cursor = conn.cursor()
rows = cursor.execute("""
select DnsName, PackageName, Code, Value from WorkstationApplicationSettings
where DnsName=? and PackageName=?
""", stat, app).fetchall()
for row in rows:
print(row.PackageName,':', row.Code, ':', row.Value)
with open(bak_name, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(rows)
Also Make sure to write the code to run the WCT_control by using Qapplication
I've created a live-updating dash app connected to a public facing AWS Postgres database. I've put db connection within my callback so it updates, but I find that it takes a long long time to retrieve data and create the graph, such that if the interval time is reduced to 10 seconds or less, no graph loads at all. I've tried to store the data in dcc.store but the initial load still takes a very long time. My abbreviated code is written below. I'm assuming the lag time is from the engine connecting to the database, because I am only reading a few rows and columns. Is there anyway to speed this up?
import plotly.graph_objs as go
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State
from plotly.subplots import make_subplots
from sqlalchemy import create_engine, MetaData, Table
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import declarative_base
from sqlalchemy import Column, Integer, String, func, Date, ARRAY
from sqlalchemy.orm import sessionmaker
app = dash.Dash(__name__, external_stylesheets=[BS], suppress_callback_exceptions=True, update_title=None)
server=app.server
app.layout = html.Div([
dcc.Store(id='time', storage_type='session'),
dcc.Store(id='blood_pressure', storage_type='session'),
html.Div(dcc.Graph(id='live-graph', animate=False), className='w-100'),
html.Div(id= "testing"),
dcc.Interval(
id='graph-update-BP',
interval=30000,
n_intervals=0
)]), width={"size": 10, "offset": 0.5}),
#app.callback(
dash.dependencies.Output('live-graph', 'figure'),
dash.dependencies.Output('blood_pressure', 'data'),
dash.dependencies.Output('time', 'data'),
[dash.dependencies.Input('graph-update-BP', 'n_intervals')],
Input('live-graph', 'relayoutData'),
)
def update_graph_scatter_1(n):
trace = []
blood_pressure = []
time = []
engine = create_engine("postgresql://username:password#address:5432/xxxxx", echo=True, future=True)
Session = sessionmaker(bind=engine)
session = Session()
Base = automap_base()
Base.prepare(engine, reflect=True)
User = Base.classes.users
Datex = Base.classes.data
for instance in session.query(Datex).filter(Datex.user_id == 3).filter(Datex.date_time == 'Monday,Apr:26'):
blood_pressure.append([instance.systolic, instance.mean, instance.diastolic])
time.append(instance.time)
for i in range(0, len(blood_pressure)):
trace.append(go.Box(y=blood_pressure[i],
x=time[i],
line=dict(color='#6a92ff'),
hoverinfo='all'))
fig = make_subplots(rows=1, cols=1)
def append_trace():
for i in range(0, len(trace)):
fig.append_trace(trace[i], 1, 1)
append_trace()
return fig, blood_pressure, hr,
You can increase performance in your app in the following ways:
Non-programming methods:
If your app is deployed on AWS, ensure your app is connecting to your database over private IP. This reduces the number of networks your data has to traverse and will result in significantly lower latency.
Ensure your virtual machine has enough RAM. (If you're loading 2GB of data to a machine with 1GB available RAM, you're going to see the IO hit disk before loading to your program.)
Programming methods:
Modularize connecting to your database, and only do it once. This decreases the overhead required to reserve resources and authenticate connecting to the database
import os
class DbConnection:
"""Use this class to connect to your database within a dashapp"""
def __init__(self, **kwargs):
self.DB_URI = os.environ.get('DB_URI', kwargs.get('DB_URI'))
self.echo = kwargs.get('echo', True)
self.future = kwargs.get('future', True)
# Now create the engine
self.engine = create_engine(self.DB_URI, echo=self.echo, future=self.self)
# Make the session maker
self.session_maker = sessionmaker(bind=self.engine)
#property
def session(self):
"""Return a session as a property"""
return self.session_maker()
# -------------------------------------------
# In your app, instantiate the database connection
# and map your base
my_db_connection = DbConnection() # provide kwargs as needed
session = my_db_connection.session # necessary to assign property to a variable
# Map the classes
Base = automap_base()
Base.prepare(my_db_connection.engine, reflect=True)
User = Base.classes.users
Datex = Base.classes.data
Cache frequently queried data. Unless your data is massive and dramatically varying, you should expect better performance from loading the data from disk (or RAM) on your machine, than over the network from your database.
from functools import lru_cache
#lru_cache()
def get_blood_pressure(session, user_id, date):
"""returns blood pressure for a given user for a given date"""
blood_pressure, time = [], []
query = session.query(Datex)\
.filter(Datex.user_id == 3)\
.filter(Datex.date_time == 'Monday,Apr:26')
# I like short variable names when interacting with db results
for rec in query:
time.append(rec.time)
blood_pressure.append([rec.systolic, rec.mean, rec.diastolic])
# finally
return blood_pressure, time
Putting them all together, your callback should be a lot quicker
def update_graph_scatter_1(n):
# I'm not sure how these variables will be assigned
# but you'll figure it out
blood_pressure, time = get_blood_pressure(session=session, user_id=user_id, date='Monday,Apr:26')
# Create new traces
for i in range(0, len(blood_pressure)):
trace.append(go.Box(
y=blood_pressure[i],
x=time[i],
line=dict(color='#6a92ff'),
hoverinfo='all'
))
# Add to subplots
fig = make_subplots(rows=1, cols=1)
for i in range(0, len(trace)):
fig.append_trace(trace[i], 1, 1)
return fig, blood_pressure, time
Lastly, it looks like you're recreating your graph objects each update. This is a heavy operation. I'd recommend updating the graph's data instead. I know this is possible, since I've done this in the past. But it looks like the solution is not-trivial, unfortunately. Perhaps an item for a later response or follow up Q.
Further reading: https://dash.plotly.com/performance
So i had a much bigger problem ,but i sorted that out. Now, i have this error command:
pygame 2.0.1 (SDL 2.0.14, Python 3.7.9)
Hello from the pygame community. https://www.pygame.org/contribute.html
Traceback (most recent call last):
File "c:/Users/danku/jarvis.py", line 16, in
engine = pyttsx3.init('sapi5')
AttributeError: module 'pyttsx3' has no attribute 'init'
i tried kinda everything(installed pygame,pypiwin32,pywintypes) but i cant figure it out. Here is my beloved code (dont laugh its jarvis code):
#alap
import pyttsx3
import datetime
import speech_recognition as sr
import wikipedia
import webbrowser
import os
import pywhatkit
import pyjokes
import subprocess
import pywintypes
import win32com.client
import pygame
engine = pyttsx3.init('sapi5')
def speak(audio):
engine.say(audio)
engine.runAndWait()
def time():
Time = datetime.datetime.now().strftime("%H:%M:%S")
speak(Time)
def date():
year = int(datetime.datetime.now().year)
month = int(datetime.datetime.now().month)
date = int(datetime.datetime.now().day)
speak(date)
speak(month)
speak(year)
def wishme():
speak("Welcome back sir! All system are ready for work!")
speak("the current time is")
time()
speak("The current date is")
date()
hour = datetime.datetime.now().hour
if hour >= 6 and hour<12:
speak("Good morning sir!")
elif hour >=12 and hour<18:
speak("Good afternoon sir!")
elif hour >=18 and hour<24:
speak("Good evening sir!")
else:
speak("Good night sir!")
speak("Jarvis at your service. Please tell me how can i help you?")
def takeCommand():
r = sr.Recognizer()
with sr.Microphone() as source:
print("listening...")
r.pause_threshold = 1
audio = r.listen(source)
try:
print("Recognizing...")
query = r.recognize_google(audio, language='en-US')
print(query)
except Exception as e:
print(e)
speak("Say that again")
return "none"
return query
if __name__ == "__main__":
wishme()
while True:
query = takeCommand().lower()
if 'wikipedia' in query: #if wikipedia found in the query then this block will be executed
speak('Searching Wikipedia...')
query = query.replace("wikipedia", "")
results = wikipedia.summary(query, sentences=2)
speak("According to Wikipedia")
print(results)
speak(results)
``
Also i'm using python 2.71, and latest of pip.
It is normally because you have named your Python file the same as the module you are importing and caused a circular reference. Try changing the name of your file. It should resolve the issue.
import boto3
import pandas as pd
import io
def lambda_handler(event, context):
if event:
s3_client = boto3.client('s3')
bucket = event['Records'][0]['s3']['bucket']['name']
key = event['Records'][0]['s3']['object']['key']
file_obj = s3_client.get_object(Bucket=bucket,Key=key)
file_content = file_obj['Body'].read()
b = io.BytesIO(file_content)
df = pd.read_excel(b)
print(df)
I am trying to upload excel sheet data from s3 to amazon rds (Postgres). The above code is what I have to extract data from s3. How can I upload the data from here to postgres, Please Help.
from PIL import Image
from bson import Binary
img = Image.open('test.jpg')
img = Binary(img)
throws an error stating TypeError : data must be an instance of bytes
Why does this happen? And how to resolve this to store the img to MongoDB?
As long as the document doesn't exceed 16MB standard bson is fine, otherwise gridfs should be used. The example below shows how an image can be inserted and read back from a mongodb.
insert_image.py
from pymongo import MongoClient
from PIL import Image
import io
client = MongoClient()
db = client.testdb
images = db.images
im = Image.open("./image.jpg")
image_bytes = io.BytesIO()
im.save(image_bytes, format='JPEG')
image = {
'data': image_bytes.getvalue()
}
image_id = images.insert_one(image).inserted_id
read_image.py
from pymongo import MongoClient
from bson.binary import Binary
from PIL import Image
import io
import matplotlib.pyplot as plt
client = MongoClient()
db = client.testdb
images = db.images
image = images.find_one()
pil_img = Image.open(io.BytesIO(image['data']))
plt.imshow(pil_img)
plt.show()
You need to convert the image into a Byte array. You can do this as follows,
from PIL import Image
from bson import Binary
img = Image.open('test.jpg')
imgByteArr = io.BytesIO()
img.save(imgByteArr, format='PNG')
imgByteArr = imgByteArr.getvalue()
You can try to save imgByteArr into mongo
OR
You can convert image into string and then store it in mongo:
import base64
with open("test.jpg", "rb") as imageFile:
str = base64.b64encode(imageFile.read())
//store str in mongo
To get back image
with open("test2.jpg", "wb") as fimage:
fimage.write(str.decode('base64'))