I am trying to develop a script which will run all the spark sql queries kept in a directory.
I have been able to do that in Python, but pyspark is a different game.
Below is the python script I use to read and execute all query files in a directory.
import sys,csv,sqlite3,codecs,unicodedata, string,glob, os,c
conn=psycopg2.connect(database="xxx", user="xxxx", password="xxxx",
host="localhost", port="5432") cur = conn.cursor() print("done")
with open("*.txt", "r") as ins: for line in ins:
words=line.split('|') print(words) query=words[0]
pmicode=words[1] print(query)
cur = conn.cursor()
cur.execute(query) conn.commit() conn.close()
Is it possible to replicate this in PySpark ?
Thanks,
Pankaj
I'm guessing you want pyspark to pull data from the postgres database that you're using in this python script.
If the current code in Python is something like:
import sys, csv, sqlite3, codecs, unicodedata, string, glob, os
conn = psycopg2.connect(database="xxx", user="xxxx", password="xxxx", host="localhost", port="5432")
cur = conn.cursor()
print("done")
def runSQL(query):
cur = conn.cursor()
cur.execute(query)
conn.commit()
with open("*.txt", "r") as ins:
for line in ins:
words = line.split('|')
print(words)
query = words[0]
pmicode = words[1]
print(query)
conn.close()
The equivalent would be to use JDBC to connect and execute command with a sqlContext:
import sys, csv, sqlite3, codecs, unicodedata, string, glob, os
postgres_url = 'jdbc:postgresql://localhost:5432/database'
properties = {"user": "xxxx", "password": "xxxx"}
print("done")
def runSQL(query):
return sqlContext.read.jdbc(
url=postgres_url,
table="( {0} ) TEMPDB_SPARK_DELINQ".format(query)
with open("*.txt", "r") as ins:
for line in ins:
words = line.split('|')
print(words)
query = words[0]
pmicode = words[1]
print(query)
runSQL(query)
Related
I'm new to streamlit, and I wanted to build an interactive app that updates a CSV file when pressing an "update" button.
I defined 3 columns ("A", "B", "C") that are editable from my CSV file (has two other columns). For some reason when I make my updates, only one of them ("A") is actually updating when I press the button, while the other two are not.
Here is my code:
def main():
parser = argparse.ArgumentParser(description='')
parser.add_argument("-d", "--data", type=str, required=False, default=data.csv)
args = None
try:
args = parser.parse_args()
except SystemExit as e:
os._exit(e.code)
st.set_page_config(layout="wide")
grid_table = show_grid(args.data)
st.sidebar.header("Options:")
st.sidebar.button("Update CSV file", on_click=update, args=[grid_table, args.data])
def data_upload(data_path):
df = pd.read_csv(data_path)
return df
def show_grid(data_path):
df = data_upload(data_path)
gb = GridOptionsBuilder.from_dataframe(df)
gb.configure_column("A", editable=True)
gb.configure_column("B", editable=True)
gb.configure_column("C", editable=True)
grid_table = AgGrid(
df,
height=400,
gridOptions=gb.build(),
fit_columns_on_grid_load=True,
allow_unsafe_jscode=True,
)
return grid_table
def update(grid_table, data_path):
grid_table_df = pd.DataFrame(grid_table['data'])
grid_table_df.to_csv(data_path, index=False)
if __name__ == '__main__':
main()
Tried to update columns "B" and "C" and expected them to update, but only changes done to column "A" seems to be updating on the CSV when pressing the update button.
I'm trying to insert rows of dataframe in postgres databases and insert the generated primary keys in this dataframe.
I'm doing this :
def createConnexionRds():
host = "..."
database = "..."
conn = pg8000.connect(
user="...",
host=host,
database=database,
password="...",
ssl_context=True)
return conn
def insertProcess(r):
conn = createConnexionRds()
insertResults = conn.run(r["tmp_query"])
insertResult = "NOT_INSERTED"
if len(insertResults) > 0:
insertResult = insertResults[0][0]
conn.commit()
conn.close()
return insertResult
def insertPerQuery(myDataframe):
query = sf.lit("insert into tabAAA (colBBB) values ('valueCCC') returning idAAA")
myDataframe = myDataframe.withColumn("tmp_query", query)
myDataframe = myDataframe.drop("idAAA")
rdd=myDataframe.rdd.map(
lambda x:(*x, insertProcess(x))
)
myDataframe = myDataframe.withColumn("idAAA", sf.lit(""))
myDataframe = sqlContext.createDataFrame(rdd,myDataframe.schema)
myDataframe = myDataframe.drop("tmp_query")
return myDataframe
df = insertPerQuery(df)
# df.show(100, False)
The issue is when I comment df.show(...) (the last line), the insert is not process. And if I launch a second df.show(), the insert is duplicate.
This is for a AWS glue job.
Thanks.
This is due to the lazy-evaluation-nature of Spark. The code gets only executed on the executors as soon you call an action, in this case .show()
I have the following code:
def main(args: Array[String]) {
var dvfFiles : String = "g:/data/gouv/dvf/raw"
var q : String = ""
//q = "SELECT distinct DateMutation, NVoie, IndVoie, Voie, Valeur, CodeTypeLocal, TypeLocal, Commune FROM mutations WHERE Commune = 'ICI' and Valeur > 100000 and CodeTypeLocal in (1, 2) order by Valeur desc"
args.sliding(2, 2).toList.collect {
case Array("--sfiles", argFiles: String) => dvfFiles = argFiles
case Array("--squery", argQ: String) => q = argQ
}
println(s"files from: ${dvfFiles}")
if I run the following command:
G:\dev\fromGit\dvf\spark>spark-submit .\target\scala-2.11\dfvqueryer_2.11-1.0.jar \
--squery "SELECT distinct DateMutation, NVoie, IndVoie, Voie, Valeur, CodeTypeLocal, \
TypeLocal, Commune FROM mutations WHERE (Commune = 'ICI') and (Valeur > 100000) and (CodeTypeLocal in (1, 2)) order by Valeur desc"
I got the following result:
== SQL ==
SELECT distinct DateMutation, NVoie, IndVoie, Voie, Valeur, CodeTypeLocal, TypeLocal, Commune FROM mutations WHERE (Commune = 'ICI') and (Valeur and (CodeTypeLocal in (1, 2)) order by Valeur desc
----------------------------------------------------------------------------------------------^^^
the ^^^ pointing the FROM
I also notice the missing > 100000 after Valeur.
the query is correct because if I uncomment the //q =..., package the code and submit it, all happens fine.
Seems that the process is burning part of the query during input. One solution to this problem would be to send the entire argument of you select query in one line and input it into a string value. In that format it can be immediately pipelined into the sql function to run you query. Below is how you can build out the function:
//The Package Tree
package stack.overFlow
//Call all needed packages
import org.apache.spark.sql.{DataFrame, SparkSession, Column, SQLContext}
import org.apache.spark.SparkContext
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql
//Object Name
object demoCode {
def main(args: Array[String]) {
///Build the contexts
var spark = SparkSession.builder.enableHiveSupport().getOrCreate()
var sc = spark.sparkContext
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
//Set the query as a string for argument 1
val commandQuery : String = args(0)
//Pass query to the sql function
val inputDF = spark.sql(commandQuery)
}
}
When the function compiles you will need two objects (1) the Jar as well as (2) the package tree and class for running the function. When running bot of those within --class all you need to do is include a space and pass through the SQL query so on run time it will be loaded into the spark session.
spark-submit --class stack.overFlow.demoCode /home/user/demo_code/target/demoCode-compilation-jar.jar \
SELECT distinct DateMutation, NVoie, IndVoie, Voie, Valeur, CodeTypeLocal,TypeLocal, Commune FROM mutations WHERE (Commune = 'ICI') and (Valeur > 100000) and (CodeTypeLocal in (1, 2)) order by Valeur desc
Would this help your use-case or do you need it to be in another format?
I have a SnowflakeApi class in python which just works as a wrapper on top of the SnowflakeConnection class. My SnowflakeApi is
import logging
import os
from snowflake.connector import connect
class SnowflakeApi(object):
"""
Wrapper to handle snowflake connection
"""
def __init__(self, account, warehouse, database, user, pwd):
"""
Handles snowflake connection. Connection must be closed once it is no longer needed
:param account:
:param warehouse:
:param database:
"""
self.__acct = self._account_url(account)
self.__wh = warehouse
self.__db = database
self.__connection = None
self.__user = user
self.__pwd = pwd
def __create_connection(self):
try:
# set the proxy here
conn = connect(
account=self.__acct
, user=self.__user
, password=self.__pwd
, warehouse=self.__wh
, database=self.__db
)
return conn
except:
raise Exception(
"Unable to connect to snowflake for user: '{0}', warehouse: '{1}', database: '{2}'".format(
self.__user, self.__wh, self.__db))
def get_connection(self):
"""
Gets a snowflake connection. If the connection has already been initialised it is returned
otherwise a new connection is created
:param credentials_func: method to get database credentials.
:return:
"""
try:
if self.__connection is None:
self.__connection = self.__create_connection()
return self.__connection
except:
raise Exception("Unable to initalise Snowflake connection")
def close_connection(self):
"""
Closes snowflake connection.
:return:
"""
self.__connection.close()
Namespace for SnowflakeApi is connection.snowflake_connection.SnowflakeApi (i.e. i have snowflake_connection.py in a folder called connections)
I want to write unit tests for this class using pytest and unittest.mock. The problem is I want to mock 'connect' so that a MagicMock object is returned and no database call is made. So far I have tried:
monkeypatch.setattr(connections.snowflake_connection,"connect",return_value = "")
Changed my original class to just import snowflake. I then created a mock object and used monkeypatch.setattr(snowflake_connection,"snowflake",my_mock_snowflake). That didn't work either
In short, I have tried a couple of other things but nothing has worked. All I want to do is mock snowflake connection so no actual database call is made.
Here is another way where we are mocking snowflake connector, cursor and fetch_all using python mock and patch.
import mock
import unittest
from datetime import datetime, timedelta
import feed_daily_report
class TestFeedDailyReport(unittest.TestCase):
#mock.patch('snowflake.connector.connect')
def test_compare_partner(self, mock_snowflake_connector):
tod = datetime.now()
delta = timedelta(days=8)
date_8_days_ago = tod - delta
query_result = [('partner_1', date_8_days_ago)]
mock_con = mock_snowflake_connector.return_value
mock_cur = mock_con.cursor.return_value
mock_cur.fetchall.return_value = query_result
result = feed_daily_report.main()
assert result == True
An example using unittest.mock and patching the connection:
from unittest import TestCase
from unittest.mock import patch
from connection.snowflake_connection import SnowflakeApi
class TestSnowFlakeApi(TestCase):
#patch('connection.snowflake_connection.connect')
def test_get_connection(self, mock_connect)
api = SnowflakeApi('the_account',
'the_warehouse',
'the_database',
'the_user',
'the_pwd')
api.get_connection()
mock_connect.assert_called_once_with(account='account_url', # Will be the output of self._account_url()
user='the_user',
password='the_pwd',
warehouse='the_warehouse',
database='the_database')
If you're testing other classes that use your SnowFlakeApi wrapper, then you should use the same approach, but patch the SnowFlakeApi itself in those tests.
from package.module.SomeClassThatUsesSnowFlakeApi
class TestSomeClassThatUsesSnowFlakeApi(TestCase):
#patch('package.module.SnowFlakeApi')
def test_some_func(self, mock_api):
instance = SomeClassThatUsesSnowFlakeApi()
instance.do_something()
mock_api.assert_called_once_with(...)
mock_api.return_value.get_connection.assert_called_once_with()
Also note that if you're using Python 2, you will need to pip install mock and then from mock import patch.
Using stubbing and dependency injection
from ... import SnowflakeApi
def some_func(*args, api=None, **kwargs):
api = api or SnowflakeApi(...)
conn = api.get_connection()
# Do some work
return result
Your test
class SnowflakeApiStub(SnowflakeApi)
def __init__(self):
# bypass super constructor
self.__connection = MagicMock()
def test_some_func():
stub = SnowflakeApiStub()
mock_connection = stub.__connection
mock_cursor = mock_connection.cursor.return_value
expect = ...
actual = some_func(api=stub)
assert expect == actual
assert mock_cursor.execute.called
An example using cursor, execute, and fetchone.
import snowflake.connector
class AlongSamePolly:
def __init__(self, conn):
self.conn = conn
def row_count(self):
cur = self.conn.cursor()
query = cur.execute('select count(*) from schema.table;')
return query.fetchone()[0] # returns (12345,)
# I like to dependency inject the snowflake connection object in my classes.
# This lets me use Snowflake Python Connector's built in context manager to
# rollback any errors and automatically close connections. Then you don't have
# try/except/finally blocks everywhere in your code.
#
if __name__ == '__main__':
with snowflake.connector.connect(user='user', password='password') as con:
same = AlongSamePolly(con)
print(same.row_count())
# => 12345
In the unittests you mock out the expected method calls - cursor(), execute(),
fetchone() and define the return value to follow up the chain of defined mocks.
import unittest
from unittest import mock
from along_same_polly import AlongSamePolly
class TestAlongSamePolly(unittest.TestCase):
def test_row_count(self):
with mock.patch('snowflake.connector.connect') as mock_snowflake_conn:
mock_query = mock.Mock()
mock_query.fetchone.return_value = (123,)
mock_cur = mock.Mock()
mock_cur.execute.return_value = mock_query
mock_snowflake_conn.cursor.return_value = mock_cur
same = AlongSamePolly(mock_snowflake_conn)
self.assertEqual(same.row_count(), 123)
if __name__ == '__main__':
unittest.main()
The following Solution Worked for me.
def test_connect(env_var_setup, monkeypatch):
monkeypatch.setattr(snowflake.connector.connection.SnowflakeConnection,
"connect", mocked_sf_connect
)
# calling snowflake connector method
file_job_map(env_var_setup).connect()
#mocked connection
def mocked_sf_connect(self, **kwargs):
print("Connection Successfully Established")
return True
I deal with large DataFrames containing both numbers and text. Obviously I could store each column/row as a sep. document in my mongoDB but I would like to remove and hassle when loading the data.
I thought about using GridFS which is kind of abstracted away by using the MongoEngine FileField. In the meantime I came up with a solution that works for me:
import pandas as pd
from mongoengine import Document, StringField, FileField
from io import BytesIO
class Frame(Document):
name = StringField(required=True, max_length=200, unique=True)
data = FileField()
#property
def frame(self):
str_data = BytesIO(self.data.read()).read().decode()
try:
return pd.read_json(str_data, typ="frame")
except ValueError:
return pd.read_json(str_data, typ="series")
def __str__(self):
return "{name}: \n{frame}".format(name=self.name, frame=self.frame)
def put(self, frame):
if self.data:
self.data.replace(frame.to_json().encode())
else:
self.data.new_file()
self.data.write(frame.to_json().encode())
self.data.close()
self.save()
if __name__ == '__main__':
from pydata.config import connect_production
connect_production()
frame = pd.DataFrame(data=[[1,2,4],[3,4,6]], columns=["A","B","C"], index=["X","Y"])
f = Frame.objects(name="history").update_one(name="history", upsert=True)
f = Frame.objects(name="history").first()
f.put(frame=frame)
print(f)