writing pyspark DF into Redshift - pyspark

I have established the connection between Pyspark and Redshift using the following code.
import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker
import psycopg2
DATABASE = "d"
USER = "user1"
PASSWORD = "1234"
HOST = "sparkvalidation.crv9zfdiseqm.us-west-2.redshift.amazonaws.com"
PORT = "5439"
SCHEMA = "public"
connection_string = "redshift+psycopg2://%s:%s#%s:%s/%s" % (USER,PASSWORD,HOST,str(PORT),DATABASE)
engine = sa.create_engine(connection_string)
session = sessionmaker()
session.configure(bind=engine)
s = session()
SetPath = "SET search_path TO %s" % SCHEMA
s.execute(SetPath)
Now how can I write a pyspark dataframe to Redshift?

If you use DataBricks, you could write something like this:
dataframe.write \
.format("com.databricks.spark.redshift") \
.option("url", connection_string) \
.option("dbtable", "target") \
.option("tempdir", "s3a://your_s3_tmp_bucket/tmp_data") \
.mode("error") \
.save()
Note that you need a s3 bucket, as it's usually the case when copying data into redshift

Related

Spark Job taking long time to append data to S3

I'm running spark job on EMR and trying to convert large zipped file (15gb) to parquet but it is taking too long to write to S3.
I'm using r5 instance for master (1 instance) and core (3 instances).
Here is my code.
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date
def main():
spark = SparkSession \
.builder \
.appName("csv-to-parquer-convertor") \
.config("spark.sql.catalogimplementation", "hive") \
.config("hive.metastore.connect.retries", 3) \
.config("hive.metastore.client.factory.class", "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory") \
.enableHiveSupport().getOrCreate()
tgt_filename = 'SOME_Prefix'
src_path = 'SOURCE_S3_PATH'
tgt_path = 'TARGET_ BUCKET' + tgt_filename
df = spark.read.csv(src_path, header=True)
partitioned_df = df.repartition(50)
partitioned_df.write.mode('append').parquet(path=tgt_path)
spark.stop()
if __name__ == "__main__":
main()
any suggestion will be much appreciated. ?

Pyspark error occurred while calling o50.load. : com.mysql.cj.jdbc.exceptions.CommunicationsException: Communications link failure

I need help am struggling with pyspark jdbc error on my jupyter notebook..im connecting to a database url from local to somewhere else my code is as of below
installed sql connector
2.installed mysql workbench to test out the connection
installed jupyter and did the set up,
Im honestly not sure what lib or installs im missing do i need to download a local mysql server?
import findspark
findspark.init
from pyspark import SparkContext
from pyspark.sql import SQLContext
sqlContext = sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)
#connection to oc_customer_reward (find store credits customers gain), change db and port
df_storecredit = sqlContext.read \
.format("jdbc") \
.option("url", "jdbc:mysql://link") \
.option("driver", "com.mysql.jdbc.Driver") \
.option("dbtable", "(SELECT customer_id, order_id, date_added as 'timestamp', points from oc_customer_reward) as orders") \
.option("user", "") \
.option("password", "") \
.load()
df_storecredit.printSchema()
#add project_id column
from pyspark.sql.functions import lit
df_projectid = df_storecredit.withColumn('project_id', lit(''))
df_projectid.printSchema()
#Save the dataframe to the table.
db_properties = {"user": "","password": "","driver": "org.postgresql.Driver"}
df_projectid.write.jdbc(url='jdbc:postgresql://link',table='dwh_storecredits',mode='append',properties=db_properties)

pysprak - microbatch streaming delta table as a source to perform merge against another delta table - foreachbatch is not getting invoked

I have created a delta table and now I'm trying to perform merge data to that table using foreachBatch(). I've followed this example. I am running this code in dataproc image 1.5x in google cloud.
Spark version 2.4.7
Delta version 0.6.0
My code looks as follows:
from delta.tables import *
spark = SparkSession.builder \
.appName("streaming_merge") \
.master("local[*]") \
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
.getOrCreate()
# Function to upsert `microBatchOutputDF` into Delta table using MERGE
def mergeToDelta(microBatchOutputDF, batchId):
(deltaTable.alias("accnt").merge(
microBatchOutputDF.alias("updates"), \
"accnt.acct_nbr = updates.acct_nbr") \
.whenMatchedDelete(condition = "updates.cdc_ind='D'") \
.whenMatchedUpdateAll(condition = "updates.cdc_ind='U'") \
.whenNotMatchedInsertAll(condition = "updates.cdc_ind!='D'") \
.execute()
)
deltaTable = DeltaTable.forPath(spark, "gs:<<path_for_the_target_delta_table>>")
# Define the source extract
SourceDF = (
spark.readStream \
.format("delta") \
.load("gs://<<path_for_the_source_delta_location>>")
# Start the query to continuously upsert into target tables in update mode
SourceDF.writeStream \
.format("delta") \
.outputMode("update") \
.foreachBatch(mergeToDelta) \
.option("checkpointLocation","gs:<<path_for_the_checkpint_location>>") \
.trigger(once=True) \
.start() \
This code runs without any problems, but there is no data written to the delta table, I doubt foreachBatch is not getting invoked. Anyone know what I'm doing wrong?
After adding awaitTermination, streaming started working and picked up the latest data from the source and performed the merge on delta target table.

Pyspark saving is not working when called from inside a foreach

I am building a pipeline that receives messages from Azure EventHub and save into databricks delta tables.
All my tests with static data went well, see the code below:
body = 'A|B|C|D\n"False"|"253435564"|"14"|"2019-06-25 04:56:21.713"\n"True"|"253435564"|"13"|"2019-06-25 04:56:21.713"\n"
tableLocation = "/delta/tables/myTableName"
spark = SparkSession.builder.appName("CSV converter").getOrCreate()
csvData = spark.sparkContext.parallelize(body.split('\n'))
df = spark.read \
.option("header", True) \
.option("delimiter","|") \
.option("quote", "\"") \
.option("nullValue", "\\N") \
.option("inferShema", "true") \
.option("mergeSchema", "true") \
.csv(csvData)
df.write.format("delta").mode("append").save(tableLocation)
However in my case, each eventhub message is a CSV string, and they may come from many sources. So each message must be processed separatelly, because each message may end up saved in different delta tables.
When I try to execute this same code inside a foreach statement, It doesn't work. There are no errors shown at the logs, and I cant find any table saved.
So maybe I am doing something wrong when calling the foreach. See the code below:
def SaveData(row):
...
The same code above
dfEventHubCSV.rdd.foreach(SaveData)
I tried to do this on a streaming context, but I sadly went through the same problem.
What is in the foreach that makes it behave different?
Below the full code I am running:
import pyspark.sql.types as t
from pyspark.sql import SQLContext
--row contains the fields Body and SdIds
--Body: CSV string
--SdIds: A string ID
def SaveData(row):
--Each row data that is going to be added to different tables
rowInfo = GetDestinationTableData(row['SdIds']).collect()
table = rowInfo[0][4]
schema = rowInfo[0][3]
database = rowInfo[0][2]
body = row['Body']
tableLocation = "/delta/" + database + '/' + schema + '/' + table
checkpointLocation = "/delta/" + database + '/' + schema + "/_checkpoints/" + table
spark = SparkSession.builder.appName("CSV").getOrCreate()
csvData = spark.sparkContext.parallelize(body.split('\n'))
df = spark.read \
.option("header", True) \
.option("delimiter","|") \
.option("quote", "\"") \
.option("nullValue", "\\N") \
.option("inferShema", "true") \
.option("mergeSchema", "true") \
.csv(csvData)
df.write.format("delta").mode("append").save(tableLocation)
dfEventHubCSV.rdd.foreach(SaveData)
Well, at the end of all, as always it is something very simple, but I dind't see this anywere.
Basically when you perform a foreach and the dataframe you want to save is built inside the loop. The worker unlike the driver, won't automatically setup the "/dbfs/" path on the saving, so if you don't manually add the "/dbfs/", it will save the data locally in the worker and you will never find the saved data.
That is why my loops weren't working.

how can i get field by field from mongodb and put it into labeledpoint

I want to create a LabeledPoint in Spark using data from mongodb. I can get fields from mongodb to spark by:
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db1 = client.newumc
collection1 = db.data_classification
q_2 = collection1.find({}, {'q2':1,'_id':0})
q_1 = collection1.find({}, {'q1':1,'_id':0})
q_38 = collection1.find({}, {'q38':1,'_id':0})
_result = collection.find({},{'qresultat':1,'_id':0})
where q1, q2, q38 and qresultat are the fields from mongodb (q1, q2, q38 are the features and _result is the label).
But this does not works for me:
lbpoint = LabeledPoint(result, array([q1, q2, q38]))
and type(q2) is
pymongo.cursor.Cursor
Can anyone please help me or if someone already had some code of decision tree with mongodb.
i solved the problem by uusing
spark = SparkSession \
.builder \
.appName("myApp") \
.config("spark.mongodb.input.uri", "mongodb://127.0.0.1/newumc.classification_data") \
.config("spark.mongodb.output.uri", "mongodb://127.0.0.1/newumc.classification_data") \
.getOrCreate()
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
column1 = df[1]
column2 = df[2]