How do you have AWS Glue ETL job return a single file with all the results in it using PySpark? - pyspark

I have a very basic AWS Glue ETL job that I created to select some fields from a data catalog that was built from a crawler I have pointing to an RDS database. Once the dataset is returned I export the results in CSV format. This works, however; the output generates around 20 unique files . The dataset only has two rows in it right now so only two files have data and the rest just show the column headers with no second row. My requirement is to have a single CSV file that contains all of the data selected from the dataset. I have tried both repartition and coalesce functions unsuccessfully. I am able to generate the single file, but my data is missing. I am new to AWS Glue and have been unable to figure this out so any suggestions will be much appreciated.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue import DynamicFrame
def sparkSqlQuery(glueContext, query, mapping, transformation_ctx) -> DynamicFrame:
for alias, frame in mapping.items():
frame.toDF().createOrReplaceTempView(alias)
result = spark.sql(query)
return DynamicFrame.fromDF(result, glueContext, transformation_ctx)
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# Script generated for node PostgreSQL
PostgreSQL_node1644981751584 = glueContext.create_dynamic_frame.from_catalog(
database="newApp",
table_name="database_schema_staging_hdr",
transformation_ctx="PostgreSQL_node1644981751584",
)
# Script generated for node SQL
SqlQuery0 = """
select * from myDataSource
"""
SQL_node1644981807578 = sparkSqlQuery(
glueContext,
query=SqlQuery0,
mapping={"myDataSource": PostgreSQL_node1644981751584},
transformation_ctx="SQL_node1644981807578",
)
# Script generated for node Amazon S3
AmazonS3_node1644981816657 = glueContext.write_dynamic_frame.from_options(
frame=SQL_node1644981807578,
connection_type="s3",
format="csv",
connection_options={"path": "s3://awsglueetloutput/", "partitionKeys": []},
transformation_ctx="AmazonS3_node1644981816657",
)
job.commit()

You have to repartition the DynamicFrame to achieve that.
Example to have 1 file in the end: SQL_node1644981807578 = SQL_node1644981807578.repartition(1)

Related

Loading data from AWS EMR to Redshift using Glue is very slow

I am trying to load data from AWS EMR(data storage as S3 and glue-catalog for metastore) to Redshift.
import sys
import boto3
from datetime import datetime,date
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pyspark.context import SparkContext
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.window import *
from pyspark.sql.functions import to_date
from pyspark.sql import SQLContext
glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.sparkSession
sc = spark.sparkContext
sqlContext = SQLContext(sc)
df = sqlContext.sql("Select * from classic_models.orderdetails where insert_date >= '2021-01-01' and insert_date < '2021-01-02' ")
dynamic_df = DynamicFrame.fromDF(new_df, glueContext, "dynamic_df")
redshift_target_table = "classic_models.orderdetails"
pre_actions = f"Truncate table {redshift_target_table};"
redshift_connection_opts = {
"database": "dev",
"dbtable": redshift_target_table,
"aws_iam_role": "arn:aws:iam::*********",
"preactions": pre_actions
}
s3_temp_dir = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
glueContext.write_dynamic_frame.from_jdbc_conf(
frame = dynamic_df,
catalog_connection = "redshift",
connection_options = redshift_connection_opts,
redshift_tmp_dir = "s3:/staging/orderdetails/%s/" % s3_temp_dir #Need change
)
The extract data from Hive is quite fast but loading data to Redshift is taking long. By long I mean if I am loading data for the past 10 days, and a glue job takes 16 min to complete, less than 1 min is to extract data from Hive, rest is only to load data to Redshift.
More than half of the Hive table columns are of String data-type
Is there any other better and faster way to do it?

Create pyspark dataframe from parquet file

I am quite new in pyspark and I am still trying to figure out who things work. What I am trying to do is after loading a parquet file in memory using pyarrow Itry to make it to pyspark dataframe. But I am getting an error.
I should mention that I am not reading directly through pyspark because the file in in s3 which gives me another error about "no filesystem for scheme s3"
so I am trying to work around. Below I have a reproducible example.
import pyarrow.parquet as pq
import s3fs
s3 = s3fs.S3FileSystem()
parquet_file=pq.ParquetDataset('s3filepath.parquet',filesystem=s3)
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
spark.createDataFrame(parquet_file)
------------------------------------------------------------------
TypeError Traceback (most recent
call last)
<ipython-input-20-0cb2dd287606> in <module>
----> 1 spark.createDataFrame(pandas_dataframe)
/usr/local/spark/python/pyspark/sql/session.py in
createDataFrame(self, data, schema, samplingRatio, verifySchema)
746 rdd, schema =
self._createFromRDD(data.map(prepare), schema, samplingRatio)
747 else:
--> 748 rdd, schema =
self._createFromLocal(map(prepare, data), schema)
749 jrdd =
self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
750 jdf =
self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(),
schema.json())
TypeError: 'ParquetDataset' object is not iterable
import pyspark
from pyspark.sql import SQLContext
sc = pyspark.SparkContext('local', "retail")
sqlC = SQLContext(sc)
This is how you should read parquet files to spark df:
df = sqlC.read.parquet('path_to_file_or_dir')
You can read data from S3 via Spark as long as you have the public and secret keys for the S3 bucket ... this would be more efficient compared to going though arrow via pandas and then converting to spark dataframe because you would have to parallelize the serial read.
sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", ACCESS_KEY)
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", SECRET_KEY)
df = spark.read.parquet("s3://path/to/parquet/files")
source doc => https://docs.databricks.com/spark/latest/data-sources/aws/amazon-s3.html#access-aws-s3-directly

How do I run SQL SELECT on AWS Glue created Dataframe in Spark?

I have the following job in AWS Glue which basically reads data from one table and extracts it as a csv file in S3, however I want to run a query on this table (A Select, SUM and GROUPBY) and want to get that output to CSV, how do I do this in AWS Glue? I am a newbie in Spark so please help
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
datasource0 = glueContext.create_dynamic_frame.from_catalog(database =
"db1", table_name = "dbo1_expdb_dbo_stg_plan", transformation_ctx =
"datasource0")
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings =
[("plan_code", "int", "plan_code", "int"), ("plan_id", "int", "plan_id",
"int")], transformation_ctx = "applymapping1")
datasink2 = glueContext.write_dynamic_frame.from_options(frame =
applymapping1, connection_type = "s3", connection_options = {"path":
"s3://bucket"}, format = "csv", transformation_ctx = "datasink2")
job.commit()
The "create_dynamic_frame.from_catalog" function of glue context creates a dynamic frame and not dataframe. And dynamic frame does not support execution of sql queries.
To execute sql queries you will first need to convert the dynamic frame to dataframe, register a temp table in spark's memory and then execute the sql query on this temp table.
Sample code:
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from pyspark.sql import SQLContext
glueContext = GlueContext(SparkContext.getOrCreate())
spark_session = glueContext.spark_session
sqlContext = SQLContext(spark_session.sparkContext, spark_session)
DyF = glueContext.create_dynamic_frame.from_catalog(database="{{database}}", table_name="{{table_name}}")
df = DyF.toDF()
df.registerTempTable('{{name}}')
df = sqlContext.sql('{{your select query with table name that you used for temp table above}}')
df.write.format('{{orc/parquet/whatever}}').partitionBy("{{columns}}").save('path to s3 location')
This is how I did it by converting the glue dynamic frame to spark dataframe first. Then using the glueContext object and sql method to do the query.
spark_dataframe = glue_dynamic_frame.toDF()
spark_dataframe.createOrReplaceTempView("spark_df")
glueContext.sql("""
SELECT *
FROM spark_df
LIMIT 10
""").show()

How to create child dataframe from xml file using Pyspark?

I have all those supporting libraries in pyspark and I am able to create dataframe for parent-
def xmlReader(root, row, filename):
df = spark.read.format("com.databricks.spark.xml").options(rowTag=row,rootTag=root).load(filename)
xref = df.select("genericEntity.entityId", "genericEntity.entityName","genericEntity.entityType","genericEntity.inceptionDate","genericEntity.updateTimestamp","genericEntity.entityLongName")
return xref
df1 = xmlReader("BOBML","entityTransaction","s3://dev.xml")
df1.head()
I am unable to create child dataframe-
def xmlReader(root, row, filename):
df2 = spark.read.format("com.databricks.spark.xml").options(rowTag=row, rootTag=root).load(filename)
xref = df2.select("genericEntity.entityDetail", "genericEntity.entityDetialId","genericEntity.updateTimestamp")
return xref
df3 = xmlReader("BOBML","s3://dev.xml")
df3.head()
I am not getting any output and I was planning to do union between parent and child dataframe. Any help will be truly appreciated!
After more than 24 hours, I am able to solve the problem and thanks to all whoever at least look at my problem.
Solution:
Step 1: Upload couple of libraries
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
Step2 (Parents):Read xml files, print schema, register temp tables, and create dataframe.
Step3 (Child): Repeat step 2.
Step4: Create final Dataframe by joining Child and parent dataframes.
Step5: Load data into S3 (write.csv/S3://Path) or Database.

Pyspark - Averaging values for every data batch of Kafka Streams

I'm using Pyspark and Kafka to process data via live streams
I made a function that reads Kafka stream by batch and calculate the average values of data for each batch.
I want the same thing but when the values for the second batch should be the average of values for the first and the second batch (the whole history I mean). For the third batch, the average should be the average for the first + second + third batch.. etc
For more, If the first batch could be updated with new values calculated with the values of the last batch, that would be great :)
This is what I've done so far :
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pandas as pd
import numpy as np
import json, time, os.path
kafka_brokers = "localhost:9092"
kafka_core_topic = "test"
sc = SparkContext(appName = "test-kafka")
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 3)
kvs = KafkaUtils.createDirectStream(ssc, [kafka_core_topic], {"metadata.broker.list": kafka_brokers})
parsed = kvs.map(lambda x: json.loads(x[1]))
#pandas_udf('double')
def mean_score(col):
return pd.Series([np.mean(col)] * len(col))
def getSparkSessionInstance(sparkConf):
if ("sparkSessionSingletonInstance" not in globals()):
globals()["sparkSessionSingletonInstance"] = SparkSession \
.builder \
.config(conf = sparkConf)\
.getOrCreate()
return globals()["sparkSessionSingletonInstance"]
def process(time, rdd):
print("========= %s =========" % str(time))
parquetfile = "sparkstream.parquet"
spark = getSparkSessionInstance(rdd.context.getConf())
schema = StructType([
StructField('name', StringType()),
StructField('score', IntegerType())
])
data = spark.read.json(rdd, schema = schema)
data = data.withColumn('mean_score', mean_score(data['score']))
data.show()
if os.path.isdir(parquetfile):
data.write.mode('append').parquet(parquetfile)
else:
data.write.parquet(parquetfile)
parsed.foreachRDD(process)
ssc.start()
ssc.awaitTermination()
This gives the following result :
Thanks a lot for helping :)