I am trying to load data from AWS EMR(data storage as S3 and glue-catalog for metastore) to Redshift.
import sys
import boto3
from datetime import datetime,date
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pyspark.context import SparkContext
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.window import *
from pyspark.sql.functions import to_date
from pyspark.sql import SQLContext
glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.sparkSession
sc = spark.sparkContext
sqlContext = SQLContext(sc)
df = sqlContext.sql("Select * from classic_models.orderdetails where insert_date >= '2021-01-01' and insert_date < '2021-01-02' ")
dynamic_df = DynamicFrame.fromDF(new_df, glueContext, "dynamic_df")
redshift_target_table = "classic_models.orderdetails"
pre_actions = f"Truncate table {redshift_target_table};"
redshift_connection_opts = {
"database": "dev",
"dbtable": redshift_target_table,
"aws_iam_role": "arn:aws:iam::*********",
"preactions": pre_actions
}
s3_temp_dir = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
glueContext.write_dynamic_frame.from_jdbc_conf(
frame = dynamic_df,
catalog_connection = "redshift",
connection_options = redshift_connection_opts,
redshift_tmp_dir = "s3:/staging/orderdetails/%s/" % s3_temp_dir #Need change
)
The extract data from Hive is quite fast but loading data to Redshift is taking long. By long I mean if I am loading data for the past 10 days, and a glue job takes 16 min to complete, less than 1 min is to extract data from Hive, rest is only to load data to Redshift.
More than half of the Hive table columns are of String data-type
Is there any other better and faster way to do it?
Related
I have a very basic AWS Glue ETL job that I created to select some fields from a data catalog that was built from a crawler I have pointing to an RDS database. Once the dataset is returned I export the results in CSV format. This works, however; the output generates around 20 unique files . The dataset only has two rows in it right now so only two files have data and the rest just show the column headers with no second row. My requirement is to have a single CSV file that contains all of the data selected from the dataset. I have tried both repartition and coalesce functions unsuccessfully. I am able to generate the single file, but my data is missing. I am new to AWS Glue and have been unable to figure this out so any suggestions will be much appreciated.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue import DynamicFrame
def sparkSqlQuery(glueContext, query, mapping, transformation_ctx) -> DynamicFrame:
for alias, frame in mapping.items():
frame.toDF().createOrReplaceTempView(alias)
result = spark.sql(query)
return DynamicFrame.fromDF(result, glueContext, transformation_ctx)
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# Script generated for node PostgreSQL
PostgreSQL_node1644981751584 = glueContext.create_dynamic_frame.from_catalog(
database="newApp",
table_name="database_schema_staging_hdr",
transformation_ctx="PostgreSQL_node1644981751584",
)
# Script generated for node SQL
SqlQuery0 = """
select * from myDataSource
"""
SQL_node1644981807578 = sparkSqlQuery(
glueContext,
query=SqlQuery0,
mapping={"myDataSource": PostgreSQL_node1644981751584},
transformation_ctx="SQL_node1644981807578",
)
# Script generated for node Amazon S3
AmazonS3_node1644981816657 = glueContext.write_dynamic_frame.from_options(
frame=SQL_node1644981807578,
connection_type="s3",
format="csv",
connection_options={"path": "s3://awsglueetloutput/", "partitionKeys": []},
transformation_ctx="AmazonS3_node1644981816657",
)
job.commit()
You have to repartition the DynamicFrame to achieve that.
Example to have 1 file in the end: SQL_node1644981807578 = SQL_node1644981807578.repartition(1)
I quite new to spark and started with pyspark, I am learning to push data from kafka to hive using pyspark.
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import *
from pyspark.streaming.kafka import KafkaUtils
from os.path import abspath
warehouseLocation = abspath("spark-warehouse")
spark = SparkSession.builder.appName("sparkstreaming").getOrCreate()
df = spark.read.format("kafka").option("startingoffsets", "earliest").option("kafka.bootstrap.servers", "kafka-server1:66,kafka-server2:66").option("kafka.security.protocol", "SSL").option("kafka.ssl.keystore.location", "mykeystore.jks").option("kafka.ssl.keystore.password","mykeystorepassword").option("subscribe","json_stream").load().selectExpr("CAST(value AS STRING)")
json_schema = df.schema
df1 = df.select($"value").select(from_json,json_schema).alias("data").select("data.*")
The above is not working, however after extracting data, I want to insert data to hive table.
As I am completely new, looking for help.
Appreciated in advance! :)
from os.path import expanduser, join, abspath
from pyspark.sql import SparkSession
from pyspark.sql import Row
# warehouse_location points to the default location for managed databases and tables
warehouse_location = abspath('spark-warehouse')
spark = SparkSession \
.builder \
.appName("Python Spark SQL Hive integration example") \
.config("spark.sql.warehouse.dir", warehouse_location) \
.enableHiveSupport() \
.getOrCreate()
# spark is an existing SparkSession
spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive")
spark.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")
I'm using Pyspark and Kafka to process data via live streams
I made a function that reads Kafka stream by batch and calculate the average values of data for each batch.
I want the same thing but when the values for the second batch should be the average of values for the first and the second batch (the whole history I mean). For the third batch, the average should be the average for the first + second + third batch.. etc
For more, If the first batch could be updated with new values calculated with the values of the last batch, that would be great :)
This is what I've done so far :
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pandas as pd
import numpy as np
import json, time, os.path
kafka_brokers = "localhost:9092"
kafka_core_topic = "test"
sc = SparkContext(appName = "test-kafka")
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 3)
kvs = KafkaUtils.createDirectStream(ssc, [kafka_core_topic], {"metadata.broker.list": kafka_brokers})
parsed = kvs.map(lambda x: json.loads(x[1]))
#pandas_udf('double')
def mean_score(col):
return pd.Series([np.mean(col)] * len(col))
def getSparkSessionInstance(sparkConf):
if ("sparkSessionSingletonInstance" not in globals()):
globals()["sparkSessionSingletonInstance"] = SparkSession \
.builder \
.config(conf = sparkConf)\
.getOrCreate()
return globals()["sparkSessionSingletonInstance"]
def process(time, rdd):
print("========= %s =========" % str(time))
parquetfile = "sparkstream.parquet"
spark = getSparkSessionInstance(rdd.context.getConf())
schema = StructType([
StructField('name', StringType()),
StructField('score', IntegerType())
])
data = spark.read.json(rdd, schema = schema)
data = data.withColumn('mean_score', mean_score(data['score']))
data.show()
if os.path.isdir(parquetfile):
data.write.mode('append').parquet(parquetfile)
else:
data.write.parquet(parquetfile)
parsed.foreachRDD(process)
ssc.start()
ssc.awaitTermination()
This gives the following result :
Thanks a lot for helping :)
I am running the following the code in spark submit(Spark 2.3.0) and getting "NameError: name 'sc' is not defined"
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, lit
from pyspark.sql.types import *
if __name__ == "__main__":
sc=SparkContext()
sqlContext = SQLContext(sc)
forecast = sc.read.load('/user/gg/LV_hadoop_example.csv',
format='csv', header='true', inferSchema='true', sep=',')
forecast = forecast.filter(forecast['Total_scaled_forecast'] > 0)
forecast.saveAsTextFile("word_count11.txt")
In spark 2.3.0, the correct way to load a csv file using:
from pyspark.sql import SparkSession
# initiate spark instance
spark = SparkSession.builder
.master("local")
.appName("abc")
.getOrCreate()
# read csv file
df = spark.read.csv('/user/gg/LV_hadoop_example.csv')
Check the documentation for more examples.
Here is my code:
import pandas as pd
from pyspark.sql import SQLContext
import pyspark.sql.functions as fn
from pyspark.sql.functions import isnan, isnull
from pyspark.sql.functions import lit
from pyspark.sql.window import Window
spark= SparkSession.builder.appName(" ").getOrCreate()
file = "D:\project\HistoryData.csv"
lines = pd.read_csv(file)
spark_df=spark.createDataFrame(cc,['id','time','average','max','min'])
temp = Window.partitionBy("time").orderBy("id").rowsBetween(-1, 1)
df = spark_df.withColumn("movingAvg",fn.avg("average").over(temp))
df.show()
But it output this:
It output the same data,and some data is disappear.