Spark Job taking long time to append data to S3 - pyspark

I'm running spark job on EMR and trying to convert large zipped file (15gb) to parquet but it is taking too long to write to S3.
I'm using r5 instance for master (1 instance) and core (3 instances).
Here is my code.
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date
def main():
spark = SparkSession \
.builder \
.appName("csv-to-parquer-convertor") \
.config("spark.sql.catalogimplementation", "hive") \
.config("hive.metastore.connect.retries", 3) \
.config("hive.metastore.client.factory.class", "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory") \
.enableHiveSupport().getOrCreate()
tgt_filename = 'SOME_Prefix'
src_path = 'SOURCE_S3_PATH'
tgt_path = 'TARGET_ BUCKET' + tgt_filename
df = spark.read.csv(src_path, header=True)
partitioned_df = df.repartition(50)
partitioned_df.write.mode('append').parquet(path=tgt_path)
spark.stop()
if __name__ == "__main__":
main()
any suggestion will be much appreciated. ?

Related

Pyspark - Spark2 SQL submit - Casting implicity all columns getting 'u'Cannot up cast 'error

I am trying to use spark sql from spark 2 on cloudera environment and getting the folowing error:
'pyspark.sql.utils.AnalysisException: u'Cannot up cast
other_column_from_table from decimal(32,22) to decimal(30,22) as it
may truncate\n;''
We not use this column other_column_from_table that SPARK SQL tries to cast in the select statement, and it is the cause of error. Below is the code:
enter code herespark2-submit /home/adonnert/teste_alexandre.py pyspark --deploy-mode cluster --driver-cores 2 --driver-memory 4G --executor-cores 2 --executor-memory 6G --name --master --conf "spark.sql.parquet.writeLegacyFormat=true"
import sys
from pyspark import SparkConf, SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.functions import coalesce
from pyspark.sql.functions import from_unixtime
import time
import traceback
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from datetime import datetime, timedelta, date
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,ArrayType,MapType
spark = SparkSession.builder.appName("PySparkSQL_VRJ_EC_GDC_ALE") \
.enableHiveSupport() \
.config('hive.exec.dynamic.partition', 'True') \
.config('hive.exec.dynamic.partition.mode','nonstrict') \
.config("spark.debug.maxToStringFields","200") \
.config("spark.sql.shuffle.partition", "200") \
.config("spark.sql.inMemoryColumnarStorage.compressed", True) \
.config("spark.sql.inMemoryColumnarStorage.batchSize",10000) \
.config("spark.sql.codegen",True) \
.getOrCreate()
df_lead = spark.sql("""
SELECT
my_id,
value_number
FROM owner.table
WHERE date >= CAST(DATE_FORMAT(ADD_MONTHS(current_timestamp(),-13),'yyyyMM') AS BIGINT)
""").show(10)
Is there a way to deal with it as not allow the spark sql make a cast of column that is not called? It does not even generate the df to use some schema.

Writing data from kafka to hive using pyspark - stucked

I quite new to spark and started with pyspark, I am learning to push data from kafka to hive using pyspark.
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import *
from pyspark.streaming.kafka import KafkaUtils
from os.path import abspath
warehouseLocation = abspath("spark-warehouse")
spark = SparkSession.builder.appName("sparkstreaming").getOrCreate()
df = spark.read.format("kafka").option("startingoffsets", "earliest").option("kafka.bootstrap.servers", "kafka-server1:66,kafka-server2:66").option("kafka.security.protocol", "SSL").option("kafka.ssl.keystore.location", "mykeystore.jks").option("kafka.ssl.keystore.password","mykeystorepassword").option("subscribe","json_stream").load().selectExpr("CAST(value AS STRING)")
json_schema = df.schema
df1 = df.select($"value").select(from_json,json_schema).alias("data").select("data.*")
The above is not working, however after extracting data, I want to insert data to hive table.
As I am completely new, looking for help.
Appreciated in advance! :)
from os.path import expanduser, join, abspath
from pyspark.sql import SparkSession
from pyspark.sql import Row
# warehouse_location points to the default location for managed databases and tables
warehouse_location = abspath('spark-warehouse')
spark = SparkSession \
.builder \
.appName("Python Spark SQL Hive integration example") \
.config("spark.sql.warehouse.dir", warehouse_location) \
.enableHiveSupport() \
.getOrCreate()
# spark is an existing SparkSession
spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive")
spark.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")

Pyspark - Averaging values for every data batch of Kafka Streams

I'm using Pyspark and Kafka to process data via live streams
I made a function that reads Kafka stream by batch and calculate the average values of data for each batch.
I want the same thing but when the values for the second batch should be the average of values for the first and the second batch (the whole history I mean). For the third batch, the average should be the average for the first + second + third batch.. etc
For more, If the first batch could be updated with new values calculated with the values of the last batch, that would be great :)
This is what I've done so far :
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pandas as pd
import numpy as np
import json, time, os.path
kafka_brokers = "localhost:9092"
kafka_core_topic = "test"
sc = SparkContext(appName = "test-kafka")
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 3)
kvs = KafkaUtils.createDirectStream(ssc, [kafka_core_topic], {"metadata.broker.list": kafka_brokers})
parsed = kvs.map(lambda x: json.loads(x[1]))
#pandas_udf('double')
def mean_score(col):
return pd.Series([np.mean(col)] * len(col))
def getSparkSessionInstance(sparkConf):
if ("sparkSessionSingletonInstance" not in globals()):
globals()["sparkSessionSingletonInstance"] = SparkSession \
.builder \
.config(conf = sparkConf)\
.getOrCreate()
return globals()["sparkSessionSingletonInstance"]
def process(time, rdd):
print("========= %s =========" % str(time))
parquetfile = "sparkstream.parquet"
spark = getSparkSessionInstance(rdd.context.getConf())
schema = StructType([
StructField('name', StringType()),
StructField('score', IntegerType())
])
data = spark.read.json(rdd, schema = schema)
data = data.withColumn('mean_score', mean_score(data['score']))
data.show()
if os.path.isdir(parquetfile):
data.write.mode('append').parquet(parquetfile)
else:
data.write.parquet(parquetfile)
parsed.foreachRDD(process)
ssc.start()
ssc.awaitTermination()
This gives the following result :
Thanks a lot for helping :)

sc is not defined while running executable python code

I am running the following the code in spark submit(Spark 2.3.0) and getting "NameError: name 'sc' is not defined"
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, lit
from pyspark.sql.types import *
if __name__ == "__main__":
sc=SparkContext()
sqlContext = SQLContext(sc)
forecast = sc.read.load('/user/gg/LV_hadoop_example.csv',
format='csv', header='true', inferSchema='true', sep=',')
forecast = forecast.filter(forecast['Total_scaled_forecast'] > 0)
forecast.saveAsTextFile("word_count11.txt")
In spark 2.3.0, the correct way to load a csv file using:
from pyspark.sql import SparkSession
# initiate spark instance
spark = SparkSession.builder
.master("local")
.appName("abc")
.getOrCreate()
# read csv file
df = spark.read.csv('/user/gg/LV_hadoop_example.csv')
Check the documentation for more examples.

I used some problems in using pyspark on jupyter

The problem that occurs when I load the file。
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("Mytest") \
.config("spark.some.config.option","some-value") \
.getOrCreate()
# Load training data
data = spark.read.format("libsvm").load("/test/test.txt")