I used some problems in using pyspark on jupyter - pyspark

The problem that occurs when I load the file。
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("Mytest") \
.config("spark.some.config.option","some-value") \
.getOrCreate()
# Load training data
data = spark.read.format("libsvm").load("/test/test.txt")

Related

Pyspark - Spark2 SQL submit - Casting implicity all columns getting 'u'Cannot up cast 'error

I am trying to use spark sql from spark 2 on cloudera environment and getting the folowing error:
'pyspark.sql.utils.AnalysisException: u'Cannot up cast
other_column_from_table from decimal(32,22) to decimal(30,22) as it
may truncate\n;''
We not use this column other_column_from_table that SPARK SQL tries to cast in the select statement, and it is the cause of error. Below is the code:
enter code herespark2-submit /home/adonnert/teste_alexandre.py pyspark --deploy-mode cluster --driver-cores 2 --driver-memory 4G --executor-cores 2 --executor-memory 6G --name --master --conf "spark.sql.parquet.writeLegacyFormat=true"
import sys
from pyspark import SparkConf, SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.functions import coalesce
from pyspark.sql.functions import from_unixtime
import time
import traceback
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from datetime import datetime, timedelta, date
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,ArrayType,MapType
spark = SparkSession.builder.appName("PySparkSQL_VRJ_EC_GDC_ALE") \
.enableHiveSupport() \
.config('hive.exec.dynamic.partition', 'True') \
.config('hive.exec.dynamic.partition.mode','nonstrict') \
.config("spark.debug.maxToStringFields","200") \
.config("spark.sql.shuffle.partition", "200") \
.config("spark.sql.inMemoryColumnarStorage.compressed", True) \
.config("spark.sql.inMemoryColumnarStorage.batchSize",10000) \
.config("spark.sql.codegen",True) \
.getOrCreate()
df_lead = spark.sql("""
SELECT
my_id,
value_number
FROM owner.table
WHERE date >= CAST(DATE_FORMAT(ADD_MONTHS(current_timestamp(),-13),'yyyyMM') AS BIGINT)
""").show(10)
Is there a way to deal with it as not allow the spark sql make a cast of column that is not called? It does not even generate the df to use some schema.

Spark Job taking long time to append data to S3

I'm running spark job on EMR and trying to convert large zipped file (15gb) to parquet but it is taking too long to write to S3.
I'm using r5 instance for master (1 instance) and core (3 instances).
Here is my code.
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date
def main():
spark = SparkSession \
.builder \
.appName("csv-to-parquer-convertor") \
.config("spark.sql.catalogimplementation", "hive") \
.config("hive.metastore.connect.retries", 3) \
.config("hive.metastore.client.factory.class", "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory") \
.enableHiveSupport().getOrCreate()
tgt_filename = 'SOME_Prefix'
src_path = 'SOURCE_S3_PATH'
tgt_path = 'TARGET_ BUCKET' + tgt_filename
df = spark.read.csv(src_path, header=True)
partitioned_df = df.repartition(50)
partitioned_df.write.mode('append').parquet(path=tgt_path)
spark.stop()
if __name__ == "__main__":
main()
any suggestion will be much appreciated. ?

Loading data from AWS EMR to Redshift using Glue is very slow

I am trying to load data from AWS EMR(data storage as S3 and glue-catalog for metastore) to Redshift.
import sys
import boto3
from datetime import datetime,date
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pyspark.context import SparkContext
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.window import *
from pyspark.sql.functions import to_date
from pyspark.sql import SQLContext
glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.sparkSession
sc = spark.sparkContext
sqlContext = SQLContext(sc)
df = sqlContext.sql("Select * from classic_models.orderdetails where insert_date >= '2021-01-01' and insert_date < '2021-01-02' ")
dynamic_df = DynamicFrame.fromDF(new_df, glueContext, "dynamic_df")
redshift_target_table = "classic_models.orderdetails"
pre_actions = f"Truncate table {redshift_target_table};"
redshift_connection_opts = {
"database": "dev",
"dbtable": redshift_target_table,
"aws_iam_role": "arn:aws:iam::*********",
"preactions": pre_actions
}
s3_temp_dir = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
glueContext.write_dynamic_frame.from_jdbc_conf(
frame = dynamic_df,
catalog_connection = "redshift",
connection_options = redshift_connection_opts,
redshift_tmp_dir = "s3:/staging/orderdetails/%s/" % s3_temp_dir #Need change
)
The extract data from Hive is quite fast but loading data to Redshift is taking long. By long I mean if I am loading data for the past 10 days, and a glue job takes 16 min to complete, less than 1 min is to extract data from Hive, rest is only to load data to Redshift.
More than half of the Hive table columns are of String data-type
Is there any other better and faster way to do it?

Writing data from kafka to hive using pyspark - stucked

I quite new to spark and started with pyspark, I am learning to push data from kafka to hive using pyspark.
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import *
from pyspark.streaming.kafka import KafkaUtils
from os.path import abspath
warehouseLocation = abspath("spark-warehouse")
spark = SparkSession.builder.appName("sparkstreaming").getOrCreate()
df = spark.read.format("kafka").option("startingoffsets", "earliest").option("kafka.bootstrap.servers", "kafka-server1:66,kafka-server2:66").option("kafka.security.protocol", "SSL").option("kafka.ssl.keystore.location", "mykeystore.jks").option("kafka.ssl.keystore.password","mykeystorepassword").option("subscribe","json_stream").load().selectExpr("CAST(value AS STRING)")
json_schema = df.schema
df1 = df.select($"value").select(from_json,json_schema).alias("data").select("data.*")
The above is not working, however after extracting data, I want to insert data to hive table.
As I am completely new, looking for help.
Appreciated in advance! :)
from os.path import expanduser, join, abspath
from pyspark.sql import SparkSession
from pyspark.sql import Row
# warehouse_location points to the default location for managed databases and tables
warehouse_location = abspath('spark-warehouse')
spark = SparkSession \
.builder \
.appName("Python Spark SQL Hive integration example") \
.config("spark.sql.warehouse.dir", warehouse_location) \
.enableHiveSupport() \
.getOrCreate()
# spark is an existing SparkSession
spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive")
spark.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")

sc is not defined while running executable python code

I am running the following the code in spark submit(Spark 2.3.0) and getting "NameError: name 'sc' is not defined"
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, lit
from pyspark.sql.types import *
if __name__ == "__main__":
sc=SparkContext()
sqlContext = SQLContext(sc)
forecast = sc.read.load('/user/gg/LV_hadoop_example.csv',
format='csv', header='true', inferSchema='true', sep=',')
forecast = forecast.filter(forecast['Total_scaled_forecast'] > 0)
forecast.saveAsTextFile("word_count11.txt")
In spark 2.3.0, the correct way to load a csv file using:
from pyspark.sql import SparkSession
# initiate spark instance
spark = SparkSession.builder
.master("local")
.appName("abc")
.getOrCreate()
# read csv file
df = spark.read.csv('/user/gg/LV_hadoop_example.csv')
Check the documentation for more examples.