Writing data from kafka to hive using pyspark - stucked - pyspark

I quite new to spark and started with pyspark, I am learning to push data from kafka to hive using pyspark.
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import *
from pyspark.streaming.kafka import KafkaUtils
from os.path import abspath
warehouseLocation = abspath("spark-warehouse")
spark = SparkSession.builder.appName("sparkstreaming").getOrCreate()
df = spark.read.format("kafka").option("startingoffsets", "earliest").option("kafka.bootstrap.servers", "kafka-server1:66,kafka-server2:66").option("kafka.security.protocol", "SSL").option("kafka.ssl.keystore.location", "mykeystore.jks").option("kafka.ssl.keystore.password","mykeystorepassword").option("subscribe","json_stream").load().selectExpr("CAST(value AS STRING)")
json_schema = df.schema
df1 = df.select($"value").select(from_json,json_schema).alias("data").select("data.*")
The above is not working, however after extracting data, I want to insert data to hive table.
As I am completely new, looking for help.
Appreciated in advance! :)

from os.path import expanduser, join, abspath
from pyspark.sql import SparkSession
from pyspark.sql import Row
# warehouse_location points to the default location for managed databases and tables
warehouse_location = abspath('spark-warehouse')
spark = SparkSession \
.builder \
.appName("Python Spark SQL Hive integration example") \
.config("spark.sql.warehouse.dir", warehouse_location) \
.enableHiveSupport() \
.getOrCreate()
# spark is an existing SparkSession
spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive")
spark.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")

Related

Pyspark - Spark2 SQL submit - Casting implicity all columns getting 'u'Cannot up cast 'error

I am trying to use spark sql from spark 2 on cloudera environment and getting the folowing error:
'pyspark.sql.utils.AnalysisException: u'Cannot up cast
other_column_from_table from decimal(32,22) to decimal(30,22) as it
may truncate\n;''
We not use this column other_column_from_table that SPARK SQL tries to cast in the select statement, and it is the cause of error. Below is the code:
enter code herespark2-submit /home/adonnert/teste_alexandre.py pyspark --deploy-mode cluster --driver-cores 2 --driver-memory 4G --executor-cores 2 --executor-memory 6G --name --master --conf "spark.sql.parquet.writeLegacyFormat=true"
import sys
from pyspark import SparkConf, SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.functions import coalesce
from pyspark.sql.functions import from_unixtime
import time
import traceback
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from datetime import datetime, timedelta, date
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,ArrayType,MapType
spark = SparkSession.builder.appName("PySparkSQL_VRJ_EC_GDC_ALE") \
.enableHiveSupport() \
.config('hive.exec.dynamic.partition', 'True') \
.config('hive.exec.dynamic.partition.mode','nonstrict') \
.config("spark.debug.maxToStringFields","200") \
.config("spark.sql.shuffle.partition", "200") \
.config("spark.sql.inMemoryColumnarStorage.compressed", True) \
.config("spark.sql.inMemoryColumnarStorage.batchSize",10000) \
.config("spark.sql.codegen",True) \
.getOrCreate()
df_lead = spark.sql("""
SELECT
my_id,
value_number
FROM owner.table
WHERE date >= CAST(DATE_FORMAT(ADD_MONTHS(current_timestamp(),-13),'yyyyMM') AS BIGINT)
""").show(10)
Is there a way to deal with it as not allow the spark sql make a cast of column that is not called? It does not even generate the df to use some schema.

Spark Job taking long time to append data to S3

I'm running spark job on EMR and trying to convert large zipped file (15gb) to parquet but it is taking too long to write to S3.
I'm using r5 instance for master (1 instance) and core (3 instances).
Here is my code.
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date
def main():
spark = SparkSession \
.builder \
.appName("csv-to-parquer-convertor") \
.config("spark.sql.catalogimplementation", "hive") \
.config("hive.metastore.connect.retries", 3) \
.config("hive.metastore.client.factory.class", "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory") \
.enableHiveSupport().getOrCreate()
tgt_filename = 'SOME_Prefix'
src_path = 'SOURCE_S3_PATH'
tgt_path = 'TARGET_ BUCKET' + tgt_filename
df = spark.read.csv(src_path, header=True)
partitioned_df = df.repartition(50)
partitioned_df.write.mode('append').parquet(path=tgt_path)
spark.stop()
if __name__ == "__main__":
main()
any suggestion will be much appreciated. ?

Unable to create Hive tables in pyspark inside google Colab using hivesupportenabled?

Here is my code: -
from pyspark import SparkContext, SparkConf
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, HiveContext
from pyspark.sql import SparkSession
sparkSession = (SparkSession
.builder
.master("local")
.appName('Colab')
.config('spark.ui.port', '4050')
.enableHiveSupport()
.getOrCreate()
)
sc.createOrReplaceTempView("my_temp_table");
spark.sql("drop table if exists my_table");
spark.sql("CREATE TABLE Starcraft AS SELECT * FROM my_temp_table")
I tried changing
master("local")
with
also tried adding
.config("spark.sql.catalogImplementation","hive")\
but to no avail.
The error:-
AnalysisException: Hive support is required to CREATE Hive TABLE (AS SELECT);;
'CreateTable `Starcraft`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, ErrorIfExists
+- Project [Name#3094, Dept#3095, Salary#3096]
+- SubqueryAlias my_temp_table
+- Relation[Name#3094,Dept#3095,Salary#3096] csv
master("yarn")
Is it a problem of the environment or the code?

sc is not defined while running executable python code

I am running the following the code in spark submit(Spark 2.3.0) and getting "NameError: name 'sc' is not defined"
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, lit
from pyspark.sql.types import *
if __name__ == "__main__":
sc=SparkContext()
sqlContext = SQLContext(sc)
forecast = sc.read.load('/user/gg/LV_hadoop_example.csv',
format='csv', header='true', inferSchema='true', sep=',')
forecast = forecast.filter(forecast['Total_scaled_forecast'] > 0)
forecast.saveAsTextFile("word_count11.txt")
In spark 2.3.0, the correct way to load a csv file using:
from pyspark.sql import SparkSession
# initiate spark instance
spark = SparkSession.builder
.master("local")
.appName("abc")
.getOrCreate()
# read csv file
df = spark.read.csv('/user/gg/LV_hadoop_example.csv')
Check the documentation for more examples.

I used some problems in using pyspark on jupyter

The problem that occurs when I load the file。
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("Mytest") \
.config("spark.some.config.option","some-value") \
.getOrCreate()
# Load training data
data = spark.read.format("libsvm").load("/test/test.txt")