sc is not defined while running executable python code - pyspark

I am running the following the code in spark submit(Spark 2.3.0) and getting "NameError: name 'sc' is not defined"
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, lit
from pyspark.sql.types import *
if __name__ == "__main__":
sc=SparkContext()
sqlContext = SQLContext(sc)
forecast = sc.read.load('/user/gg/LV_hadoop_example.csv',
format='csv', header='true', inferSchema='true', sep=',')
forecast = forecast.filter(forecast['Total_scaled_forecast'] > 0)
forecast.saveAsTextFile("word_count11.txt")

In spark 2.3.0, the correct way to load a csv file using:
from pyspark.sql import SparkSession
# initiate spark instance
spark = SparkSession.builder
.master("local")
.appName("abc")
.getOrCreate()
# read csv file
df = spark.read.csv('/user/gg/LV_hadoop_example.csv')
Check the documentation for more examples.

Related

Unable to create Hive tables in pyspark inside google Colab using hivesupportenabled?

Here is my code: -
from pyspark import SparkContext, SparkConf
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, HiveContext
from pyspark.sql import SparkSession
sparkSession = (SparkSession
.builder
.master("local")
.appName('Colab')
.config('spark.ui.port', '4050')
.enableHiveSupport()
.getOrCreate()
)
sc.createOrReplaceTempView("my_temp_table");
spark.sql("drop table if exists my_table");
spark.sql("CREATE TABLE Starcraft AS SELECT * FROM my_temp_table")
I tried changing
master("local")
with
also tried adding
.config("spark.sql.catalogImplementation","hive")\
but to no avail.
The error:-
AnalysisException: Hive support is required to CREATE Hive TABLE (AS SELECT);;
'CreateTable `Starcraft`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, ErrorIfExists
+- Project [Name#3094, Dept#3095, Salary#3096]
+- SubqueryAlias my_temp_table
+- Relation[Name#3094,Dept#3095,Salary#3096] csv
master("yarn")
Is it a problem of the environment or the code?

Writing data from kafka to hive using pyspark - stucked

I quite new to spark and started with pyspark, I am learning to push data from kafka to hive using pyspark.
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import *
from pyspark.streaming.kafka import KafkaUtils
from os.path import abspath
warehouseLocation = abspath("spark-warehouse")
spark = SparkSession.builder.appName("sparkstreaming").getOrCreate()
df = spark.read.format("kafka").option("startingoffsets", "earliest").option("kafka.bootstrap.servers", "kafka-server1:66,kafka-server2:66").option("kafka.security.protocol", "SSL").option("kafka.ssl.keystore.location", "mykeystore.jks").option("kafka.ssl.keystore.password","mykeystorepassword").option("subscribe","json_stream").load().selectExpr("CAST(value AS STRING)")
json_schema = df.schema
df1 = df.select($"value").select(from_json,json_schema).alias("data").select("data.*")
The above is not working, however after extracting data, I want to insert data to hive table.
As I am completely new, looking for help.
Appreciated in advance! :)
from os.path import expanduser, join, abspath
from pyspark.sql import SparkSession
from pyspark.sql import Row
# warehouse_location points to the default location for managed databases and tables
warehouse_location = abspath('spark-warehouse')
spark = SparkSession \
.builder \
.appName("Python Spark SQL Hive integration example") \
.config("spark.sql.warehouse.dir", warehouse_location) \
.enableHiveSupport() \
.getOrCreate()
# spark is an existing SparkSession
spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive")
spark.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")

streamWrite with append option and window function

I'm trying to writeStream using the append option, but I get an error.
Code:
from pyspark.sql import SparkSession
from pyspark.sql.functions import window
from pyspark.sql.functions import col, column, count, when
spark = SparkSession\
.builder\
.appName("get_sensor_data")\
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
Sensor = lines.select(lines.value.alias('Sensor'),
lines.timestamp)
windowedCounts = Sensor.withWatermark('timestamp', '10 seconds').groupBy(
window(Sensor.timestamp, windowDuration, slideDuration)).\
agg(count(when(col('Sensor')=="LR1 On",True)).alias('LR1'),\
count(when(col('Sensor')=="LR2 On",True)).alias('LR2'),\
count(when(col('Sensor')=="LD On",True)).alias('LD')).\
orderBy('window')
query = windowedCounts\
.writeStream\
.outputMode('append')\
.format("console")\
.start()
Error:
Append output mode not supported when there are streaming aggregations on streaming DataFrames/DataSets without watermark
The reason for using the append option is to save as a CSV file later.
I think this problem is caused by the window function, but I don't know how to solve it.

Pyspark - Averaging values for every data batch of Kafka Streams

I'm using Pyspark and Kafka to process data via live streams
I made a function that reads Kafka stream by batch and calculate the average values of data for each batch.
I want the same thing but when the values for the second batch should be the average of values for the first and the second batch (the whole history I mean). For the third batch, the average should be the average for the first + second + third batch.. etc
For more, If the first batch could be updated with new values calculated with the values of the last batch, that would be great :)
This is what I've done so far :
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pandas as pd
import numpy as np
import json, time, os.path
kafka_brokers = "localhost:9092"
kafka_core_topic = "test"
sc = SparkContext(appName = "test-kafka")
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 3)
kvs = KafkaUtils.createDirectStream(ssc, [kafka_core_topic], {"metadata.broker.list": kafka_brokers})
parsed = kvs.map(lambda x: json.loads(x[1]))
#pandas_udf('double')
def mean_score(col):
return pd.Series([np.mean(col)] * len(col))
def getSparkSessionInstance(sparkConf):
if ("sparkSessionSingletonInstance" not in globals()):
globals()["sparkSessionSingletonInstance"] = SparkSession \
.builder \
.config(conf = sparkConf)\
.getOrCreate()
return globals()["sparkSessionSingletonInstance"]
def process(time, rdd):
print("========= %s =========" % str(time))
parquetfile = "sparkstream.parquet"
spark = getSparkSessionInstance(rdd.context.getConf())
schema = StructType([
StructField('name', StringType()),
StructField('score', IntegerType())
])
data = spark.read.json(rdd, schema = schema)
data = data.withColumn('mean_score', mean_score(data['score']))
data.show()
if os.path.isdir(parquetfile):
data.write.mode('append').parquet(parquetfile)
else:
data.write.parquet(parquetfile)
parsed.foreachRDD(process)
ssc.start()
ssc.awaitTermination()
This gives the following result :
Thanks a lot for helping :)

I used some problems in using pyspark on jupyter

The problem that occurs when I load the file。
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("Mytest") \
.config("spark.some.config.option","some-value") \
.getOrCreate()
# Load training data
data = spark.read.format("libsvm").load("/test/test.txt")