Pyspark - Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe - pyspark

I am new to pyspark and just started with pyspark by following this
I have made the changes to the code basis on my local system. Below is the code how I am setting up pyspark
# Import PySpark related modules
import findspark
findspark.init()
import pyspark
import os
import sys
os.environ['PYSPARK_PYTHON'] = 'python'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import functions
from pyspark.sql.functions import lit, desc, col, size, array_contains\
, isnan, udf, hour, array_min, array_max, countDistinct
from pyspark.sql.types import *
MAX_MEMORY = '15G'
# Initialize a spark session.
conf = pyspark.SparkConf().setMaster("local[*]") \
.set('spark.executor.heartbeatInterval', 10000) \
.set('spark.network.timeout', 10000) \
.set("spark.core.connection.ack.wait.timeout", "20000") \
.set("spark.executor.memory", MAX_MEMORY) \
.set("spark.driver.memory", MAX_MEMORY) \
.set("spark.driver.maxResultSize", MAX_MEMORY) \
.set("spark.sql.execution.arrow.pyspark.enabled", "true") \
.set("spark.memory.offHeap.enabled","true") \
.set("spark.memory.offHeap.size","10g") \
.set("spark.network.timeout", "300000")
def init_spark():
spark = SparkSession \
.builder \
.appName("Pyspark guide") \
.config(conf=conf) \
.getOrCreate()
return spark
spark = init_spark()
filename_data = 'endomondoHR.json'
# Load the main data set into pyspark data frame
df = spark.read.json(filename_data, mode="DROPMALFORMED")
print('Data frame type: ' + str(type(df)))
While executing the code in cell
# Helper function to calculate statistic(s) of the column name from a tuple x of (sport, records list of the column)
#, the stats to calculate is also given as an input
def calculate_stats(x,column_name, stat_list):
sport, records_list = x
stat_dict = {'sport': sport}
if 'min' in stat_list:
min_stat = min(records_list)
stat_dict.update({'min ' + column_name : min_stat})
if 'max' in stat_list:
max_stat = max(records_list)
stat_dict.update({'max ' + column_name: max_stat})
if 'mean' in stat_list:
average_stat = stats.mean(records_list)
stat_dict.update({'mean ' + column_name: average_stat})
if 'stdev' in stat_list:
std_stat = stats.stdev(records_list)
stat_dict.update({'stdev ' + column_name: std_stat})
if '50th percentile' in stat_list:
median_stat = stats.median(records_list)
stat_dict.update({'50th percentile ' + column_name: median_stat})
if '25th percentile' in stat_list:
percentile_25th_stat = np.percentile(records_list, 25)
stat_dict.update({'25th percentile ' + column_name: percentile_25th_stat})
if '75th percentile' in stat_list:
percentile_75th_stat = np.percentile(records_list, 75)
stat_dict.update({'75th percentile ' + column_name: percentile_75th_stat})
if '95th percentile' in stat_list:
percentile_95th_stat = np.percentile(records_list, 95)
stat_dict.update({'95th percentile ' + column_name: percentile_95th_stat})
return stat_dict
def to_list(a):
return a
def extend(a, b):
a.extend(b)
return a
def retrieve_array_column_stat_df(df, column_name, stat_list):
# Convert sport & "column_name" to RDD to easily calculate the statistics of intervals by sports
sport_record_rdd = df.select('sport', column_name).rdd \
.map(tuple).combineByKey(to_list, extend, extend).persist()
# Calculate statistics of the input column by calling calculate_stats function defined above
record_statistic_df = pd.DataFrame(sport_record_rdd.map(
lambda x: calculate_stats(x, column_name,stat_list)).collect()
)
# Set proper dataframe column orders
columns_order = ['sport'] + [stat + ' ' + column_name for stat in stat_list]
# Re order columns
return record_statistic_df[columns_order]
stat_list = ['min', '25th percentile', 'mean', '50th percentile',
'75th percentile', '95th percentile', 'max', 'stdev']
interval_statistic_df = retrieve_array_column_stat_df(df, column_name='interval', stat_list=stat_list)
print('\nLet\'s look at statistic for interval, in seconds (by sport):' )
interval_statistic_df
I am getting below error
Py4JJavaError Traceback (most recent call last)
c:\Users\shiv\Desktop\test\pyspark_analysis_bigdata.ipynb Cell 45 in <cell line: 55>()
51 return record_statistic_df[columns_order]
53 stat_list = ['min', '25th percentile', 'mean', '50th percentile',
54 '75th percentile', '95th percentile', 'max', 'stdev']
---> 55 interval_statistic_df = retrieve_array_column_stat_df(df, column_name='interval', stat_list=stat_list)
56 print('\nLet\'s look at statistic for interval, in seconds (by sport):' )
57 interval_statistic_df
c:\Users\shiv\Desktop\test\pyspark_analysis_bigdata.ipynb Cell 45 in retrieve_array_column_stat_df(df, column_name, stat_list)
41 sport_record_rdd = df.select('sport', column_name).rdd \
42 .map(tuple).combineByKey(to_list, extend, extend).persist()
44 # Calculate statistics of the input column by calling calculate_stats function defined above
---> 45 record_statistic_df = pd.DataFrame(sport_record_rdd.map(
46 lambda x: calculate_stats(x, column_name,stat_list)).collect()
47 )
48 # Set proper dataframe column orders
49 columns_order = ['sport'] + [stat + ' ' + column_name for stat in stat_list]
File c:\Users\shiv\miniconda3\lib\site-packages\pyspark\rdd.py:1197, in RDD.collect(self)
1195 with SCCallSiteSync(self.context):
1196 assert self.ctx._jvm is not None
-> 1197 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
1198 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
File c:\Users\shiv\miniconda3\lib\site-packages\py4j\java_gateway.py:1321, in JavaMember.__call__(self, *args)
1315 command = proto.CALL_COMMAND_NAME +\
1316 self.command_header +\
1317 args_command +\
1318 proto.END_COMMAND_PART
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1324 for temp_arg in temp_args:
1325 temp_arg._detach()
File c:\Users\shiv\miniconda3\lib\site-packages\pyspark\sql\utils.py:190, in capture_sql_exception.<locals>.deco(*a, **kw)
188 def deco(*a: Any, **kw: Any) -> Any:
189 try:
--> 190 return f(*a, **kw)
191 except Py4JJavaError as e:
192 converted = convert_exception(e.java_exception)
File c:\Users\shiv\miniconda3\lib\site-packages\py4j\protocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
332 format(target_id, ".", name, value))
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 5 in stage 155.0 failed 1 times, most recent failure: Lost task 5.0 in stage 155.0 (TID 1775) (shiv executor driver): java.net.SocketException: Connection reset by peer
at java.base/sun.nio.ch.NioSocketImpl.implWrite(NioSocketImpl.java:413)
at java.base/sun.nio.ch.NioSocketImpl.write(NioSocketImpl.java:433)
at java.base/sun.nio.ch.NioSocketImpl$2.write(NioSocketImpl.java:812)
at java.base/java.net.Socket$SocketOutputStream.write(Socket.java:1120)
at java.base/java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:125)
at java.base/java.io.BufferedOutputStream.implWrite(BufferedOutputStream.java:215)
at java.base/java.io.BufferedOutputStream.write(BufferedOutputStream.java:199)
at java.base/java.io.DataOutputStream.write(DataOutputStream.java:112)
at java.base/java.io.FilterOutputStream.write(FilterOutputStream.java:108)
at org.apache.spark.api.python.PythonRDD$.write$1(PythonRDD.scala:295)
at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1(PythonRDD.scala:307)
at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1$adapted(PythonRDD.scala:307)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:307)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.writeIteratorToStream(PythonUDFRunner.scala:53)
at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:438)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:2066)
at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:272)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2293)
at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1021)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
at org.apache.spark.rdd.RDD.collect(RDD.scala:1020)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:180)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:104)
at java.base/java.lang.reflect.Method.invoke(Method.java:578)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:1589)
Caused by: java.net.SocketException: Connection reset by peer
at java.base/sun.nio.ch.NioSocketImpl.implWrite(NioSocketImpl.java:413)
at java.base/sun.nio.ch.NioSocketImpl.write(NioSocketImpl.java:433)
at java.base/sun.nio.ch.NioSocketImpl$2.write(NioSocketImpl.java:812)
at java.base/java.net.Socket$SocketOutputStream.write(Socket.java:1120)
at java.base/java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:125)
at java.base/java.io.BufferedOutputStream.implWrite(BufferedOutputStream.java:215)
at java.base/java.io.BufferedOutputStream.write(BufferedOutputStream.java:199)
at java.base/java.io.DataOutputStream.write(DataOutputStream.java:112)
at java.base/java.io.FilterOutputStream.write(FilterOutputStream.java:108)
at org.apache.spark.api.python.PythonRDD$.write$1(PythonRDD.scala:295)
at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1(PythonRDD.scala:307)
at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1$adapted(PythonRDD.scala:307)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:307)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.writeIteratorToStream(PythonUDFRunner.scala:53)
at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:438)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:2066)
at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:272)
I am not able to understand the error, can anybody help here?
Important info -
pyspark and spark version - 3.3.1
python version - 3.9.13
Windows 11, 16GB, Intel Core i5-11400H, NVIDIA-GeForce GTX 1650 with 4GB GDDR6 dedicated VRAM
Java version - 19.0.2
I tried allocating maximum memory to driver and executors, increased heartbeat interval and network timeout and then later used findspark to intialize spark. The above code set up is the latest and still getting the same error.
Update
I tried running the code with the chunk of data and made the changes, the code ran successfully:
df_min = df.limit(1000)
stat_list = ['min', '25th percentile', 'mean', '50th percentile',
'75th percentile', '95th percentile', 'max', 'stdev']
interval_statistic_df = retrieve_array_column_stat_df(df_min, column_name='interval', stat_list=stat_list)
print('\nLet\'s look at statistic for interval, in seconds (by sport):' )
interval_statistic_df
Therefore, the size of the data is the issue. Can anyone help me with the optimized code to omit the use of rdd and effectively use map() and collect()

Related

How to fix this error in PySpark with the select method?

I'm following an example from the internet, but it gives me an error that I can't solve. The code is the following:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pyspark import SparkContext
from IPython.display import display, HTML
from pyspark.sql import SQLContext
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from pyspark.sql import Column as c
from pyspark.sql.functions import array, udf, lit, col as c
import pyspark.sql.functions as f
pd.set_option('max_colwidth',100)
plt.style.use('seaborn-paper')
try:
sc = SparkContext('local[*]')
except:
sc = SparkContext.getOrCreate('local[*]')
sqlContext = SQLContext(sc)
#Leyendo los dataframe
whiteWinnePath = 'winequality-White.csv'
redWinnePath = 'winequality-Red.csv'
"""
df_p = pd.read_csv(whiteWinnePath, sep =";")
print("Mostrar el data frame")
display(df_p)
"""
whiteWinneDF = sqlContext.createDataFrame(pd.read_csv(whiteWinnePath, sep = ";")).withColumn('type',lit(0))
redWinneDF = sqlContext.createDataFrame(pd.read_csv(redWinnePath, sep = ";")).withColumn('type',lit(1))
whiteWinneDF.printSchema()
#Dividiendo conjunto de entranamiento y prueba
whiteTrainingDF, whiteTestingDF = whiteWinneDF.randomSplit([0.7,0.3])
redTrainingDF, redTestingDF = redWinneDF.randomSplit([0.7,0.3])
trainingDF = whiteTrainingDF.union(redTrainingDF)
testingDF = whiteTestingDF.union(redTestingDF)
#Preparando el dataframe para PCA
idCol = ['type']
features = [column for column in redWinneDF.columns if column not in idCol]
p = len(features)
meanVector = trainingDF.describe().where(c('summary')==lit('mean')).toPandas()[[0][1:p+1]].values
"""
meanVector2= meanVector1.toPandas()
#print("="*50)
#print(type(meanVector2))
#meanVector= meanVector2.as_matrix()#[0][1:p+1]
meanVector= meanVector2[[0][1:p+1]].values
"""
labeledVectorsDF = trainingDF.select(features+['type']).rdd\
.map(lambda x:(Vectors.dense(x[0:p]-Vectors.dense(meanVector)),x[p]))\
.toDF(['features','type'])
labeledVectorsDF.limit(5).toPandas()
When I run the code, this is the error i get:
File "D:\UGR\Investigación\Cosas de Reinaldo\mis script\Seleccion_caracteristicas\PCA_wine_quality.py", line 78, in <module>
labeledVectorsDF = trainingDF.select(features+['type']).rdd\
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 61, in toDF
return sparkSession.createDataFrame(self, schema, sampleRatio)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 605, in createDataFrame
return self._create_dataframe(data, schema, samplingRatio, verifySchema)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 628, in _create_dataframe
rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 425, in _createFromRDD
struct = self._inferSchema(rdd, samplingRatio, names=schema)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 396, in _inferSchema
first = rdd.first()
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\rdd.py", line 1464, in first
rs = self.take(1)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\rdd.py", line 1446, in take
res = self.context.runJob(self, takeUpToNumLeft, p)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\context.py", line 1118, in runJob
sock_info = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
File "C:\spark-3.0.1-bin-hadoop2.7\python\lib\py4j-0.10.9-src.zip\py4j\java_gateway.py", line 1304, in __call__
return_value = get_return_value(
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\utils.py", line 128, in deco
return f(*a, **kw)
File "C:\spark-3.0.1-bin-hadoop2.7\python\lib\py4j-0.10.9-src.zip\py4j\protocol.py", line 326, in get_return_value
raise Py4JJavaError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2.0 (TID 17, LAPTOP-3G67L0HS, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
I don't know if the error is with the select method or the rdd method. I verified that features has all the names of the columns of the dataframe except type.
How could I solve it?

PySpark MultiLayerPercepTronClassifier seems to not work wirh OneHotEncoding

I'm running a duummy example to perform classification with PySpark.
I created an ETL pipeline in which labels are transformed to OneHotEncoding, but
PySpark throws:
IllegalArgumentException: 'requirement failed: Column label must be of type numeric but was actually of type struct<type:tinyint,size:int,indices:array<int>,values:array<double>>.'
Code for Sparse One-hot
from pyspark.ml.feature import StringIndexer, StandardScaler, OneHotEncoderEstimator, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import rand
df = spark.createDataFrame([
("Music", 3.45,1245),
("Sports", 4.49,3456),
("Music", 1.22, 323),
("Animals", 2.45,24)], ["category", "rating", "views"])
"""ETL Pipeline over
the whole dataset
"""
indexer = StringIndexer(inputCol="category", outputCol="class",handleInvalid="skip")
encoder = OneHotEncoderEstimator(inputCols=["class"],
outputCols=["label"])
encoder.setDropLast(False)
vectorizer = VectorAssembler(inputCols=["rating","views"],
outputCol="unscaled_features")
etl_pipeline = Pipeline(stages=[indexer,encoder,vectorizer])
etlModel = etl_pipeline.fit(df)
tr_df = etlModel.transform(df)
tr_df.show()
"""Training Pipeline
"""
train_data, test_data = tr_df.randomSplit([.8, .2],seed=23487)
scaler = StandardScaler(inputCol="unscaled_features", outputCol="features",
withStd=True, withMean=True)
# specify layers for the neural network:
layers = [4, 5, 4, 3]
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=1, seed=1234)
ml_pipeline = Pipeline(stages=[scaler, trainer])
mlModel = ml_pipeline.fit(train_data)
result = mlModel.transform(test_data)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
Out
+--------+------+-----+-----+-------------+-----------------+
|category|rating|views|class| label|unscaled_features|
+--------+------+-----+-----+-------------+-----------------+
| Music| 3.45| 1245| 0.0|(3,[0],[1.0])| [3.45,1245.0]|
| Sports| 4.49| 3456| 2.0|(3,[2],[1.0])| [4.49,3456.0]|
| Music| 1.22| 323| 0.0|(3,[0],[1.0])| [1.22,323.0]|
| Animals| 2.45| 24| 1.0|(3,[1],[1.0])| [2.45,24.0]|
+--------+------+-----+-----+-------------+-----------------+
IllegalArgumentException: 'requirement failed: Column label must be of type numeric but was actually of type struct<type:tinyint,size:int,indices:array<int>,values:array<double>>.'
The weird thing is that although I convert the SparseVecotr of one-hot-labels to a DenseVector, the error still remains. It seems that MultilayerPerceptronClassifier converts dense labels to sparse but it is not working properly...
Code for ETL with dense one-hot
from pyspark.ml.feature import StringIndexer, StandardScaler, OneHotEncoderEstimator, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import rand
df = spark.createDataFrame([
("Music", 3.45,1245),
("Sports", 4.49,3456),
("Music", 1.22, 323),
("Animals", 2.45,24)], ["category", "rating", "views"])
"""ETL Pipeline over
the whole dataset
"""
indexer = StringIndexer(inputCol="category", outputCol="class",handleInvalid="skip")
encoder = OneHotEncoderEstimator(inputCols=["class"],
outputCols=["label"])
encoder.setDropLast(False)
vectorizer = VectorAssembler(inputCols=["rating","views"],
outputCol="unscaled_features")
etl_pipeline = Pipeline(stages=[indexer,encoder,vectorizer])
etlModel = etl_pipeline.fit(df)
tr_df = etlModel.transform(df)
tr_df = tr_df.select("label", "unscaled_features")
rdd = tr_df.rdd.map(lambda x: Row(label=DenseVector(x[0].toArray()),unscaled_features=x[1])
if (len(x)>1 and hasattr(x[0], "toArray"))
else Row(label=None, unscaled_features=DenseVector([])))
tr_df = rdd.toDF()
tr_df.show()
"""Training Pipeline
"""
train_data, test_data = tr_df.randomSplit([.8, .2],seed=23487)
scaler = StandardScaler(inputCol="unscaled_features", outputCol="features",
withStd=True, withMean=True)
# specify layers for the neural network:
layers = [4, 5, 4, 3]
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=1, seed=1234)
ml_pipeline = Pipeline(stages=[scaler, trainer])
mlModel = ml_pipeline.fit(train_data)
result = mlModel.transform(test_data)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
Out
+-------------+-----------------+
| label|unscaled_features|
+-------------+-----------------+
|[1.0,0.0,0.0]| [3.45,1245.0]|
|[0.0,0.0,1.0]| [4.49,3456.0]|
|[1.0,0.0,0.0]| [1.22,323.0]|
|[0.0,1.0,0.0]| [2.45,24.0]|
+-------------+-----------------+
IllegalArgumentException: 'requirement failed: Column label must be of type numeric but was actually of type struct<type:tinyint,size:int,indices:array<int>,values:array<double>>.'
UPDATE 1: REMOVING ONE-HOT ENCODING FROM PIPELINE
Code
from pyspark.ml.feature import StringIndexer, StandardScaler, OneHotEncoderEstimator, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import rand
df = spark.createDataFrame([
("Music", 3.45,1245),
("Sports", 4.49,3456),
("Music", 1.22, 323),
("Animals", 2.45,24)], ["category", "rating", "views"])
"""ETL Pipeline over
the whole dataset
"""
indexer = StringIndexer(inputCol="category", outputCol="label",handleInvalid="skip")
# encoder = OneHotEncoderEstimator(inputCols=["class"],
# outputCols=["label"])
# encoder.setDropLast(False)
vectorizer = VectorAssembler(inputCols=["rating","views"],
outputCol="unscaled_features")
etl_pipeline = Pipeline(stages=[indexer,vectorizer])
etlModel = etl_pipeline.fit(df)
tr_df = etlModel.transform(df)
tr_df.show()
"""Training Pipeline
"""
train_data, test_data = tr_df.randomSplit([.8, .2],seed=23487)
scaler = StandardScaler(inputCol="unscaled_features", outputCol="features",
withStd=True, withMean=True)
# specify layers for the neural network:
layers = [4, 5, 4, 3]
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
ml_pipeline = Pipeline(stages=[scaler, trainer])
mlModel = ml_pipeline.fit(train_data)
result = mlModel.transform(test_data)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
OUT
+--------+------+-----+-----+-----------------+
|category|rating|views|label|unscaled_features|
+--------+------+-----+-----+-----------------+
| Music| 3.45| 1245| 0.0| [3.45,1245.0]|
| Sports| 4.49| 3456| 2.0| [4.49,3456.0]|
| Music| 1.22| 323| 0.0| [1.22,323.0]|
| Animals| 2.45| 24| 1.0| [2.45,24.0]|
+--------+------+-----+-----+-----------------+
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-9-58967f1d5bce> in <module>
60
61 ml_pipeline = Pipeline(stages=[scaler, trainer])
---> 62 mlModel = ml_pipeline.fit(train_data)
63 result = mlModel.transform(test_data)
64 predictionAndLabels = result.select("prediction", "label")
~/.local/lib/python3.5/site-packages/pyspark/ml/base.py in fit(self, dataset, params)
130 return self.copy(params)._fit(dataset)
131 else:
--> 132 return self._fit(dataset)
133 else:
134 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
~/.local/lib/python3.5/site-packages/pyspark/ml/pipeline.py in _fit(self, dataset)
107 dataset = stage.transform(dataset)
108 else: # must be an Estimator
--> 109 model = stage.fit(dataset)
110 transformers.append(model)
111 if i < indexOfLastEstimator:
~/.local/lib/python3.5/site-packages/pyspark/ml/base.py in fit(self, dataset, params)
130 return self.copy(params)._fit(dataset)
131 else:
--> 132 return self._fit(dataset)
133 else:
134 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
~/.local/lib/python3.5/site-packages/pyspark/ml/wrapper.py in _fit(self, dataset)
293
294 def _fit(self, dataset):
--> 295 java_model = self._fit_java(dataset)
296 model = self._create_model(java_model)
297 return self._copyValues(model)
~/.local/lib/python3.5/site-packages/pyspark/ml/wrapper.py in _fit_java(self, dataset)
290 """
291 self._transfer_params_to_java()
--> 292 return self._java_obj.fit(dataset._jdf)
293
294 def _fit(self, dataset):
~/.local/lib/python3.5/site-packages/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
~/.local/lib/python3.5/site-packages/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
~/.local/lib/python3.5/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o870.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 26.0 failed 1 times, most recent failure: Lost task 0.0 in stage 26.0 (TID 26, localhost, executor driver): java.lang.ArrayIndexOutOfBoundsException
at java.lang.System.arraycopy(Native Method)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3$$anonfun$apply$4.apply(Layer.scala:665)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3$$anonfun$apply$4.apply(Layer.scala:664)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3.apply(Layer.scala:664)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3.apply(Layer.scala:660)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:222)
at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:299)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1165)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156)
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091)
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:286)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD.count(RDD.scala:1168)
at org.apache.spark.mllib.optimization.LBFGS$.runLBFGS(LBFGS.scala:195)
at org.apache.spark.mllib.optimization.LBFGS.optimize(LBFGS.scala:142)
at org.apache.spark.ml.ann.FeedForwardTrainer.train(Layer.scala:854)
at org.apache.spark.ml.classification.MultilayerPerceptronClassifier$$anonfun$train$1.apply(MultilayerPerceptronClassifier.scala:249)
at org.apache.spark.ml.classification.MultilayerPerceptronClassifier$$anonfun$train$1.apply(MultilayerPerceptronClassifier.scala:205)
at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185)
at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:205)
at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:114)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ArrayIndexOutOfBoundsException
at java.lang.System.arraycopy(Native Method)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3$$anonfun$apply$4.apply(Layer.scala:665)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3$$anonfun$apply$4.apply(Layer.scala:664)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3.apply(Layer.scala:664)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3.apply(Layer.scala:660)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:222)
at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:299)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1165)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156)
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091)
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:286)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
PySPark version
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.4.4
/_/
Using Scala version 2.11.12, OpenJDK 64-Bit Server VM, 1.8.0_222
Java Version
openjdk version "1.8.0_222"
OpenJDK Runtime Environment (build 1.8.0_222-8u222-b10-1ubuntu1~16.04.1-b10)
OpenJDK 64-Bit Server VM (build 25.222-b10, mixed mode)
You should run the features via a VectorAssembler but you don't need to do one-hot-encoding on the label column. You should just pass the labels column as numeric classes as they are:
+------+-----------------+
| label|unscaled_features|
+------+-----------------+
| 0| [3.45,1245.0]|
| 2| [4.49,3456.0]|
| 0| [1.22,323.0]|
| 1| [2.45,24.0]|
+------+-----------------+
This should solve your error.

AttributeError: 'NoneType' object has no attribute 'setCallSite' pyspark after indexedRowMatrix columnSimilarities()

I'm working on a code that was correctly executed with the dataframe before, but this time when I execute it, I get an error. (The only difference is that I used persist() on the dataframe this time.)
simMat = IndexedRMat.columnSimilarities()
executes correctly, but then this part:
columns = ['product1', 'product2', 'sim']
vals = simMat.entries.map(lambda e: (e.i, e.j, e.value)).collect()
dfsim = spark.createDataFrame(vals, columns)
generates this error:
AttributeErrorTraceback (most recent call last)
<ipython-input-100-11502084c71b> in <module>()
1 columns = ['product1', 'product2', 'sim']
----> 2 vals = simMat.entries.map(lambda e: (e.i, e.j, e.value)).collect()
3 dfsim = spark.createDataFrame(vals, columns)
/opt/spark-2.3.0-SNAPSHOT-bin-spark-master/python/pyspark/rdd.pyc in collect(self)
806 to be small, as all the data is loaded into the driver's memory.
807 """
--> 808 with SCCallSiteSync(self.context) as css:
809 port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
810 return list(_load_from_socket(port, self._jrdd_deserializer))
/opt/spark-2.3.0-SNAPSHOT-bin-spark-master/python/pyspark/traceback_utils.pyc in __enter__(self)
70 def __enter__(self):
71 if SCCallSiteSync._spark_stack_depth == 0:
---> 72 self._context._jsc.setCallSite(self._call_site)
73 SCCallSiteSync._spark_stack_depth += 1
74
AttributeError: 'NoneType' object has no attribute 'setCallSite'
What does it mean? I'm new to spark and didn't find an explanation for this type of error..

convert nested list to Dataframe : Pyspark

I tried to convert nested listed to Dataframe by following the answers in this link
List to DataFrame in pyspark
my_data =[['apple','ball','ballon'],['cat','camel','james'],['none','focus','cake']]
from pyspark.sql import Row
R = Row('ID', 'words')
spark.createDataFrame([R(i, x) for i, x in enumerate(my_data)]).show()
​
But I obtain this error :
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-147-780a8d7196df> in <module>()
----> 5 spark.createDataFrame([R(i, x) for i, x in enumerate(my_data)]).show()
F:\spark\spark\python\pyspark\sql\session.py in createDataFrame(self, data, schema, samplingRatio, verifySchema)
--> 689 rdd, schema = self._createFromLocal(map(prepare, data), schema)
F:\spark\spark\python\pyspark\sql\session.py in _createFromLocal(self, data, schema)
--> 424 return self._sc.parallelize(data), schema
F:\spark\spark\python\pyspark\context.py in parallelize(self, c, numSlices)
--> 484 jrdd = self._serialize_to_jvm(c, numSlices, serializer)
F:\spark\spark\python\pyspark\context.py in _serialize_to_jvm(self, data, parallelism, serializer)
--> 493 tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
~\Anaconda3\lib\tempfile.py in NamedTemporaryFile(mode, buffering, encoding, newline, suffix, prefix, dir, delete)
547 flags |= _os.O_TEMPORARY
548
--> 549 (fd, name) = _mkstemp_inner(dir, prefix, suffix, flags, output_type)
550 try:
551 file = _io.open(fd, mode, buffering=buffering,
~\Anaconda3\lib\tempfile.py in _mkstemp_inner(dir, pre, suf, flags, output_type)
258 file = _os.path.join(dir, pre + name + suf)
259 try:
--> 260 fd = _os.open(file, flags, 0o600)
261 except FileExistsError:
262 continue # try again
FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\*****\\AppData\\Local\\Temp\\spark-e340269d-a29e-4b95-90d3-c424a04fcb0a\\pyspark-f7fce557-e11b-47c9-b7a5-81e72a360b36\\tmp7n0s97t2'
i was getting the same error from jupyter notebook/pyspark.
it worked after restarting the notebook kernel.

PySpark java.io.IOException: No FileSystem for scheme: https

I am using local windows and trying to load the XML file with the following code on python, and i am having this error, do anyone knows how to resolve it,
this is the code
df1 = sqlContext.read.format("xml").options(rowTag="IRS990EZ").load("https://irs-form-990.s3.amazonaws.com/201611339349202661_public.xml")
and this is the error
Py4JJavaError Traceback (most recent call last)
<ipython-input-7-4832eb48a4aa> in <module>()
----> 1 df1 = sqlContext.read.format("xml").options(rowTag="IRS990EZ").load("https://irs-form-990.s3.amazonaws.com/201611339349202661_public.xml")
C:\SPARK_HOME\spark-2.2.0-bin-hadoop2.7\python\pyspark\sql\readwriter.py in load(self, path, format, schema, **options)
157 self.options(**options)
158 if isinstance(path, basestring):
--> 159 return self._df(self._jreader.load(path))
160 elif path is not None:
161 if type(path) != list:
C:\SPARK_HOME\spark-2.2.0-bin-hadoop2.7\python\lib\py4j-0.10.4-src.zip\py4j\java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
C:\SPARK_HOME\spark-2.2.0-bin-hadoop2.7\python\pyspark\sql\utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
C:\SPARK_HOME\spark-2.2.0-bin-hadoop2.7\python\lib\py4j-0.10.4-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
317 raise Py4JJavaError(
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
321 raise Py4JError(
Py4JJavaError: An error occurred while calling o38.load.
: java.io.IOException: No FileSystem for scheme: https
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2660)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths(FileInputFormat.java:500)
at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths(FileInputFormat.java:469)
at org.apache.spark.SparkContext$$anonfun$newAPIHadoopFile$2.apply(SparkContext.scala:1160)
at org.apache.spark.SparkContext$$anonfun$newAPIHadoopFile$2.apply(SparkContext.scala:1148)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.SparkContext.withScope(SparkContext.scala:701)
at org.apache.spark.SparkContext.newAPIHadoopFile(SparkContext.scala:1148)
at com.databricks.spark.xml.util.XmlFile$.withCharset(XmlFile.scala:46)
at com.databricks.spark.xml.DefaultSource$$anonfun$createRelation$1.apply(DefaultSource.scala:62)
at com.databricks.spark.xml.DefaultSource$$anonfun$createRelation$1.apply(DefaultSource.scala:62)
at com.databricks.spark.xml.XmlRelation$$anonfun$1.apply(XmlRelation.scala:47)
at com.databricks.spark.xml.XmlRelation$$anonfun$1.apply(XmlRelation.scala:46)
at scala.Option.getOrElse(Option.scala:121)
at com.databricks.spark.xml.XmlRelation.<init>(XmlRelation.scala:45)
at com.databricks.spark.xml.DefaultSource.createRelation(DefaultSource.scala:65)
at com.databricks.spark.xml.DefaultSource.createRelation(DefaultSource.scala:43)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:306)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:156)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Unknown Source)
Somehow pyspark is unable to load the http or https, one of my colleague found the answer for this so here is the solution,
before creating the spark context and sql context we need to load this two line of code
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.4.1 pyspark-shell'
after creating the sparkcontext and sqlcontext from sc = pyspark.SparkContext.getOrCreate and sqlContext = SQLContext(sc)
add the http or https url into the sc by using sc.addFile(url)
Data_XMLFile = sqlContext.read.format("xml").options(rowTag="anytaghere").load(pyspark.SparkFiles.get("*_public.xml")).coalesce(10).cache()
this solution worked for me
The error message says it all: you cannot use dataframe reader & load to access files on the web (http or htpps). I suggest you first download the file locally.
See the pyspark.sql.DataFrameReader docs for more on the available sources (in general, local file system, HDFS, and databases via JDBC).
Irrelevantly to the error, notice that you seem to use the format part of the command incorrectly: assuming that you use the XML Data Source for Apache Spark package, the correct usage should be format('com.databricks.spark.xml') (see the example).
I've commit a similar but slightly different error: forgot the "s3://" prefix to file path. After adding this prefix to form "s3://path/to/object" the following code works:
my_data = spark.read.format("com.databricks.spark.csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.option("delimiter", ",")\
.load("s3://path/to/object")
I was also having a similar issue with the CSV file basically we were trying to load a CSV file into spark.
We were able to load the file successfully by making use of the pandas' library, first we loaded the file into the pandas data frame, and then by using the pandas we were able to load the data into the spark data frame.
from pyspark.sql import SparkSession
import pandas as pd
spark = SparkSession.builder.appName('appName').getOrCreate()
pdf = pd.read_csv('file patth with https')
sdf = spark.createDataFrame(pdf)