I'm following an example from the internet, but it gives me an error that I can't solve. The code is the following:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pyspark import SparkContext
from IPython.display import display, HTML
from pyspark.sql import SQLContext
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from pyspark.sql import Column as c
from pyspark.sql.functions import array, udf, lit, col as c
import pyspark.sql.functions as f
pd.set_option('max_colwidth',100)
plt.style.use('seaborn-paper')
try:
sc = SparkContext('local[*]')
except:
sc = SparkContext.getOrCreate('local[*]')
sqlContext = SQLContext(sc)
#Leyendo los dataframe
whiteWinnePath = 'winequality-White.csv'
redWinnePath = 'winequality-Red.csv'
"""
df_p = pd.read_csv(whiteWinnePath, sep =";")
print("Mostrar el data frame")
display(df_p)
"""
whiteWinneDF = sqlContext.createDataFrame(pd.read_csv(whiteWinnePath, sep = ";")).withColumn('type',lit(0))
redWinneDF = sqlContext.createDataFrame(pd.read_csv(redWinnePath, sep = ";")).withColumn('type',lit(1))
whiteWinneDF.printSchema()
#Dividiendo conjunto de entranamiento y prueba
whiteTrainingDF, whiteTestingDF = whiteWinneDF.randomSplit([0.7,0.3])
redTrainingDF, redTestingDF = redWinneDF.randomSplit([0.7,0.3])
trainingDF = whiteTrainingDF.union(redTrainingDF)
testingDF = whiteTestingDF.union(redTestingDF)
#Preparando el dataframe para PCA
idCol = ['type']
features = [column for column in redWinneDF.columns if column not in idCol]
p = len(features)
meanVector = trainingDF.describe().where(c('summary')==lit('mean')).toPandas()[[0][1:p+1]].values
"""
meanVector2= meanVector1.toPandas()
#print("="*50)
#print(type(meanVector2))
#meanVector= meanVector2.as_matrix()#[0][1:p+1]
meanVector= meanVector2[[0][1:p+1]].values
"""
labeledVectorsDF = trainingDF.select(features+['type']).rdd\
.map(lambda x:(Vectors.dense(x[0:p]-Vectors.dense(meanVector)),x[p]))\
.toDF(['features','type'])
labeledVectorsDF.limit(5).toPandas()
When I run the code, this is the error i get:
File "D:\UGR\Investigación\Cosas de Reinaldo\mis script\Seleccion_caracteristicas\PCA_wine_quality.py", line 78, in <module>
labeledVectorsDF = trainingDF.select(features+['type']).rdd\
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 61, in toDF
return sparkSession.createDataFrame(self, schema, sampleRatio)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 605, in createDataFrame
return self._create_dataframe(data, schema, samplingRatio, verifySchema)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 628, in _create_dataframe
rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 425, in _createFromRDD
struct = self._inferSchema(rdd, samplingRatio, names=schema)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 396, in _inferSchema
first = rdd.first()
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\rdd.py", line 1464, in first
rs = self.take(1)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\rdd.py", line 1446, in take
res = self.context.runJob(self, takeUpToNumLeft, p)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\context.py", line 1118, in runJob
sock_info = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
File "C:\spark-3.0.1-bin-hadoop2.7\python\lib\py4j-0.10.9-src.zip\py4j\java_gateway.py", line 1304, in __call__
return_value = get_return_value(
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\utils.py", line 128, in deco
return f(*a, **kw)
File "C:\spark-3.0.1-bin-hadoop2.7\python\lib\py4j-0.10.9-src.zip\py4j\protocol.py", line 326, in get_return_value
raise Py4JJavaError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2.0 (TID 17, LAPTOP-3G67L0HS, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
I don't know if the error is with the select method or the rdd method. I verified that features has all the names of the columns of the dataframe except type.
How could I solve it?
I'm running a duummy example to perform classification with PySpark.
I created an ETL pipeline in which labels are transformed to OneHotEncoding, but
PySpark throws:
IllegalArgumentException: 'requirement failed: Column label must be of type numeric but was actually of type struct<type:tinyint,size:int,indices:array<int>,values:array<double>>.'
Code for Sparse One-hot
from pyspark.ml.feature import StringIndexer, StandardScaler, OneHotEncoderEstimator, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import rand
df = spark.createDataFrame([
("Music", 3.45,1245),
("Sports", 4.49,3456),
("Music", 1.22, 323),
("Animals", 2.45,24)], ["category", "rating", "views"])
"""ETL Pipeline over
the whole dataset
"""
indexer = StringIndexer(inputCol="category", outputCol="class",handleInvalid="skip")
encoder = OneHotEncoderEstimator(inputCols=["class"],
outputCols=["label"])
encoder.setDropLast(False)
vectorizer = VectorAssembler(inputCols=["rating","views"],
outputCol="unscaled_features")
etl_pipeline = Pipeline(stages=[indexer,encoder,vectorizer])
etlModel = etl_pipeline.fit(df)
tr_df = etlModel.transform(df)
tr_df.show()
"""Training Pipeline
"""
train_data, test_data = tr_df.randomSplit([.8, .2],seed=23487)
scaler = StandardScaler(inputCol="unscaled_features", outputCol="features",
withStd=True, withMean=True)
# specify layers for the neural network:
layers = [4, 5, 4, 3]
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=1, seed=1234)
ml_pipeline = Pipeline(stages=[scaler, trainer])
mlModel = ml_pipeline.fit(train_data)
result = mlModel.transform(test_data)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
Out
+--------+------+-----+-----+-------------+-----------------+
|category|rating|views|class| label|unscaled_features|
+--------+------+-----+-----+-------------+-----------------+
| Music| 3.45| 1245| 0.0|(3,[0],[1.0])| [3.45,1245.0]|
| Sports| 4.49| 3456| 2.0|(3,[2],[1.0])| [4.49,3456.0]|
| Music| 1.22| 323| 0.0|(3,[0],[1.0])| [1.22,323.0]|
| Animals| 2.45| 24| 1.0|(3,[1],[1.0])| [2.45,24.0]|
+--------+------+-----+-----+-------------+-----------------+
IllegalArgumentException: 'requirement failed: Column label must be of type numeric but was actually of type struct<type:tinyint,size:int,indices:array<int>,values:array<double>>.'
The weird thing is that although I convert the SparseVecotr of one-hot-labels to a DenseVector, the error still remains. It seems that MultilayerPerceptronClassifier converts dense labels to sparse but it is not working properly...
Code for ETL with dense one-hot
from pyspark.ml.feature import StringIndexer, StandardScaler, OneHotEncoderEstimator, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import rand
df = spark.createDataFrame([
("Music", 3.45,1245),
("Sports", 4.49,3456),
("Music", 1.22, 323),
("Animals", 2.45,24)], ["category", "rating", "views"])
"""ETL Pipeline over
the whole dataset
"""
indexer = StringIndexer(inputCol="category", outputCol="class",handleInvalid="skip")
encoder = OneHotEncoderEstimator(inputCols=["class"],
outputCols=["label"])
encoder.setDropLast(False)
vectorizer = VectorAssembler(inputCols=["rating","views"],
outputCol="unscaled_features")
etl_pipeline = Pipeline(stages=[indexer,encoder,vectorizer])
etlModel = etl_pipeline.fit(df)
tr_df = etlModel.transform(df)
tr_df = tr_df.select("label", "unscaled_features")
rdd = tr_df.rdd.map(lambda x: Row(label=DenseVector(x[0].toArray()),unscaled_features=x[1])
if (len(x)>1 and hasattr(x[0], "toArray"))
else Row(label=None, unscaled_features=DenseVector([])))
tr_df = rdd.toDF()
tr_df.show()
"""Training Pipeline
"""
train_data, test_data = tr_df.randomSplit([.8, .2],seed=23487)
scaler = StandardScaler(inputCol="unscaled_features", outputCol="features",
withStd=True, withMean=True)
# specify layers for the neural network:
layers = [4, 5, 4, 3]
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=1, seed=1234)
ml_pipeline = Pipeline(stages=[scaler, trainer])
mlModel = ml_pipeline.fit(train_data)
result = mlModel.transform(test_data)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
Out
+-------------+-----------------+
| label|unscaled_features|
+-------------+-----------------+
|[1.0,0.0,0.0]| [3.45,1245.0]|
|[0.0,0.0,1.0]| [4.49,3456.0]|
|[1.0,0.0,0.0]| [1.22,323.0]|
|[0.0,1.0,0.0]| [2.45,24.0]|
+-------------+-----------------+
IllegalArgumentException: 'requirement failed: Column label must be of type numeric but was actually of type struct<type:tinyint,size:int,indices:array<int>,values:array<double>>.'
UPDATE 1: REMOVING ONE-HOT ENCODING FROM PIPELINE
Code
from pyspark.ml.feature import StringIndexer, StandardScaler, OneHotEncoderEstimator, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import rand
df = spark.createDataFrame([
("Music", 3.45,1245),
("Sports", 4.49,3456),
("Music", 1.22, 323),
("Animals", 2.45,24)], ["category", "rating", "views"])
"""ETL Pipeline over
the whole dataset
"""
indexer = StringIndexer(inputCol="category", outputCol="label",handleInvalid="skip")
# encoder = OneHotEncoderEstimator(inputCols=["class"],
# outputCols=["label"])
# encoder.setDropLast(False)
vectorizer = VectorAssembler(inputCols=["rating","views"],
outputCol="unscaled_features")
etl_pipeline = Pipeline(stages=[indexer,vectorizer])
etlModel = etl_pipeline.fit(df)
tr_df = etlModel.transform(df)
tr_df.show()
"""Training Pipeline
"""
train_data, test_data = tr_df.randomSplit([.8, .2],seed=23487)
scaler = StandardScaler(inputCol="unscaled_features", outputCol="features",
withStd=True, withMean=True)
# specify layers for the neural network:
layers = [4, 5, 4, 3]
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
ml_pipeline = Pipeline(stages=[scaler, trainer])
mlModel = ml_pipeline.fit(train_data)
result = mlModel.transform(test_data)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
OUT
+--------+------+-----+-----+-----------------+
|category|rating|views|label|unscaled_features|
+--------+------+-----+-----+-----------------+
| Music| 3.45| 1245| 0.0| [3.45,1245.0]|
| Sports| 4.49| 3456| 2.0| [4.49,3456.0]|
| Music| 1.22| 323| 0.0| [1.22,323.0]|
| Animals| 2.45| 24| 1.0| [2.45,24.0]|
+--------+------+-----+-----+-----------------+
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-9-58967f1d5bce> in <module>
60
61 ml_pipeline = Pipeline(stages=[scaler, trainer])
---> 62 mlModel = ml_pipeline.fit(train_data)
63 result = mlModel.transform(test_data)
64 predictionAndLabels = result.select("prediction", "label")
~/.local/lib/python3.5/site-packages/pyspark/ml/base.py in fit(self, dataset, params)
130 return self.copy(params)._fit(dataset)
131 else:
--> 132 return self._fit(dataset)
133 else:
134 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
~/.local/lib/python3.5/site-packages/pyspark/ml/pipeline.py in _fit(self, dataset)
107 dataset = stage.transform(dataset)
108 else: # must be an Estimator
--> 109 model = stage.fit(dataset)
110 transformers.append(model)
111 if i < indexOfLastEstimator:
~/.local/lib/python3.5/site-packages/pyspark/ml/base.py in fit(self, dataset, params)
130 return self.copy(params)._fit(dataset)
131 else:
--> 132 return self._fit(dataset)
133 else:
134 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
~/.local/lib/python3.5/site-packages/pyspark/ml/wrapper.py in _fit(self, dataset)
293
294 def _fit(self, dataset):
--> 295 java_model = self._fit_java(dataset)
296 model = self._create_model(java_model)
297 return self._copyValues(model)
~/.local/lib/python3.5/site-packages/pyspark/ml/wrapper.py in _fit_java(self, dataset)
290 """
291 self._transfer_params_to_java()
--> 292 return self._java_obj.fit(dataset._jdf)
293
294 def _fit(self, dataset):
~/.local/lib/python3.5/site-packages/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
~/.local/lib/python3.5/site-packages/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
~/.local/lib/python3.5/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o870.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 26.0 failed 1 times, most recent failure: Lost task 0.0 in stage 26.0 (TID 26, localhost, executor driver): java.lang.ArrayIndexOutOfBoundsException
at java.lang.System.arraycopy(Native Method)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3$$anonfun$apply$4.apply(Layer.scala:665)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3$$anonfun$apply$4.apply(Layer.scala:664)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3.apply(Layer.scala:664)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3.apply(Layer.scala:660)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:222)
at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:299)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1165)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156)
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091)
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:286)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD.count(RDD.scala:1168)
at org.apache.spark.mllib.optimization.LBFGS$.runLBFGS(LBFGS.scala:195)
at org.apache.spark.mllib.optimization.LBFGS.optimize(LBFGS.scala:142)
at org.apache.spark.ml.ann.FeedForwardTrainer.train(Layer.scala:854)
at org.apache.spark.ml.classification.MultilayerPerceptronClassifier$$anonfun$train$1.apply(MultilayerPerceptronClassifier.scala:249)
at org.apache.spark.ml.classification.MultilayerPerceptronClassifier$$anonfun$train$1.apply(MultilayerPerceptronClassifier.scala:205)
at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185)
at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:205)
at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:114)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ArrayIndexOutOfBoundsException
at java.lang.System.arraycopy(Native Method)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3$$anonfun$apply$4.apply(Layer.scala:665)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3$$anonfun$apply$4.apply(Layer.scala:664)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3.apply(Layer.scala:664)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3.apply(Layer.scala:660)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:222)
at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:299)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1165)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156)
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091)
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:286)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
PySPark version
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.4.4
/_/
Using Scala version 2.11.12, OpenJDK 64-Bit Server VM, 1.8.0_222
Java Version
openjdk version "1.8.0_222"
OpenJDK Runtime Environment (build 1.8.0_222-8u222-b10-1ubuntu1~16.04.1-b10)
OpenJDK 64-Bit Server VM (build 25.222-b10, mixed mode)
You should run the features via a VectorAssembler but you don't need to do one-hot-encoding on the label column. You should just pass the labels column as numeric classes as they are:
+------+-----------------+
| label|unscaled_features|
+------+-----------------+
| 0| [3.45,1245.0]|
| 2| [4.49,3456.0]|
| 0| [1.22,323.0]|
| 1| [2.45,24.0]|
+------+-----------------+
This should solve your error.
I tried to convert nested listed to Dataframe by following the answers in this link
List to DataFrame in pyspark
my_data =[['apple','ball','ballon'],['cat','camel','james'],['none','focus','cake']]
from pyspark.sql import Row
R = Row('ID', 'words')
spark.createDataFrame([R(i, x) for i, x in enumerate(my_data)]).show()
​
But I obtain this error :
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-147-780a8d7196df> in <module>()
----> 5 spark.createDataFrame([R(i, x) for i, x in enumerate(my_data)]).show()
F:\spark\spark\python\pyspark\sql\session.py in createDataFrame(self, data, schema, samplingRatio, verifySchema)
--> 689 rdd, schema = self._createFromLocal(map(prepare, data), schema)
F:\spark\spark\python\pyspark\sql\session.py in _createFromLocal(self, data, schema)
--> 424 return self._sc.parallelize(data), schema
F:\spark\spark\python\pyspark\context.py in parallelize(self, c, numSlices)
--> 484 jrdd = self._serialize_to_jvm(c, numSlices, serializer)
F:\spark\spark\python\pyspark\context.py in _serialize_to_jvm(self, data, parallelism, serializer)
--> 493 tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
~\Anaconda3\lib\tempfile.py in NamedTemporaryFile(mode, buffering, encoding, newline, suffix, prefix, dir, delete)
547 flags |= _os.O_TEMPORARY
548
--> 549 (fd, name) = _mkstemp_inner(dir, prefix, suffix, flags, output_type)
550 try:
551 file = _io.open(fd, mode, buffering=buffering,
~\Anaconda3\lib\tempfile.py in _mkstemp_inner(dir, pre, suf, flags, output_type)
258 file = _os.path.join(dir, pre + name + suf)
259 try:
--> 260 fd = _os.open(file, flags, 0o600)
261 except FileExistsError:
262 continue # try again
FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\*****\\AppData\\Local\\Temp\\spark-e340269d-a29e-4b95-90d3-c424a04fcb0a\\pyspark-f7fce557-e11b-47c9-b7a5-81e72a360b36\\tmp7n0s97t2'
i was getting the same error from jupyter notebook/pyspark.
it worked after restarting the notebook kernel.
I am using local windows and trying to load the XML file with the following code on python, and i am having this error, do anyone knows how to resolve it,
this is the code
df1 = sqlContext.read.format("xml").options(rowTag="IRS990EZ").load("https://irs-form-990.s3.amazonaws.com/201611339349202661_public.xml")
and this is the error
Py4JJavaError Traceback (most recent call last)
<ipython-input-7-4832eb48a4aa> in <module>()
----> 1 df1 = sqlContext.read.format("xml").options(rowTag="IRS990EZ").load("https://irs-form-990.s3.amazonaws.com/201611339349202661_public.xml")
C:\SPARK_HOME\spark-2.2.0-bin-hadoop2.7\python\pyspark\sql\readwriter.py in load(self, path, format, schema, **options)
157 self.options(**options)
158 if isinstance(path, basestring):
--> 159 return self._df(self._jreader.load(path))
160 elif path is not None:
161 if type(path) != list:
C:\SPARK_HOME\spark-2.2.0-bin-hadoop2.7\python\lib\py4j-0.10.4-src.zip\py4j\java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
C:\SPARK_HOME\spark-2.2.0-bin-hadoop2.7\python\pyspark\sql\utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
C:\SPARK_HOME\spark-2.2.0-bin-hadoop2.7\python\lib\py4j-0.10.4-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
317 raise Py4JJavaError(
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
321 raise Py4JError(
Py4JJavaError: An error occurred while calling o38.load.
: java.io.IOException: No FileSystem for scheme: https
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2660)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths(FileInputFormat.java:500)
at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths(FileInputFormat.java:469)
at org.apache.spark.SparkContext$$anonfun$newAPIHadoopFile$2.apply(SparkContext.scala:1160)
at org.apache.spark.SparkContext$$anonfun$newAPIHadoopFile$2.apply(SparkContext.scala:1148)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.SparkContext.withScope(SparkContext.scala:701)
at org.apache.spark.SparkContext.newAPIHadoopFile(SparkContext.scala:1148)
at com.databricks.spark.xml.util.XmlFile$.withCharset(XmlFile.scala:46)
at com.databricks.spark.xml.DefaultSource$$anonfun$createRelation$1.apply(DefaultSource.scala:62)
at com.databricks.spark.xml.DefaultSource$$anonfun$createRelation$1.apply(DefaultSource.scala:62)
at com.databricks.spark.xml.XmlRelation$$anonfun$1.apply(XmlRelation.scala:47)
at com.databricks.spark.xml.XmlRelation$$anonfun$1.apply(XmlRelation.scala:46)
at scala.Option.getOrElse(Option.scala:121)
at com.databricks.spark.xml.XmlRelation.<init>(XmlRelation.scala:45)
at com.databricks.spark.xml.DefaultSource.createRelation(DefaultSource.scala:65)
at com.databricks.spark.xml.DefaultSource.createRelation(DefaultSource.scala:43)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:306)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:156)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Unknown Source)
Somehow pyspark is unable to load the http or https, one of my colleague found the answer for this so here is the solution,
before creating the spark context and sql context we need to load this two line of code
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.4.1 pyspark-shell'
after creating the sparkcontext and sqlcontext from sc = pyspark.SparkContext.getOrCreate and sqlContext = SQLContext(sc)
add the http or https url into the sc by using sc.addFile(url)
Data_XMLFile = sqlContext.read.format("xml").options(rowTag="anytaghere").load(pyspark.SparkFiles.get("*_public.xml")).coalesce(10).cache()
this solution worked for me
The error message says it all: you cannot use dataframe reader & load to access files on the web (http or htpps). I suggest you first download the file locally.
See the pyspark.sql.DataFrameReader docs for more on the available sources (in general, local file system, HDFS, and databases via JDBC).
Irrelevantly to the error, notice that you seem to use the format part of the command incorrectly: assuming that you use the XML Data Source for Apache Spark package, the correct usage should be format('com.databricks.spark.xml') (see the example).
I've commit a similar but slightly different error: forgot the "s3://" prefix to file path. After adding this prefix to form "s3://path/to/object" the following code works:
my_data = spark.read.format("com.databricks.spark.csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.option("delimiter", ",")\
.load("s3://path/to/object")
I was also having a similar issue with the CSV file basically we were trying to load a CSV file into spark.
We were able to load the file successfully by making use of the pandas' library, first we loaded the file into the pandas data frame, and then by using the pandas we were able to load the data into the spark data frame.
from pyspark.sql import SparkSession
import pandas as pd
spark = SparkSession.builder.appName('appName').getOrCreate()
pdf = pd.read_csv('file patth with https')
sdf = spark.createDataFrame(pdf)