I'm running a duummy example to perform classification with PySpark.
I created an ETL pipeline in which labels are transformed to OneHotEncoding, but
PySpark throws:
IllegalArgumentException: 'requirement failed: Column label must be of type numeric but was actually of type struct<type:tinyint,size:int,indices:array<int>,values:array<double>>.'
Code for Sparse One-hot
from pyspark.ml.feature import StringIndexer, StandardScaler, OneHotEncoderEstimator, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import rand
df = spark.createDataFrame([
("Music", 3.45,1245),
("Sports", 4.49,3456),
("Music", 1.22, 323),
("Animals", 2.45,24)], ["category", "rating", "views"])
"""ETL Pipeline over
the whole dataset
"""
indexer = StringIndexer(inputCol="category", outputCol="class",handleInvalid="skip")
encoder = OneHotEncoderEstimator(inputCols=["class"],
outputCols=["label"])
encoder.setDropLast(False)
vectorizer = VectorAssembler(inputCols=["rating","views"],
outputCol="unscaled_features")
etl_pipeline = Pipeline(stages=[indexer,encoder,vectorizer])
etlModel = etl_pipeline.fit(df)
tr_df = etlModel.transform(df)
tr_df.show()
"""Training Pipeline
"""
train_data, test_data = tr_df.randomSplit([.8, .2],seed=23487)
scaler = StandardScaler(inputCol="unscaled_features", outputCol="features",
withStd=True, withMean=True)
# specify layers for the neural network:
layers = [4, 5, 4, 3]
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=1, seed=1234)
ml_pipeline = Pipeline(stages=[scaler, trainer])
mlModel = ml_pipeline.fit(train_data)
result = mlModel.transform(test_data)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
Out
+--------+------+-----+-----+-------------+-----------------+
|category|rating|views|class| label|unscaled_features|
+--------+------+-----+-----+-------------+-----------------+
| Music| 3.45| 1245| 0.0|(3,[0],[1.0])| [3.45,1245.0]|
| Sports| 4.49| 3456| 2.0|(3,[2],[1.0])| [4.49,3456.0]|
| Music| 1.22| 323| 0.0|(3,[0],[1.0])| [1.22,323.0]|
| Animals| 2.45| 24| 1.0|(3,[1],[1.0])| [2.45,24.0]|
+--------+------+-----+-----+-------------+-----------------+
IllegalArgumentException: 'requirement failed: Column label must be of type numeric but was actually of type struct<type:tinyint,size:int,indices:array<int>,values:array<double>>.'
The weird thing is that although I convert the SparseVecotr of one-hot-labels to a DenseVector, the error still remains. It seems that MultilayerPerceptronClassifier converts dense labels to sparse but it is not working properly...
Code for ETL with dense one-hot
from pyspark.ml.feature import StringIndexer, StandardScaler, OneHotEncoderEstimator, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import rand
df = spark.createDataFrame([
("Music", 3.45,1245),
("Sports", 4.49,3456),
("Music", 1.22, 323),
("Animals", 2.45,24)], ["category", "rating", "views"])
"""ETL Pipeline over
the whole dataset
"""
indexer = StringIndexer(inputCol="category", outputCol="class",handleInvalid="skip")
encoder = OneHotEncoderEstimator(inputCols=["class"],
outputCols=["label"])
encoder.setDropLast(False)
vectorizer = VectorAssembler(inputCols=["rating","views"],
outputCol="unscaled_features")
etl_pipeline = Pipeline(stages=[indexer,encoder,vectorizer])
etlModel = etl_pipeline.fit(df)
tr_df = etlModel.transform(df)
tr_df = tr_df.select("label", "unscaled_features")
rdd = tr_df.rdd.map(lambda x: Row(label=DenseVector(x[0].toArray()),unscaled_features=x[1])
if (len(x)>1 and hasattr(x[0], "toArray"))
else Row(label=None, unscaled_features=DenseVector([])))
tr_df = rdd.toDF()
tr_df.show()
"""Training Pipeline
"""
train_data, test_data = tr_df.randomSplit([.8, .2],seed=23487)
scaler = StandardScaler(inputCol="unscaled_features", outputCol="features",
withStd=True, withMean=True)
# specify layers for the neural network:
layers = [4, 5, 4, 3]
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=1, seed=1234)
ml_pipeline = Pipeline(stages=[scaler, trainer])
mlModel = ml_pipeline.fit(train_data)
result = mlModel.transform(test_data)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
Out
+-------------+-----------------+
| label|unscaled_features|
+-------------+-----------------+
|[1.0,0.0,0.0]| [3.45,1245.0]|
|[0.0,0.0,1.0]| [4.49,3456.0]|
|[1.0,0.0,0.0]| [1.22,323.0]|
|[0.0,1.0,0.0]| [2.45,24.0]|
+-------------+-----------------+
IllegalArgumentException: 'requirement failed: Column label must be of type numeric but was actually of type struct<type:tinyint,size:int,indices:array<int>,values:array<double>>.'
UPDATE 1: REMOVING ONE-HOT ENCODING FROM PIPELINE
Code
from pyspark.ml.feature import StringIndexer, StandardScaler, OneHotEncoderEstimator, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import rand
df = spark.createDataFrame([
("Music", 3.45,1245),
("Sports", 4.49,3456),
("Music", 1.22, 323),
("Animals", 2.45,24)], ["category", "rating", "views"])
"""ETL Pipeline over
the whole dataset
"""
indexer = StringIndexer(inputCol="category", outputCol="label",handleInvalid="skip")
# encoder = OneHotEncoderEstimator(inputCols=["class"],
# outputCols=["label"])
# encoder.setDropLast(False)
vectorizer = VectorAssembler(inputCols=["rating","views"],
outputCol="unscaled_features")
etl_pipeline = Pipeline(stages=[indexer,vectorizer])
etlModel = etl_pipeline.fit(df)
tr_df = etlModel.transform(df)
tr_df.show()
"""Training Pipeline
"""
train_data, test_data = tr_df.randomSplit([.8, .2],seed=23487)
scaler = StandardScaler(inputCol="unscaled_features", outputCol="features",
withStd=True, withMean=True)
# specify layers for the neural network:
layers = [4, 5, 4, 3]
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
ml_pipeline = Pipeline(stages=[scaler, trainer])
mlModel = ml_pipeline.fit(train_data)
result = mlModel.transform(test_data)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
OUT
+--------+------+-----+-----+-----------------+
|category|rating|views|label|unscaled_features|
+--------+------+-----+-----+-----------------+
| Music| 3.45| 1245| 0.0| [3.45,1245.0]|
| Sports| 4.49| 3456| 2.0| [4.49,3456.0]|
| Music| 1.22| 323| 0.0| [1.22,323.0]|
| Animals| 2.45| 24| 1.0| [2.45,24.0]|
+--------+------+-----+-----+-----------------+
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-9-58967f1d5bce> in <module>
60
61 ml_pipeline = Pipeline(stages=[scaler, trainer])
---> 62 mlModel = ml_pipeline.fit(train_data)
63 result = mlModel.transform(test_data)
64 predictionAndLabels = result.select("prediction", "label")
~/.local/lib/python3.5/site-packages/pyspark/ml/base.py in fit(self, dataset, params)
130 return self.copy(params)._fit(dataset)
131 else:
--> 132 return self._fit(dataset)
133 else:
134 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
~/.local/lib/python3.5/site-packages/pyspark/ml/pipeline.py in _fit(self, dataset)
107 dataset = stage.transform(dataset)
108 else: # must be an Estimator
--> 109 model = stage.fit(dataset)
110 transformers.append(model)
111 if i < indexOfLastEstimator:
~/.local/lib/python3.5/site-packages/pyspark/ml/base.py in fit(self, dataset, params)
130 return self.copy(params)._fit(dataset)
131 else:
--> 132 return self._fit(dataset)
133 else:
134 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
~/.local/lib/python3.5/site-packages/pyspark/ml/wrapper.py in _fit(self, dataset)
293
294 def _fit(self, dataset):
--> 295 java_model = self._fit_java(dataset)
296 model = self._create_model(java_model)
297 return self._copyValues(model)
~/.local/lib/python3.5/site-packages/pyspark/ml/wrapper.py in _fit_java(self, dataset)
290 """
291 self._transfer_params_to_java()
--> 292 return self._java_obj.fit(dataset._jdf)
293
294 def _fit(self, dataset):
~/.local/lib/python3.5/site-packages/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
~/.local/lib/python3.5/site-packages/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
~/.local/lib/python3.5/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o870.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 26.0 failed 1 times, most recent failure: Lost task 0.0 in stage 26.0 (TID 26, localhost, executor driver): java.lang.ArrayIndexOutOfBoundsException
at java.lang.System.arraycopy(Native Method)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3$$anonfun$apply$4.apply(Layer.scala:665)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3$$anonfun$apply$4.apply(Layer.scala:664)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3.apply(Layer.scala:664)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3.apply(Layer.scala:660)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:222)
at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:299)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1165)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156)
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091)
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:286)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD.count(RDD.scala:1168)
at org.apache.spark.mllib.optimization.LBFGS$.runLBFGS(LBFGS.scala:195)
at org.apache.spark.mllib.optimization.LBFGS.optimize(LBFGS.scala:142)
at org.apache.spark.ml.ann.FeedForwardTrainer.train(Layer.scala:854)
at org.apache.spark.ml.classification.MultilayerPerceptronClassifier$$anonfun$train$1.apply(MultilayerPerceptronClassifier.scala:249)
at org.apache.spark.ml.classification.MultilayerPerceptronClassifier$$anonfun$train$1.apply(MultilayerPerceptronClassifier.scala:205)
at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185)
at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:205)
at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:114)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ArrayIndexOutOfBoundsException
at java.lang.System.arraycopy(Native Method)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3$$anonfun$apply$4.apply(Layer.scala:665)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3$$anonfun$apply$4.apply(Layer.scala:664)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3.apply(Layer.scala:664)
at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3.apply(Layer.scala:660)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:222)
at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:299)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1165)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156)
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091)
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:286)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
PySPark version
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.4.4
/_/
Using Scala version 2.11.12, OpenJDK 64-Bit Server VM, 1.8.0_222
Java Version
openjdk version "1.8.0_222"
OpenJDK Runtime Environment (build 1.8.0_222-8u222-b10-1ubuntu1~16.04.1-b10)
OpenJDK 64-Bit Server VM (build 25.222-b10, mixed mode)
You should run the features via a VectorAssembler but you don't need to do one-hot-encoding on the label column. You should just pass the labels column as numeric classes as they are:
+------+-----------------+
| label|unscaled_features|
+------+-----------------+
| 0| [3.45,1245.0]|
| 2| [4.49,3456.0]|
| 0| [1.22,323.0]|
| 1| [2.45,24.0]|
+------+-----------------+
This should solve your error.
Related
I am new to pyspark and just started with pyspark by following this
I have made the changes to the code basis on my local system. Below is the code how I am setting up pyspark
# Import PySpark related modules
import findspark
findspark.init()
import pyspark
import os
import sys
os.environ['PYSPARK_PYTHON'] = 'python'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import functions
from pyspark.sql.functions import lit, desc, col, size, array_contains\
, isnan, udf, hour, array_min, array_max, countDistinct
from pyspark.sql.types import *
MAX_MEMORY = '15G'
# Initialize a spark session.
conf = pyspark.SparkConf().setMaster("local[*]") \
.set('spark.executor.heartbeatInterval', 10000) \
.set('spark.network.timeout', 10000) \
.set("spark.core.connection.ack.wait.timeout", "20000") \
.set("spark.executor.memory", MAX_MEMORY) \
.set("spark.driver.memory", MAX_MEMORY) \
.set("spark.driver.maxResultSize", MAX_MEMORY) \
.set("spark.sql.execution.arrow.pyspark.enabled", "true") \
.set("spark.memory.offHeap.enabled","true") \
.set("spark.memory.offHeap.size","10g") \
.set("spark.network.timeout", "300000")
def init_spark():
spark = SparkSession \
.builder \
.appName("Pyspark guide") \
.config(conf=conf) \
.getOrCreate()
return spark
spark = init_spark()
filename_data = 'endomondoHR.json'
# Load the main data set into pyspark data frame
df = spark.read.json(filename_data, mode="DROPMALFORMED")
print('Data frame type: ' + str(type(df)))
While executing the code in cell
# Helper function to calculate statistic(s) of the column name from a tuple x of (sport, records list of the column)
#, the stats to calculate is also given as an input
def calculate_stats(x,column_name, stat_list):
sport, records_list = x
stat_dict = {'sport': sport}
if 'min' in stat_list:
min_stat = min(records_list)
stat_dict.update({'min ' + column_name : min_stat})
if 'max' in stat_list:
max_stat = max(records_list)
stat_dict.update({'max ' + column_name: max_stat})
if 'mean' in stat_list:
average_stat = stats.mean(records_list)
stat_dict.update({'mean ' + column_name: average_stat})
if 'stdev' in stat_list:
std_stat = stats.stdev(records_list)
stat_dict.update({'stdev ' + column_name: std_stat})
if '50th percentile' in stat_list:
median_stat = stats.median(records_list)
stat_dict.update({'50th percentile ' + column_name: median_stat})
if '25th percentile' in stat_list:
percentile_25th_stat = np.percentile(records_list, 25)
stat_dict.update({'25th percentile ' + column_name: percentile_25th_stat})
if '75th percentile' in stat_list:
percentile_75th_stat = np.percentile(records_list, 75)
stat_dict.update({'75th percentile ' + column_name: percentile_75th_stat})
if '95th percentile' in stat_list:
percentile_95th_stat = np.percentile(records_list, 95)
stat_dict.update({'95th percentile ' + column_name: percentile_95th_stat})
return stat_dict
def to_list(a):
return a
def extend(a, b):
a.extend(b)
return a
def retrieve_array_column_stat_df(df, column_name, stat_list):
# Convert sport & "column_name" to RDD to easily calculate the statistics of intervals by sports
sport_record_rdd = df.select('sport', column_name).rdd \
.map(tuple).combineByKey(to_list, extend, extend).persist()
# Calculate statistics of the input column by calling calculate_stats function defined above
record_statistic_df = pd.DataFrame(sport_record_rdd.map(
lambda x: calculate_stats(x, column_name,stat_list)).collect()
)
# Set proper dataframe column orders
columns_order = ['sport'] + [stat + ' ' + column_name for stat in stat_list]
# Re order columns
return record_statistic_df[columns_order]
stat_list = ['min', '25th percentile', 'mean', '50th percentile',
'75th percentile', '95th percentile', 'max', 'stdev']
interval_statistic_df = retrieve_array_column_stat_df(df, column_name='interval', stat_list=stat_list)
print('\nLet\'s look at statistic for interval, in seconds (by sport):' )
interval_statistic_df
I am getting below error
Py4JJavaError Traceback (most recent call last)
c:\Users\shiv\Desktop\test\pyspark_analysis_bigdata.ipynb Cell 45 in <cell line: 55>()
51 return record_statistic_df[columns_order]
53 stat_list = ['min', '25th percentile', 'mean', '50th percentile',
54 '75th percentile', '95th percentile', 'max', 'stdev']
---> 55 interval_statistic_df = retrieve_array_column_stat_df(df, column_name='interval', stat_list=stat_list)
56 print('\nLet\'s look at statistic for interval, in seconds (by sport):' )
57 interval_statistic_df
c:\Users\shiv\Desktop\test\pyspark_analysis_bigdata.ipynb Cell 45 in retrieve_array_column_stat_df(df, column_name, stat_list)
41 sport_record_rdd = df.select('sport', column_name).rdd \
42 .map(tuple).combineByKey(to_list, extend, extend).persist()
44 # Calculate statistics of the input column by calling calculate_stats function defined above
---> 45 record_statistic_df = pd.DataFrame(sport_record_rdd.map(
46 lambda x: calculate_stats(x, column_name,stat_list)).collect()
47 )
48 # Set proper dataframe column orders
49 columns_order = ['sport'] + [stat + ' ' + column_name for stat in stat_list]
File c:\Users\shiv\miniconda3\lib\site-packages\pyspark\rdd.py:1197, in RDD.collect(self)
1195 with SCCallSiteSync(self.context):
1196 assert self.ctx._jvm is not None
-> 1197 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
1198 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
File c:\Users\shiv\miniconda3\lib\site-packages\py4j\java_gateway.py:1321, in JavaMember.__call__(self, *args)
1315 command = proto.CALL_COMMAND_NAME +\
1316 self.command_header +\
1317 args_command +\
1318 proto.END_COMMAND_PART
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1324 for temp_arg in temp_args:
1325 temp_arg._detach()
File c:\Users\shiv\miniconda3\lib\site-packages\pyspark\sql\utils.py:190, in capture_sql_exception.<locals>.deco(*a, **kw)
188 def deco(*a: Any, **kw: Any) -> Any:
189 try:
--> 190 return f(*a, **kw)
191 except Py4JJavaError as e:
192 converted = convert_exception(e.java_exception)
File c:\Users\shiv\miniconda3\lib\site-packages\py4j\protocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
332 format(target_id, ".", name, value))
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 5 in stage 155.0 failed 1 times, most recent failure: Lost task 5.0 in stage 155.0 (TID 1775) (shiv executor driver): java.net.SocketException: Connection reset by peer
at java.base/sun.nio.ch.NioSocketImpl.implWrite(NioSocketImpl.java:413)
at java.base/sun.nio.ch.NioSocketImpl.write(NioSocketImpl.java:433)
at java.base/sun.nio.ch.NioSocketImpl$2.write(NioSocketImpl.java:812)
at java.base/java.net.Socket$SocketOutputStream.write(Socket.java:1120)
at java.base/java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:125)
at java.base/java.io.BufferedOutputStream.implWrite(BufferedOutputStream.java:215)
at java.base/java.io.BufferedOutputStream.write(BufferedOutputStream.java:199)
at java.base/java.io.DataOutputStream.write(DataOutputStream.java:112)
at java.base/java.io.FilterOutputStream.write(FilterOutputStream.java:108)
at org.apache.spark.api.python.PythonRDD$.write$1(PythonRDD.scala:295)
at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1(PythonRDD.scala:307)
at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1$adapted(PythonRDD.scala:307)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:307)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.writeIteratorToStream(PythonUDFRunner.scala:53)
at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:438)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:2066)
at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:272)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2293)
at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1021)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
at org.apache.spark.rdd.RDD.collect(RDD.scala:1020)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:180)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:104)
at java.base/java.lang.reflect.Method.invoke(Method.java:578)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:1589)
Caused by: java.net.SocketException: Connection reset by peer
at java.base/sun.nio.ch.NioSocketImpl.implWrite(NioSocketImpl.java:413)
at java.base/sun.nio.ch.NioSocketImpl.write(NioSocketImpl.java:433)
at java.base/sun.nio.ch.NioSocketImpl$2.write(NioSocketImpl.java:812)
at java.base/java.net.Socket$SocketOutputStream.write(Socket.java:1120)
at java.base/java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:125)
at java.base/java.io.BufferedOutputStream.implWrite(BufferedOutputStream.java:215)
at java.base/java.io.BufferedOutputStream.write(BufferedOutputStream.java:199)
at java.base/java.io.DataOutputStream.write(DataOutputStream.java:112)
at java.base/java.io.FilterOutputStream.write(FilterOutputStream.java:108)
at org.apache.spark.api.python.PythonRDD$.write$1(PythonRDD.scala:295)
at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1(PythonRDD.scala:307)
at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1$adapted(PythonRDD.scala:307)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:307)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.writeIteratorToStream(PythonUDFRunner.scala:53)
at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:438)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:2066)
at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:272)
I am not able to understand the error, can anybody help here?
Important info -
pyspark and spark version - 3.3.1
python version - 3.9.13
Windows 11, 16GB, Intel Core i5-11400H, NVIDIA-GeForce GTX 1650 with 4GB GDDR6 dedicated VRAM
Java version - 19.0.2
I tried allocating maximum memory to driver and executors, increased heartbeat interval and network timeout and then later used findspark to intialize spark. The above code set up is the latest and still getting the same error.
Update
I tried running the code with the chunk of data and made the changes, the code ran successfully:
df_min = df.limit(1000)
stat_list = ['min', '25th percentile', 'mean', '50th percentile',
'75th percentile', '95th percentile', 'max', 'stdev']
interval_statistic_df = retrieve_array_column_stat_df(df_min, column_name='interval', stat_list=stat_list)
print('\nLet\'s look at statistic for interval, in seconds (by sport):' )
interval_statistic_df
Therefore, the size of the data is the issue. Can anyone help me with the optimized code to omit the use of rdd and effectively use map() and collect()
I am reading some data (8 GB) data from multiple files, filter the data doing some null check and performing some upliftings (operations) on columns like cleaning column value for this I have 6 to 7 functions (custom functionality, cannot use spark functions) that are registered as UDFs. Then I write the final result to tables and CSV files, now on 'dataframe.write.saveAsTable()' and on writing 'CSV' I get EOF exception Seek past end of file. This exception does not occur everytime, like if I run 20 times it may occur once. I am unable to find its reason and cause because it is not reproduce-able, (Getting this in both in scala and pyspark), will appreciate any help or hint. Looking forward. Thanks
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<command-4146672194126555> in <module>()
331 saveMergedLogs(
332 dataframeLogs
333 );
<command-3860558353011740> in saveMergedLogs(dataframeLogs)
45 # ---------------------------------------------------------------------------------------------------------------------------------
46 spark.sql("DROP TABLE IF EXISTS dbo.UsageLogs");
---> 47 dataframeLogs.write.saveAsTable("dbo.UsageLogs")
/databricks/spark/python/pyspark/sql/readwriter.py in saveAsTable(self, name, format, mode, partitionBy, **options)
773 if format is not None:
774 self.format(format)
--> 775 self._jwrite.saveAsTable(name)
776
777 #since(1.4)
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o3615.saveAsTable.
: org.apache.spark.SparkException: Job aborted.
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:196)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:192)
at org.apache.spark.sql.execution.datasources.DataSource.writeAndRead(DataSource.scala:553)
at org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand.saveDataIntoTable(createDataSourceTables.scala:216)
at org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand.run(createDataSourceTables.scala:175)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:110)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:108)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:128)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:143)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$5.apply(SparkPlan.scala:183)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:180)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:131)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:114)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:114)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:690)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:690)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withCustomExecutionEnv$1.apply(SQLExecution.scala:99)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:228)
at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:85)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:158)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:690)
at org.apache.spark.sql.DataFrameWriter.createTable(DataFrameWriter.scala:487)
at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:466)
at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:414)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:295)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 9 in stage 58.0 failed 4 times, most recent failure: Lost task 9.3 in stage 58.0 (TID 8630, 10.139.64.7, executor 0): java.io.EOFException: Cannot seek past end of file
at com.microsoft.azure.datalake.store.ADLFileInputStream.seek(ADLFileInputStream.java:262)
at com.databricks.adl.AdlFsInputStream.seek(AdlFsInputStream.java:64)
at org.apache.hadoop.fs.FSDataInputStream.seek(FSDataInputStream.java:62)
at com.databricks.spark.metrics.FSInputStreamWithMetrics.seek(FileSystemWithMetrics.scala:207)
at org.apache.hadoop.fs.FSDataInputStream.seek(FSDataInputStream.java:62)
at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.initialize(LineRecordReader.java:107)
at org.apache.spark.sql.execution.datasources.HadoopFileLinesReader.<init>(HadoopFileLinesReader.scala:65)
at org.apache.spark.sql.execution.datasources.HadoopFileLinesReader.<init>(HadoopFileLinesReader.scala:47)
at org.apache.spark.sql.execution.datasources.csv.TextInputCSVDataSource$.readFile(CSVDataSource.scala:201)
at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat$$anonfun$buildReader$2.apply(CSVFileFormat.scala:147)
at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat$$anonfun$buildReader$2.apply(CSVFileFormat.scala:140)
at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:147)
at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:134)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1$$anon$2.getNext(FileScanRDD.scala:226)
at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:196)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:338)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:196)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage443.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$11$$anon$1.hasNext(WholeStageCodegenExec.scala:622)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:139)
at org.apache.spark.scheduler.Task.run(Task.scala:112)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$13.apply(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1432)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:503)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:2100)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2088)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2087)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2087)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1076)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1076)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1076)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2319)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2267)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2255)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:873)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2252)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:166)
... 36 more
Caused by: java.io.EOFException: Cannot seek past end of file
at com.microsoft.azure.datalake.store.ADLFileInputStream.seek(ADLFileInputStream.java:262)
at com.databricks.adl.AdlFsInputStream.seek(AdlFsInputStream.java:64)
at org.apache.hadoop.fs.FSDataInputStream.seek(FSDataInputStream.java:62)
at com.databricks.spark.metrics.FSInputStreamWithMetrics.seek(FileSystemWithMetrics.scala:207)
at org.apache.hadoop.fs.FSDataInputStream.seek(FSDataInputStream.java:62)
at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.initialize(LineRecordReader.java:107)
at org.apache.spark.sql.execution.datasources.HadoopFileLinesReader.<init>(HadoopFileLinesReader.scala:65)
at org.apache.spark.sql.execution.datasources.HadoopFileLinesReader.<init>(HadoopFileLinesReader.scala:47)
at org.apache.spark.sql.execution.datasources.csv.TextInputCSVDataSource$.readFile(CSVDataSource.scala:201)
at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat$$anonfun$buildReader$2.apply(CSVFileFormat.scala:147)
at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat$$anonfun$buildReader$2.apply(CSVFileFormat.scala:140)
at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:147)
at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:134)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1$$anon$2.getNext(FileScanRDD.scala:226)
at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:196)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:338)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:196)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage443.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$11$$anon$1.hasNext(WholeStageCodegenExec.scala:622)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:139)
at org.apache.spark.scheduler.Task.run(Task.scala:112)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$13.apply(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1432)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:503)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
So I worked a lot on it, but there is no solution for it. The way I managed was I specified a retry in ADF databricks activity. So when ever this issue occurs it can rerun the notebook in databricks and it passes. Still this is not a perfect solution but it works.
I met same issue when reading data from multiple files. The files have same columns but the order of the columns are not exactly the same. Since in pandas we can set sort=True when append DataFrame so that same columns in different position won't get confused, I suspected it is the order of the columns that cause the issue. I'm not 100% sure but it worked after I re-write the files with same order of columns.
I am using local windows and trying to load the XML file with the following code on python, and i am having this error, do anyone knows how to resolve it,
this is the code
df1 = sqlContext.read.format("xml").options(rowTag="IRS990EZ").load("https://irs-form-990.s3.amazonaws.com/201611339349202661_public.xml")
and this is the error
Py4JJavaError Traceback (most recent call last)
<ipython-input-7-4832eb48a4aa> in <module>()
----> 1 df1 = sqlContext.read.format("xml").options(rowTag="IRS990EZ").load("https://irs-form-990.s3.amazonaws.com/201611339349202661_public.xml")
C:\SPARK_HOME\spark-2.2.0-bin-hadoop2.7\python\pyspark\sql\readwriter.py in load(self, path, format, schema, **options)
157 self.options(**options)
158 if isinstance(path, basestring):
--> 159 return self._df(self._jreader.load(path))
160 elif path is not None:
161 if type(path) != list:
C:\SPARK_HOME\spark-2.2.0-bin-hadoop2.7\python\lib\py4j-0.10.4-src.zip\py4j\java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
C:\SPARK_HOME\spark-2.2.0-bin-hadoop2.7\python\pyspark\sql\utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
C:\SPARK_HOME\spark-2.2.0-bin-hadoop2.7\python\lib\py4j-0.10.4-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
317 raise Py4JJavaError(
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
321 raise Py4JError(
Py4JJavaError: An error occurred while calling o38.load.
: java.io.IOException: No FileSystem for scheme: https
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2660)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths(FileInputFormat.java:500)
at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths(FileInputFormat.java:469)
at org.apache.spark.SparkContext$$anonfun$newAPIHadoopFile$2.apply(SparkContext.scala:1160)
at org.apache.spark.SparkContext$$anonfun$newAPIHadoopFile$2.apply(SparkContext.scala:1148)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.SparkContext.withScope(SparkContext.scala:701)
at org.apache.spark.SparkContext.newAPIHadoopFile(SparkContext.scala:1148)
at com.databricks.spark.xml.util.XmlFile$.withCharset(XmlFile.scala:46)
at com.databricks.spark.xml.DefaultSource$$anonfun$createRelation$1.apply(DefaultSource.scala:62)
at com.databricks.spark.xml.DefaultSource$$anonfun$createRelation$1.apply(DefaultSource.scala:62)
at com.databricks.spark.xml.XmlRelation$$anonfun$1.apply(XmlRelation.scala:47)
at com.databricks.spark.xml.XmlRelation$$anonfun$1.apply(XmlRelation.scala:46)
at scala.Option.getOrElse(Option.scala:121)
at com.databricks.spark.xml.XmlRelation.<init>(XmlRelation.scala:45)
at com.databricks.spark.xml.DefaultSource.createRelation(DefaultSource.scala:65)
at com.databricks.spark.xml.DefaultSource.createRelation(DefaultSource.scala:43)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:306)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:156)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Unknown Source)
Somehow pyspark is unable to load the http or https, one of my colleague found the answer for this so here is the solution,
before creating the spark context and sql context we need to load this two line of code
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.4.1 pyspark-shell'
after creating the sparkcontext and sqlcontext from sc = pyspark.SparkContext.getOrCreate and sqlContext = SQLContext(sc)
add the http or https url into the sc by using sc.addFile(url)
Data_XMLFile = sqlContext.read.format("xml").options(rowTag="anytaghere").load(pyspark.SparkFiles.get("*_public.xml")).coalesce(10).cache()
this solution worked for me
The error message says it all: you cannot use dataframe reader & load to access files on the web (http or htpps). I suggest you first download the file locally.
See the pyspark.sql.DataFrameReader docs for more on the available sources (in general, local file system, HDFS, and databases via JDBC).
Irrelevantly to the error, notice that you seem to use the format part of the command incorrectly: assuming that you use the XML Data Source for Apache Spark package, the correct usage should be format('com.databricks.spark.xml') (see the example).
I've commit a similar but slightly different error: forgot the "s3://" prefix to file path. After adding this prefix to form "s3://path/to/object" the following code works:
my_data = spark.read.format("com.databricks.spark.csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.option("delimiter", ",")\
.load("s3://path/to/object")
I was also having a similar issue with the CSV file basically we were trying to load a CSV file into spark.
We were able to load the file successfully by making use of the pandas' library, first we loaded the file into the pandas data frame, and then by using the pandas we were able to load the data into the spark data frame.
from pyspark.sql import SparkSession
import pandas as pd
spark = SparkSession.builder.appName('appName').getOrCreate()
pdf = pd.read_csv('file patth with https')
sdf = spark.createDataFrame(pdf)
I am trying to run the following code which leverages graphframes, and I am getting an error now which, to the best of my knowledge and after some hours of Googling, I cannot resolve. It seems like a class cannot be loaded, but I don't really know what else I should be doing.
Can someone please have another look at the code and error below? I have followed the instructions from here, and in case you want to quickly give it a try, you can find my dataset here.
"""
Program: RUNNING GRAPH ANALYTICS WITH SPARK GRAPH-FRAMES:
Author: Dr. C. Hadjinikolis
Date: 14/09/2016
Description: This is the application's core module from where everything is executed.
The module is responsible for:
1. Loading Spark
2. Loading GraphFrames
3. Running analytics by leveraging other modules in the package.
"""
# IMPORT OTHER LIBS -------------------------------------------------------------------------------#
import os
import sys
import pandas as pd
# IMPORT SPARK ------------------------------------------------------------------------------------#
# Path to Spark source folder
USER_FILE_PATH = "/Users/christoshadjinikolis"
SPARK_PATH = "/PycharmProjects/GenesAssociation"
SPARK_FILE = "/spark-2.0.0-bin-hadoop2.7"
SPARK_HOME = USER_FILE_PATH + SPARK_PATH + SPARK_FILE
os.environ['SPARK_HOME'] = SPARK_HOME
# Append pySpark to Python Path
sys.path.append(SPARK_HOME + "/python")
sys.path.append(SPARK_HOME + "/python" + "/lib/py4j-0.10.1-src.zip")
try:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext
from graphframes import *
except ImportError as ex:
print "Can not import Spark Modules", ex
sys.exit(1)
# GLOBAL VARIABLES --------------------------------------------------------------------------------#
# Configure spark properties
CONF = (SparkConf()
.setMaster("local")
.setAppName("My app")
.set("spark.executor.memory", "10g")
.set("spark.executor.instances", "4"))
# Instantiate SparkContext object
SC = SparkContext(conf=CONF)
# Instantiate SQL_SparkContext object
SQL_CONTEXT = SQLContext(SC)
# MAIN CODE ---------------------------------------------------------------------------------------#
if __name__ == "__main__":
# Main Path to CSV files
DATA_PATH = '/PycharmProjects/GenesAssociation/data/'
FILE_NAME = 'gene_gene_associations_50k.csv'
# LOAD DATA CSV USING PANDAS -----------------------------------------------------------------#
print "STEP 1: Loading Gene Nodes -------------------------------------------------------------"
# Read csv file and load as df
GENES = pd.read_csv(USER_FILE_PATH + DATA_PATH + FILE_NAME,
usecols=['OFFICIAL_SYMBOL_A'],
low_memory=True,
iterator=True,
chunksize=1000)
# Concatenate chunks into list & convert to dataFrame
GENES_DF = pd.DataFrame(pd.concat(list(GENES), ignore_index=True))
# Remove duplicates
GENES_DF_CLEAN = GENES_DF.drop_duplicates(keep='first')
# Name Columns
GENES_DF_CLEAN.columns = ['id']
# Output dataFrame
print GENES_DF_CLEAN
# Create vertices
VERTICES = SQL_CONTEXT.createDataFrame(GENES_DF_CLEAN)
# Show some vertices
print VERTICES.take(5)
print "STEP 2: Loading Gene Edges -------------------------------------------------------------"
# Read csv file and load as df
EDGES = pd.read_csv(USER_FILE_PATH + DATA_PATH + FILE_NAME,
usecols=['OFFICIAL_SYMBOL_A', 'OFFICIAL_SYMBOL_B', 'EXPERIMENTAL_SYSTEM'],
low_memory=True,
iterator=True,
chunksize=1000)
# Concatenate chunks into list & convert to dataFrame
EDGES_DF = pd.DataFrame(pd.concat(list(EDGES), ignore_index=True))
# Name Columns
EDGES_DF.columns = ["src", "dst", "rel_type"]
# Output dataFrame
print EDGES_DF
# Create vertices
EDGES = SQL_CONTEXT.createDataFrame(EDGES_DF)
# Show some edges
print EDGES.take(5)
print "STEP 3: Generating the Graph -----------------------------------------------------------"
GENES_GRAPH = GraphFrame(VERTICES, EDGES)
print "STEP 4: Running Various Basic Analytics ------------------------------------------------"
print "Vertex in-Degree -----------------------------------------------------------------------"
GENES_GRAPH.inDegrees.sort('inDegree', ascending=False).show()
print "Vertex out-Degree ----------------------------------------------------------------------"
GENES_GRAPH.outDegrees.sort('outDegree', ascending=False).show()
print "Vertex degree --------------------------------------------------------------------------"
GENES_GRAPH.degrees.sort('degree', ascending=False).show()
print "Triangle Count -------------------------------------------------------------------------"
RESULTS = GENES_GRAPH.triangleCount()
RESULTS.select("id", "count").show()
print "Label Propagation ----------------------------------------------------------------------"
GENES_GRAPH.labelPropagation(maxIter=10).show() # Convergence is not guaranteed
print "PageRank -------------------------------------------------------------------------------"
GENES_GRAPH.pageRank(resetProbability=0.15, tol=0.01)\
.vertices.sort('pagerank', ascending=False).show()
print "STEP 5: Find Shortest Paths w.r.t. Landmarks -------------------------------------------"
# Shortest paths
SHORTEST_PATH = GENES_GRAPH.shortestPaths(landmarks=["ARF3", "MAP2K4"])
SHORTEST_PATH.select("id", "distances").show()
print "STEP 6: Save Vertices and Edges --------------------------------------------------------"
# Save vertices and edges as Parquet to some location.
# Note: You can't overwrite existing vertices and edges directories.
GENES_GRAPH.vertices.write.parquet("vertices")
GENES_GRAPH.edges.write.parquet("edges")
print "STEP 7: Load "
# Load the vertices and edges back.
SAME_VERTICES = GENES_GRAPH.read.parquet("vertices")
SAME_EDGES = GENES_GRAPH.read.parquet("edges")
# Create an identical GraphFrame.
SAME_GENES_GRAPH = GF.GraphFrame(SAME_VERTICES, SAME_EDGES)
# END OF FILE -------------------------------------------------------------------------------------#
This is the output:
Ivy Default Cache set to: /Users/username/.ivy2/cache
The jars for the packages stored in: /Users/username/.ivy2/jars
:: loading settings :: url = jar:file:/Users/username/PycharmProjects/GenesAssociation/spark-2.0.0-bin-hadoop2.7/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent;1.0
confs: [default]
found graphframes#graphframes;0.2.0-spark2.0-s_2.11 in list
found com.typesafe.scala-logging#scala-logging-api_2.11;2.1.2 in list
found com.typesafe.scala-logging#scala-logging-slf4j_2.11;2.1.2 in list
found org.scala-lang#scala-reflect;2.11.0 in list
[2.11.0] org.scala-lang#scala-reflect;2.11.0
found org.slf4j#slf4j-api;1.7.7 in list
:: resolution report :: resolve 391ms :: artifacts dl 14ms
:: modules in use:
com.typesafe.scala-logging#scala-logging-api_2.11;2.1.2 from list in [default]
com.typesafe.scala-logging#scala-logging-slf4j_2.11;2.1.2 from list in [default]
graphframes#graphframes;0.2.0-spark2.0-s_2.11 from list in [default]
org.scala-lang#scala-reflect;2.11.0 from list in [default]
org.slf4j#slf4j-api;1.7.7 from list in [default]
---------------------------------------------------------------------
| | modules || artifacts |
| conf | number| search|dwnlded|evicted|| number|dwnlded|
---------------------------------------------------------------------
| default | 5 | 0 | 0 | 0 || 5 | 0 |
---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent
confs: [default]
0 artifacts copied, 5 already retrieved (0kB/11ms)
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel).
16/09/20 11:00:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
OK1
Traceback (most recent call last):
File "/Users/username/PycharmProjects/GenesAssociation/main.py", line 32, in <module>
g = GraphFrame(v, e)
File "/Users/tjhunter/work/graphframes/python/graphframes/graphframe.py", line 62, in __init__
File "/Users/tjhunter/work/graphframes/python/graphframes/graphframe.py", line 34, in _java_api
File "/Users/christoshadjinikolis/PycharmProjects/GenesAssociation/spark-2.0.0-bin-hadoop2.7/python/lib/py4j-0.10.1-src.zip/py4j/java_gateway.py", line 933, in __call__
File "/Users/christoshadjinikolis/PycharmProjects/GenesAssociation/spark-2.0.0-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco
return f(*a, **kw)
File "/Users/username/PycharmProjects/GenesAssociation/spark-2.0.0-bin-hadoop2.7/python/lib" \
"/py4j-0.10.1-src.zip/py4j/protocol.py", line 312, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o53.newInstance.
: java.lang.NoClassDefFoundError: com/typesafe/scalalogging/slf4j/LazyLogging
at java.lang.ClassLoader.defineClass1(Native Method)
at java.lang.ClassLoader.defineClass(ClassLoader.java:763)
at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142)
at java.net.URLClassLoader.defineClass(URLClassLoader.java:467)
at java.net.URLClassLoader.access$100(URLClassLoader.java:73)
at java.net.URLClassLoader$1.run(URLClassLoader.java:368)
at java.net.URLClassLoader$1.run(URLClassLoader.java:362)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:361)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at java.lang.ClassLoader.defineClass1(Native Method)
at java.lang.ClassLoader.defineClass(ClassLoader.java:763)
at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142)
at java.net.URLClassLoader.defineClass(URLClassLoader.java:467)
at java.net.URLClassLoader.access$100(URLClassLoader.java:73)
at java.net.URLClassLoader$1.run(URLClassLoader.java:368)
at java.net.URLClassLoader$1.run(URLClassLoader.java:362)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:361)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at org.graphframes.GraphFrame$.<init>(GraphFrame.scala:677)
at org.graphframes.GraphFrame$.<clinit>(GraphFrame.scala)
at org.graphframes.GraphFramePythonAPI.<init>(GraphFramePythonAPI.scala:11)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at java.lang.Class.newInstance(Class.java:442)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:211)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ClassNotFoundException: com.typesafe.scalalogging.slf4j.LazyLogging
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 43 more
Process finished with exit code 1
I've had the same issue with spark/scala I've resolved it by adding the fowlowing jar to classpath:
spark-shell --jars scala-logging_2.12-3.5.0.jar
You can find the jar here:
https://mvnrepository.com/artifact/com.typesafe.scala-logging/scala-logging_2.12/3.5.0
Source: https://github.com/graphframes/graphframes/issues/113
I was attempting a simple rightOuterJoin, in Pyspark. The datasets which I am trying to join are the following
temp1.take(5)
Out[138]:
[u'tube_assembly_id,component_id_1,quantity_1,component_id_2,quantity_2,component_id_3,quantity_3,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8',
u'TA-00001,C-1622,2,C-1629,2,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA',
u'TA-00002,C-1312,2,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA',
u'TA-00003,C-1312,2,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA',
u'TA-00004,C-1312,2,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA']
In [139]:
temp2.take(5)
Out[139]:
[u'tube_assembly_id,spec1,spec2,spec3,spec4,spec5,spec6,spec7,spec8,spec9,spec10',
u'TA-00001,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA',
u'TA-00002,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA',
u'TA-00003,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA',
u'TA-00004,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA']
The join command is as follows
In [140]:
temp4 = temp1.rightOuterJoin(temp2)
temp4
Out[140]:
PythonRDD[191] at RDD at PythonRDD.scala:43
However, When I attempt to do any of the operations like
temp4.take(4) or temp4.count() I get the long error as listed below
Py4JJavaError Traceback (most recent call last)
<ipython-input-141-3372dfa2c550> in <module>()
----> 1 temp4.take(5)
/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/pyspark/rdd.py in take(self, num)
1222
1223 p = range(partsScanned, min(partsScanned + numPartsToTry, totalParts))
-> 1224 res = self.context.runJob(self, takeUpToNumLeft, p, True)
1225
1226 items += res
/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/pyspark/context.py in runJob(self, rdd, partitionFunc, partitions, allowLocal)
840 mappedRDD = rdd.mapPartitions(partitionFunc)
841 port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, javaPartitions,
--> 842 allowLocal)
843 return list(_load_from_socket(port, mappedRDD._jrdd_deserializer))
844
/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 74.0 failed 1 times, most recent failure: Lost task 0.0 in stage 74.0 (TID 78, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/pyspark/worker.py", line 101, in main
process()
File "/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/pyspark/worker.py", line 96, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/pyspark/serializers.py", line 236, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/pyspark/rdd.py", line 1806, in <lambda>
map_values_fn = lambda (k, v): (k, f(v))
ValueError: too many values to unpack
at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:135)
at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:176)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:94)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:87)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at org.apache.spark.api.python.PythonRDD$WriterThread$$anonfun$run$1.apply(PythonRDD.scala:243)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1618)
at org.apache.spark.api.python.PythonRDD$WriterThread.run(PythonRDD.scala:205)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1204)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1193)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1192)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1393)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
Appreciate help on this. I am new to Pyspark