How to fix this error in PySpark with the select method? - pyspark

I'm following an example from the internet, but it gives me an error that I can't solve. The code is the following:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pyspark import SparkContext
from IPython.display import display, HTML
from pyspark.sql import SQLContext
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from pyspark.sql import Column as c
from pyspark.sql.functions import array, udf, lit, col as c
import pyspark.sql.functions as f
pd.set_option('max_colwidth',100)
plt.style.use('seaborn-paper')
try:
sc = SparkContext('local[*]')
except:
sc = SparkContext.getOrCreate('local[*]')
sqlContext = SQLContext(sc)
#Leyendo los dataframe
whiteWinnePath = 'winequality-White.csv'
redWinnePath = 'winequality-Red.csv'
"""
df_p = pd.read_csv(whiteWinnePath, sep =";")
print("Mostrar el data frame")
display(df_p)
"""
whiteWinneDF = sqlContext.createDataFrame(pd.read_csv(whiteWinnePath, sep = ";")).withColumn('type',lit(0))
redWinneDF = sqlContext.createDataFrame(pd.read_csv(redWinnePath, sep = ";")).withColumn('type',lit(1))
whiteWinneDF.printSchema()
#Dividiendo conjunto de entranamiento y prueba
whiteTrainingDF, whiteTestingDF = whiteWinneDF.randomSplit([0.7,0.3])
redTrainingDF, redTestingDF = redWinneDF.randomSplit([0.7,0.3])
trainingDF = whiteTrainingDF.union(redTrainingDF)
testingDF = whiteTestingDF.union(redTestingDF)
#Preparando el dataframe para PCA
idCol = ['type']
features = [column for column in redWinneDF.columns if column not in idCol]
p = len(features)
meanVector = trainingDF.describe().where(c('summary')==lit('mean')).toPandas()[[0][1:p+1]].values
"""
meanVector2= meanVector1.toPandas()
#print("="*50)
#print(type(meanVector2))
#meanVector= meanVector2.as_matrix()#[0][1:p+1]
meanVector= meanVector2[[0][1:p+1]].values
"""
labeledVectorsDF = trainingDF.select(features+['type']).rdd\
.map(lambda x:(Vectors.dense(x[0:p]-Vectors.dense(meanVector)),x[p]))\
.toDF(['features','type'])
labeledVectorsDF.limit(5).toPandas()
When I run the code, this is the error i get:
File "D:\UGR\Investigación\Cosas de Reinaldo\mis script\Seleccion_caracteristicas\PCA_wine_quality.py", line 78, in <module>
labeledVectorsDF = trainingDF.select(features+['type']).rdd\
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 61, in toDF
return sparkSession.createDataFrame(self, schema, sampleRatio)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 605, in createDataFrame
return self._create_dataframe(data, schema, samplingRatio, verifySchema)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 628, in _create_dataframe
rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 425, in _createFromRDD
struct = self._inferSchema(rdd, samplingRatio, names=schema)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 396, in _inferSchema
first = rdd.first()
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\rdd.py", line 1464, in first
rs = self.take(1)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\rdd.py", line 1446, in take
res = self.context.runJob(self, takeUpToNumLeft, p)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\context.py", line 1118, in runJob
sock_info = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
File "C:\spark-3.0.1-bin-hadoop2.7\python\lib\py4j-0.10.9-src.zip\py4j\java_gateway.py", line 1304, in __call__
return_value = get_return_value(
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\utils.py", line 128, in deco
return f(*a, **kw)
File "C:\spark-3.0.1-bin-hadoop2.7\python\lib\py4j-0.10.9-src.zip\py4j\protocol.py", line 326, in get_return_value
raise Py4JJavaError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2.0 (TID 17, LAPTOP-3G67L0HS, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
I don't know if the error is with the select method or the rdd method. I verified that features has all the names of the columns of the dataframe except type.
How could I solve it?

Related

Convert pointcloud csv to hdf5 to train on PointCNN network

I am trying to train my point cloud data on PointCNN so I need to convert my dataset to hdf5 as used in PointCNN. PointCNN used the modelnet40_ply_hdf5_2048 dataset.
I have tried converting my custom dataset but I am having issues with the label.
I tried this to get the label/shape_names
shape_ids = {}
shape_ids = [line.rstrip() for line in open(os.path.join(PATH, 'filelist1.txt'))]
shape_names = ['_'.join(x.split('_')[0:-1]) for x in shape_ids]
datapath = [(shape_names[i], os.path.join(PATH, shape_names[i], shape_ids[i])) for i
in range(len(shape_ids))]
Convert to h5py file
import numpy as np
from tqdm import tqdm
import h5py
filenames = [line.rstrip() for line in open(os.path.join(PATH))]
f = h5py.File("filename", 'w')
data = np.zeros((len(filenames), 1024, 3))
for i in range(0, len(datapath)):
fn = datapath[i]
cls = classes[datapath[i][0]]
label = np.array([cls]).astype(np.int32)
csvreader = np.genfromtxt("data1/" + filenames[i] + ".csv", delimiter=",").astype(np.float32)
for j in range(0,1024):
data[i,j] = [csvreader[j][0], csvreader[j][1], csvreader[j][2]]
label
dset1 = f.create_dataset("data", data=data, compression="gzip", compression_opts=4)
dset2 = f.create_dataset("label", data=label, compression="gzip", compression_opts=1)
f.close()
It did convert successfully but when I tried to train on PointCNN
PointCNN training
------Building model-------
------Successfully Built model-------
Traceback (most recent call last):
File "train_pytorch.py", line 174, in <module>
current_data, current_label, _ = provider.shuffle_data(current_data, np.squeeze(current_label))
File "provider.py", line 28, in shuffle_data
idx = np.arange(len(labels))
TypeError: len() of unsized object

How to create an array in Pyspark with normal distribution with scipy.stats with UDF (or any other way)?

I currently working on migrate Python scripts to PySpark, I have this Python script that works fine:
### PYTHON
import pandas as pd
import scipy.stats as st
def fnNormalDistribution(mean,std, n):
box = list(eval('st.norm')(*[mean,std]).rvs(n))
return box
df = pd.DataFrame([[18.2500365,2.7105814157004193],
[9.833353,2.121324586200329],
[41.55563866666666,7.118716782527054]],
columns = ['mean','std'])
df
| mean | std |
|------------|----------|
| 18.250037| 2.710581|
| 9.833353| 2.121325|
| 41.555639| 7.118717|
n = 100 #Example
df['random_values'] = df.apply(lambda row: fnNormalDistribution(row["mean"], row["std"], n), axis=1)
df
| mean | std | random_values |
|------------|----------|--------------------------------------------------|
| 18.250037| 2.710581|[17.752189993958638, 18.883038367927465, 16.39...]|
| 9.833353| 2.121325|[10.31806454283759, 8.732261487201594, 11.6782...]|
| 41.555639| 7.118717|[38.17469739795093, 43.16514466083524, 49.2668...]|
but when I try to migrate to Pyspark I get the following error:
### PYSPARK
def fnNormalDistribution(mean,std, n):
box = list(eval('st.norm')(*[mean,std]).rvs(n))
return box
udf_fnNomalDistribution = f.udf(fnNormalDistribution, t.ArrayType(t.DoubleType()))
columns = ['mean','std']
data = [(18.2500365,2.7105814157004193),
(9.833353,2.121324586200329),
(41.55563866666666,7.118716782527054)]
df = spark.createDataFrame(data=data,schema=columns)
df.show()
| mean | std |
|------------|----------|
| 18.250037| 2.710581|
| 9.833353| 2.121325|
| 41.555639| 7.118717|
df = df.withColumn('random_values', udf_fnNomalDistribution('mean','std',f.lit(n)))
df.show()
PythonException:
An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\worker.py", line 604, in main
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\worker.py", line 596, in process
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\serializers.py", line 211, in dump_stream
self.serializer.dump_stream(self._batched(iterator), stream)
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\serializers.py", line 132, in dump_stream
for obj in iterator:
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\serializers.py", line 200, in _batched
for item in iterator:
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\worker.py", line 450, in mapper
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\worker.py", line 450, in <genexpr>
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\worker.py", line 85, in <lambda>
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\util.py", line 73, in wrapper
return f(*args, **kwargs)
File "C:\Users\Ubits\AppData\Local\Temp/ipykernel_10604/2493247477.py", line 2, in fnNormalDistribution
File "<string>", line 1, in <module>
NameError: name 'st' is not defined
Is there some way to use the same function in Pyspark or get the random_values column in another way? I googled it with no exit about it.
Thanks
I was trying this and it can really be fixed by moving st inside fnNormalDistribution like samkart suggested.
I will just leave my example here as Fugue may provide a more readable way to bring this to Spark, especially around handling schema. Full code below.
import pandas as pd
def fnNormalDistribution(mean,std, n):
import scipy.stats as st
box = (eval('st.norm')(*[mean,std]).rvs(n)).tolist()
return box
df = pd.DataFrame([[18.2500365,2.7105814157004193],
[9.833353,2.121324586200329],
[41.55563866666666,7.118716782527054]],
columns = ['mean','std'])
n = 100 #Example
def helper(df: pd.DataFrame) -> pd.DataFrame:
df['random_values'] = df.apply(lambda row: fnNormalDistribution(row["mean"], row["std"], n), axis=1)
return df
from fugue import transform
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
# transform can take either pandas of spark DataFrame as input
# If engine is none, it will run on pandas
sdf = transform(df,
helper,
schema="*, random_values:[float]",
engine=spark)
sdf.show()

Show() brings error after applying pandas udf to dataframe

I am having problems to make this trial code work. The final line df.select(plus_one(col("x"))).show() doesn't work, I also tried to save in a variable ( vardf = df.select(plus_one(col("x"))) followed by vardf.show() and fails too.
import pyspark
import pandas as pd
from typing import Iterator
from pyspark.sql.functions import col, pandas_udf, struct
spark = pyspark.sql.SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
pdf = pd.DataFrame([1, 2, 3], columns=["x"])
df = spark.createDataFrame(pdf)
df.show()
#pandas_udf("long")
def plus_one(batch_iter: Iterator[pd.Series]) -> Iterator[pd.Series]:
for s in batch_iter:
yield s + 1
df.select(plus_one(col("x"))).show()
Error message (parts of it):
File "C:\bigdatasetup\anaconda3\envs\pyspark-env\lib\site-packages\spyder_kernels\py3compat.py", line 356, in compat_exec
exec(code, globals, locals)
File "c:\bigdatasetup\dataanalysiswithpythonandpyspark-trunk\code\ch09\untitled0.py", line 24, in
df.select(plus_one(col("x"))).show()
File "C:\bigdatasetup\anaconda3\envs\pyspark-env\lib\site-packages\pyspark\sql\dataframe.py", line 494, in show
print(self._jdf.showString(n, 20, vertical))
File "C:\bigdatasetup\anaconda3\envs\pyspark-env\lib\site-packages\py4j\java_gateway.py", line 1321, in call
return_value = get_return_value(
File "C:\bigdatasetup\anaconda3\envs\pyspark-env\lib\site-packages\pyspark\sql\utils.py", line 117, in deco
raise converted from None
PythonException:
An exception was thrown from the Python worker. Please see the stack trace below.
...
...
ERROR 2022-04-21 09:48:24,423 7608 org.apache.spark.scheduler.TaskSetManager [task-result-getter-0] Task 0 in stage 3.0 failed 1 times; aborting job

org.jpmml.sparkml.PMMLBuilder does not exist in the JVM

Thanks a lot for any help.
My goal is to save a trained model in XML format and Im really stragling with this error and warnings
---------------------------------------------------------------------------
Exception in thread "Thread-4" java.lang.ExceptionInInitializerError
at java.base/java.lang.Class.forName0(Native Method)
at java.base/java.lang.Class.forName(Class.java:398)
at py4j.reflection.CurrentThreadClassLoadingStrategy.classForName(CurrentThreadClassLoadingStrategy.java:40)
at py4j.reflection.ReflectionUtil.classForName(ReflectionUtil.java:51)
at py4j.reflection.TypeUtil.forName(TypeUtil.java:243)
at py4j.commands.ReflectionCommand.getUnknownMember(ReflectionCommand.java:175)
at py4j.commands.ReflectionCommand.execute(ReflectionCommand.java:87)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.IllegalArgumentException: Expected Apache Spark ML version 3.1, got version 3.2 (3.2.0)
at org.jpmml.sparkml.ConverterFactory.checkVersion(ConverterFactory.java:114)
at org.jpmml.sparkml.PMMLBuilder.init(PMMLBuilder.java:481)
at org.jpmml.sparkml.PMMLBuilder.<clinit>(PMMLBuilder.java:545)
... 10 more
ERROR:root:Exception while sending command.
Traceback (most recent call last):
File "/home/mbg/.local/lib/python3.8/site-packages/pyspark/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 480, in send_command
raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/mbg/.local/lib/python3.8/site-packages/pyspark/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py", line 1038, in send_command
response = connection.send_command(command)
File "/home/mbg/.local/lib/python3.8/site-packages/pyspark/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 503, in send_command
raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
---------------------------------------------------------------------------
Py4JError Traceback (most recent call last)
/tmp/ipykernel_20251/3496938591.py in <module>
----> 1 pmmlBuilder = PMMLBuilder(sc, df_train, rfModel)
~/.local/lib/python3.8/site-packages/pyspark2pmml/__init__.py in __init__(self, sc, df, pipelineModel)
10 javaSchema = javaDf.schema.__call__()
11 javaPipelineModel = pipelineModel._to_java()
---> 12 javaPmmlBuilderClass = sc._jvm.org.jpmml.sparkml.PMMLBuilder
13 if(not isinstance(javaPmmlBuilderClass, JavaClass)):
14 raise RuntimeError("JPMML-SparkML not found on classpath")
~/.local/lib/python3.8/site-packages/pyspark/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py in __getattr__(self, name)
1647 answer[proto.CLASS_FQN_START:], self._gateway_client)
1648 else:
-> 1649 raise Py4JError("{0} does not exist in the JVM".format(new_fqn))
1650
1651
Py4JError: org.jpmml.sparkml.PMMLBuilder does not exist in the JVM
My code is the folowing:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
conf = SparkConf().setAppName("SparkApp_ETL_ML").setMaster("local[*]")
sc = SparkContext.getOrCreate(conf)
spark = SparkSession.builder.getOrCreate()
import pandas as pd
df=pd.read_parquet("https://s3.eu-de.cloud-object-storage.appdomain.cloud/cloud-object-storage-yy-cos-standard-js4/data.parquet")
sdf = spark.createDataFrame(df)
from pyspark.sql.types import DoubleType
sdf = sdf.withColumn("x", sdf.x.cast(DoubleType()))
sdf = sdf.withColumn("y", sdf.y.cast(DoubleType()))
sdf = sdf.withColumn("z", sdf.z.cast(DoubleType()))
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
input_columns = ["x", "y", "z"] # input columns to consider
train, test = sdf.randomSplit([0.8, 0.2], seed=1)
indexer = StringIndexer(inputCol="class", outputCol="label")
vectorAssembler = VectorAssembler(inputCols=input_columns, outputCol="features")
normalizer = MinMaxScaler(inputCol="features", outputCol="features_norm")
pipeline = Pipeline(stages=[indexer, vectorAssembler, normalizer])
binEval = MulticlassClassificationEvaluator().setMetricName("accuracy").setPredictionCol("prediction"). \
setLabelCol("label")
df_train = pipeline.fit(train).transform(train)
df_test = pipeline.fit(test).transform(test)
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol='features_norm', labelCol='label', maxDepth=20, numTrees=7, seed=1)
rfModel = rf.fit(df_train)
from pyspark2pmml import PMMLBuilder
model_target = "HMP_frModel.xml"
pmmlBuilder = PMMLBuilder(sc, df_train, rfModel)
All works well till the last line in code.
I tried all solutions i found on the internet but unfortunatly without success.
I am working with jupyter notebook not anaconda and installed pyspark with pip and I added those variables in the .bashrc
export PATH=$PATH:~/.local/bin
export SPARK_HOME=~/.local/lib/python3.8/site-packages/pyspark
export PYTHONPATH=$SPARK_HOME/python/lib/py4j-0.10.9.2-src.zip
export PATH=$SPARK_HOME/bin:$SPARK_HOME/python:$PATH
export PYSPARK_DRIVER_PYTHON=jupyter
export PYSPARK_DRIVER_PYTHON_OPTS='notebook'
I also downloaded those jar files jpmml-sparkml-executable-1.7.2.jar jpmml-sparkml-executable-1.8.0.jar and put them in this directory ~/.local/lib/python3.8/site-packages/pyspark/jars

Cannot convert type <class 'pyspark.mllib.regression.LabeledPoint'> into Vector while using StreamingLinearRegressionWithSGD

I am using SaprkStreaming to train a simple StreamingLinearRegressionWithSGD model with PySpark. Firstly, I was trying to simulate the random inputStream for model(fixed Vector w dots random Vector x, then plus random value obeying GuassianDistribution),code like below:
import sys
import socket
import time
import random
import numpy as np
from functools import reduce
def return_random_guass_seq(_type=list, len_term=10, mu=0, sigma=1, _scale=1):
return _type([_scale*random.gauss(mu, sigma) for i in range(len_term)])
# set w vector
len_term = 100
w = return_random_guass_seq(_type=np.array, _scale=10, len_term=len_term)
socket_server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
socket_server.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
socket_server.bind((sys.argv[1], int(sys.argv[2])))
socket_server.listen(5)
while True:
conn, (ip, port) = socket_server.accept()
print(ip, port)
while True:
random_sleep_time = random.random()
noisy = random.gauss(0, 1)*5
x = return_random_guass_seq(_type=np.array, len_term=len_term)
y = x.dot(w) + noisy
time.sleep(random_sleep_time)
s = str(y) + reduce(lambda i, j:str(i)+','+str(j), x, '')
print(s)
conn.send(bytes(s, 'utf-8'))
conn.send(bytes('\n', 'utf-8'))
socket_server.close()
Then, I initialized StreamingLinearRegressionWithSGD model and tried to trainOn labeledStream,
import sys
from pyspark.streaming import StreamingContext
from pyspark import SparkContext
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import StreamingLinearRegressionWithSGD
import numpy as np
#initialize sc,then initialize ssc
sc = SparkContext(appName="StreamingModel")
#just print ssc result and error
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, batchDuration=5)# 5 seconds window
stream = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
#create model
len_term = 100
Model = StreamingLinearRegressionWithSGD(stepSize=0.1, numIterations=50)
Model.setInitialWeights(np.array([0.0, ]*len_term))
# create actions
labeledStream = stream.map(
lambda x:tuple(x.split(','))
).map(
lambda x:(float(x[0]), np.array([float(i) for i in x[1:]]) )
).map(
lambda x:LabeledPoint(label=x[0], features=x[1])
)
labeledStream.pprint()
Model.trainOn(labeledStream)
Model.predictOn(labeledStream).pprint()
# start sc_stream
ssc.start()
ssc.awaitTermination()
And ran into the following error
TypeError: Cannot convert type <class 'pyspark.mllib.regression.LabeledPoint'> into Vector.
I can't figure out why this error happened. Was I misunderstanding something apperent or it was the result of negeligence?
Full trace:
Traceback (most recent call last):
File "/home/zh/spark_hadoop/hadoop/tmp/nm-local-dir/usercache/zh/appcache/application_1512094491627_0072/container_1512094491627_0072_01_000003/pyspark.zip/pyspark/worker.py", line 177, in main
process()
File "/home/zh/spark_hadoop/hadoop/tmp/nm-local-dir/usercache/zh/appcache/application_1512094491627_0072/container_1512094491627_0072_01_000003/pyspark.zip/pyspark/worker.py", line 172, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/home/zh/spark_hadoop/hadoop/tmp/nm-local-dir/usercache/zh/appcache/application_1512094491627_0072/container_1512094491627_0072_01_000003/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/home/zh/spark_hadoop/spark/python/lib/pyspark.zip/pyspark/mllib/regression.py", line 746, in <lambda>
File "/home/zh/spark_hadoop/hadoop/tmp/nm-local-dir/usercache/zh/appcache/application_1512094491627_0072/container_1512094491627_0072_01_000003/pyspark.zip/pyspark/mllib/regression.py", line 121, in predict
x = _convert_to_vector(x)
File "/home/zh/spark_hadoop/hadoop/tmp/nm-local-dir/usercache/zh/appcache/application_1512094491627_0072/container_1512094491627_0072_01_000003/pyspark.zip/pyspark/mllib/linalg/__init__.py", line 83, in _convert_to_vector
raise TypeError("Cannot convert type %s into Vector" % type(l))
TypeError: Cannot convert type <class 'pyspark.mllib.regression.LabeledPoint'> into Vector
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:108)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
... 1 more
StreamingLinearAlgorithm.predictOn requires DStream[Vector]. You have to map:
Model.predictOn(labeledStream.map(lambda x.features)).pprint()