How to create an array in Pyspark with normal distribution with scipy.stats with UDF (or any other way)? - pyspark

I currently working on migrate Python scripts to PySpark, I have this Python script that works fine:
### PYTHON
import pandas as pd
import scipy.stats as st
def fnNormalDistribution(mean,std, n):
box = list(eval('st.norm')(*[mean,std]).rvs(n))
return box
df = pd.DataFrame([[18.2500365,2.7105814157004193],
[9.833353,2.121324586200329],
[41.55563866666666,7.118716782527054]],
columns = ['mean','std'])
df
| mean | std |
|------------|----------|
| 18.250037| 2.710581|
| 9.833353| 2.121325|
| 41.555639| 7.118717|
n = 100 #Example
df['random_values'] = df.apply(lambda row: fnNormalDistribution(row["mean"], row["std"], n), axis=1)
df
| mean | std | random_values |
|------------|----------|--------------------------------------------------|
| 18.250037| 2.710581|[17.752189993958638, 18.883038367927465, 16.39...]|
| 9.833353| 2.121325|[10.31806454283759, 8.732261487201594, 11.6782...]|
| 41.555639| 7.118717|[38.17469739795093, 43.16514466083524, 49.2668...]|
but when I try to migrate to Pyspark I get the following error:
### PYSPARK
def fnNormalDistribution(mean,std, n):
box = list(eval('st.norm')(*[mean,std]).rvs(n))
return box
udf_fnNomalDistribution = f.udf(fnNormalDistribution, t.ArrayType(t.DoubleType()))
columns = ['mean','std']
data = [(18.2500365,2.7105814157004193),
(9.833353,2.121324586200329),
(41.55563866666666,7.118716782527054)]
df = spark.createDataFrame(data=data,schema=columns)
df.show()
| mean | std |
|------------|----------|
| 18.250037| 2.710581|
| 9.833353| 2.121325|
| 41.555639| 7.118717|
df = df.withColumn('random_values', udf_fnNomalDistribution('mean','std',f.lit(n)))
df.show()
PythonException:
An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\worker.py", line 604, in main
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\worker.py", line 596, in process
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\serializers.py", line 211, in dump_stream
self.serializer.dump_stream(self._batched(iterator), stream)
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\serializers.py", line 132, in dump_stream
for obj in iterator:
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\serializers.py", line 200, in _batched
for item in iterator:
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\worker.py", line 450, in mapper
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\worker.py", line 450, in <genexpr>
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\worker.py", line 85, in <lambda>
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\util.py", line 73, in wrapper
return f(*args, **kwargs)
File "C:\Users\Ubits\AppData\Local\Temp/ipykernel_10604/2493247477.py", line 2, in fnNormalDistribution
File "<string>", line 1, in <module>
NameError: name 'st' is not defined
Is there some way to use the same function in Pyspark or get the random_values column in another way? I googled it with no exit about it.
Thanks

I was trying this and it can really be fixed by moving st inside fnNormalDistribution like samkart suggested.
I will just leave my example here as Fugue may provide a more readable way to bring this to Spark, especially around handling schema. Full code below.
import pandas as pd
def fnNormalDistribution(mean,std, n):
import scipy.stats as st
box = (eval('st.norm')(*[mean,std]).rvs(n)).tolist()
return box
df = pd.DataFrame([[18.2500365,2.7105814157004193],
[9.833353,2.121324586200329],
[41.55563866666666,7.118716782527054]],
columns = ['mean','std'])
n = 100 #Example
def helper(df: pd.DataFrame) -> pd.DataFrame:
df['random_values'] = df.apply(lambda row: fnNormalDistribution(row["mean"], row["std"], n), axis=1)
return df
from fugue import transform
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
# transform can take either pandas of spark DataFrame as input
# If engine is none, it will run on pandas
sdf = transform(df,
helper,
schema="*, random_values:[float]",
engine=spark)
sdf.show()

Related

Convert pointcloud csv to hdf5 to train on PointCNN network

I am trying to train my point cloud data on PointCNN so I need to convert my dataset to hdf5 as used in PointCNN. PointCNN used the modelnet40_ply_hdf5_2048 dataset.
I have tried converting my custom dataset but I am having issues with the label.
I tried this to get the label/shape_names
shape_ids = {}
shape_ids = [line.rstrip() for line in open(os.path.join(PATH, 'filelist1.txt'))]
shape_names = ['_'.join(x.split('_')[0:-1]) for x in shape_ids]
datapath = [(shape_names[i], os.path.join(PATH, shape_names[i], shape_ids[i])) for i
in range(len(shape_ids))]
Convert to h5py file
import numpy as np
from tqdm import tqdm
import h5py
filenames = [line.rstrip() for line in open(os.path.join(PATH))]
f = h5py.File("filename", 'w')
data = np.zeros((len(filenames), 1024, 3))
for i in range(0, len(datapath)):
fn = datapath[i]
cls = classes[datapath[i][0]]
label = np.array([cls]).astype(np.int32)
csvreader = np.genfromtxt("data1/" + filenames[i] + ".csv", delimiter=",").astype(np.float32)
for j in range(0,1024):
data[i,j] = [csvreader[j][0], csvreader[j][1], csvreader[j][2]]
label
dset1 = f.create_dataset("data", data=data, compression="gzip", compression_opts=4)
dset2 = f.create_dataset("label", data=label, compression="gzip", compression_opts=1)
f.close()
It did convert successfully but when I tried to train on PointCNN
PointCNN training
------Building model-------
------Successfully Built model-------
Traceback (most recent call last):
File "train_pytorch.py", line 174, in <module>
current_data, current_label, _ = provider.shuffle_data(current_data, np.squeeze(current_label))
File "provider.py", line 28, in shuffle_data
idx = np.arange(len(labels))
TypeError: len() of unsized object

Show() brings error after applying pandas udf to dataframe

I am having problems to make this trial code work. The final line df.select(plus_one(col("x"))).show() doesn't work, I also tried to save in a variable ( vardf = df.select(plus_one(col("x"))) followed by vardf.show() and fails too.
import pyspark
import pandas as pd
from typing import Iterator
from pyspark.sql.functions import col, pandas_udf, struct
spark = pyspark.sql.SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
pdf = pd.DataFrame([1, 2, 3], columns=["x"])
df = spark.createDataFrame(pdf)
df.show()
#pandas_udf("long")
def plus_one(batch_iter: Iterator[pd.Series]) -> Iterator[pd.Series]:
for s in batch_iter:
yield s + 1
df.select(plus_one(col("x"))).show()
Error message (parts of it):
File "C:\bigdatasetup\anaconda3\envs\pyspark-env\lib\site-packages\spyder_kernels\py3compat.py", line 356, in compat_exec
exec(code, globals, locals)
File "c:\bigdatasetup\dataanalysiswithpythonandpyspark-trunk\code\ch09\untitled0.py", line 24, in
df.select(plus_one(col("x"))).show()
File "C:\bigdatasetup\anaconda3\envs\pyspark-env\lib\site-packages\pyspark\sql\dataframe.py", line 494, in show
print(self._jdf.showString(n, 20, vertical))
File "C:\bigdatasetup\anaconda3\envs\pyspark-env\lib\site-packages\py4j\java_gateway.py", line 1321, in call
return_value = get_return_value(
File "C:\bigdatasetup\anaconda3\envs\pyspark-env\lib\site-packages\pyspark\sql\utils.py", line 117, in deco
raise converted from None
PythonException:
An exception was thrown from the Python worker. Please see the stack trace below.
...
...
ERROR 2022-04-21 09:48:24,423 7608 org.apache.spark.scheduler.TaskSetManager [task-result-getter-0] Task 0 in stage 3.0 failed 1 times; aborting job

How to fix this error in PySpark with the select method?

I'm following an example from the internet, but it gives me an error that I can't solve. The code is the following:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pyspark import SparkContext
from IPython.display import display, HTML
from pyspark.sql import SQLContext
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from pyspark.sql import Column as c
from pyspark.sql.functions import array, udf, lit, col as c
import pyspark.sql.functions as f
pd.set_option('max_colwidth',100)
plt.style.use('seaborn-paper')
try:
sc = SparkContext('local[*]')
except:
sc = SparkContext.getOrCreate('local[*]')
sqlContext = SQLContext(sc)
#Leyendo los dataframe
whiteWinnePath = 'winequality-White.csv'
redWinnePath = 'winequality-Red.csv'
"""
df_p = pd.read_csv(whiteWinnePath, sep =";")
print("Mostrar el data frame")
display(df_p)
"""
whiteWinneDF = sqlContext.createDataFrame(pd.read_csv(whiteWinnePath, sep = ";")).withColumn('type',lit(0))
redWinneDF = sqlContext.createDataFrame(pd.read_csv(redWinnePath, sep = ";")).withColumn('type',lit(1))
whiteWinneDF.printSchema()
#Dividiendo conjunto de entranamiento y prueba
whiteTrainingDF, whiteTestingDF = whiteWinneDF.randomSplit([0.7,0.3])
redTrainingDF, redTestingDF = redWinneDF.randomSplit([0.7,0.3])
trainingDF = whiteTrainingDF.union(redTrainingDF)
testingDF = whiteTestingDF.union(redTestingDF)
#Preparando el dataframe para PCA
idCol = ['type']
features = [column for column in redWinneDF.columns if column not in idCol]
p = len(features)
meanVector = trainingDF.describe().where(c('summary')==lit('mean')).toPandas()[[0][1:p+1]].values
"""
meanVector2= meanVector1.toPandas()
#print("="*50)
#print(type(meanVector2))
#meanVector= meanVector2.as_matrix()#[0][1:p+1]
meanVector= meanVector2[[0][1:p+1]].values
"""
labeledVectorsDF = trainingDF.select(features+['type']).rdd\
.map(lambda x:(Vectors.dense(x[0:p]-Vectors.dense(meanVector)),x[p]))\
.toDF(['features','type'])
labeledVectorsDF.limit(5).toPandas()
When I run the code, this is the error i get:
File "D:\UGR\Investigación\Cosas de Reinaldo\mis script\Seleccion_caracteristicas\PCA_wine_quality.py", line 78, in <module>
labeledVectorsDF = trainingDF.select(features+['type']).rdd\
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 61, in toDF
return sparkSession.createDataFrame(self, schema, sampleRatio)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 605, in createDataFrame
return self._create_dataframe(data, schema, samplingRatio, verifySchema)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 628, in _create_dataframe
rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 425, in _createFromRDD
struct = self._inferSchema(rdd, samplingRatio, names=schema)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 396, in _inferSchema
first = rdd.first()
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\rdd.py", line 1464, in first
rs = self.take(1)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\rdd.py", line 1446, in take
res = self.context.runJob(self, takeUpToNumLeft, p)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\context.py", line 1118, in runJob
sock_info = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
File "C:\spark-3.0.1-bin-hadoop2.7\python\lib\py4j-0.10.9-src.zip\py4j\java_gateway.py", line 1304, in __call__
return_value = get_return_value(
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\utils.py", line 128, in deco
return f(*a, **kw)
File "C:\spark-3.0.1-bin-hadoop2.7\python\lib\py4j-0.10.9-src.zip\py4j\protocol.py", line 326, in get_return_value
raise Py4JJavaError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2.0 (TID 17, LAPTOP-3G67L0HS, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
I don't know if the error is with the select method or the rdd method. I verified that features has all the names of the columns of the dataframe except type.
How could I solve it?

org.jpmml.sparkml.PMMLBuilder does not exist in the JVM

Thanks a lot for any help.
My goal is to save a trained model in XML format and Im really stragling with this error and warnings
---------------------------------------------------------------------------
Exception in thread "Thread-4" java.lang.ExceptionInInitializerError
at java.base/java.lang.Class.forName0(Native Method)
at java.base/java.lang.Class.forName(Class.java:398)
at py4j.reflection.CurrentThreadClassLoadingStrategy.classForName(CurrentThreadClassLoadingStrategy.java:40)
at py4j.reflection.ReflectionUtil.classForName(ReflectionUtil.java:51)
at py4j.reflection.TypeUtil.forName(TypeUtil.java:243)
at py4j.commands.ReflectionCommand.getUnknownMember(ReflectionCommand.java:175)
at py4j.commands.ReflectionCommand.execute(ReflectionCommand.java:87)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.IllegalArgumentException: Expected Apache Spark ML version 3.1, got version 3.2 (3.2.0)
at org.jpmml.sparkml.ConverterFactory.checkVersion(ConverterFactory.java:114)
at org.jpmml.sparkml.PMMLBuilder.init(PMMLBuilder.java:481)
at org.jpmml.sparkml.PMMLBuilder.<clinit>(PMMLBuilder.java:545)
... 10 more
ERROR:root:Exception while sending command.
Traceback (most recent call last):
File "/home/mbg/.local/lib/python3.8/site-packages/pyspark/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 480, in send_command
raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/mbg/.local/lib/python3.8/site-packages/pyspark/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py", line 1038, in send_command
response = connection.send_command(command)
File "/home/mbg/.local/lib/python3.8/site-packages/pyspark/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 503, in send_command
raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
---------------------------------------------------------------------------
Py4JError Traceback (most recent call last)
/tmp/ipykernel_20251/3496938591.py in <module>
----> 1 pmmlBuilder = PMMLBuilder(sc, df_train, rfModel)
~/.local/lib/python3.8/site-packages/pyspark2pmml/__init__.py in __init__(self, sc, df, pipelineModel)
10 javaSchema = javaDf.schema.__call__()
11 javaPipelineModel = pipelineModel._to_java()
---> 12 javaPmmlBuilderClass = sc._jvm.org.jpmml.sparkml.PMMLBuilder
13 if(not isinstance(javaPmmlBuilderClass, JavaClass)):
14 raise RuntimeError("JPMML-SparkML not found on classpath")
~/.local/lib/python3.8/site-packages/pyspark/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py in __getattr__(self, name)
1647 answer[proto.CLASS_FQN_START:], self._gateway_client)
1648 else:
-> 1649 raise Py4JError("{0} does not exist in the JVM".format(new_fqn))
1650
1651
Py4JError: org.jpmml.sparkml.PMMLBuilder does not exist in the JVM
My code is the folowing:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
conf = SparkConf().setAppName("SparkApp_ETL_ML").setMaster("local[*]")
sc = SparkContext.getOrCreate(conf)
spark = SparkSession.builder.getOrCreate()
import pandas as pd
df=pd.read_parquet("https://s3.eu-de.cloud-object-storage.appdomain.cloud/cloud-object-storage-yy-cos-standard-js4/data.parquet")
sdf = spark.createDataFrame(df)
from pyspark.sql.types import DoubleType
sdf = sdf.withColumn("x", sdf.x.cast(DoubleType()))
sdf = sdf.withColumn("y", sdf.y.cast(DoubleType()))
sdf = sdf.withColumn("z", sdf.z.cast(DoubleType()))
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
input_columns = ["x", "y", "z"] # input columns to consider
train, test = sdf.randomSplit([0.8, 0.2], seed=1)
indexer = StringIndexer(inputCol="class", outputCol="label")
vectorAssembler = VectorAssembler(inputCols=input_columns, outputCol="features")
normalizer = MinMaxScaler(inputCol="features", outputCol="features_norm")
pipeline = Pipeline(stages=[indexer, vectorAssembler, normalizer])
binEval = MulticlassClassificationEvaluator().setMetricName("accuracy").setPredictionCol("prediction"). \
setLabelCol("label")
df_train = pipeline.fit(train).transform(train)
df_test = pipeline.fit(test).transform(test)
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol='features_norm', labelCol='label', maxDepth=20, numTrees=7, seed=1)
rfModel = rf.fit(df_train)
from pyspark2pmml import PMMLBuilder
model_target = "HMP_frModel.xml"
pmmlBuilder = PMMLBuilder(sc, df_train, rfModel)
All works well till the last line in code.
I tried all solutions i found on the internet but unfortunatly without success.
I am working with jupyter notebook not anaconda and installed pyspark with pip and I added those variables in the .bashrc
export PATH=$PATH:~/.local/bin
export SPARK_HOME=~/.local/lib/python3.8/site-packages/pyspark
export PYTHONPATH=$SPARK_HOME/python/lib/py4j-0.10.9.2-src.zip
export PATH=$SPARK_HOME/bin:$SPARK_HOME/python:$PATH
export PYSPARK_DRIVER_PYTHON=jupyter
export PYSPARK_DRIVER_PYTHON_OPTS='notebook'
I also downloaded those jar files jpmml-sparkml-executable-1.7.2.jar jpmml-sparkml-executable-1.8.0.jar and put them in this directory ~/.local/lib/python3.8/site-packages/pyspark/jars

How to add meta_data to Pandas dataframe?

I use Pandas dataframe heavily. And need to attach some data to the dataframe, for example to record the birth time of the dataframe, the additional description of the dataframe etc.
I just can't find reserved fields of dataframe class to keep the data.
So I change the core\frame.py file to add a line _reserved_slot = {} to solve my issue. I post the question here is just want to know is it OK to do so ? Or is there better way to attach meta-data to dataframe/column/row etc?
#----------------------------------------------------------------------
# DataFrame class
class DataFrame(NDFrame):
_auto_consolidate = True
_verbose_info = True
_het_axis = 1
_col_klass = Series
_AXIS_NUMBERS = {
'index': 0,
'columns': 1
}
_reserved_slot = {} # Add by bigbug to keep extra data for dataframe
_AXIS_NAMES = dict((v, k) for k, v in _AXIS_NUMBERS.iteritems())
EDIT : (Add demo msg for witingkuo's way)
>>> df = pd.DataFrame(np.random.randn(10,5), columns=list('ABCDEFGHIJKLMN')[0:5])
>>> df
A B C D E
0 0.5890 -0.7683 -1.9752 0.7745 0.8019
1 1.1835 0.0873 0.3492 0.7749 1.1318
2 0.7476 0.4116 0.3427 -0.1355 1.8557
3 1.2738 0.7225 -0.8639 -0.7190 -0.2598
4 -0.3644 -0.4676 0.0837 0.1685 0.8199
5 0.4621 -0.2965 0.7061 -1.3920 0.6838
6 -0.4135 -0.4991 0.7277 -0.6099 1.8606
7 -1.0804 -0.3456 0.8979 0.3319 -1.1907
8 -0.3892 1.2319 -0.4735 0.8516 1.2431
9 -1.0527 0.9307 0.2740 -0.6909 0.4924
>>> df._test = 'hello'
>>> df2 = df.shift(1)
>>> print df2._test
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "D:\Python\lib\site-packages\pandas\core\frame.py", line 2051, in __getattr__
(type(self).__name__, name))
AttributeError: 'DataFrame' object has no attribute '_test'
>>>
This is not supported right now. See https://github.com/pydata/pandas/issues/2485. The reason is the propogation of these attributes is non-trivial. You can certainly assign data, but almost all pandas operations return a new object, where the assigned data will be lost.
Your _reserved_slot will become a class variable. That might not work if you want to assign different value to different DataFrame. Probably you can assign what you want to the instance directly.
In [6]: import pandas as pd
In [7]: df = pd.DataFrame()
In [8]: df._test = 'hello'
In [9]: df._test
Out[9]: 'hello'
I think a decent workaround is putting your datafame into a dictionary with your metadata as other keys. So if you have a dataframe with cashflows, like:
df = pd.DataFrame({'Amount': [-20, 15, 25, 30, 100]},index=pd.date_range(start='1/1/2018', periods=5))
You can create your dictionary with additional metadata and put the dataframe there
out = {'metadata': {'Name': 'Whatever', 'Account': 'Something else'}, 'df': df}
and then use it as out[df]