Show() brings error after applying pandas udf to dataframe - pyspark

I am having problems to make this trial code work. The final line df.select(plus_one(col("x"))).show() doesn't work, I also tried to save in a variable ( vardf = df.select(plus_one(col("x"))) followed by vardf.show() and fails too.
import pyspark
import pandas as pd
from typing import Iterator
from pyspark.sql.functions import col, pandas_udf, struct
spark = pyspark.sql.SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
pdf = pd.DataFrame([1, 2, 3], columns=["x"])
df = spark.createDataFrame(pdf)
df.show()
#pandas_udf("long")
def plus_one(batch_iter: Iterator[pd.Series]) -> Iterator[pd.Series]:
for s in batch_iter:
yield s + 1
df.select(plus_one(col("x"))).show()
Error message (parts of it):
File "C:\bigdatasetup\anaconda3\envs\pyspark-env\lib\site-packages\spyder_kernels\py3compat.py", line 356, in compat_exec
exec(code, globals, locals)
File "c:\bigdatasetup\dataanalysiswithpythonandpyspark-trunk\code\ch09\untitled0.py", line 24, in
df.select(plus_one(col("x"))).show()
File "C:\bigdatasetup\anaconda3\envs\pyspark-env\lib\site-packages\pyspark\sql\dataframe.py", line 494, in show
print(self._jdf.showString(n, 20, vertical))
File "C:\bigdatasetup\anaconda3\envs\pyspark-env\lib\site-packages\py4j\java_gateway.py", line 1321, in call
return_value = get_return_value(
File "C:\bigdatasetup\anaconda3\envs\pyspark-env\lib\site-packages\pyspark\sql\utils.py", line 117, in deco
raise converted from None
PythonException:
An exception was thrown from the Python worker. Please see the stack trace below.
...
...
ERROR 2022-04-21 09:48:24,423 7608 org.apache.spark.scheduler.TaskSetManager [task-result-getter-0] Task 0 in stage 3.0 failed 1 times; aborting job

Related

How to create an array in Pyspark with normal distribution with scipy.stats with UDF (or any other way)?

I currently working on migrate Python scripts to PySpark, I have this Python script that works fine:
### PYTHON
import pandas as pd
import scipy.stats as st
def fnNormalDistribution(mean,std, n):
box = list(eval('st.norm')(*[mean,std]).rvs(n))
return box
df = pd.DataFrame([[18.2500365,2.7105814157004193],
[9.833353,2.121324586200329],
[41.55563866666666,7.118716782527054]],
columns = ['mean','std'])
df
| mean | std |
|------------|----------|
| 18.250037| 2.710581|
| 9.833353| 2.121325|
| 41.555639| 7.118717|
n = 100 #Example
df['random_values'] = df.apply(lambda row: fnNormalDistribution(row["mean"], row["std"], n), axis=1)
df
| mean | std | random_values |
|------------|----------|--------------------------------------------------|
| 18.250037| 2.710581|[17.752189993958638, 18.883038367927465, 16.39...]|
| 9.833353| 2.121325|[10.31806454283759, 8.732261487201594, 11.6782...]|
| 41.555639| 7.118717|[38.17469739795093, 43.16514466083524, 49.2668...]|
but when I try to migrate to Pyspark I get the following error:
### PYSPARK
def fnNormalDistribution(mean,std, n):
box = list(eval('st.norm')(*[mean,std]).rvs(n))
return box
udf_fnNomalDistribution = f.udf(fnNormalDistribution, t.ArrayType(t.DoubleType()))
columns = ['mean','std']
data = [(18.2500365,2.7105814157004193),
(9.833353,2.121324586200329),
(41.55563866666666,7.118716782527054)]
df = spark.createDataFrame(data=data,schema=columns)
df.show()
| mean | std |
|------------|----------|
| 18.250037| 2.710581|
| 9.833353| 2.121325|
| 41.555639| 7.118717|
df = df.withColumn('random_values', udf_fnNomalDistribution('mean','std',f.lit(n)))
df.show()
PythonException:
An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\worker.py", line 604, in main
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\worker.py", line 596, in process
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\serializers.py", line 211, in dump_stream
self.serializer.dump_stream(self._batched(iterator), stream)
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\serializers.py", line 132, in dump_stream
for obj in iterator:
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\serializers.py", line 200, in _batched
for item in iterator:
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\worker.py", line 450, in mapper
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\worker.py", line 450, in <genexpr>
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\worker.py", line 85, in <lambda>
File "C:\Spark\spark-3.1.2-bin-hadoop3.2\python\lib\pyspark.zip\pyspark\util.py", line 73, in wrapper
return f(*args, **kwargs)
File "C:\Users\Ubits\AppData\Local\Temp/ipykernel_10604/2493247477.py", line 2, in fnNormalDistribution
File "<string>", line 1, in <module>
NameError: name 'st' is not defined
Is there some way to use the same function in Pyspark or get the random_values column in another way? I googled it with no exit about it.
Thanks
I was trying this and it can really be fixed by moving st inside fnNormalDistribution like samkart suggested.
I will just leave my example here as Fugue may provide a more readable way to bring this to Spark, especially around handling schema. Full code below.
import pandas as pd
def fnNormalDistribution(mean,std, n):
import scipy.stats as st
box = (eval('st.norm')(*[mean,std]).rvs(n)).tolist()
return box
df = pd.DataFrame([[18.2500365,2.7105814157004193],
[9.833353,2.121324586200329],
[41.55563866666666,7.118716782527054]],
columns = ['mean','std'])
n = 100 #Example
def helper(df: pd.DataFrame) -> pd.DataFrame:
df['random_values'] = df.apply(lambda row: fnNormalDistribution(row["mean"], row["std"], n), axis=1)
return df
from fugue import transform
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
# transform can take either pandas of spark DataFrame as input
# If engine is none, it will run on pandas
sdf = transform(df,
helper,
schema="*, random_values:[float]",
engine=spark)
sdf.show()

How to fix this error in PySpark with the select method?

I'm following an example from the internet, but it gives me an error that I can't solve. The code is the following:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pyspark import SparkContext
from IPython.display import display, HTML
from pyspark.sql import SQLContext
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from pyspark.sql import Column as c
from pyspark.sql.functions import array, udf, lit, col as c
import pyspark.sql.functions as f
pd.set_option('max_colwidth',100)
plt.style.use('seaborn-paper')
try:
sc = SparkContext('local[*]')
except:
sc = SparkContext.getOrCreate('local[*]')
sqlContext = SQLContext(sc)
#Leyendo los dataframe
whiteWinnePath = 'winequality-White.csv'
redWinnePath = 'winequality-Red.csv'
"""
df_p = pd.read_csv(whiteWinnePath, sep =";")
print("Mostrar el data frame")
display(df_p)
"""
whiteWinneDF = sqlContext.createDataFrame(pd.read_csv(whiteWinnePath, sep = ";")).withColumn('type',lit(0))
redWinneDF = sqlContext.createDataFrame(pd.read_csv(redWinnePath, sep = ";")).withColumn('type',lit(1))
whiteWinneDF.printSchema()
#Dividiendo conjunto de entranamiento y prueba
whiteTrainingDF, whiteTestingDF = whiteWinneDF.randomSplit([0.7,0.3])
redTrainingDF, redTestingDF = redWinneDF.randomSplit([0.7,0.3])
trainingDF = whiteTrainingDF.union(redTrainingDF)
testingDF = whiteTestingDF.union(redTestingDF)
#Preparando el dataframe para PCA
idCol = ['type']
features = [column for column in redWinneDF.columns if column not in idCol]
p = len(features)
meanVector = trainingDF.describe().where(c('summary')==lit('mean')).toPandas()[[0][1:p+1]].values
"""
meanVector2= meanVector1.toPandas()
#print("="*50)
#print(type(meanVector2))
#meanVector= meanVector2.as_matrix()#[0][1:p+1]
meanVector= meanVector2[[0][1:p+1]].values
"""
labeledVectorsDF = trainingDF.select(features+['type']).rdd\
.map(lambda x:(Vectors.dense(x[0:p]-Vectors.dense(meanVector)),x[p]))\
.toDF(['features','type'])
labeledVectorsDF.limit(5).toPandas()
When I run the code, this is the error i get:
File "D:\UGR\Investigación\Cosas de Reinaldo\mis script\Seleccion_caracteristicas\PCA_wine_quality.py", line 78, in <module>
labeledVectorsDF = trainingDF.select(features+['type']).rdd\
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 61, in toDF
return sparkSession.createDataFrame(self, schema, sampleRatio)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 605, in createDataFrame
return self._create_dataframe(data, schema, samplingRatio, verifySchema)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 628, in _create_dataframe
rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 425, in _createFromRDD
struct = self._inferSchema(rdd, samplingRatio, names=schema)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\session.py", line 396, in _inferSchema
first = rdd.first()
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\rdd.py", line 1464, in first
rs = self.take(1)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\rdd.py", line 1446, in take
res = self.context.runJob(self, takeUpToNumLeft, p)
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\context.py", line 1118, in runJob
sock_info = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
File "C:\spark-3.0.1-bin-hadoop2.7\python\lib\py4j-0.10.9-src.zip\py4j\java_gateway.py", line 1304, in __call__
return_value = get_return_value(
File "C:\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\utils.py", line 128, in deco
return f(*a, **kw)
File "C:\spark-3.0.1-bin-hadoop2.7\python\lib\py4j-0.10.9-src.zip\py4j\protocol.py", line 326, in get_return_value
raise Py4JJavaError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2.0 (TID 17, LAPTOP-3G67L0HS, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
I don't know if the error is with the select method or the rdd method. I verified that features has all the names of the columns of the dataframe except type.
How could I solve it?

JSON error while using Pandas output format

I am using alpha_vantage Timeseries API like below:
-----------------------------------------code------------------------------------
import pandas as pd
from alpha_vantage.timeseries import TimeSeries
from alpha_vantage.techindicators import TechIndicators
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
from pprint import pprint
#my key
key = 'mykey'
ts = TimeSeries(key, output_format='pandas')
def processMyBatch(batch, FD):
for i in batch:
df, meta_data = ts.get_quote_endpoint(i)
FD=FD.append(df)
return(FD)
main code...
for i in batches:
DF2=processMyBatch(i, DF)
DF=DF2
While the API worked for few symbols (see error log below), somewhere in between going through the list of symbols, I suddenly got the following JSONDecoder error ... but I am using output_format as pandas. Could you please throw some light on why this error occurred?
thank you
================error===============
/opt/scripts
starting now. fileName is: /mnt/NAS/Documents/../../../dailyquote2020-03-03.xlsx
completed the batch: ['AAPL', 'ABBV', 'AMZN', 'BAC', 'BNDX']
Waiting to honor API requirement: for 1 min
Waited: 65 sec
completed the batch: ['C', 'CNQ', 'CTSH', 'EEMV', 'FBGRX']
Waiting to honor API requirement: for 1 min
Waited: 65 sec
completed the batch: ['FDVV', 'FFNOX', 'FSMEX', 'FXAIX', 'GE']
Waiting to honor API requirement: for 1 min
Waited: 65 sec
Traceback (most recent call last):
File "getQuotes.py", line 55, in <module>
DF2=processMyBatch(i, DF)
File "getQuotes.py", line 29, in processMyBatch
df, meta_data = ts.get_quote_endpoint(i)
File "/home/username/.local/lib/python3.6/site-packages/alpha_vantage/alphavantage.py", line 174, in _format_wrapper
self, *args, **kwargs)
File "/home/username/.local/lib/python3.6/site-packages/alpha_vantage/alphavantage.py", line 159, in _call_wrapper
return self._handle_api_call(url), data_key, meta_data_key
File "/home/username/.local/lib/python3.6/site-packages/alpha_vantage/alphavantage.py", line 287, in _handle_api_call
json_response = response.json()
File "/home/username/.local/lib/python3.6/site-packages/requests/models.py", line 898, in json
return complexjson.loads(self.text, **kwargs)
File "/usr/lib/python3/dist-packages/simplejson/__init__.py", line 518, in loads
return _default_decoder.decode(s)
File "/usr/lib/python3/dist-packages/simplejson/decoder.py", line 370, in decode
obj, end = self.raw_decode(s)
File "/usr/lib/python3/dist-packages/simplejson/decoder.py", line 400, in raw_decode
return self.scan_once(s, idx=_w(s, idx).end())
simplejson.errors.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Added on 3/4/2020
..
..
completed the batch: ['FDVV', 'FFNOX', 'FSMEX', 'FXAIX', 'GE']
Waiting to honor API requirement: for 1 min
Waited: 65 sec
completed the batch: ['GOOGL', 'IGEB', 'IJH', 'IJR', 'IMTB']
Waiting to honor API requirement: for 1 min
Waited: 65 sec
Traceback (most recent call last):
File "getQuotes.py", line 55, in <module>
DF2=processMyBatch(i, DF)
..
..
Well I was getting an error like that today and it turns out the Alpha Vantage site is down!

Cannot cast ListType[tuple(float64 x 2)] to list(tuple(float64 x 2)) in numba

Hello I am trying to use typed List in numba v46.0
>>> from numba.typed import List
>>> from numba import types
>>> mylist = List.empty_list(item_type=types.Tuple((types.f8, types.f8)))
>>> mylist2 = List.empty_list(item_type=types.List(dtype=types.Tuple((types.f8, types.f8))))
>>> mylist2.append(mylist)
but I got the following error, I am wondering how to fix it?
Traceback (most recent call last): File "", line 1, in
File
"/usr/local/lib/python3.7/site-packages/numba/typed/typedlist.py",
line 223, in append
_append(self, item) File "/usr/local/lib/python3.7/site-packages/numba/dispatcher.py", line
401, in _compile_for_args
error_rewrite(e, 'typing') File "/usr/local/lib/python3.7/site-packages/numba/dispatcher.py", line
344, in error_rewrite
reraise(type(e), e, None) File "/usr/local/lib/python3.7/site-packages/numba/six.py", line 668, in
reraise
raise value.with_traceback(tb) numba.errors.TypingError: Failed in nopython mode pipeline (step: nopython frontend) Internal error at
. Failed in
nopython mode pipeline (step: nopython mode backend) Cannot cast
ListType[tuple(float64 x 2)] to list(tuple(float64 x 2)): %".24" =
load {i8*, i8*}, {i8*, i8*}* %"item"
File
"../../usr/local/lib/python3.7/site-packages/numba/listobject.py",
line 434:
def impl(l, item):
casteditem = _cast(item, itemty)
the following should work
mylist2 = List.empty_list(item_type=types.ListType(itemty=types.Tuple((types.f8, types.f8))))

using boto3 in a python3 virtual env in AWS Lambda

I am trying to use Python3.4 and boto3 to walk an S3 bucket and publish some file locations to an RDS instance. The part of this effort I am having trouble with is when using boto3. My lambda function looks like the following:
import subprocess
def lambda_handler(event, context):
args = ("venv/bin/python3.4", "run.py")
popen = subprocess.Popen(args, stdout=subprocess.PIPE)
popen.wait()
output = popen.stdout.read()
print(output)
and, in my run.py file I have some lines:
import boto3
s3c = boto3.client('s3')
which cause an exception. The run.py file is not relevant for this question however, so in order make this post more concise, I've found that the cause of this error is generated with executing the lambda function:
import subprocess
def lambda_handler(event, context):
args = ("python3.4", "-c", "import boto3; print(boto3.client('s3'))")
popen = subprocess.Popen(args, stdout=subprocess.PIPE)
popen.wait()
output = popen.stdout.read()
print(output)
My logstream reports the error:
Event Data
START RequestId: 2b65421a-664d-11e6-81db-974c7c09d283 Version: $LATEST
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/var/runtime/boto3/__init__.py", line 79, in client
return _get_default_session().client(*args, **kwargs)
File "/var/runtime/boto3/session.py", line 250, in client
aws_session_token=aws_session_token, config=config)
File "/var/runtime/botocore/session.py", line 818, in create_client
client_config=config, api_version=api_version)
File "/var/runtime/botocore/client.py", line 63, in create_client
cls = self._create_client_class(service_name, service_model)
File "/var/runtime/botocore/client.py", line 85, in _create_client_class
base_classes=bases)
File "/var/runtime/botocore/hooks.py", line 227, in emit
return self._emit(event_name, kwargs)
File "/var/runtime/botocore/hooks.py", line 210, in _emit
response = handler(**kwargs)
File "/var/runtime/boto3/utils.py", line 61, in _handler
module = import_module(module)
File "/var/runtime/boto3/utils.py", line 52, in import_module
__import__(name)
File "/var/runtime/boto3/s3/inject.py", line 13, in <module>
from boto3.s3.transfer import S3Transfer
File "/var/runtime/boto3/s3/transfer.py", line 135, in <module>
from concurrent import futures
File "/var/runtime/concurrent/futures/__init__.py", line 8, in <module>
from concurrent.futures._base import (FIRST_COMPLETED,
File "/var/runtime/concurrent/futures/_base.py", line 357
raise type(self._exception), self._exception, self._traceback
^
SyntaxError: invalid syntax
END RequestId: 2b65421a-664d-11e6-81db-974c7c09d283
REPORT RequestId: 2b65421a-664d-11e6-81db-974c7c09d283 Duration: 2673.45 ms Billed Duration: 2700 ms Memory Size: 1024 MB Max Memory Used: 61 MB
I need to use boto3 downstream of run.py. Any ideas on how to resolve this are much appreciated. Thanks!