Trying to get Jupyter notebook to work with PySpark - pyspark

I am fairly new to this but I have followed numerous setup instructions and installed Anaconda to use the Jupyter notebook with PySpark. I've installed Python, PySpark, Winutils, set environment variables but still cannot get PySpark to work.
Everyone seems to have their own flavor of configurations.
When running this code:
import findspark
findspark.init()
import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df = spark.sql('''select 'spark' as hello ''')
df.show()
...I get numerous errors ending with "TypeError: an integer is required (got type bytes)"
When I simply run:
import findspark
findspark.init()
...nothing happens at all.
I am trying to setup a local Jupyter notebook environment to test code for use in AWS Glue.
Adding type errors:
TypeError Traceback (most recent call last)
<ipython-input-1-84edcc0ab449> in <module>
2 findspark.init()
3
----> 4 import pyspark # only run after findspark.init()
5 from pyspark.sql import SparkSession
6 spark = SparkSession.builder.getOrCreate()
C:\spark-2.4.8-bin-hadoop2.7\python\pyspark\__init__.py in <module>
49
50 from pyspark.conf import SparkConf
---> 51 from pyspark.context import SparkContext
52 from pyspark.rdd import RDD, RDDBarrier
53 from pyspark.files import SparkFiles
C:\spark-2.4.8-bin-hadoop2.7\python\pyspark\context.py in <module>
29 from py4j.protocol import Py4JError
30
---> 31 from pyspark import accumulators
32 from pyspark.accumulators import Accumulator
33 from pyspark.broadcast import Broadcast, BroadcastPickleRegistry
C:\spark-2.4.8-bin-hadoop2.7\python\pyspark\accumulators.py in <module>
95 import socketserver as SocketServer
96 import threading
---> 97 from pyspark.serializers import read_int, PickleSerializer
98
99
C:\spark-2.4.8-bin-hadoop2.7\python\pyspark\serializers.py in <module>
70 xrange = range
71
---> 72 from pyspark import cloudpickle
73 from pyspark.util import _exception_message
74
C:\spark-2.4.8-bin-hadoop2.7\python\pyspark\cloudpickle.py in <module>
143
144
--> 145 _cell_set_template_code = _make_cell_set_template_code()
146
147
C:\spark-2.4.8-bin-hadoop2.7\python\pyspark\cloudpickle.py in _make_cell_set_template_code()
124 )
125 else:
--> 126 return types.CodeType(
127 co.co_argcount,
128 co.co_kwonlyargcount,
TypeError: an integer is required (got type bytes)

Related

'NoneType' object has no attribute 'shape' when using convert_sparkml function and FloatTensorType from onnxmltools library

I searched a lot for my problem but to no avail.
This is the runtime version I have on my Databricks cluster: 10.4 LTS ML (includes Apache Spark 3.2.1, Scala 2.12)
I have this code below:
with mlflow.start_run():
rf = RandomForestRegressor(labelCol='Duration',
featuresCol='scaled_features')
name = "Random Forest"
model = rf.fit(train)
predictions = model.transform(test)
evaluator_r2 = RegressionEvaluator(
labelCol="Duration", predictionCol="prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
mlflow.log_param("Algorithm", name)
mlflow.log_metric("R2 score", r2)
# Convert to ONNX model
num_features = model.numFeatures # this is 13 features and for GBTree as well
#initial_type = buildInitialTypesSimple(scaled_data.select("scaled_features"))
initial_type = [('features', FloatTensorType([1, num_features]))]
onnx = convert_sparkml(model=model, name="Random Forest", initial_types=initial_type)
# Log model
mlflow.onnx.log_model(onnx.SerializeToString(), registered_model_name="onnx_random_forest")
The model trains well, but when it arrives at the line "onnx = convert_sparkml(model=model, name="Random Forest", initial_types=initial_type)", I get this error below:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<command-1060338014211318> in <module>
19 initial_type = [('features', FloatTensorType([1, num_features]))]
20
---> 21 onnx = convert_sparkml(model=model, name="Random Forest", initial_types=initial_type)
22
23 # Log model
/databricks/python/lib/python3.8/site-packages/onnxmltools/convert/main.py in convert_sparkml(model, name, initial_types, doc_string, target_opset, targeted_onnx, custom_conversion_functions, custom_shape_calculators, spark_session)
164
165 from .sparkml.convert import convert
--> 166 return convert(model, name, initial_types, doc_string, target_opset, targeted_onnx,
167 custom_conversion_functions, custom_shape_calculators, spark_session)
168
/databricks/python/lib/python3.8/site-packages/onnxmltools/convert/sparkml/convert.py in convert(model, name, initial_types, doc_string, target_opset, targeted_onnx, custom_conversion_functions, custom_shape_calculators, spark_session)
66
67 # Infer variable shapes
---> 68 topology.compile()
69
70 # Convert our Topology object into ONNX. The outcome is an ONNX model.
/databricks/python/lib/python3.8/site-packages/onnxconverter_common/topology.py in compile(self)
676 self._resolve_duplicates()
677 self._fix_shapes()
--> 678 self._infer_all_types()
679 self._check_structure()
680
/databricks/python/lib/python3.8/site-packages/onnxconverter_common/topology.py in _infer_all_types(self)
551 pass # in Keras converter, the shape calculator can be optional.
552 else:
--> 553 operator.infer_types()
554
555 def _resolve_duplicates(self):
/databricks/python/lib/python3.8/site-packages/onnxconverter_common/topology.py in infer_types(self)
105 def infer_types(self):
106 # Invoke a core inference function
--> 107 get_shape_calculator(self.type)(self)
108
109
/databricks/python/lib/python3.8/site-packages/onnxmltools/convert/sparkml/operator_converters/decision_tree_regressor.py in calculate_decision_tree_regressor_output_shapes(operator)
31 def calculate_decision_tree_regressor_output_shapes(operator):
32 check_input_and_output_numbers(operator, input_count_range=1, output_count_range=1)
---> 33 N = operator.inputs[0].type.shape[0]
34 operator.outputs[0].type = FloatTensorType(shape=[N, 1])
35
AttributeError: 'NoneType' object has no attribute 'shape'
Here are the imports:
import pyspark.sql.functions as f
from pyspark.sql.types import *
import mlflow
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, date
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from onnxmltools import convert_sparkml
from onnxmltools.convert.sparkml.utils import buildInitialTypesSimple, FloatTensorType
import onnxmltools.convert.common.data_types
I used optionally
from skl2onnx.common.data_types import FloatTensorType
import onnxmltools.convert.common.data_types
from onnxmltools.convert.common.data_types import FloatTensorType
from py4j.java_gateway import java_import
java_import(spark._sc._jvm, "org.apache.spark.sql.api.python.*")
For Gradient-boosted tree I get a different error, but kind of pointing in the same direction.
with mlflow.start_run():
name = 'GBTree'
gbt = GBTRegressor(labelCol='Duration',
featuresCol='scaled_features')
model = gbt.fit(train)
predictions = model.transform(test)
evaluator_r2 = RegressionEvaluator(
labelCol="Duration", predictionCol="prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
mlflow.log_param("Algorithm", name)
mlflow.log_metric("R2 score", r2)
# Convert to ONNX model
num_features = model.numFeatures
#initial_types = buildInitialTypesSimple(scaled_data.select("scaled_features"))
initial_type = [('features', FloatTensorType([None, num_features]))]
onnx = convert_sparkml(model=model, name="GBTree", initial_types=initial_type)
# Log model
mlflow.onnx.log_model(onnx.SerializeToString(), registered_model_name="onnx_GBTree")
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<command-3643615854794992> in <module>
20 initial_type = [('features', FloatTensorType([None, num_features]))]
21
---> 22 onnx = convert_sparkml(model=model, name="GBTree", initial_types=initial_type)
23
24 # Log model
/databricks/python/lib/python3.8/site-packages/onnxmltools/convert/main.py in convert_sparkml(model, name, initial_types, doc_string, target_opset, targeted_onnx, custom_conversion_functions, custom_shape_calculators, spark_session)
164
165 from .sparkml.convert import convert
--> 166 return convert(model, name, initial_types, doc_string, target_opset, targeted_onnx,
167 custom_conversion_functions, custom_shape_calculators, spark_session)
168
/databricks/python/lib/python3.8/site-packages/onnxmltools/convert/sparkml/convert.py in convert(model, name, initial_types, doc_string, target_opset, targeted_onnx, custom_conversion_functions, custom_shape_calculators, spark_session)
66
67 # Infer variable shapes
---> 68 topology.compile()
69
70 # Convert our Topology object into ONNX. The outcome is an ONNX model.
/databricks/python/lib/python3.8/site-packages/onnxconverter_common/topology.py in compile(self)
676 self._resolve_duplicates()
677 self._fix_shapes()
--> 678 self._infer_all_types()
679 self._check_structure()
680
/databricks/python/lib/python3.8/site-packages/onnxconverter_common/topology.py in _infer_all_types(self)
551 pass # in Keras converter, the shape calculator can be optional.
552 else:
--> 553 operator.infer_types()
554
555 def _resolve_duplicates(self):
/databricks/python/lib/python3.8/site-packages/onnxconverter_common/topology.py in infer_types(self)
105 def infer_types(self):
106 # Invoke a core inference function
--> 107 get_shape_calculator(self.type)(self)
108
109
/databricks/python/lib/python3.8/site-packages/onnxmltools/convert/sparkml/operator_converters/gbt_classifier.py in calculate_gbt_classifier_output_shapes(operator)
66 def calculate_gbt_classifier_output_shapes(operator):
67 check_input_and_output_numbers(operator, input_count_range=1, output_count_range=[1, 2])
---> 68 check_input_and_output_types(operator, good_input_types=[FloatTensorType, Int64TensorType])
69 if len(operator.inputs[0].type.shape) != 2:
70 raise RuntimeError('Input must be a [N, C]-tensor')
/databricks/python/lib/python3.8/site-packages/onnxconverter_common/utils.py in check_input_and_output_types(operator, good_input_types, good_output_types)
320 for variable in operator.inputs:
321 if type(variable.type) not in good_input_types:
--> 322 raise RuntimeError('Operator %s (type: %s) got an input %s with a wrong type %s. Only %s are allowed'
323 % (operator.full_name, operator.type, variable.full_name, type(variable.type),
324 good_input_types))
RuntimeError: Operator pyspark_ml_regression_GBTRegressionModel (type: pyspark.ml.regression.GBTRegressionModel) got an input scaled_features with a wrong type <class 'NoneType'>. Only [<class 'onnxconverter_common.data_types.FloatTensorType'>, <class 'onnxconverter_common.data_types.Int64TensorType'>] are allowed
If I missed something, please tell me and I will try to provide it, thank you.

Scipy module installed but unable to call sub package stats in jupyter notebook

Successfully installed SciPy but unable to use the sub
package stats. Tried uninstalling and reinstalling SciPy in anaconda prompt still no luck.
The other packages work fine such as pandas numpy matplotlib
import scipy ------> no error
but the when I run the below code in jupyter notebook.
from scipy import stats
error
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_6988/2143707691.py in <module>
----> 1 from scipy import stats
~\anaconda3\envs\sampleenv\lib\site-packages\scipy\stats\__init__.py in <module>
439 """
440
--> 441 from .stats import *
442 from .distributions import *
443 from .morestats import *
~\anaconda3\envs\sampleenv\lib\site-packages\scipy\stats\stats.py in <module>
35 from numpy import array, asarray, ma
36
---> 37 from scipy.spatial.distance import cdist
38 from scipy.ndimage import measurements
39 from scipy._lib._util import (check_random_state, MapWrapper,
~\anaconda3\envs\sampleenv\lib\site-packages\scipy\spatial\__init__.py in <module>
100 from ._plotutils import *
101 from ._procrustes import procrustes
--> 102 from ._geometric_slerp import geometric_slerp
103
104 __all__ = [s for s in dir() if not s.startswith('_')]
~\anaconda3\envs\sampleenv\lib\site-packages\scipy\spatial\_geometric_slerp.py in <module>
6
7 import numpy as np
----> 8 from scipy.spatial.distance import euclidean
9
10
~\anaconda3\envs\sampleenv\lib\site-packages\scipy\spatial\distance.py in <module>
122 from . import _hausdorff
123 from ..linalg import norm
--> 124 from ..special import rel_entr
125
126 from . import _distance_pybind
~\anaconda3\envs\sampleenv\lib\site-packages\scipy\special\__init__.py in <module>
641 from .sf_error import SpecialFunctionWarning, SpecialFunctionError
642
--> 643 from . import _ufuncs
644 from ._ufuncs import *
645
ImportError: DLL load failed while importing _ufuncs: The specified module could not be found.

ImportError: cannot import name 'SAVE_STATE_WARNING' from 'torch.optim.lr_scheduler'

I am attempting to issue this statement in a jupyter Notebook.
from transformers import BertForQuestionAnswering
I get the error:
ImportError: cannot import name 'SAVE_STATE_WARNING' from 'torch.optim.lr_scheduler' (C:\Users\sbing.conda\envs\Tensorflow\lib\site-packages\torch\optim\lr_scheduler.py)
Here is the complete stack:
ImportError Traceback (most recent call last)
in
----> 1 from transformers import BertForQuestionAnswering
~.conda\envs\Tensorflow\lib\site-packages\transformers_init_.py in
624
625 # Trainer
--> 626 from .trainer import Trainer
627 from .trainer_pt_utils import torch_distributed_zero_first
628 else:
~.conda\envs\Tensorflow\lib\site-packages\transformers\trainer.py in
67 TrainerState,
68 )
---> 69 from .trainer_pt_utils import (
70 DistributedTensorGatherer,
71 SequentialDistributedSampler,
~.conda\envs\Tensorflow\lib\site-packages\transformers\trainer_pt_utils.py in
38 SAVE_STATE_WARNING = ""
39 else:
---> 40 from torch.optim.lr_scheduler import SAVE_STATE_WARNING
41
42 logger = logging.get_logger(name)
ImportError: cannot import name 'SAVE_STATE_WARNING' from 'torch.optim.lr_scheduler' (C:\Users\sbing.conda\envs\Tensorflow\lib\site-packages\torch\optim\lr_scheduler.py)
You need to update the transformer package to the latest version. You can achieve it by running this code:
!pip install transformers==4.11.3.
For me, there is no error after updating. Refer to these links official resource and this

How to fix "TypeError: an integer is required (got type bytes)" error when trying to import "from cdqa.pipeline.cdqa_sklearn import QAPipeline" this

I am using python3.6 which helped me import this package before but now it is not working. I installed all the necessary dependencies as said in the ~cdQA~ package.
I'm not sure if I missed a step in the cdQA installation, like setting some environment variable, but I can't find any further detailed instructions.
TypeError Traceback (most recent call last)
<ipython-input-2-349c08f077b9> in <module>
1 from cdqa.utils.filters import filter_paragraphs
2 from cdqa.utils.download import download_model, download_bnpp_data
----> 3 from cdqa.pipeline.cdqa_sklearn import QAPipeline
~/code/cdQA/cdqa/pipeline/__init__.py in <module>
----> 1 from .cdqa_sklearn import QAPipeline
2
3 __all__ = ["QAPipeline"]
~/code/cdQA/cdqa/pipeline/cdqa_sklearn.py in <module>
----> 1 import joblib
2 import warnings
3
4 import pandas as pd
5 import numpy as np
~/.local/lib/python3.8/site-packages/joblib/__init__.py in <module>
117 from .numpy_pickle import load
118 from .compressor import register_compressor
--> 119 from .parallel import Parallel
120 from .parallel import delayed
121 from .parallel import cpu_count
~/.local/lib/python3.8/site-packages/joblib/parallel.py in <module>
26 from .my_exceptions import TransportableException
27 from .disk import memstr_to_bytes
---> 28 from ._parallel_backends import (FallbackToBackend, MultiprocessingBackend,
29 ThreadingBackend, SequentialBackend,
30 LokyBackend)
~/.local/lib/python3.8/site-packages/joblib/_parallel_backends.py in <module>
20 from .pool import MemmappingPool
21 from multiprocessing.pool import ThreadPool
---> 22 from .executor import get_memmapping_executor
23
24 # Compat between concurrent.futures and multiprocessing TimeoutError
~/.local/lib/python3.8/site-packages/joblib/executor.py in <module>
12 from .disk import delete_folder
13 from ._memmapping_reducer import get_memmapping_reducers
---> 14 from .externals.loky.reusable_executor import get_reusable_executor
15
16
~/.local/lib/python3.8/site-packages/joblib/externals/loky/__init__.py in <module>
10
11 from .backend.context import cpu_count
---> 12 from .backend.reduction import set_loky_pickler
13 from .reusable_executor import get_reusable_executor
14 from .cloudpickle_wrapper import wrap_non_picklable_objects
~/.local/lib/python3.8/site-packages/joblib/externals/loky/backend/reduction.py in <module>
123 # global variable to change the pickler behavior
124 try:
--> 125 from joblib.externals import cloudpickle # noqa: F401
126 DEFAULT_ENV = "cloudpickle"
127 except ImportError:
~/.local/lib/python3.8/site-packages/joblib/externals/cloudpickle/__init__.py in <module>
1 from __future__ import absolute_import
2
----> 3 from .cloudpickle import *
4
5 __version__ = '0.8.0'
~/.local/lib/python3.8/site-packages/joblib/externals/cloudpickle/cloudpickle.py in <module>
150
151
--> 152 _cell_set_template_code = _make_cell_set_template_code()
153
154
~/.local/lib/python3.8/site-packages/joblib/externals/cloudpickle/cloudpickle.py in _make_cell_set_template_code()
131 )
132 else:
--> 133 return types.CodeType(
134 co.co_argcount,
135 co.co_kwonlyargcount,
TypeError: an integer is required (got type bytes)

%load_ext rpy2.ipython works in iPython but not in iPython Notebook

I have a problem to get rpy2 running in iPython notebook.
If I load
%load_ext rpy2.ipython
in iPython 4.0.3 everything is fine. But if I do the same thing in a iPython notebook I get:
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-3-a69f80d0128e> in <module>()
----> 1 get_ipython().magic('load_ext rpy2.ipython')
C:\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in magic(self, arg_s)
2334 magic_name, _, magic_arg_s = arg_s.partition(' ')
2335 magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2336 return self.run_line_magic(magic_name, magic_arg_s)
2337
2338 #-------------------------------------------------------------------------
C:\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_line_magic(self, magic_name, line)
2255 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
2256 with self.builtin_trap:
-> 2257 result = fn(*args,**kwargs)
2258 return result
2259
<decorator-gen-65> in load_ext(self, module_str)
C:\Anaconda3\lib\site-packages\IPython\core\magic.py in <lambda>(f, *a, **k)
191 # but it's overkill for just that one bit of state.
192 def magic_deco(arg):
--> 193 call = lambda f, *a, **k: f(*a, **k)
194
195 if callable(arg):
C:\Anaconda3\lib\site-packages\IPython\core\magics\extension.py in load_ext(self, module_str)
64 if not module_str:
65 raise UsageError('Missing module name.')
---> 66 res = self.shell.extension_manager.load_extension(module_str)
67
68 if res == 'already loaded':
C:\Anaconda3\lib\site-packages\IPython\core\extensions.py in load_extension(self, module_str)
82 if module_str not in sys.modules:
83 with prepended_to_syspath(self.ipython_extension_dir):
---> 84 __import__(module_str)
85 mod = sys.modules[module_str]
86 if self._call_load_ipython_extension(mod):
C:\Anaconda3\lib\site-packages\rpy2\ipython\__init__.py in <module>()
----> 1 from .rmagic import load_ipython_extension
C:\Anaconda3\lib\site-packages\rpy2\ipython\rmagic.py in <module>()
50 # numpy and rpy2 imports
51
---> 52 import rpy2.rinterface as ri
53 import rpy2.robjects as ro
54 import rpy2.robjects.packages as rpacks
C:\Anaconda3\lib\site-packages\rpy2\rinterface\__init__.py in <module>()
72 if not os.path.exists(Rlib):
73 continue
---> 74 ctypes.CDLL(Rlib)
75 _win_ok = True
76 break
C:\Anaconda3\lib\ctypes\__init__.py in __init__(self, name, mode, handle, use_errno, use_last_error)
345
346 if handle is None:
--> 347 self._handle = _dlopen(self._name, mode)
348 else:
349 self._handle = handle
FileNotFoundError: [WinError 161] Der angegebene Pfadname ist ungültig
Is there some way to get both running? As rpy2 runs properly in iPython I guess there the installation shoiuld be correct.
Thanks,
Marv
There is likely more differences between the environment from which ipython is called and the one from which the notebook is called: the error Der angegebene Pfadname ist ungültig occurs while trying the R shared library.
You'd need to tell us a little more about how you start either ipython or the notebook.
Having that said, you should also note that rpy2 is likely working better on Linux or OS X. If the ipython notebook is your primary interest, running through a Docker container could be a good solution.