How to get Spark2.3 working in Jupyter Notebook - jupyter

I am struggling on getting Spark2.3 working in Jupyter Notebook now.
Currently I have kernel created as below:
create an environment file:
$ cat rxie20181012-pyspark.yml
name: rxie20181012-pyspark
dependencies:
- pyspark
create an environment based on the environment file
conda env create -f rxie20181012-pyspark.yml
activate the new environment:
source activate rxie20181012-pyspark
create kernel based on the conda env:
sudo ./python -m ipykernel install --name rxie20181012-pyspark
--display-name "Python (rxie20181012-pyspark)"
kernel.json is as below:
cat /usr/local/share/jupyter/kernels/rxie20181012-pyspark/kernel.json
{
"display_name": "Python (rxie20181012-pyspark)",
"language": "python",
"argv": [
"/opt/cloudera/parcels/Anaconda-4.2.0/bin/python",
"-m",
"ipykernel",
"-f",
"{connection_file}"
]
}
After noticing the notebook failed on import pyspark, I added env section as below to the kernel.json:
{
"display_name": "Python (rxie20181012-pyspark)",
"language": "python",
"argv": [
"/opt/cloudera/parcels/Anaconda-4.2.0/bin/python",
"-m",
"ipykernel",
"-f",
"{connection_file}"
],
"env": {
"HADOOP_CONF_DIR": "/etc/spark2/conf/yarn-conf",
"PYSPARK_PYTHON":"/opt/cloudera/parcels/Anaconda/bin/python",
"SPARK_HOME": "/opt/cloudera/parcels/SPARK2",
"PYTHONPATH": "/opt/cloudera/parcels/SPARK2/lib/spark2/python/lib/py4j-0.10.7-src.zip:/opt/cloudera/parcels/SPARK2/lib/spark2/python/",
"PYTHONSTARTUP": "/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/shell.py",
"PYSPARK_SUBMIT_ARGS": " --master yarn --deploy-mode client pyspark-shell"
}
}
Now no more error on import pyspark, but still not able to start a sparksession:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('abc').getOrCreate()
OSErrorTraceback (most recent call last)
in ()
----> 1 spark = SparkSession.builder.appName('abc').getOrCreate()
/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/sql/session.pyc
in getOrCreate(self)
171 for key, value in self._options.items():
172 sparkConf.set(key, value)
--> 173 sc = SparkContext.getOrCreate(sparkConf)
174 # This SparkContext may be an existing one.
175 for key, value in self._options.items():
/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/context.pyc in
getOrCreate(cls, conf)
341 with SparkContext._lock:
342 if SparkContext._active_spark_context is None:
--> 343 SparkContext(conf=conf or SparkConf())
344 return SparkContext._active_spark_context
345
/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/context.pyc in
init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
113 """
114 self._callsite = first_spark_call() or CallSite(None, None, None)
--> 115 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
116 try:
117 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/context.pyc in
_ensure_initialized(cls, instance, gateway, conf)
290 with SparkContext._lock:
291 if not SparkContext._gateway:
--> 292 SparkContext._gateway = gateway or launch_gateway(conf)
293 SparkContext._jvm = SparkContext._gateway.jvm
294
/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/java_gateway.pyc
in launch_gateway(conf)
81 def preexec_func():
82 signal.signal(signal.SIGINT, signal.SIG_IGN)
---> 83 proc = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env)
84 else:
85 # preexec_fn not supported on Windows
/opt/cloudera/parcels/Anaconda/lib/python2.7/subprocess.pyc in
init(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines,
startupinfo, creationflags)
709 p2cread, p2cwrite,
710 c2pread, c2pwrite,
--> 711 errread, errwrite)
712 except Exception:
713 # Preserve original exception in case os.close raises.
/opt/cloudera/parcels/Anaconda/lib/python2.7/subprocess.pyc in
_execute_child(self, args, executable, preexec_fn, close_fds, cwd, env, universal_newlines, startupinfo, creationflags, shell, to_close,
p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite) 1341
raise 1342 child_exception = pickle.loads(data)
-> 1343 raise child_exception 1344 1345
OSError: [Errno 2] No such file or directory
Can anyone help me to sort it out please? Thank you from bottom of my heart.

root cause identified and its working now:
"SPARK_HOME": "/opt/cloudera/parcels/SPARK2"
should be replaced by:
"SPARK_HOME": "/opt/cloudera/parcels/SPARK2/lib/spark2"

Related

Why pip install not working in Jupyter notebook?

When i run pip3 install <package> or !pip3 install <package> or !pip install <package> i get this error. And also i can't clone any repo in jupyter. It gives the same error. This is my first time in Jupyter.
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
Input In [18], in <cell line: 1>()
----> 1 get_ipython().run_line_magic('pip', 'install boto3')
File /lib/python3.9/site-packages/IPython/core/interactiveshell.py:2294, in InteractiveShell.run_line_magic(self, magic_name, line, _stack_depth)
2292 kwargs['local_ns'] = self.get_local_scope(stack_depth)
2293 with self.builtin_trap:
-> 2294 result = fn(*args, **kwargs)
2295 return result
File /lib/python3.9/site-packages/IPython/core/magics/packaging.py:75, in PackagingMagics.pip(self, line)
72 else:
73 python = shlex.quote(python)
---> 75 self.shell.system(" ".join([python, "-m", "pip", line]))
77 print("Note: you may need to restart the kernel to use updated packages.")
File /lib/python3.9/site-packages/IPython/core/interactiveshell.py:2451, in InteractiveShell.system_piped(self, cmd)
2446 raise OSError("Background processes not supported.")
2448 # we explicitly do NOT return the subprocess status code, because
2449 # a non-None value would trigger :func:`sys.displayhook` calls.
2450 # Instead, we store the exit_code in user_ns.
-> 2451 self.user_ns['_exit_code'] = system(self.var_expand(cmd, depth=1))
File /lib/python3.9/site-packages/IPython/utils/_process_posix.py:148, in ProcessHandler.system(self, cmd)
146 child = pexpect.spawnb(self.sh, args=['-c', cmd]) # Pexpect-U
147 else:
--> 148 child = pexpect.spawn(self.sh, args=['-c', cmd]) # Vanilla Pexpect
149 flush = sys.stdout.flush
150 while True:
151 # res is the index of the pattern that caused the match, so we
152 # know whether we've finished (if we matched EOF) or not
File /lib/python3.9/site-packages/IPython/utils/_process_posix.py:57, in ProcessHandler.sh(self)
55 self._sh = pexpect.which(shell_name)
56 if self._sh is None:
---> 57 raise OSError('"{}" shell not found'.format(shell_name))
59 return self._sh
I searched everywhere, but it is weird that no-one faced this issue except me. Pls provide some solution for this. I'm getting crazy.

TypeError: 'JavaPackage' object is not callable for Xgboost in PySpark

I am trying to make Scala Xgboost API available for my PySpark Notebook. And following this blog:
https://towardsdatascience.com/pyspark-and-xgboost-integration-tested-on-the-kaggle-titanic-dataset-4e75a568bdb
However, keep on running into below err:
spark._jvm.ml.dmlc.xgboost4j.scala.spark.XGBoostEstimator
<py4j.java_gateway.JavaPackage at 0x7fa650fe7a58>
from sparkxgb import XGBoostEstimator
xgboost = XGBoostEstimator(
featuresCol="features",
labelCol="Survival",
predictionCol="prediction"
)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-18-1765fb9e3344> in <module>
4 featuresCol="features",
5 labelCol="Survival",
----> 6 predictionCol="prediction"
7 )
~/spark-assembly-2.4.0-twttr-kryo3-scala2128-hadoop2.9.2.t05/python/pyspark/__init__.py in wrapper(self, *args, **kwargs)
108 raise TypeError("Method %s forces keyword arguments." % func.__name__)
109 self._input_kwargs = kwargs
--> 110 return func(self, **kwargs)
111 return wrapper
112
~/local/spark-3536cd7a-6188-4ca8-b3d0-57d42cd01531/userFiles-0a0d90bc-96b4-43f2-bf21-00ae0e6f7309/sparkxgb.zip/sparkxgb/xgboost.py in __init__(self, checkpoint_path, checkpointInterval, missing, nthread, nworkers, silent, use_external_memory, baseMarginCol, featuresCol, labelCol, predictionCol, weightCol, base_score, booster, eval_metric, num_class, num_round, objective, seed, alpha, colsample_bytree, colsample_bylevel, eta, gamma, grow_policy, max_bin, max_delta_step, max_depth, min_child_weight, reg_lambda, scale_pos_weight, sketch_eps, subsample, tree_method, normalize_type, rate_drop, sample_type, skip_drop, lambda_bias)
113
114 super(XGBoostEstimator, self).__init__()
--> 115 self._java_obj = self._new_java_obj("ml.dmlc.xgboost4j.scala.spark.XGBoostEstimator", self.uid)
116 self._create_params_from_java()
117 self._setDefault(
~/spark-assembly-2.4.0-twttr-kryo3-scala2128-hadoop2.9.2.t05/python/pyspark/ml/wrapper.py in _new_java_obj(java_class, *args)
65 java_obj = getattr(java_obj, name)
66 java_args = [_py2java(sc, arg) for arg in args]
---> 67 return java_obj(*java_args)
68
69 #staticmethod
TypeError: 'JavaPackage' object is not callable
I already google this error and tried below things. I got all ideas from this blog https://github.com/JohnSnowLabs/spark-nlp/issues/232 :
Make sure Xgboost4j is in the SPARK_DIST_CLASSPATH. Already checked.
$echo $SPARK_DIST_CLASSPATH | tr " " "\n" | grep 'xgboost4j' | rev | cut -d'/' -f1 | rev
xgboost4j-0.72.jar
xgboost4j-spark.72.jar
Make sure they are added to EXTRA_CLASSPATH. - Done
Updating configs.
'export PYSPARK_SUBMIT_ARGS="--conf spark.jars=$SPARK_HOME/jars/* --conf spark.driver.extraClassPath=$SPARK_HOME/jars/* --conf spark.executor.extraClassPath=$SPARK_HOME/jars/* pyspark-shell"',
Hardware Info:
Machine: Linux
Using Jupyter Notebook.
Spark Version 2.4.0
python3.6
I found the problem, The problem was that the sparkxbg.zip(which I downloaded over internet) is written for xgboost4j-0.72. However, my jars were from xgoost4j-0.9. And the API has been completetly changed. As a result 0.9 version didn't had any class named ml.dmlc.xgboost4j.scala.spark.XGBoostEstimator. And hence the error. You can see the difference in API below:
https://github.com/dmlc/xgboost/tree/release_0.72/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark
vs
https://github.com/dmlc/xgboost/tree/v0.90/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark

Unable to open files, with the path in Jupyter notebook

I have reinstalled the anaconda after formatting my machine, since I am getting error while opening the files in jupyter notebook.
Initially I tried access the file from desktop location, as I got an error again tried to access from D drive. both were not successful attempts.
salaries = pd.read_excel('D:\\housesales.xlsx')
Below is the error
FileNotFoundError Traceback (most recent call last) <ipython-input-13-6d8e17cbb085> in <module> ----> 1 salaries = pd.read_excel('D:\housesales.xlsx') ~\Anaconda3\lib\site-packages\pandas\util_decorators.py in wrapper(*args, **kwargs) 186 else: 187 kwargs[new_arg_name] = new_arg_value --> 188 return func(*args, **kwargs) 189 return wrapper 190 return _deprecate_kwarg ~\Anaconda3\lib\site-packages\pandas\util_decorators.py in wrapper(*args, **kwargs) 186 else: 187 kwargs[new_arg_name] = new_arg_value --> 188 return func(*args, **kwargs) 189 return wrapper 190 return _deprecate_kwarg ~\Anaconda3\lib\site-packages\pandas\io\excel.py in read_excel(io, sheet_name, header, names, index_col, parse_cols, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, verbose, parse_dates, date_parser, thousands, comment, skip_footer, skipfooter, convert_float, mangle_dupe_cols, **kwds) 348 349 if not isinstance(io, ExcelFile): --> 350 io = ExcelFile(io, engine=engine) 351 352 return io.parse( ~\Anaconda3\lib\site-packages\pandas\io\excel.py in init(self, io, engine) 651 self._io = _stringify_path(io) 652 --> 653 self._reader = self._enginesengine 654 655 def fspath(self): ~\Anaconda3\lib\site-packages\pandas\io\excel.py in init(self, filepath_or_buffer) 422 self.book = xlrd.open_workbook(file_contents=data) 423 elif isinstance(filepath_or_buffer, compat.string_types): --> 424 self.book = xlrd.open_workbook(filepath_or_buffer) 425 else: 426 raise ValueError('Must explicitly set engine if not passing in' ~\Anaconda3\lib\site-packages\xlrd__init__.py in open_workbook(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows) 109 else: 110 filename = os.path.expanduser(filename) --> 111 with open(filename, "rb") as f: 112 peek = f.read(peeksz) 113 if peek == b"PK\x03\x04": # a ZIP file FileNotFoundError: [Errno 2] No such file or directory: 'D:\housesales.xlsx'
Sounds like your housesales.xlsx file is on your Desktop, but you do not include the Desktop folder in the path to your file.
salaries = pd.read_excel('D:\\Desktop\housesales.xlsx')
I recommend you use jupyter lab as it has a file tree.
Running this bash command in a notebook cell will tell you the working directory of your jupyter instance so you know where it is looking for files.
!pwd
You could also move your file to that directory and then just access it as
salaries = pd.read_excel('housesales.xlsx')

GCS Python access from colab

I am trying to access GCS from Colab using the following lines of code and get the given error. Am I missing something? Or Colab doesn't support this kind of GCS access? Is there any workaround or best practices I can use?
from google.cloud import storage
client = storage.Client()
bucket = client.get_bucket('busnet_videos')
blob = bucket.blob('my-test-file.txt')
blob.upload_from_string('this is test content!')
Error :
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
<ipython-input-7-0ed440d78c8f> in <module>()
7 from google.cloud import storage
8
----> 9 client = storage.Client()
10 bucket = client.get_bucket('busnet_videos')
11 blob = bucket.blob('my-test-file.txt')
2 frames
/usr/local/lib/python3.6/dist-packages/google/cloud/storage/client.py in __init__(self, project, credentials, _http)
71 project = None
72 super(Client, self).__init__(
---> 73 project=project, credentials=credentials, _http=_http
74 )
75 if no_project:
/usr/local/lib/python3.6/dist-packages/google/cloud/client.py in __init__(self, project, credentials, _http)
221
222 def __init__(self, project=None, credentials=None, _http=None):
--> 223 _ClientProjectMixin.__init__(self, project=project)
224 Client.__init__(self, credentials=credentials, _http=_http)
/usr/local/lib/python3.6/dist-packages/google/cloud/client.py in __init__(self, project)
176 if project is None:
177 raise EnvironmentError(
--> 178 "Project was not passed and could not be "
179 "determined from the environment."
180 )
OSError: Project was not passed and could not be determined from the environment.
You may have to set a the environment variables:
GOOGLE_APPLICATION_CREDENTIALS=SERVICE_ACCOUNT_KEY.json
and
PROJECT_ID=YOUR_GOOGLE_CLOUD_PROJECT_ID

Running gvmagic extension on jupyter notebook returning FileNotFounError

I am developing Python 3.5.3 using the jupyter notebook in Anaconda 2.5.0 (64-bit) on a Windows 10 machine. I am trying to use an extension called 'gvmagic', which is used for viewing graphs. The extension seems to load, but returns a FileNotFoundError instead of a graph.
My input code is (Note: 'visualize_de_bruijn_graph' is a custom code that builds a de Bruijn graph from a string):
dbg = visualize_de_bruijn_graph('ACGCGTCG', 3)
print(dbg)
Which returns graph:
digraph "DeBruijn Graph" {
CG [label="CG"] ;
TC [label="TC"] ;
GC [label="GC"] ;
AC [label="AC"] ;
GT [label="GT"] ;
AC -> CG ;
CG -> GC ;
GC -> CG ;
CG -> GT ;
GT -> TC ;
TC -> CG ;
}
Trying to visualize the graph with the following code:
%load_ext gvmagic
%dotstr dbg
returns the error below. I cannot figure out what file is missing, as all the files referenced are where they are suppose to be.
FileNotFoundError Traceback (most recent call last)
<ipython-input-17-d138faf6c47c> in <module>()
----> 1 get_ipython().magic('dotstr dbg')
C:\Users\username\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in magic(self, arg_s)
2161 magic_name, _, magic_arg_s = arg_s.partition(' ')
2162 magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2163 return self.run_line_magic(magic_name, magic_arg_s)
2164
2165 #-------------------------------------------------------------------------
C:\Users\username\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_line_magic(self, magic_name, line)
2082 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
2083 with self.builtin_trap:
-> 2084 result = fn(*args,**kwargs)
2085 return result
2086
<decorator-gen-126> in dotstr(self, line)
C:\Users\username\Anaconda3\lib\site-packages\IPython\core\magic.py in <lambda>(f, *a, **k)
191 # but it's overkill for just that one bit of state.
192 def magic_deco(arg):
--> 193 call = lambda f, *a, **k: f(*a, **k)
194
195 if callable(arg):
C:\Users\username\Anaconda3\lib\site-packages\IPython\extensions\gvmagic.py in dotstr(self, line)
50 #line_magic
51 def dotstr(self, line):
---> 52 self._from_str(line, 'dot')
53
54 #line_magic
C:\Users\username\Anaconda3\lib\site-packages\IPython\extensions\gvmagic.py in _from_str(self, line, layout_engine)
151 def _from_str(self, line, layout_engine):
152 s = self.shell.ev(line)
--> 153 data = run_graphviz(s, layout_engine)
154 if data:
155 display_svg(data, raw=True)
C:\Users\username\Anaconda3\lib\site-packages\IPython\extensions\gvmagic.py in run_graphviz(s, layout_engine)
30 cmd = ['dot', '-Tsvg', '-K', layout_engine]
31
---> 32 dot = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
33 stdoutdata, stderrdata = dot.communicate(s.encode('utf-8'))
34 status = dot.wait()
C:\Users\username\Anaconda3\lib\subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds)
674 c2pread, c2pwrite,
675 errread, errwrite,
--> 676 restore_signals, start_new_session)
677 except:
678 # Cleanup if the child failed starting.
C:\Users\username\Anaconda3\lib\subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_start_new_session)
953 env,
954 cwd,
--> 955 startupinfo)
956 finally:
957 # Child is launched. Close the parent's copy of those pipe
FileNotFoundError: [WinError 2] The system cannot find the file specified
You have to install the Graphviz software on your PC. For Windows, for example, download this https://graphviz.gitlab.io/_pages/Download/Download_windows.html.
In your IPython session, you have to point to the install location, for example:
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin'
%load_ext gvmagic
dbg = visualize_de_bruijn('ACGCGTCG', 3)
%dotstr dbg
And now it should work for you, just as it does for me! You can probably set the PATH variable on your PC instead of having to do it inside IPython each time.