GCS Python access from colab - google-cloud-storage

I am trying to access GCS from Colab using the following lines of code and get the given error. Am I missing something? Or Colab doesn't support this kind of GCS access? Is there any workaround or best practices I can use?
from google.cloud import storage
client = storage.Client()
bucket = client.get_bucket('busnet_videos')
blob = bucket.blob('my-test-file.txt')
blob.upload_from_string('this is test content!')
Error :
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
<ipython-input-7-0ed440d78c8f> in <module>()
7 from google.cloud import storage
8
----> 9 client = storage.Client()
10 bucket = client.get_bucket('busnet_videos')
11 blob = bucket.blob('my-test-file.txt')
2 frames
/usr/local/lib/python3.6/dist-packages/google/cloud/storage/client.py in __init__(self, project, credentials, _http)
71 project = None
72 super(Client, self).__init__(
---> 73 project=project, credentials=credentials, _http=_http
74 )
75 if no_project:
/usr/local/lib/python3.6/dist-packages/google/cloud/client.py in __init__(self, project, credentials, _http)
221
222 def __init__(self, project=None, credentials=None, _http=None):
--> 223 _ClientProjectMixin.__init__(self, project=project)
224 Client.__init__(self, credentials=credentials, _http=_http)
/usr/local/lib/python3.6/dist-packages/google/cloud/client.py in __init__(self, project)
176 if project is None:
177 raise EnvironmentError(
--> 178 "Project was not passed and could not be "
179 "determined from the environment."
180 )
OSError: Project was not passed and could not be determined from the environment.

You may have to set a the environment variables:
GOOGLE_APPLICATION_CREDENTIALS=SERVICE_ACCOUNT_KEY.json
and
PROJECT_ID=YOUR_GOOGLE_CLOUD_PROJECT_ID

Related

ValueError: unrecognized engine zarr must be one of: ['scipy', 'store']

I am trying to open zarr file as,
import pandas as pd
import xarray as xr
xf = xr.open_zarr("../../data/processed/geolink_norge_dataset/geolink_norge_well_logs.zarr")
But there comes out the errors:
ValueError Traceback (most recent call last) <ipython-input-17-ff38d9c54463> in <module>
1 import pandas as pd
2 import xarray as xr
----> 3 xf = xr.open_zarr("../../data/processed/geolink_norge_dataset/geolink_norge_well_logs.zarr")
4
5 # We will use just the 30* wells
C:\ProgramData\Anaconda3\lib\site-packages\xarray\backends\zarr.py in open_zarr(store, group, synchronizer, chunks, decode_cf, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, consolidated, overwrite_encoded_chunks, chunk_store, storage_options, decode_timedelta, use_cftime, **kwargs)
685 }
686
--> 687 ds = open_dataset(
688 filename_or_obj=store,
689 group=group,
C:\ProgramData\Anaconda3\lib\site-packages\xarray\backends\api.py in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, backend_kwargs,
*args, **kwargs)
480 engine = plugins.guess_engine(filename_or_obj)
481
--> 482 backend = plugins.get_backend(engine)
483
484 decoders = _resolve_decoders_kwargs(
C:\ProgramData\Anaconda3\lib\site-packages\xarray\backends\plugins.py in get_backend(engine)
132 engines = list_engines()
133 if engine not in engines:
--> 134 raise ValueError(
135 f"unrecognized engine {engine} must be one of: {list(engines)}"
136 )
ValueError: unrecognized engine zarr must be one of: ['scipy','store']
Can anyone help to solve this problem?
I can confirm that I have installed the scipy and store packages.
You likely need to install the zarr package as well:
pip install zarr
If that doesn't work, try:
pip install xarray[complete]
See https://github.com/pydata/xarray/issues/5395#issuecomment-850483726 for more information.

Using pretrained models from sparknlp on Databricks

I am trying to follow the official examples from John Snow Labs but every time I get a TypeError: 'JavaPackage' object is not callable error. I followed all of the steps in the Databricks install documentation but no matter what walkthrough I try, either this one or this one it fails.
An example of the first (after doing the installs):
import sparknlp
from sparknlp.pretrained import *
pipeline = PretrainedPipeline('recognize_entities_dl', 'en')
recognize_entities_dl download started this may take some time.
TypeError: 'JavaPackage' object is not callable
TypeError Traceback (most recent call last)
<command-937510457011238> in <module>
----> 1 pipeline = PretrainedPipeline('recognize_entities_dl', 'en')
2
3 # ner_bert = NerDLModel.pretrained('ner_dl_bert')
4
5 # pipeline = PretrainedPipeline('recognize_entities_dl', 'en', 'https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/ner_dl_bert_en_2.4.3_2.4_1584624951079.zip')
/databricks/python/lib/python3.7/site-packages/sparknlp/pretrained.py in __init__(self, name, lang, remote_loc, parse_embeddings, disk_location)
89 def __init__(self, name, lang='en', remote_loc=None, parse_embeddings=False, disk_location=None):
90 if not disk_location:
---> 91 self.model = ResourceDownloader().downloadPipeline(name, lang, remote_loc)
92 else:
93 self.model = PipelineModel.load(disk_location)
/databricks/python/lib/python3.7/site-packages/sparknlp/pretrained.py in downloadPipeline(name, language, remote_loc)
49 def downloadPipeline(name, language, remote_loc=None):
50 print(name + " download started this may take some time.")
---> 51 file_size = _internal._GetResourceSize(name, language, remote_loc).apply()
52 if file_size == "-1":
53 print("Can not find the model to download please check the name!")
/databricks/python/lib/python3.7/site-packages/sparknlp/internal.py in __init__(self, name, language, remote_loc)
190 def __init__(self, name, language, remote_loc):
191 super(_GetResourceSize, self).__init__(
--> 192 "com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.getDownloadSize", name, language, remote_loc)
193
194
/databricks/python/lib/python3.7/site-packages/sparknlp/internal.py in __init__(self, java_obj, *args)
127 super(ExtendedJavaWrapper, self).__init__(java_obj)
128 self.sc = SparkContext._active_spark_context
--> 129 self._java_obj = self.new_java_obj(java_obj, *args)
130 self.java_obj = self._java_obj
131
/databricks/python/lib/python3.7/site-packages/sparknlp/internal.py in new_java_obj(self, java_class, *args)
137
138 def new_java_obj(self, java_class, *args):
--> 139 return self._new_java_obj(java_class, *args)
140
141 def new_java_array(self, pylist, java_class):
/databricks/spark/python/pyspark/ml/wrapper.py in _new_java_obj(java_class, *args)
65 java_obj = getattr(java_obj, name)
66 java_args = [_py2java(sc, arg) for arg in args]
---> 67 return java_obj(*java_args)
68
69 #staticmethod
TypeError: 'JavaPackage' object is not callable
I get a similar if not the exact error if I try:
pipeline = PretrainedPipeline('recognize_entities_dl', 'en', 'https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/ner_dl_bert_en_2.4.3_2.4_1584624951079.zip')
I also get the same error for the second example. The Databricks Runtime Version is: 6.5 (includes Apache Spark 2.4.5, Scala 2.11), which is on the list of approved runtimes.
I'm not sure what the error messages mean or how to resolve them.
I found out that 'JavaPackage' object is not callable is caused by the spark-nlp (assembly jars) missing. So I made sure that these jars were downloaded and then placed in BOTH the executor and driver. E.g
when building the Spark docker image do something like
RUN cd /opt/spark/jars && \
wget https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/spark-nlp-assembly-2.6.4.jar
and also on the driver image/machine make sure the jar exists in the local directoy. Then set
conf.set("spark.driver.extraClassPath", "/opt/spark/jars/spark-nlp-assembly-2.6.4.jar")
conf.set("spark.executor.extraClassPath", "/opt/spark/jars/spark-nlp-assembly-2.6.4.jar")
The solution for databricks might be a bit different so instead of baking in the jars you may need to host them on S3 and refer to them that way.

TypeError: 'JavaPackage' object is not callable for Xgboost in PySpark

I am trying to make Scala Xgboost API available for my PySpark Notebook. And following this blog:
https://towardsdatascience.com/pyspark-and-xgboost-integration-tested-on-the-kaggle-titanic-dataset-4e75a568bdb
However, keep on running into below err:
spark._jvm.ml.dmlc.xgboost4j.scala.spark.XGBoostEstimator
<py4j.java_gateway.JavaPackage at 0x7fa650fe7a58>
from sparkxgb import XGBoostEstimator
xgboost = XGBoostEstimator(
featuresCol="features",
labelCol="Survival",
predictionCol="prediction"
)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-18-1765fb9e3344> in <module>
4 featuresCol="features",
5 labelCol="Survival",
----> 6 predictionCol="prediction"
7 )
~/spark-assembly-2.4.0-twttr-kryo3-scala2128-hadoop2.9.2.t05/python/pyspark/__init__.py in wrapper(self, *args, **kwargs)
108 raise TypeError("Method %s forces keyword arguments." % func.__name__)
109 self._input_kwargs = kwargs
--> 110 return func(self, **kwargs)
111 return wrapper
112
~/local/spark-3536cd7a-6188-4ca8-b3d0-57d42cd01531/userFiles-0a0d90bc-96b4-43f2-bf21-00ae0e6f7309/sparkxgb.zip/sparkxgb/xgboost.py in __init__(self, checkpoint_path, checkpointInterval, missing, nthread, nworkers, silent, use_external_memory, baseMarginCol, featuresCol, labelCol, predictionCol, weightCol, base_score, booster, eval_metric, num_class, num_round, objective, seed, alpha, colsample_bytree, colsample_bylevel, eta, gamma, grow_policy, max_bin, max_delta_step, max_depth, min_child_weight, reg_lambda, scale_pos_weight, sketch_eps, subsample, tree_method, normalize_type, rate_drop, sample_type, skip_drop, lambda_bias)
113
114 super(XGBoostEstimator, self).__init__()
--> 115 self._java_obj = self._new_java_obj("ml.dmlc.xgboost4j.scala.spark.XGBoostEstimator", self.uid)
116 self._create_params_from_java()
117 self._setDefault(
~/spark-assembly-2.4.0-twttr-kryo3-scala2128-hadoop2.9.2.t05/python/pyspark/ml/wrapper.py in _new_java_obj(java_class, *args)
65 java_obj = getattr(java_obj, name)
66 java_args = [_py2java(sc, arg) for arg in args]
---> 67 return java_obj(*java_args)
68
69 #staticmethod
TypeError: 'JavaPackage' object is not callable
I already google this error and tried below things. I got all ideas from this blog https://github.com/JohnSnowLabs/spark-nlp/issues/232 :
Make sure Xgboost4j is in the SPARK_DIST_CLASSPATH. Already checked.
$echo $SPARK_DIST_CLASSPATH | tr " " "\n" | grep 'xgboost4j' | rev | cut -d'/' -f1 | rev
xgboost4j-0.72.jar
xgboost4j-spark.72.jar
Make sure they are added to EXTRA_CLASSPATH. - Done
Updating configs.
'export PYSPARK_SUBMIT_ARGS="--conf spark.jars=$SPARK_HOME/jars/* --conf spark.driver.extraClassPath=$SPARK_HOME/jars/* --conf spark.executor.extraClassPath=$SPARK_HOME/jars/* pyspark-shell"',
Hardware Info:
Machine: Linux
Using Jupyter Notebook.
Spark Version 2.4.0
python3.6
I found the problem, The problem was that the sparkxbg.zip(which I downloaded over internet) is written for xgboost4j-0.72. However, my jars were from xgoost4j-0.9. And the API has been completetly changed. As a result 0.9 version didn't had any class named ml.dmlc.xgboost4j.scala.spark.XGBoostEstimator. And hence the error. You can see the difference in API below:
https://github.com/dmlc/xgboost/tree/release_0.72/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark
vs
https://github.com/dmlc/xgboost/tree/v0.90/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark

Error with show variable in data viewer for jupyter notebook

In the recent VS Code release, they added this feature to view the active variables in the Jupyter Notebook and also, view the values in the variable with Data Viewer.
However, every time I am trying to view the values in Data Viewer, VS Code is throwing error below. It says that the reason is that the object of data type is Int64 and not string, but I am sure that should not be the reason to not show the variable. Anyone facing similar issues. I tried with a simple data frame and it's working fine.
Error: Failure during variable extraction:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-16-eae5f1f55b35> in <module>
97
98 # Transform this back into a string
---> 99 print(_VSCODE_json.dumps(_VSCODE_targetVariable))
100 del _VSCODE_targetVariable
101
~/anaconda3/lib/python3.7/json/__init__.py in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)
229 cls is None and indent is None and separators is None and
230 default is None and not sort_keys and not kw):
--> 231 return _default_encoder.encode(obj)
232 if cls is None:
233 cls = JSONEncoder
~/anaconda3/lib/python3.7/json/encoder.py in encode(self, o)
197 # exceptions aren't as detailed. The list call should be roughly
198 # equivalent to the PySequence_Fast that ''.join() would do.
--> 199 chunks = self.iterencode(o, _one_shot=True)
200 if not isinstance(chunks, (list, tuple)):
201 chunks = list(chunks)
~/anaconda3/lib/python3.7/json/encoder.py in iterencode(self, o, _one_shot)
255 self.key_separator, self.item_separator, self.sort_keys,
256 self.skipkeys, _one_shot)
--> 257 return _iterencode(o, 0)
258
259 def _make_iterencode(markers, _default, _encoder, _indent, _floatstr,
~/anaconda3/lib/python3.7/json/encoder.py in default(self, o)
177
178 """
--> 179 raise TypeError(f'Object of type {o.__class__.__name__} '
180 f'is not JSON serializable')
181
TypeError: Object of type int64 is not JSON serializable

Unable to open files, with the path in Jupyter notebook

I have reinstalled the anaconda after formatting my machine, since I am getting error while opening the files in jupyter notebook.
Initially I tried access the file from desktop location, as I got an error again tried to access from D drive. both were not successful attempts.
salaries = pd.read_excel('D:\\housesales.xlsx')
Below is the error
FileNotFoundError Traceback (most recent call last) <ipython-input-13-6d8e17cbb085> in <module> ----> 1 salaries = pd.read_excel('D:\housesales.xlsx') ~\Anaconda3\lib\site-packages\pandas\util_decorators.py in wrapper(*args, **kwargs) 186 else: 187 kwargs[new_arg_name] = new_arg_value --> 188 return func(*args, **kwargs) 189 return wrapper 190 return _deprecate_kwarg ~\Anaconda3\lib\site-packages\pandas\util_decorators.py in wrapper(*args, **kwargs) 186 else: 187 kwargs[new_arg_name] = new_arg_value --> 188 return func(*args, **kwargs) 189 return wrapper 190 return _deprecate_kwarg ~\Anaconda3\lib\site-packages\pandas\io\excel.py in read_excel(io, sheet_name, header, names, index_col, parse_cols, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, verbose, parse_dates, date_parser, thousands, comment, skip_footer, skipfooter, convert_float, mangle_dupe_cols, **kwds) 348 349 if not isinstance(io, ExcelFile): --> 350 io = ExcelFile(io, engine=engine) 351 352 return io.parse( ~\Anaconda3\lib\site-packages\pandas\io\excel.py in init(self, io, engine) 651 self._io = _stringify_path(io) 652 --> 653 self._reader = self._enginesengine 654 655 def fspath(self): ~\Anaconda3\lib\site-packages\pandas\io\excel.py in init(self, filepath_or_buffer) 422 self.book = xlrd.open_workbook(file_contents=data) 423 elif isinstance(filepath_or_buffer, compat.string_types): --> 424 self.book = xlrd.open_workbook(filepath_or_buffer) 425 else: 426 raise ValueError('Must explicitly set engine if not passing in' ~\Anaconda3\lib\site-packages\xlrd__init__.py in open_workbook(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows) 109 else: 110 filename = os.path.expanduser(filename) --> 111 with open(filename, "rb") as f: 112 peek = f.read(peeksz) 113 if peek == b"PK\x03\x04": # a ZIP file FileNotFoundError: [Errno 2] No such file or directory: 'D:\housesales.xlsx'
Sounds like your housesales.xlsx file is on your Desktop, but you do not include the Desktop folder in the path to your file.
salaries = pd.read_excel('D:\\Desktop\housesales.xlsx')
I recommend you use jupyter lab as it has a file tree.
Running this bash command in a notebook cell will tell you the working directory of your jupyter instance so you know where it is looking for files.
!pwd
You could also move your file to that directory and then just access it as
salaries = pd.read_excel('housesales.xlsx')