Error with show variable in data viewer for jupyter notebook - visual-studio-code

In the recent VS Code release, they added this feature to view the active variables in the Jupyter Notebook and also, view the values in the variable with Data Viewer.
However, every time I am trying to view the values in Data Viewer, VS Code is throwing error below. It says that the reason is that the object of data type is Int64 and not string, but I am sure that should not be the reason to not show the variable. Anyone facing similar issues. I tried with a simple data frame and it's working fine.
Error: Failure during variable extraction:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-16-eae5f1f55b35> in <module>
97
98 # Transform this back into a string
---> 99 print(_VSCODE_json.dumps(_VSCODE_targetVariable))
100 del _VSCODE_targetVariable
101
~/anaconda3/lib/python3.7/json/__init__.py in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)
229 cls is None and indent is None and separators is None and
230 default is None and not sort_keys and not kw):
--> 231 return _default_encoder.encode(obj)
232 if cls is None:
233 cls = JSONEncoder
~/anaconda3/lib/python3.7/json/encoder.py in encode(self, o)
197 # exceptions aren't as detailed. The list call should be roughly
198 # equivalent to the PySequence_Fast that ''.join() would do.
--> 199 chunks = self.iterencode(o, _one_shot=True)
200 if not isinstance(chunks, (list, tuple)):
201 chunks = list(chunks)
~/anaconda3/lib/python3.7/json/encoder.py in iterencode(self, o, _one_shot)
255 self.key_separator, self.item_separator, self.sort_keys,
256 self.skipkeys, _one_shot)
--> 257 return _iterencode(o, 0)
258
259 def _make_iterencode(markers, _default, _encoder, _indent, _floatstr,
~/anaconda3/lib/python3.7/json/encoder.py in default(self, o)
177
178 """
--> 179 raise TypeError(f'Object of type {o.__class__.__name__} '
180 f'is not JSON serializable')
181
TypeError: Object of type int64 is not JSON serializable

Related

Error Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _th_index_select

I have the following code taken directly from here with some pretty little modifications:
import pandas as pd
import torch
import json
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from torch import cuda
df = pd.read_pickle('df_final.pkl')
model = T5ForConditionalGeneration.from_pretrained('t5-base')
tokenizer = T5Tokenizer.from_pretrained('t5-base')
device = 'cuda' if cuda.is_available() else 'cpu'
text = ''.join(df[(df['col1'] == 'type') & (df['col2'] == 2)].col3.to_list())
preprocess_text = text.strip().replace("\n","")
t5_prepared_Text = "summarize: "+preprocess_text
#print ("original text preprocessed: \n", preprocess_text)
tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt", max_length = 500000).to(device)
# summmarize
summary_ids = model.generate(tokenized_text,
num_beams=4,
no_repeat_ngram_size=2,
min_length=30,
max_length=100,
early_stopping=True)
output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print ("\n\nSummarized text: \n",output)
When executing the model_generate() part i get an error like this:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-12-e8e9819a85dc> in <module>
12 min_length=30,
13 max_length=100,
---> 14 early_stopping=True).to(device)
15
16 output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
~\Anaconda3\lib\site-packages\torch\autograd\grad_mode.py in decorate_no_grad(*args, **kwargs)
47 def decorate_no_grad(*args, **kwargs):
48 with self:
---> 49 return func(*args, **kwargs)
50 return decorate_no_grad
51
~\Anaconda3\lib\site-packages\transformers\generation_utils.py in generate(self, input_ids, max_length, min_length, do_sample, early_stopping, num_beams, temperature, top_k, top_p, repetition_penalty, bad_words_ids, bos_token_id, pad_token_id, eos_token_id, length_penalty, no_repeat_ngram_size, num_return_sequences, attention_mask, decoder_start_token_id, use_cache, **model_specific_kwargs)
383 encoder = self.get_encoder()
384
--> 385 encoder_outputs: tuple = encoder(input_ids, attention_mask=attention_mask)
386
387 # Expand input ids if num_beams > 1 or num_return_sequences > 1
~\Anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
~\Anaconda3\lib\site-packages\transformers\modeling_t5.py in forward(self, input_ids, attention_mask, encoder_hidden_states, encoder_attention_mask, inputs_embeds, head_mask, past_key_value_states, use_cache, output_attentions, output_hidden_states, return_dict)
701 if inputs_embeds is None:
702 assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings"
--> 703 inputs_embeds = self.embed_tokens(input_ids)
704
705 batch_size, seq_length = input_shape
~\Anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
~\Anaconda3\lib\site-packages\torch\nn\modules\sparse.py in forward(self, input)
112 return F.embedding(
113 input, self.weight, self.padding_idx, self.max_norm,
--> 114 self.norm_type, self.scale_grad_by_freq, self.sparse)
115
116 def extra_repr(self):
~\Anaconda3\lib\site-packages\torch\nn\functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
1482 # remove once script supports set_grad_enabled
1483 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 1484 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
1485
1486
RuntimeError: Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _th_index_select
​
I've searched this error and fouund some other threads like this one and this one but they didn't help me much since their case seems to be completely different. In my case there are no custom instances or classes created, so i don't know how to fix this or where the error come from.
Could you please tell me where is the error coming from and how could i fix it?
Thank you very much in advance.
Try explicitly moving your model to the GPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)

Using pretrained models from sparknlp on Databricks

I am trying to follow the official examples from John Snow Labs but every time I get a TypeError: 'JavaPackage' object is not callable error. I followed all of the steps in the Databricks install documentation but no matter what walkthrough I try, either this one or this one it fails.
An example of the first (after doing the installs):
import sparknlp
from sparknlp.pretrained import *
pipeline = PretrainedPipeline('recognize_entities_dl', 'en')
recognize_entities_dl download started this may take some time.
TypeError: 'JavaPackage' object is not callable
TypeError Traceback (most recent call last)
<command-937510457011238> in <module>
----> 1 pipeline = PretrainedPipeline('recognize_entities_dl', 'en')
2
3 # ner_bert = NerDLModel.pretrained('ner_dl_bert')
4
5 # pipeline = PretrainedPipeline('recognize_entities_dl', 'en', 'https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/ner_dl_bert_en_2.4.3_2.4_1584624951079.zip')
/databricks/python/lib/python3.7/site-packages/sparknlp/pretrained.py in __init__(self, name, lang, remote_loc, parse_embeddings, disk_location)
89 def __init__(self, name, lang='en', remote_loc=None, parse_embeddings=False, disk_location=None):
90 if not disk_location:
---> 91 self.model = ResourceDownloader().downloadPipeline(name, lang, remote_loc)
92 else:
93 self.model = PipelineModel.load(disk_location)
/databricks/python/lib/python3.7/site-packages/sparknlp/pretrained.py in downloadPipeline(name, language, remote_loc)
49 def downloadPipeline(name, language, remote_loc=None):
50 print(name + " download started this may take some time.")
---> 51 file_size = _internal._GetResourceSize(name, language, remote_loc).apply()
52 if file_size == "-1":
53 print("Can not find the model to download please check the name!")
/databricks/python/lib/python3.7/site-packages/sparknlp/internal.py in __init__(self, name, language, remote_loc)
190 def __init__(self, name, language, remote_loc):
191 super(_GetResourceSize, self).__init__(
--> 192 "com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.getDownloadSize", name, language, remote_loc)
193
194
/databricks/python/lib/python3.7/site-packages/sparknlp/internal.py in __init__(self, java_obj, *args)
127 super(ExtendedJavaWrapper, self).__init__(java_obj)
128 self.sc = SparkContext._active_spark_context
--> 129 self._java_obj = self.new_java_obj(java_obj, *args)
130 self.java_obj = self._java_obj
131
/databricks/python/lib/python3.7/site-packages/sparknlp/internal.py in new_java_obj(self, java_class, *args)
137
138 def new_java_obj(self, java_class, *args):
--> 139 return self._new_java_obj(java_class, *args)
140
141 def new_java_array(self, pylist, java_class):
/databricks/spark/python/pyspark/ml/wrapper.py in _new_java_obj(java_class, *args)
65 java_obj = getattr(java_obj, name)
66 java_args = [_py2java(sc, arg) for arg in args]
---> 67 return java_obj(*java_args)
68
69 #staticmethod
TypeError: 'JavaPackage' object is not callable
I get a similar if not the exact error if I try:
pipeline = PretrainedPipeline('recognize_entities_dl', 'en', 'https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/ner_dl_bert_en_2.4.3_2.4_1584624951079.zip')
I also get the same error for the second example. The Databricks Runtime Version is: 6.5 (includes Apache Spark 2.4.5, Scala 2.11), which is on the list of approved runtimes.
I'm not sure what the error messages mean or how to resolve them.
I found out that 'JavaPackage' object is not callable is caused by the spark-nlp (assembly jars) missing. So I made sure that these jars were downloaded and then placed in BOTH the executor and driver. E.g
when building the Spark docker image do something like
RUN cd /opt/spark/jars && \
wget https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/spark-nlp-assembly-2.6.4.jar
and also on the driver image/machine make sure the jar exists in the local directoy. Then set
conf.set("spark.driver.extraClassPath", "/opt/spark/jars/spark-nlp-assembly-2.6.4.jar")
conf.set("spark.executor.extraClassPath", "/opt/spark/jars/spark-nlp-assembly-2.6.4.jar")
The solution for databricks might be a bit different so instead of baking in the jars you may need to host them on S3 and refer to them that way.

TypeError: 'JavaPackage' object is not callable for Xgboost in PySpark

I am trying to make Scala Xgboost API available for my PySpark Notebook. And following this blog:
https://towardsdatascience.com/pyspark-and-xgboost-integration-tested-on-the-kaggle-titanic-dataset-4e75a568bdb
However, keep on running into below err:
spark._jvm.ml.dmlc.xgboost4j.scala.spark.XGBoostEstimator
<py4j.java_gateway.JavaPackage at 0x7fa650fe7a58>
from sparkxgb import XGBoostEstimator
xgboost = XGBoostEstimator(
featuresCol="features",
labelCol="Survival",
predictionCol="prediction"
)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-18-1765fb9e3344> in <module>
4 featuresCol="features",
5 labelCol="Survival",
----> 6 predictionCol="prediction"
7 )
~/spark-assembly-2.4.0-twttr-kryo3-scala2128-hadoop2.9.2.t05/python/pyspark/__init__.py in wrapper(self, *args, **kwargs)
108 raise TypeError("Method %s forces keyword arguments." % func.__name__)
109 self._input_kwargs = kwargs
--> 110 return func(self, **kwargs)
111 return wrapper
112
~/local/spark-3536cd7a-6188-4ca8-b3d0-57d42cd01531/userFiles-0a0d90bc-96b4-43f2-bf21-00ae0e6f7309/sparkxgb.zip/sparkxgb/xgboost.py in __init__(self, checkpoint_path, checkpointInterval, missing, nthread, nworkers, silent, use_external_memory, baseMarginCol, featuresCol, labelCol, predictionCol, weightCol, base_score, booster, eval_metric, num_class, num_round, objective, seed, alpha, colsample_bytree, colsample_bylevel, eta, gamma, grow_policy, max_bin, max_delta_step, max_depth, min_child_weight, reg_lambda, scale_pos_weight, sketch_eps, subsample, tree_method, normalize_type, rate_drop, sample_type, skip_drop, lambda_bias)
113
114 super(XGBoostEstimator, self).__init__()
--> 115 self._java_obj = self._new_java_obj("ml.dmlc.xgboost4j.scala.spark.XGBoostEstimator", self.uid)
116 self._create_params_from_java()
117 self._setDefault(
~/spark-assembly-2.4.0-twttr-kryo3-scala2128-hadoop2.9.2.t05/python/pyspark/ml/wrapper.py in _new_java_obj(java_class, *args)
65 java_obj = getattr(java_obj, name)
66 java_args = [_py2java(sc, arg) for arg in args]
---> 67 return java_obj(*java_args)
68
69 #staticmethod
TypeError: 'JavaPackage' object is not callable
I already google this error and tried below things. I got all ideas from this blog https://github.com/JohnSnowLabs/spark-nlp/issues/232 :
Make sure Xgboost4j is in the SPARK_DIST_CLASSPATH. Already checked.
$echo $SPARK_DIST_CLASSPATH | tr " " "\n" | grep 'xgboost4j' | rev | cut -d'/' -f1 | rev
xgboost4j-0.72.jar
xgboost4j-spark.72.jar
Make sure they are added to EXTRA_CLASSPATH. - Done
Updating configs.
'export PYSPARK_SUBMIT_ARGS="--conf spark.jars=$SPARK_HOME/jars/* --conf spark.driver.extraClassPath=$SPARK_HOME/jars/* --conf spark.executor.extraClassPath=$SPARK_HOME/jars/* pyspark-shell"',
Hardware Info:
Machine: Linux
Using Jupyter Notebook.
Spark Version 2.4.0
python3.6
I found the problem, The problem was that the sparkxbg.zip(which I downloaded over internet) is written for xgboost4j-0.72. However, my jars were from xgoost4j-0.9. And the API has been completetly changed. As a result 0.9 version didn't had any class named ml.dmlc.xgboost4j.scala.spark.XGBoostEstimator. And hence the error. You can see the difference in API below:
https://github.com/dmlc/xgboost/tree/release_0.72/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark
vs
https://github.com/dmlc/xgboost/tree/v0.90/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark

Torchtext AttributeError: 'Example' object has no attribute 'text_content'

I'm working with RNN and using Pytorch & Torchtext. I've got a problem with building vocab in my RNN. My code is as follows:
TEXT = Field(tokenize=tokenizer, lower=True)
LABEL = LabelField(dtype=torch.float)
trainds = TabularDataset(
path='drive/{}'.format(TRAIN_PATH), format='tsv',
fields=[
('label_start', LABEL),
('label_end', None),
('title', None),
('symbol', None),
('text_content', TEXT),
])
testds = TabularDataset(
path='drive/{}'.format(TEST_PATH), format='tsv',
fields=[
('text_content', TEXT),
])
TEXT.build_vocab(trainds, testds)
When I want to build vocab, I'm getting this annoying error:
AttributeError: 'Example' object has no attribute 'text_content'
I'm sure, that there is no missing text_content attr. I made try-catch in order to display this specific case:
try:
print(len(trainds[i]))
except:
print(trainds[i].text_content)
Surprisingly, I don't get any error and this specific print command shows:
['znana', 'okresie', 'masarni', 'walc', 'y', 'myśl', 'programie', 'sprawy', ...]
So it indicates, that there is text_content attr. When I perform this on a smaller dataset, it works like a charm. This problem occurs when I want to work with proper data. I ran out of ideas. Maybe someone had a similar case and can explain it.
My full traceback:
AttributeError Traceback (most recent call last)
<ipython-input-16-cf31866a07e7> in <module>()
155
156 if __name__ == "__main__":
--> 157 main()
158
<ipython-input-16-cf31866a07e7> in main()
117 break
118
--> 119 TEXT.build_vocab(trainds, testds)
120 print('zbudowano dla text')
121 LABEL.build_vocab(trainds)
/usr/local/lib/python3.6/dist-packages/torchtext/data/field.py in build_vocab(self, *args, **kwargs)
260 sources.append(arg)
261 for data in sources:
--> 262 for x in data:
263 if not self.sequential:
264 x = [x]
/usr/local/lib/python3.6/dist-packages/torchtext/data/dataset.py in __getattr__(self, attr)
152 if attr in self.fields:
153 for x in self.examples:
--> 154 yield getattr(x, attr)
155
156 #classmethod
AttributeError: 'Example' object has no attribute 'text_content'
This problem arises when the fields are not passed in the same order as they are in the csv/tsv file. Order must be same. Also check if no extra or less fields are mentioned than there are in the csv/tsv file..
I had the same problem.
The reason was that some rows in my input csv dataset were empty.

TypeError: iteration over a 0-d array when trying to use TextLMDataBunch.from_csv in FastAI

The library expects utf-8. I tried to convert my us-ascii file to utf-8 using:
iconv -f us-ascii -t utf-8 src.csv > target.csv
When I did:
file -I target.csv
It still showed charset as us-ascii. Then I found out that us-ascii is a subset of utf-8 and that file will only guess the file type.
However, if I take use src.csv as input to the TextLMDataBunch.from_csv() library, it works. If I do:
cat src.csv > target.csv
And then use target.csv as input to the same library, it doesn't work and gives the following error:
TypeError Traceback (most recent call last)
<ipython-input-118-44bc7147d2a4> in <module>()
----> 1 data_lm = TextLMDataBunch.from_csv(sample_p, 'voila.csv')
/usr/local/lib/python3.6/dist-packages/fastai/text/data.py in from_csv(cls, path, csv_name, valid_pct, test, tokenizer, vocab, classes, header, text_cols, label_cols, label_delim, **kwargs)
180 test_df = None if test is None else pd.read_csv(Path(path)/test, header=header)
181 return cls.from_df(path, train_df, valid_df, test_df, tokenizer, vocab, classes, text_cols,
--> 182 label_cols, label_delim, **kwargs)
183
184 #classmethod
/usr/local/lib/python3.6/dist-packages/fastai/text/data.py in from_df(cls, path, train_df, valid_df, test_df, tokenizer, vocab, classes, text_cols, label_cols, label_delim, **kwargs)
165 src = ItemLists(path, TextList.from_df(train_df, path, cols=text_cols, processor=processor),
166 TextList.from_df(valid_df, path, cols=text_cols, processor=processor))
--> 167 src = src.label_for_lm() if cls==TextLMDataBunch else src.label_from_df(cols=label_cols, classes=classes, sep=label_delim)
168 if test_df is not None: src.add_test(TextList.from_df(test_df, path, cols=text_cols))
169 return src.databunch(**kwargs)
/usr/local/lib/python3.6/dist-packages/fastai/data_block.py in _inner(*args, **kwargs)
356 assert isinstance(fv, Callable)
357 def _inner(*args, **kwargs):
--> 358 self.train = ft(*args, **kwargs)
359 assert isinstance(self.train, LabelList)
360 self.valid = fv(*args, **kwargs)
/usr/local/lib/python3.6/dist-packages/fastai/text/data.py in label_for_lm(self, **kwargs)
285 "A special labelling method for language models."
286 self.__class__ = LMTextList
--> 287 return self.label_const(0, label_cls=LMLabel)
288
289 def reconstruct(self, t:Tensor):
/usr/local/lib/python3.6/dist-packages/fastai/data_block.py in label_const(self, const, **kwargs)
211 def label_const(self, const:Any=0, **kwargs)->'LabelList':
212 "Label every item with `const`."
--> 213 return self.label_from_func(func=lambda o: const, **kwargs)
214
215 def label_empty(self):
/usr/local/lib/python3.6/dist-packages/fastai/data_block.py in label_from_func(self, func, **kwargs)
219 def label_from_func(self, func:Callable, **kwargs)->'LabelList':
220 "Apply `func` to every input to get its label."
--> 221 return self.label_from_list([func(o) for o in self.items], **kwargs)
222
223 def label_from_folder(self, **kwargs)->'LabelList':
TypeError: iteration over a 0-d array
Can someone please tell me what is wrong? I am trying this on Google Colab and tried the character encoding changes on Colab and on my Mac but with no results.