Why does H2O import_file fail with col_names argument?

Why does H2O import_file fail with col_names argument? - import

It is a strange error when I use the col_names= argument in h2o.import_file. However, setting the column names by a separate line works fine.
import os
import h2o
h2o.init() # It shows H2O_cluster_version 3.36.1.2 and Python version 3.9.7 final
os.system("wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.tr.bz2")
os.system("bzip2 -d ijcnn1.tr.bz2")
# These lines work
col_names = ['class'] + ['F' + str(i) for i in range(22)]
df1 = h2o.import_file(path="ijcnn1.tr")
df1.columns = col_names
# But this line does not work
df2 = h2o.import_file(path="ijcnn1.tr", col_names=col_names)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_20191/3817572867.py in <module>
----> 1 df2 = h2o.import_file(path="ijcnn1.tr", col_names=col_names)
~/anaconda3/lib/python3.9/site-packages/h2o/h2o.py in import_file(path, destination_frame, parse, header, sep, col_names, col_types, na_strings, pattern, skipped_columns, custom_non_data_line_markers, partition_by, quotechar, escapechar)
498 return lazy_import(path, pattern)
499 else:
--> 500 return H2OFrame()._import_parse(path, pattern, destination_frame, header, sep, col_names, col_types, na_strings,
501 skipped_columns, custom_non_data_line_markers, partition_by, quotechar, escapechar)
502
~/anaconda3/lib/python3.9/site-packages/h2o/frame.py in _import_parse(self, path, pattern, destination_frame, header, separator, column_names, column_types, na_strings, skipped_columns, custom_non_data_line_markers, partition_by, quotechar, escapechar)
459 path = os.path.abspath(path)
460 rawkey = h2o.lazy_import(path, pattern)
--> 461 self._parse(rawkey, destination_frame, header, separator, column_names, column_types, na_strings,
462 skipped_columns, custom_non_data_line_markers, partition_by, quotechar, escapechar)
463 return self
~/anaconda3/lib/python3.9/site-packages/h2o/frame.py in _parse(self, rawkey, destination_frame, header, separator, column_names, column_types, na_strings, skipped_columns, custom_non_data_line_markers, partition_by, quotechar, escapechar)
476 na_strings=None, skipped_columns=None, custom_non_data_line_markers=None, partition_by=None, quotechar=None,
477 escapechar=None):
--> 478 setup = h2o.parse_setup(rawkey, destination_frame, header, separator, column_names, column_types, na_strings,
479 skipped_columns, custom_non_data_line_markers, partition_by, quotechar, escapechar)
480 return self._parse_raw(setup)
~/anaconda3/lib/python3.9/site-packages/h2o/h2o.py in parse_setup(raw_frames, destination_frame, header, separator, column_names, column_types, na_strings, skipped_columns, custom_non_data_line_markers, partition_by, quotechar, escapechar)
872 % (len(column_names), parse_column_len))
873 else:
--> 874 if len(column_names) != len(j["column_types"]): raise ValueError(
875 "length of col_names should be equal to the number of columns: %d vs %d"
876 % (len(column_names), len(j["column_types"])))
ValueError: length of col_names should be equal to the number of columns: 23 vs 22

I think this is bug (off-by-one error) in the SVMLight parser, so I filed a bug here. For now, I'd recommend just naming the columns after you import the file. Thank you for the reproducible example and bug report!

Related

JSONDecodeError when connecting to Dataverse with Databricks

When executing the code listed below to access Azure Dataverse Endpoint with Databricks I get the following error
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
The code is as follows:
list_data = call_dataverse_endpoint(f'https://{my_dv_orgName}.crm11.dynamics.com/api/data/v9.2/{my_dv_entity}')
The full error message is as follows:
JSONDecodeError Traceback (most recent call last)
<command-2234217625858083> in <module>
----> 1 list_data = call_dataverse_endpoint(f'https://{my_dv_orgName}.crm11.dynamics.com/api/data/v9.2/{my_dv_entity}')
<command-3028378077557710> in call_dataverse_endpoint(endpoint)
16
17 # Loop through the responses until odata.nextLink is gone.
---> 18 while "#odata.nextLink" in response.json():
19 # Append the data returned by the endpoint to the list
20 data.extend(response.json()["value"])
/databricks/python/lib/python3.8/site-packages/requests/models.py in json(self, **kwargs)
898 # used.
899 pass
--> 900 return complexjson.loads(self.text, **kwargs)
901
902 #property
/usr/lib/python3.8/json/__init__.py in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
355 parse_int is None and parse_float is None and
356 parse_constant is None and object_pairs_hook is None and not kw):
--> 357 return _default_decoder.decode(s)
358 if cls is None:
359 cls = JSONDecoder
/usr/lib/python3.8/json/decoder.py in decode(self, s, _w)
335
336 """
--> 337 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
338 end = _w(s, end).end()
339 if end != len(s):
/usr/lib/python3.8/json/decoder.py in raw_decode(self, s, idx)
353 obj, end = self.scan_once(s, idx)
354 except StopIteration as err:
--> 355 raise JSONDecodeError("Expecting value", s, err.value) from None
356 return obj, end
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
The function that is being called is as follows:
def call_dataverse_endpoint(endpoint):
# Empty list
data = []
# Headers
headers = {
"Authorization": f"Bearer {my_dv_accessToken}",
"Accept": "application/json",
"Content-Type": "application/json; charset=utf-8"
}
# Initial request
response = requests.get(endpoint, headers=headers)
#print(print(f''' Fetching from initial url: {endpoint} ''' ))
# Loop through the responses until odata.nextLink is gone.
while "#odata.nextLink" in response.json():
# Append the data returned by the endpoint to the list
data.extend(response.json()["value"])
# Request the odata.nextLink URL
response = requests.get(response.json()["#odata.nextLink"], headers=headers)
#print(f''' Fetching from successive url: {response.json()["#odata.nextLink"]} ''' )
# Append nextLink response data
data.extend(response.json()["value"])
# Return
return data
Any thoughts?

MongoClient insert_one works while mongoengine connect doesn't (unautherized)

I try to insert document using mongoengine interface AFTER authentication, but still gets denied. This doesn't happen using MongoClient...
This is the mongoengine try to insert one document:
In [1]: from mongoengine import connect
In [2]: db = connect(host='localhost', port=27017, username='root', password='pass')
In [3]: db.local.col.insert_one({'a':1})
---------------------------------------------------------------------------
OperationFailure Traceback (most recent call last)
<ipython-input-3-55a23806fbb1> in <module>
----> 1 db.local.col.insert_one({'a':1})
~/venv3.8/lib/python3.8/site-packages/pymongo/collection.py in insert_one(self, document, bypass_document_validation, session)
696 write_concern = self._write_concern_for(session)
697 return InsertOneResult(
--> 698 self._insert(document,
699 write_concern=write_concern,
700 bypass_doc_val=bypass_document_validation,
~/venv3.8/lib/python3.8/site-packages/pymongo/collection.py in _insert(self, docs, ordered, check_keys, manipulate, write_concern, op_id, bypass_doc_val, session)
611 """Internal insert helper."""
612 if isinstance(docs, abc.Mapping):
--> 613 return self._insert_one(
614 docs, ordered, check_keys, manipulate, write_concern, op_id,
615 bypass_doc_val, session)
~/venv3.8/lib/python3.8/site-packages/pymongo/collection.py in _insert_one(self, doc, ordered, check_keys, manipulate, write_concern, op_id, bypass_doc_val, session)
600 _check_write_command_response(result)
601
--> 602 self.__database.client._retryable_write(
603 acknowledged, _insert_command, session)
604
~/venv3.8/lib/python3.8/site-packages/pymongo/mongo_client.py in _retryable_write(self, retryable, func, session)
1496 """Internal retryable write helper."""
1497 with self._tmp_session(session) as s:
-> 1498 return self._retry_with_session(retryable, func, s, None)
1499
1500 def _handle_getlasterror(self, address, error_msg):
~/venv3.8/lib/python3.8/site-packages/pymongo/mongo_client.py in _retry_with_session(self, retryable, func, session, bulk)
1382 retryable = (retryable and self.retry_writes
1383 and session and not session.in_transaction)
-> 1384 return self._retry_internal(retryable, func, session, bulk)
1385
1386 def _retry_internal(self, retryable, func, session, bulk):
~/venv3.8/lib/python3.8/site-packages/pymongo/mongo_client.py in _retry_internal(self, retryable, func, session, bulk)
1414 raise last_error
1415 retryable = False
-> 1416 return func(session, sock_info, retryable)
1417 except ServerSelectionTimeoutError:
1418 if is_retrying():
~/venv3.8/lib/python3.8/site-packages/pymongo/collection.py in _insert_command(session, sock_info, retryable_write)
588 command['bypassDocumentValidation'] = True
589
--> 590 result = sock_info.command(
591 self.__database.name,
592 command,
~/venv3.8/lib/python3.8/site-packages/pymongo/pool.py in command(self, dbname, spec, slave_ok, read_preference, codec_options, check, allowable_errors, check_keys, read_concern, write_concern, parse_write_concern_error, collation, session, client, retryable_write, publish_events, user_fields, exhaust_allowed)
681 self._raise_if_not_writable(unacknowledged)
682 try:
--> 683 return command(self, dbname, spec, slave_ok,
684 self.is_mongos, read_preference, codec_options,
685 session, client, check, allowable_errors,
~/venv3.8/lib/python3.8/site-packages/pymongo/network.py in command(sock_info, dbname, spec, slave_ok, is_mongos, read_preference, codec_options, session, client, check, allowable_errors, address, check_keys, listeners, max_bson_size, read_concern, parse_write_concern_error, collation, compression_ctx, use_op_m
sg, unacknowledged, user_fields, exhaust_allowed)
157 client._process_response(response_doc, session)
158 if check:
--> 159 helpers._check_command_response(
160 response_doc, sock_info.max_wire_version, None,
161 allowable_errors,
~/venv3.8/lib/python3.8/site-packages/pymongo/helpers.py in _check_command_response(response, max_wire_version, msg, allowable_errors, parse_write_concern_error)
165
166 msg = msg or "%s"
--> 167 raise OperationFailure(msg % errmsg, code, response,
168 max_wire_version)
169
OperationFailure: command insert requires authentication, full error: {'ok': 0.0, 'errmsg': 'command insert requires authentication', 'code': 13, 'codeName': 'Unauthorized'}
which fails, but the MongoClient works for some reason:
In [4]: from pymongo import MongoClient
In [5]: col = MongoClient(host='localhost', port=27017, username='root', password='pass')
In [6]: col.local.col.insert_one({'a':1})
Out[6]: <pymongo.results.InsertOneResult at 0x7ff2a347a8c0>

Error Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _th_index_select

I have the following code taken directly from here with some pretty little modifications:
import pandas as pd
import torch
import json
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from torch import cuda
df = pd.read_pickle('df_final.pkl')
model = T5ForConditionalGeneration.from_pretrained('t5-base')
tokenizer = T5Tokenizer.from_pretrained('t5-base')
device = 'cuda' if cuda.is_available() else 'cpu'
text = ''.join(df[(df['col1'] == 'type') & (df['col2'] == 2)].col3.to_list())
preprocess_text = text.strip().replace("\n","")
t5_prepared_Text = "summarize: "+preprocess_text
#print ("original text preprocessed: \n", preprocess_text)
tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt", max_length = 500000).to(device)
# summmarize
summary_ids = model.generate(tokenized_text,
num_beams=4,
no_repeat_ngram_size=2,
min_length=30,
max_length=100,
early_stopping=True)
output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print ("\n\nSummarized text: \n",output)
When executing the model_generate() part i get an error like this:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-12-e8e9819a85dc> in <module>
12 min_length=30,
13 max_length=100,
---> 14 early_stopping=True).to(device)
15
16 output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
~\Anaconda3\lib\site-packages\torch\autograd\grad_mode.py in decorate_no_grad(*args, **kwargs)
47 def decorate_no_grad(*args, **kwargs):
48 with self:
---> 49 return func(*args, **kwargs)
50 return decorate_no_grad
51
~\Anaconda3\lib\site-packages\transformers\generation_utils.py in generate(self, input_ids, max_length, min_length, do_sample, early_stopping, num_beams, temperature, top_k, top_p, repetition_penalty, bad_words_ids, bos_token_id, pad_token_id, eos_token_id, length_penalty, no_repeat_ngram_size, num_return_sequences, attention_mask, decoder_start_token_id, use_cache, **model_specific_kwargs)
383 encoder = self.get_encoder()
384
--> 385 encoder_outputs: tuple = encoder(input_ids, attention_mask=attention_mask)
386
387 # Expand input ids if num_beams > 1 or num_return_sequences > 1
~\Anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
~\Anaconda3\lib\site-packages\transformers\modeling_t5.py in forward(self, input_ids, attention_mask, encoder_hidden_states, encoder_attention_mask, inputs_embeds, head_mask, past_key_value_states, use_cache, output_attentions, output_hidden_states, return_dict)
701 if inputs_embeds is None:
702 assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings"
--> 703 inputs_embeds = self.embed_tokens(input_ids)
704
705 batch_size, seq_length = input_shape
~\Anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
~\Anaconda3\lib\site-packages\torch\nn\modules\sparse.py in forward(self, input)
112 return F.embedding(
113 input, self.weight, self.padding_idx, self.max_norm,
--> 114 self.norm_type, self.scale_grad_by_freq, self.sparse)
115
116 def extra_repr(self):
~\Anaconda3\lib\site-packages\torch\nn\functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
1482 # remove once script supports set_grad_enabled
1483 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 1484 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
1485
1486
RuntimeError: Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _th_index_select

I've searched this error and fouund some other threads like this one and this one but they didn't help me much since their case seems to be completely different. In my case there are no custom instances or classes created, so i don't know how to fix this or where the error come from.
Could you please tell me where is the error coming from and how could i fix it?
Thank you very much in advance.

Try explicitly moving your model to the GPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)

How to fix "float() argument must be a string or a number, not 'PngImageFile" error when keras calling numpy.asarray with dtype of float32

I am learning neuron network by building multilayer perceptron on a binary classification problem using keras with tensorflow as backend.
Here is the source of image data.
I have followed this and this.
From those issues I found, i think the error is related to corrupted image, but I tried those suggestion inside those links by verifying the image, the image have no problem for me but the error still persists.
The stacktrace shows that the error was occured when keras is trying to convert the image data to numpy array with data type of float32, so I tried converting the image to numpy array myself, and converting it like numpy.asarray(image) works but not numpy.asarray(image, dtype='float32') which was what keras was doing.
Assuming all import are in place.
So the code to data preparation
image_data_path = '../data/breast_histopathology'
image_width = 50
image_height = 50
train_size_as_percentage = 0.8
validate_size_percentage_of_train_data = 0.1
data_extract_path = image_data_path + '_prep'
train_data_path = data_extract_path + '/training'
test_data_path = data_extract_path + '/testing'
validation_data_path = data_extract_path + '/validation'
if os.path.isdir(data_extract_path):
shutil.rmtree(data_extract_path)
os.makedirs(train_data_path)
os.makedirs(train_data_path + '/0')
os.makedirs(train_data_path + '/1')
os.makedirs(test_data_path)
os.makedirs(test_data_path + '/0')
os.makedirs(test_data_path + '/1')
os.makedirs(validation_data_path)
os.makedirs(validation_data_path + '/0')
os.makedirs(validation_data_path + '/1')
image_paths = [image_path for image_path in glob.glob(image_data_path + '/**/*', recursive=True)]
random.seed(128)
random.shuffle(image_paths)
training_size = int(len(image_paths) * train_size_as_percentage)
training_image_paths = image_paths[:training_size]
testing_image_paths = image_paths[training_size:]
validation_size = int(len(training_image_paths) * validate_size_percentage_of_train_data)
validation_image_paths = training_image_paths[:validation_size]
training_image_paths = training_image_paths[validation_size:]
datasets = [
(train_data_path, training_image_paths),
(test_data_path, testing_image_paths),
(validation_data_path, validation_image_paths)
]
for data_path, image_paths in datasets:
for image_path in image_paths:
filename = image_path.split(os.path.sep)[-1]
# filename would be, 10253_idx5_x1001_y1001_class0.png,
# the character before . and word after class are the
# labeling for the image
class_label = filename[-5:-4]
copy_destination = '{}/{}/{}'.format(data_path, class_label, filename)
if os.path.isfile(image_path):
try:
image = PIL.Image.open(image_path)
image.verify()
# print('=============')
# print(filename)
# print(image_path)
# print(image)
# print(image.size)
# print(image.format)
# print(image.mode)
# print(image.verify())
# print(numpy.asarray(image, dtype='float32'))
# print('XXXXXXXXXXXXX')
width, height = image.size
if width == height == image_width and image.format == 'PNG':
shutil.copy2(image_path, copy_destination)
except Exception as e:
print(str(e))
pass
Code to build and train the model
image_generator = keras_preprocessing.image.ImageDataGenerator()
train_data_generator = image_generator.flow_from_directory(
directory=train_data_path,
target_size=(image_width, image_height),
color_mode='rgb',
batch_size=32,
class_mode='categorical',
shuffle=True)
validation_data_generator = image_generator.flow_from_directory(
directory=validation_data_path,
target_size=(image_width, image_height),
color_mode='rgb',
batch_size=32,
class_mode='categorical',
shuffle=True)
test_data_generator = image_generator.flow_from_directory(
directory=test_data_path,
target_size=(image_width, image_height),
color_mode='rgb',
batch_size=1,
class_mode='categorical',
shuffle=False)
input_layer = keras_layers.Input(shape=(image_width, image_height))
hidden_layer_output_neuron = int((image_width + 1) / 2)
hidden_layer_0 = keras_layers.Dense(
units=hidden_layer_output_neuron,
activation=keras.activations.relu,
use_bias=True)(input_layer)
hidden_layer_1_output_unit = 16
hidden_layer_1 = keras_layers.Dense(
units=hidden_layer_1_output_unit,
activation=keras.activations.relu,
use_bias=True)(hidden_layer_0)
hidden_layer_2_output_unit = 8
hidden_layer_2 = keras_layers.Dense(units=hidden_layer_2_output_unit, activation=keras.activations.relu, use_bias=True)(hidden_layer_1)
output_layer = keras_layers.Dense(
units=1,
activation=keras.activations.relu,
use_bias=True)(hidden_layer_0)
learning_rate = 0.001
model = keras_models.Model(inputs=input_layer, outputs=output_layer)
model.compile(
optimizer=keras_optimizers.SGD(lr=learning_rate),
loss=keras_losses.binary_crossentropy,
metrics=[keras_metrics.Recall()])
model.fit_generator(
generator=train_data_generator,
steps_per_epoch=train_data_generator.n // train_data_generator.batch_size,
validation_data=validation_data_generator,
validation_steps=validation_data_generator.n // validation_data_generator.batch_size,
epochs=100)
Expected result: No error
Actual result:
Epoch 1/100
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-85-e2ffe31934fb> in <module>
4 validation_data=validation_data_generator,
5 validation_steps=validation_data_generator.n // validation_data_generator.batch_size,
----> 6 epochs=10)
/usr/local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
1513 shuffle=shuffle,
1514 initial_epoch=initial_epoch,
-> 1515 steps_name='steps_per_epoch')
1516
1517 def evaluate_generator(self,
/usr/local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_generator.py in model_iteration(model, data, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch, mode, batch_size, steps_name, **kwargs)
211 step = 0
212 while step < target_steps:
--> 213 batch_data = _get_next_batch(generator, mode)
214 if batch_data is None:
215 if is_dataset:
/usr/local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_generator.py in _get_next_batch(generator, mode)
353 """Retrieves the next batch of input data."""
354 try:
--> 355 generator_output = next(generator)
356 except (StopIteration, errors.OutOfRangeError):
357 return None
/usr/local/lib/python3.7/site-packages/tensorflow/python/keras/utils/data_utils.py in get(self)
653 except Exception: # pylint: disable=broad-except
654 self.stop()
--> 655 six.reraise(*sys.exc_info())
656
657
/usr/local/lib/python3.7/site-packages/six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
/usr/local/lib/python3.7/site-packages/tensorflow/python/keras/utils/data_utils.py in get(self)
647 try:
648 while self.is_running():
--> 649 inputs = self.queue.get(block=True).get()
650 self.queue.task_done()
651 if inputs is not None:
/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/pool.py in get(self, timeout)
655 return self._value
656 else:
--> 657 raise self._value
658
659 def _set(self, i, obj):
/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
119 job, i, func, args, kwds = task
120 try:
--> 121 result = (True, func(*args, **kwds))
122 except Exception as e:
123 if wrap_exception and func is not _helper_reraises_exception:
/usr/local/lib/python3.7/site-packages/tensorflow/python/keras/utils/data_utils.py in get_index(uid, i)
443 The value at index `i`.
444 """
--> 445 return _SHARED_SEQUENCES[uid][i]
446
447
/usr/local/lib/python3.7/site-packages/keras_preprocessing/image/iterator.py in __getitem__(self, idx)
63 index_array = self.index_array[self.batch_size * idx:
64 self.batch_size * (idx + 1)]
---> 65 return self._get_batches_of_transformed_samples(index_array)
66
67 def __len__(self):
/usr/local/lib/python3.7/site-packages/keras_preprocessing/image/iterator.py in _get_batches_of_transformed_samples(self, index_array)
225 target_size=self.target_size,
226 interpolation=self.interpolation)
--> 227 x = img_to_array(img, data_format=self.data_format)
228 # Pillow images should be closed after `load_img`,
229 # but not PIL images.
/usr/local/lib/python3.7/site-packages/keras_preprocessing/image/utils.py in img_to_array(img, data_format, dtype)
280 # or (channel, height, width)
281 # but original PIL image has format (width, height, channel)
--> 282 x = np.asarray(img, dtype=dtype)
283 if len(x.shape) == 3:
284 if data_format == 'channels_first':
/usr/local/lib/python3.7/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
536
537 """
--> 538 return array(a, dtype, copy=False, order=order)
539
540
TypeError: float() argument must be a string or a number, not 'PngImageFile'

TypeError: iteration over a 0-d array when trying to use TextLMDataBunch.from_csv in FastAI

The library expects utf-8. I tried to convert my us-ascii file to utf-8 using:
iconv -f us-ascii -t utf-8 src.csv > target.csv
When I did:
file -I target.csv
It still showed charset as us-ascii. Then I found out that us-ascii is a subset of utf-8 and that file will only guess the file type.
However, if I take use src.csv as input to the TextLMDataBunch.from_csv() library, it works. If I do:
cat src.csv > target.csv
And then use target.csv as input to the same library, it doesn't work and gives the following error:
TypeError Traceback (most recent call last)
<ipython-input-118-44bc7147d2a4> in <module>()
----> 1 data_lm = TextLMDataBunch.from_csv(sample_p, 'voila.csv')
/usr/local/lib/python3.6/dist-packages/fastai/text/data.py in from_csv(cls, path, csv_name, valid_pct, test, tokenizer, vocab, classes, header, text_cols, label_cols, label_delim, **kwargs)
180 test_df = None if test is None else pd.read_csv(Path(path)/test, header=header)
181 return cls.from_df(path, train_df, valid_df, test_df, tokenizer, vocab, classes, text_cols,
--> 182 label_cols, label_delim, **kwargs)
183
184 #classmethod
/usr/local/lib/python3.6/dist-packages/fastai/text/data.py in from_df(cls, path, train_df, valid_df, test_df, tokenizer, vocab, classes, text_cols, label_cols, label_delim, **kwargs)
165 src = ItemLists(path, TextList.from_df(train_df, path, cols=text_cols, processor=processor),
166 TextList.from_df(valid_df, path, cols=text_cols, processor=processor))
--> 167 src = src.label_for_lm() if cls==TextLMDataBunch else src.label_from_df(cols=label_cols, classes=classes, sep=label_delim)
168 if test_df is not None: src.add_test(TextList.from_df(test_df, path, cols=text_cols))
169 return src.databunch(**kwargs)
/usr/local/lib/python3.6/dist-packages/fastai/data_block.py in _inner(*args, **kwargs)
356 assert isinstance(fv, Callable)
357 def _inner(*args, **kwargs):
--> 358 self.train = ft(*args, **kwargs)
359 assert isinstance(self.train, LabelList)
360 self.valid = fv(*args, **kwargs)
/usr/local/lib/python3.6/dist-packages/fastai/text/data.py in label_for_lm(self, **kwargs)
285 "A special labelling method for language models."
286 self.__class__ = LMTextList
--> 287 return self.label_const(0, label_cls=LMLabel)
288
289 def reconstruct(self, t:Tensor):
/usr/local/lib/python3.6/dist-packages/fastai/data_block.py in label_const(self, const, **kwargs)
211 def label_const(self, const:Any=0, **kwargs)->'LabelList':
212 "Label every item with `const`."
--> 213 return self.label_from_func(func=lambda o: const, **kwargs)
214
215 def label_empty(self):
/usr/local/lib/python3.6/dist-packages/fastai/data_block.py in label_from_func(self, func, **kwargs)
219 def label_from_func(self, func:Callable, **kwargs)->'LabelList':
220 "Apply `func` to every input to get its label."
--> 221 return self.label_from_list([func(o) for o in self.items], **kwargs)
222
223 def label_from_folder(self, **kwargs)->'LabelList':
TypeError: iteration over a 0-d array
Can someone please tell me what is wrong? I am trying this on Google Colab and tried the character encoding changes on Colab and on my Mac but with no results.

We Keep Coding

iphone swift flutter scala powershell matlab mongodb postgresql perl eclipse

Why does H2O import_file fail with col_names argument? - import

I think this is bug (off-by-one error) in the SVMLight parser, so I filed a bug here. For now, I'd recommend just naming the columns after you import the file. Thank you for the reproducible example and bug report!

Related

JSONDecodeError when connecting to Dataverse with Databricks

MongoClient insert_one works while mongoengine connect doesn't (unautherized)

Error Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _th_index_select

How to fix "float() argument must be a string or a number, not 'PngImageFile" error when keras calling numpy.asarray with dtype of float32

TypeError: iteration over a 0-d array when trying to use TextLMDataBunch.from_csv in FastAI

Categories

Resources