How to convert a text file to a dictionary in python? - python-3.7

I have the following text file.
Rennes 244
Breast 244
Caen 176
Nantes 107
Paris 348
Calais 120
Paris 241
Rennes 176
Caen 120
Nancy 534
Paris 297
I am trying to convert this to a dictionary with the capitalized words as the keys. It should look like this:
roads = {'BREST': ['Rennes'],
'RENNES': ['Brest', 'Caen', 'Nantes', 'Paris'],
'CAEN': ['Calais', 'Paris', 'Rennes'],
'CALAIS': ['Caen', 'Nancy', 'Paris']

Assuming that you are reading from a file called input.txt, this produces the desired result.
from collections import defaultdict
d = defaultdict(list)
with open('input.txt', 'r') as f:
for line in
if not line: continue
if line.endswith(':'):
name = line.strip(':')
If you want to keep the numbers, you can create a dictionary for each each entry in the file and store the contact with the associated number.
from collections import defaultdict
d = defaultdict(dict)
with open('input.txt', 'r') as f:
for line in
if not line: continue
if line.endswith(':'):
name = line.strip(':')
contact, number = line.split(' ')
d[name][contact] = int(number)
Which produces the following dictionary.
{'BREST': {'Rennes': 244},
'CAEN': {'Calais': 120, 'Paris': 241, 'Rennes': 176},
'CALAIS': {'Caen': 120, 'Nancy': 534, 'Paris': 297},
'RENNES': {'Breast': 244, 'Caen': 176, 'Nantes': 107, 'Paris': 348}}


Pyspark Cosine similarity Invalid argument, not a string or column

I am trying to calculate cosine distances of 2 title and headline columns via using pre-trained bert model just like below
Dance Gavin Dance bass player Tim Feerick dead at 34
Prince Harry and Meghan Markle make secret visit to see Queen ahead of Invictus Games
["Dance Gavin Dance bass player Tim Feerick dead at 34"]
["Prince Harry and Meghan Markle make secret visit to see Queen ahead of Invictus Games"]
["Dance Gavin Dance bass player Tim Feerick dead at 34", "Prince Harry and Meghan Markle make secret visit to see Queen ahead of Invictus Games"]
# downloading bert
model = SentenceTransformer('bert-base-nli-mean-tokens')
from sentence_transformers import SentenceTransformer
import numpy as np
from pyspark.sql.types import FloatType
import pyspark.sql.functions as f
def cosine_similarity(sentence_embeddings, ind_a, ind_b):
s = sentence_embeddings
return[ind_a], s[ind_b]) / (np.linalg.norm(s[ind_a]) * np.linalg.norm(s[ind_b]))
#udf_bert = udf(cosine_similarity, FloatType())
s0 = "our president is a good leader he will not fail"
s1 = "our president is not a good leader he will fail"
s2 = "our president is a good leader"
s3 = "our president will succeed"
sentences = [s0, s1, s2, s3]
sentence_embeddings = model.encode(sentences)
s = sentence_embeddings
print(f"{s0} <--> {s1}: {udf_bert(sentence_embeddings, 0, 1)}")
print(f"{s0} <--> {s2}: {cosine_similarity(sentence_embeddings, 0, 2)}")
print(f"{s0} <--> {s3}: {cosine_similarity(sentence_embeddings, 0, 3)}")
test_df = test_df.withColumn("Similarities", (cosine_similarity(model.encode(test_df.arrayed), 0, 1))
As we see from the example , algorithm takes concatenation of two array of strings and calculate distances of cosine among them.
When I only run the algorithm/function with the sample texts commented out , it is working. But when I try to apply it into my dataframe via registering as a udf and call with dataframe I am facing with the error below:
TypeError Traceback (most recent call last)
<command-757165186581086> in <module>
26 '''''
---> 28 test_df = test_df.withColumn("Similarities", f.lit(cosine_similarity(model.encode(test_df.arrayed), 0, 1)))
/databricks/spark/python/pyspark/sql/ in wrapper(*args)
197 #functools.wraps(self.func, assigned=assignments)
198 def wrapper(*args):
--> 199 return self(*args)
201 wrapper.__name__ = self._name
/databricks/spark/python/pyspark/sql/ in __call__(self, *cols)
177 judf = self._judf
178 sc = SparkContext._active_spark_context
--> 179 return Column(judf.apply(_to_seq(sc, cols, _to_java_column)))
181 # This function is for improving the online help system in the interactive interpreter.
/databricks/spark/python/pyspark/sql/ in _to_seq(sc, cols, converter)
60 """
61 if converter:
---> 62 cols = [converter(c) for c in cols]
63 return sc._jvm.PythonUtils.toSeq(cols)
/databricks/spark/python/pyspark/sql/ in <listcomp>(.0)
60 """
61 if converter:
---> 62 cols = [converter(c) for c in cols]
63 return sc._jvm.PythonUtils.toSeq(cols)
/databricks/spark/python/pyspark/sql/ in _to_java_column(col)
44 jcol = _create_column_from_name(col)
45 else:
---> 46 raise TypeError(
47 "Invalid argument, not a string or column: "
48 "{0} of type {1}. "
TypeError: Invalid argument, not a string or column: [-0.29246375 0.02216947 0.610355 -0.02230968 0.61386955 0.15291359]
The input of a UDF is a Column or a column name, that's why Spark is complaining Invalid argument, not a string or column: [-0.29246375 0.02216947 0.610355 -0.02230968 0.61386955 0.15291359]. You'll need to pass arrayed only, and refer model inside your UDF. Something like this
def cosine_similarity(sentence_embeddings, ind_a, ind_b):
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')
s = model.encode(arrayed)
return[ind_a], s[ind_b]) / (np.linalg.norm(s[ind_a]) * np.linalg.norm(s[ind_b]))
test_df = test_df.withColumn("Similarities", (cosine_similarity(test_df.arrayed, 0, 1))

Problem with pytorch dataset.imageFolder with custom dataset in Google Colab

I am trying to load a dataset for a classification task using pytorch, this is the code i use:
data_transforms = {
'train': transforms.Compose([
transforms.Normalize((0.5), (0.5))
'valid': transforms.Compose([
transforms.Normalize((0.5), (0.5))
# TODO: Load the datasets with ImageFolder
image_datasets = {x: datasets.ImageFolder(os.path.join("/content/drive/MyDrive/DatasetPersonale", x),
for x in ['train', 'valid']}
# TODO: Using the image datasets and the trainforms, define the dataloaders
batch_size = 32
dataloaders = {x:[x], batch_size=batch_size,
shuffle=True, num_workers=4)
for x in ['train', 'valid']}
class_names = image_datasets['train'].classes
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'valid']}
the code worked fine but as my dataset was in grayscale, I needed to convert it to RGB so I used this code:
rootdir = '/content/drive/MyDrive/DatasetPersonale/trainRGB'
for subdir, dirs, files in os.walk(rootdir):
for file in files:
filePath = os.path.join(subdir, file)
name = os.path.basename(filePath), mode="r")
if img.mode != "RGB":
now my images are still jpeg, but now they are RGB and not L. the problem is that if I go to rerun the code to load the dataset I get this error
FileNotFoundError Traceback (most recent call last)
<ipython-input-15-3dace4b0f21b> in <module>()
19 image_datasets = {x: datasets.ImageFolder(os.path.join("/content/drive/MyDrive/DatasetPersonale", x),
20 data_transforms[x])
---> 21 for x in ['trainRGB', 'validRGB']}
23 # TODO: Using the image datasets and the trainforms, define the dataloaders
4 frames
<ipython-input-15-3dace4b0f21b> in <dictcomp>(.0)
19 image_datasets = {x: datasets.ImageFolder(os.path.join("/content/drive/MyDrive/DatasetPersonale", x),
20 data_transforms[x])
---> 21 for x in ['trainRGB', 'validRGB']}
23 # TODO: Using the image datasets and the trainforms, define the dataloaders
/usr/local/lib/python3.7/dist-packages/torchvision/datasets/ in __init__(self, root, transform, target_transform, loader, is_valid_file)
311 transform=transform,
312 target_transform=target_transform,
--> 313 is_valid_file=is_valid_file)
314 self.imgs = self.samples
/usr/local/lib/python3.7/dist-packages/torchvision/datasets/ in __init__(self, root, loader, extensions, transform, target_transform, is_valid_file)
144 target_transform=target_transform)
145 classes, class_to_idx = self.find_classes(self.root)
--> 146 samples = self.make_dataset(self.root, class_to_idx, extensions, is_valid_file)
148 self.loader = loader
/usr/local/lib/python3.7/dist-packages/torchvision/datasets/ in make_dataset(directory, class_to_idx, extensions, is_valid_file)
190 "The class_to_idx parameter cannot be None."
191 )
--> 192 return make_dataset(directory, class_to_idx, extensions=extensions, is_valid_file=is_valid_file)
194 def find_classes(self, directory: str) -> Tuple[List[str], Dict[str, int]]:
/usr/local/lib/python3.7/dist-packages/torchvision/datasets/ in make_dataset(directory, class_to_idx, extensions, is_valid_file)
100 if extensions is not None:
101 msg += f"Supported extensions are: {', '.join(extensions)}"
--> 102 raise FileNotFoundError(msg)
104 return instances
FileNotFoundError: Found no valid file for the classes .ipynb_checkpoints. Supported extensions are: .jpg, .jpeg, .png, .ppm, .bmp, .pgm, .tif, .tiff, .webp
Does someone know why this error appears? I checked the extension of all the files and they are jpeg.
Thank you.
Problem: This is because of .ipynb_checkpoints folder inside the folder /content/drive/MyDrive/DatasetPersonale/trainRGB which contains files (invalid images) cannot be read as images that have valid extensions (.jpg, .jpeg, .png, .ppm, .bmp, .pgm, .tif, .tiff, .webp).
Solution: You can save all your images in a subfolder namely 'images' and then change your root folder to /content/drive/MyDrive/DatasetPersonale/trainRGB/images to avoid reading the .ipynb_checkpoints folder with your images.

Error Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _th_index_select

I have the following code taken directly from here with some pretty little modifications:
import pandas as pd
import torch
import json
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from torch import cuda
df = pd.read_pickle('df_final.pkl')
model = T5ForConditionalGeneration.from_pretrained('t5-base')
tokenizer = T5Tokenizer.from_pretrained('t5-base')
device = 'cuda' if cuda.is_available() else 'cpu'
text = ''.join(df[(df['col1'] == 'type') & (df['col2'] == 2)].col3.to_list())
preprocess_text = text.strip().replace("\n","")
t5_prepared_Text = "summarize: "+preprocess_text
#print ("original text preprocessed: \n", preprocess_text)
tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt", max_length = 500000).to(device)
# summmarize
summary_ids = model.generate(tokenized_text,
output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print ("\n\nSummarized text: \n",output)
When executing the model_generate() part i get an error like this:
RuntimeError Traceback (most recent call last)
<ipython-input-12-e8e9819a85dc> in <module>
12 min_length=30,
13 max_length=100,
---> 14 early_stopping=True).to(device)
16 output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
~\Anaconda3\lib\site-packages\torch\autograd\ in decorate_no_grad(*args, **kwargs)
47 def decorate_no_grad(*args, **kwargs):
48 with self:
---> 49 return func(*args, **kwargs)
50 return decorate_no_grad
~\Anaconda3\lib\site-packages\transformers\ in generate(self, input_ids, max_length, min_length, do_sample, early_stopping, num_beams, temperature, top_k, top_p, repetition_penalty, bad_words_ids, bos_token_id, pad_token_id, eos_token_id, length_penalty, no_repeat_ngram_size, num_return_sequences, attention_mask, decoder_start_token_id, use_cache, **model_specific_kwargs)
383 encoder = self.get_encoder()
--> 385 encoder_outputs: tuple = encoder(input_ids, attention_mask=attention_mask)
387 # Expand input ids if num_beams > 1 or num_return_sequences > 1
~\Anaconda3\lib\site-packages\torch\nn\modules\ in __call__(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
~\Anaconda3\lib\site-packages\transformers\ in forward(self, input_ids, attention_mask, encoder_hidden_states, encoder_attention_mask, inputs_embeds, head_mask, past_key_value_states, use_cache, output_attentions, output_hidden_states, return_dict)
701 if inputs_embeds is None:
702 assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings"
--> 703 inputs_embeds = self.embed_tokens(input_ids)
705 batch_size, seq_length = input_shape
~\Anaconda3\lib\site-packages\torch\nn\modules\ in __call__(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
~\Anaconda3\lib\site-packages\torch\nn\modules\ in forward(self, input)
112 return F.embedding(
113 input, self.weight, self.padding_idx, self.max_norm,
--> 114 self.norm_type, self.scale_grad_by_freq, self.sparse)
116 def extra_repr(self):
~\Anaconda3\lib\site-packages\torch\nn\ in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
1482 # remove once script supports set_grad_enabled
1483 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 1484 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _th_index_select
I've searched this error and fouund some other threads like this one and this one but they didn't help me much since their case seems to be completely different. In my case there are no custom instances or classes created, so i don't know how to fix this or where the error come from.
Could you please tell me where is the error coming from and how could i fix it?
Thank you very much in advance.
Try explicitly moving your model to the GPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)

Error with show variable in data viewer for jupyter notebook

In the recent VS Code release, they added this feature to view the active variables in the Jupyter Notebook and also, view the values in the variable with Data Viewer.
However, every time I am trying to view the values in Data Viewer, VS Code is throwing error below. It says that the reason is that the object of data type is Int64 and not string, but I am sure that should not be the reason to not show the variable. Anyone facing similar issues. I tried with a simple data frame and it's working fine.
Error: Failure during variable extraction:
TypeError Traceback (most recent call last)
<ipython-input-16-eae5f1f55b35> in <module>
98 # Transform this back into a string
---> 99 print(_VSCODE_json.dumps(_VSCODE_targetVariable))
100 del _VSCODE_targetVariable
~/anaconda3/lib/python3.7/json/ in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)
229 cls is None and indent is None and separators is None and
230 default is None and not sort_keys and not kw):
--> 231 return _default_encoder.encode(obj)
232 if cls is None:
233 cls = JSONEncoder
~/anaconda3/lib/python3.7/json/ in encode(self, o)
197 # exceptions aren't as detailed. The list call should be roughly
198 # equivalent to the PySequence_Fast that ''.join() would do.
--> 199 chunks = self.iterencode(o, _one_shot=True)
200 if not isinstance(chunks, (list, tuple)):
201 chunks = list(chunks)
~/anaconda3/lib/python3.7/json/ in iterencode(self, o, _one_shot)
255 self.key_separator, self.item_separator, self.sort_keys,
256 self.skipkeys, _one_shot)
--> 257 return _iterencode(o, 0)
259 def _make_iterencode(markers, _default, _encoder, _indent, _floatstr,
~/anaconda3/lib/python3.7/json/ in default(self, o)
178 """
--> 179 raise TypeError(f'Object of type {o.__class__.__name__} '
180 f'is not JSON serializable')
TypeError: Object of type int64 is not JSON serializable

Torchtext AttributeError: 'Example' object has no attribute 'text_content'

I'm working with RNN and using Pytorch & Torchtext. I've got a problem with building vocab in my RNN. My code is as follows:
TEXT = Field(tokenize=tokenizer, lower=True)
LABEL = LabelField(dtype=torch.float)
trainds = TabularDataset(
path='drive/{}'.format(TRAIN_PATH), format='tsv',
('label_start', LABEL),
('label_end', None),
('title', None),
('symbol', None),
('text_content', TEXT),
testds = TabularDataset(
path='drive/{}'.format(TEST_PATH), format='tsv',
('text_content', TEXT),
TEXT.build_vocab(trainds, testds)
When I want to build vocab, I'm getting this annoying error:
AttributeError: 'Example' object has no attribute 'text_content'
I'm sure, that there is no missing text_content attr. I made try-catch in order to display this specific case:
Surprisingly, I don't get any error and this specific print command shows:
['znana', 'okresie', 'masarni', 'walc', 'y', 'myśl', 'programie', 'sprawy', ...]
So it indicates, that there is text_content attr. When I perform this on a smaller dataset, it works like a charm. This problem occurs when I want to work with proper data. I ran out of ideas. Maybe someone had a similar case and can explain it.
My full traceback:
AttributeError Traceback (most recent call last)
<ipython-input-16-cf31866a07e7> in <module>()
156 if __name__ == "__main__":
--> 157 main()
<ipython-input-16-cf31866a07e7> in main()
117 break
--> 119 TEXT.build_vocab(trainds, testds)
120 print('zbudowano dla text')
121 LABEL.build_vocab(trainds)
/usr/local/lib/python3.6/dist-packages/torchtext/data/ in build_vocab(self, *args, **kwargs)
260 sources.append(arg)
261 for data in sources:
--> 262 for x in data:
263 if not self.sequential:
264 x = [x]
/usr/local/lib/python3.6/dist-packages/torchtext/data/ in __getattr__(self, attr)
152 if attr in self.fields:
153 for x in self.examples:
--> 154 yield getattr(x, attr)
156 #classmethod
AttributeError: 'Example' object has no attribute 'text_content'
This problem arises when the fields are not passed in the same order as they are in the csv/tsv file. Order must be same. Also check if no extra or less fields are mentioned than there are in the csv/tsv file..
I had the same problem.
The reason was that some rows in my input csv dataset were empty.