I have a list of lists roots_3 = [['4', '5', '10'], ['11', '12', '13'], ['0', '17', '26'], ['1', '10', '15'], ['4', '19', '26'], ['11', '14', '22'],.... ]
and I want to turn all the values inside in integer so I wrote a loop
for i in range(len(roots_3)):
for j in range(len(roots_3[i])):
roots_3[i][j]= np.int(roots_3[i][j])
and this is the error I get :
ValueError: invalid literal for int() with base 10: '\x00\x00\x00...
Note that roots_3 is obtained from several text files like this:
for file in files_3:
with open ("/Users/stordd/Desktop/StageI2M/Leiden/k3/{}".format(file), 'r+',encoding="utf8", errors='ignore') as f:
lines = f.readlines()
z = []
for line in lines:
value = line.split()
num_lines = len(f.readlines())
z.append(value[1])
roots_3.append(z)
f.close()
I got the text files from a python script running a C program :
start = time.time()
cmd = ["/Users/stordd/Desktop/StageI2M/C/forestenostre/grezza_foresta", "-w","/Users/stordd/Desktop/StageI2M/Leiden/text_file/USA.txt", "-m", "3", "-e", "-0"]
ntrial = input("How many trials? ")
for i in range(int(ntrial)):
# Open/Create the output file
outFile = open("/Users/stordd/Desktop/StageI2M/Leiden/k3/{}.txt".format(i), 'ab')
result = subprocess.Popen(cmd, stdout=subprocess.PIPE)
out = result.stdout.read()
outFile.write(out)
outFile.close()
time.sleep(1)
end = time.time()
print(end - start)
and the text files output are very simple, they look like this :
radice 11
radice 14
radice 25
Related
I am trying to follow a tutorial on how to make a deeplearning chatbot with pytorch. However, this code is quite complex for me and it has stopped with a "IndexError: list index out of range". I looked the error up and get the gist of what it usually means, but seeing as this code is very complex for me I can't figure out how to solve the error.
this is the source tutorial: [https://colab.research.google.com/github/pytorch/tutorials/blob/gh-pages/_downloads/chatbot_tutorial.ipynb#scrollTo=LTzdbPF-OBL9][1]
Line 198 seems to be causing the error
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
This is the error log
Start preparing training data ...
Reading lines...
Traceback (most recent call last):
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 221, in <module>
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 209, in loadPrepareData
pairs = filterPairs(pairs)
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 202, in filterPairs
return [pair for pair in pairs if filterPair(pair)]
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 202, in <listcomp>
return [pair for pair in pairs if filterPair(pair)]
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 198, in filterPair
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
IndexError: list index out of range
Read 442563 sentence pairs
Process finished with exit code 1
And this is my code copied from my pycharm up to the block with the error. Seeing as its a huge code I could not copy the entire code. The rest of the code can be found in the github source link above.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
corpus_name = "cornell movie-dialogs corpus"
corpus = os.path.join("D:\Documents\Python\intents", corpus_name)
def printLines(file, n=10):
with open(file, 'rb') as datafile:
lines = datafile.readlines()
for line in lines[:n]:
print(line)
printLines(os.path.join(corpus, "movie_lines.txt"))
# Splits each line of the file into a dictionary of fields
def loadLines(fileName, fields):
lines = {}
with open(fileName, 'r', encoding='iso-8859-1') as f:
for line in f:
values = line.split(" +++$+++ ")
# Extract fields
lineObj = {}
for i, field in enumerate(fields):
lineObj[field] = values[i]
lines[lineObj['lineID']] = lineObj
return lines
# Groups fields of lines from `loadLines` into conversations based on *movie_conversations.txt*
def loadConversations(fileName, lines, fields):
conversations = []
with open(fileName, 'r', encoding='iso-8859-1') as f:
for line in f:
values = line.split(" +++$+++ ")
# Extract fields
convObj = {}
for i, field in enumerate(fields):
convObj[field] = values[i]
# Convert string to list (convObj["utteranceIDs"] == "['L598485', 'L598486', ...]")
lineIds = eval(convObj["utteranceIDs"])
# Reassemble lines
convObj["lines"] = []
for lineId in lineIds:
convObj["lines"].append(lines[lineId])
conversations.append(convObj)
return conversations
# Extracts pairs of sentences from conversations
def extractSentencePairs(conversations):
qa_pairs = []
for conversation in conversations:
# Iterate over all the lines of the conversation
for i in range(len(conversation["lines"]) - 1): # We ignore the last line (no answer for it)
inputLine = conversation["lines"][i]["text"].strip()
targetLine = conversation["lines"][i+1]["text"].strip()
# Filter wrong samples (if one of the lists is empty)
if inputLine and targetLine:
qa_pairs.append([inputLine, targetLine])
return qa_pairs
# Define path to new file
datafile = os.path.join(corpus, "formatted_movie_lines.txt")
delimiter = '\t'
# Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))
# Initialize lines dict, conversations list, and field ids
lines = {}
conversations = []
MOVIE_LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"]
MOVIE_CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"]
# Load lines and process conversations
print("\nProcessing corpus...")
lines = loadLines(os.path.join(corpus, "movie_lines.txt"), MOVIE_LINES_FIELDS)
print("\nLoading conversations...")
conversations = loadConversations(os.path.join(corpus, "movie_conversations.txt"),
lines, MOVIE_CONVERSATIONS_FIELDS)
# Write new csv file
print("\nWriting newly formatted file...")
with open(datafile, 'w', encoding='utf-8') as outputfile:
writer = csv.writer(outputfile, delimiter=delimiter)
for pair in extractSentencePairs(conversations):
writer.writerow(pair)
# Print a sample of lines
print("\nSample lines from file:")
printLines(datafile)
# Default word tokens
PAD_token = 0 # Used for padding short sentences
SOS_token = 1 # Start-of-sentence token
EOS_token = 2 # End-of-sentence token
class Voc:
def __init__(self, name):
self.name = name
self.trimmed = False
self.word2index = {}
self.word2count = {}
self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
self.num_words = 3 # Count SOS, EOS, PAD
def addSentence(self, sentence):
for word in sentence.split(' '):
self.addWord(word)
def addWord(self, word):
if word not in self.word2index:
self.word2index[word] = self.num_words
self.word2count[word] = 1
self.index2word[self.num_words] = word
self.num_words += 1
else:
self.word2count[word] += 1
# Remove words below a certain count threshold
def trim(self, min_count):
if self.trimmed:
return
self.trimmed = True
keep_words = []
for k, v in self.word2count.items():
if v >= min_count:
keep_words.append(k)
print('keep_words {} / {} = {:.4f}'.format(
len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
))
# Reinitialize dictionaries
self.word2index = {}
self.word2count = {}
self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
self.num_words = 3 # Count default tokens
for word in keep_words:
self.addWord(word)
MAX_LENGTH = 10 # Maximum sentence length to consider
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
)
# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
s = unicodeToAscii(s.lower().strip())
s = re.sub(r"([.!?])", r" \1", s)
s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
s = re.sub(r"\s+", r" ", s).strip()
return s
# Read query/response pairs and return a voc object
def readVocs(datafile, corpus_name):
print("Reading lines...")
# Read the file and split into lines
lines = open(datafile, encoding='utf-8').\
read().strip().split('\n')
# Split every line into pairs and normalize
pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
voc = Voc(corpus_name)
return voc, pairs
# Returns True iff both sentences in a pair 'p' are under the MAX_LENGTH threshold
def filterPair(p):
# Input sequences need to preserve the last word for EOS token
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
# Filter pairs using filterPair condition
def filterPairs(pairs):
return [pair for pair in pairs if filterPair(pair)]
# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
print("Start preparing training data ...")
voc, pairs = readVocs(datafile, corpus_name)
print("Read {!s} sentence pairs".format(len(pairs)))
pairs = filterPairs(pairs)
print("Trimmed to {!s} sentence pairs".format(len(pairs)))
print("Counting words...")
for pair in pairs:
voc.addSentence(pair[0])
voc.addSentence(pair[1])
print("Counted words:", voc.num_words)
return voc, pairs
# Load/Assemble voc and pairs
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
print(pair)
MIN_COUNT = 3 # Minimum word count threshold for trimming
I really hope someone can help me fix this problem and help me understand why it happens.
In the end I changed
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
to
try:
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
except:
return False
And now the code seems to be working.
how to read broken numbers on two lines in matlab?
I am generating some results in text files that are being broken into two lines. Example:
text x = 1.
2345 text
What would a code look like to read the value x = 1.2345?
Suppose the value of x = 1.2345 is in file named name.txt.
When it doesn't break the values I'm looking for:
text x = 1.2345 text
I use the following (working) code:
buffer = fileread('name.txt') ;
search = 'x = ' ;
local = strfind(buffer, search);
xvalue = sscanf(buffer(local(1,1)+numel(search):end), '%f', 1);
You can remove line breaks (and other "white space", if needed) before parsing the string:
>> str = sprintf('text x = 1.\n2345 text')
str =
'text x = 1.
2345 text'
>> str = regexprep(str, '\n', '')
str =
'text x = 1.2345 text'
I have the following string: ['', '+VZWRSRP: 64,6300,-101.70', '', 'OK', '']
Where I try to put everything after the second comma into the variable PCI, everything after the 3rd comma into variable earfcn and everything after the 4th comman into variable RSRP.
As start I wanted to test it with RSRP and the following regex command:
cellinfo = ['', '+VZWRSRP: 64,6300,-101.70', '', 'OK', '']
rsrp = re.search('-(.+?)\'', cellinfo)
But somehow I can't get it working.
What's a good solution to achieve this?
I missed that the question is specified for micropython, I haven't worked with that, this answer works in normal python
import re
input_string = ", '+VZWRSRP: 64,6300,-101.70', '', 'OK', "
m = re.search(',.*?:(.*?),(.*?),(.*?),.*?,', input_string)
PCL = m.group(1)
earfcn = m.group(2)
RSRP = m.group(3)
returns:
PCL = 64
earfcn = 6300
RSRP = -101.70'
If you want the output to consist only out of values that could be translated to integers or floats:
part = ".*?(-*\d+\.*\d*).*?"
m = re.search(',.*?:{},{},{},.*?,'.format(part,part,part), input_string)
Will do the trick.
If your string is '+VZWRSRP: 64,6300,-101.70', use
part = ".*?(-*\d+\.*\d*).*?"
m = re.search('.*?:{},{},{}'.format(part,part,part), input_string)
I am having troubles creating a new document with mongoengine (python3). It seems not possible to directly add lists to ListFields.
I have the following set up:
# CONNECTION AND SETUP:
from mongoengine import *
connect('mongoengine_testing', host='localhost', port=27017)
class Chart(Document):
instrument_ticker = StringField(max_length=40)
chart_type = StringField(max_length=120)
chart_name = StringField(max_length=120)
x = ListField(StringField)
y = ListField(StringField)
When I try to add a new Chart document like this it fails:
## THIS DOESN'T WORK:
chart = Chart(
instrument_ticker = 'EURUSD',
chart_type = 'weekday_avg',
chart_name = 'Average Weekday',
x = ['1', '2', '3', '4', '5'],
y = ['13', '12', '24', '55', '32']
)
### ERROR MESSAGE
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-22-17d67eea1df7> in <module>()
4 chart_name = 'Average Weekday',
5 x = ['1', '2', '3', '4', '5'],
----> 6 y = ['13', '12', '24', '55', '32']
7 )
~/Development/python/mongoengine/mongo_env/lib/python3.4/site-packages/mongoengine/base/document.py in __init__(self, *args, **values)
113 field = self._fields.get(key)
114 if field and not isinstance(field, FileField):
--> 115 value = field.to_python(value)
116 setattr(self, key, value)
117 else:
~/Development/python/mongoengine/mongo_env/lib/python3.4/site-packages/mongoengine/base/fields.py in to_python(self, value)
324 self.field._auto_dereference = self._auto_dereference
325 value_dict = {key: self.field.to_python(item)
--> 326 for key, item in list(value.items())}
327 else:
328 Document = _import_class('Document')
~/Development/python/mongoengine/mongo_env/lib/python3.4/site-packages/mongoengine/base/fields.py in <dictcomp>(.0)
324 self.field._auto_dereference = self._auto_dereference
325 value_dict = {key: self.field.to_python(item)
--> 326 for key, item in list(value.items())}
327 else:
328 Document = _import_class('Document')
TypeError: to_python() missing 1 required positional argument: 'value'
But when I create it in multiple steps like this it works:
## THIS WORKS:
chart = Chart(
instrument_ticker = 'EURUSD',
chart_type = 'weekday_avg',
chart_name = 'Average Weekday',
)
chart.x = ['1', '2', '3', '4', '5']
chart.y = ['13', '12', '24', '55', '32']
Is this expected behavior? Or what am I missing?
The problem was that I forgot the brackets after StringField.. ouch.
class Chart(Document):
...
x = ListField(StringField())
y = ListField(StringField())
I have been trying to write a program in Python that reads cell values from an Excel file and translates the cell's content from Estonian to English or Russian and combines them into a single string. The results are printed to a text file. Estonian-->English seems to work fine but with Russian, errors start appearing:
Traceback (most recent call last):
File "erid.py", line 140, in <module>
f.write(aNimed(row_index, 1, 'ru')+ '\n')
File "erid.py", line 120, in aNimed
nimi += komponendid[i].strip()
UnicodeDecodeError: 'ascii' codec can't decode byte 0xd0 in position 8: ordinal not in range(128)
Traceback (most recent call last):
File "erid.py", line 140, in <module>
f.write(aNimed(row_index, 1, 'ru')+ '\n') File "erid.py", line 120, in aNimed
nimi = nimi + komponendid[i][1:].strip()
UnicodeDecodeError: 'ascii' codec can't decode byte 0xd1 in position 9: ordinal not in
range(128)
The first is triggered by the word "antibakteriaalne" and the second by " + hoobkäepide". I suspect that the "+" sign is the cause of the trouble in the second case, and not "ä." Some Russian characters seem to be a problem, while some don't. I'm kind of out of ideas.
Python code:
# -*- coding: utf-8 -*-
from xlrd import open_workbook, cellname, XL_CELL_TEXT
from xlwt import Workbook
from xlutils.copy import copy
import sonaraamatud #dictionary
# open file with data
book = open_workbook('Datafile.xls')
# Safe write unicode to ignore unicode errors
# http://developer.plone.org/troubleshooting/unicode.html
def safe_write(failName, word):
if type(word) == str:
failName.write(word + '\n')
else:
failName.write(word.decode("utf-8") + '\n')
def safeDecode(word):
if type(word) == str:
word = unicode(word, 'utf-8', errors='ignore')
return word
else:
word = unicode(word)
return word
# Translate surface coating name
def translatePind(langa, langb, word):
answ = ""
if (sonaraamatud.kasOlemas3(langa, sonaraamatud.pinnaKatted) == True):
answ = langa
return answ
#if langa is Estonian
if (langa == 'et'):
# if langb is english
if (langb == 'en'):
try:
answ = sonaraamatud.pinnakattedEstEng[word]
except KeyError:
answ = word
# If lang b is russian
elif (langb == "ru"):
try:
answ = sonaraamatud.pinnakattedEngRus[sonaraamatud.pinnakattedEstEng[word]]
except KeyError:
answ = word
# if langa is english
elif (langa == "en"):
# if langb is Estonian
if (langb == "et"):
try:
answ = sonaraamatud.pinnakattedEngEst[word]
except KeyError:
answ = word
# if langb is Russian
elif (langb == "ru"):
try:
answ = sonaraamatud.pinnakattedEngRus[word]
except KeyError:
answ = "KeyError"
return answ
def aNimed(row, sheetNr, lang):
# Function combines name
# name: aNimed
# #param: rida, lehe number
# #return: Product name
#vali leht (worksheet)
sheet = book.sheet_by_index(sheetNr) #sheetNr
komponendid = []
nimi = ""
if (lang == 'et'):
komponendid.append(str(sheet.cell(row, 5).value)) # Model
komponendid.append('(' + sheet.cell(row, 6).value + ')')#surface
komponendid.append(sheet.cell(row, 7).value) #extras
elif (lang == 'en'):
komponendid.append(str(sheet.cell(row, 5).value)) # Mudel
komponendid.append('(' + translatePind('et', 'en', sheet.cell(row, 6).value) + ')')
komponendid.append(sheet.cell(row, 7).value) #lisad
elif (lang == 'ru'):
"""
Alternativ method trying to use safeDecode, NOT working!
komponendid.append(str(safeDecode(sheet.cell(row, 5).value))) # Mudel
surface= safeDecode(sheet.cell(row, 6).value)
komponendid.append('(' + translatePind('et', 'ru', str(surface)) + ')')
komponendid.append(safeDecode(sheet.cell(row, 7).value)) #lisad
"""
komponendid.append(str(sheet.cell(row, 5).value)) # Mudel
komponendid.append('(' + translatePind('et', 'ru',sheet.cell(row, 6).value) + ')')
komponendid.append(sheet.cell(row, 7).value) #lisad
pikkus = len(komponendid)
print(komponendid)
for i in range(0, pikkus):
if (komponendid[i] == "" or komponendid[i] == "()" or komponendid[i] == " "):
i+=1
continue
elif (i == pikkus-1 and komponendid[i][0] != " "):
print("1"+ komponendid[i])
nimi += komponendid[i].strip()
i+=1
elif (komponendid[i][0] == " " and komponendid[i][1]== "+"):
#print("2"+ komponendid[i])
nimi = nimi + komponendid[i][1:].strip()
i+=1
else :
#print("4"+ komponendid[i])
nimi = nimi + komponendid[i].strip() + " "
i+=1
return nimi
# Use: aNimed(row, sheetNr, lang)
sheet = book.sheet_by_index(7)
f= open('data.txt', 'w')
for row_index in range (1, sheet.nrows):
#print(aNimed(row_index, 5, 'en'))
f.write(aNimed(row_index, 1, 'ru')+ '\n')
#safe_write(f, aNimed(row_index, 1, 'ru'))
f.close()
It's not particularly elegant, but I think I have a workaround. Instead of reading from an excel file, read from a csv file. For example,
`import csv
data = []
opened_file = open(csv_filename, 'rb')
reader = csv.reader(opened_file)
for row in reader:
data.append(row)
opened_file.close()`
Now you have your data saved as a list. Do your translating and save it as a different list, say translated_data. Now, and this is the key, you can open a new workbook with
`from xlwt import Workbook
book = Workbook(encoding="utf8")
foo = book.add_sheet("foo")
for row_num in range(len(translated_data)):
for col_num in range(len(translated_data[row_num]):
foo.write(row_num, col_num, translated_data[row_num][col_num]
book.save("filename.xls")`
The key is that you can specify the encoding if you use Workbook(), but if you use open_workbook(), it seems like you're stuck with ascii.