How do you order annotations by offset in brat? - annotations

When using the rapid annotator tool brat, it appears that the created annotations file will present the annotation in the order that the annotations were performed by the user. If you start at the beginning of a document and go the end performing annotation, then the annotations will naturally be in the correct offset order. However, if you need to go earlier in the document and add another annotation, the offset order of the annotations in the output .ann file will be out of order.
How then can you rearrange the .ann file such that the annotations are in offset order when you are done? Is there some option within brat that allows you to do this or is it something that one has to write their own script to perform?

Hearing nothing, I did write a python script to accomplish what I had set out to do. First, I reorder all annotations by begin index. Secondly, I resequence the label numbers so that they are once again in ascending order.
import optparse, sys
splitchar1 = '\t'
splitchar2 = ' '
# for brat, overlapped is not permitted (or at least a warning is generated)
# we could use this simplification in sorting by simply sorting on begin. it is
# probably a good idea anyway.
class AnnotationRecord:
label = 'T0'
type = ''
begin = -1
end = -1
text = ''
def __repr__(self):
return self.label + splitchar1
+ self.type + splitchar2
+ str(self.begin) + splitchar2
+ str(self.end) + splitchar1 + self.text
def create_record(parts):
record = AnnotationRecord()
record.label = parts[0]
middle_parts = parts[1].split(splitchar2)
record.type = middle_parts[0]
record.begin = middle_parts[1]
record.end = middle_parts[2]
record.text = parts[2]
return record
def main(filename, out_filename):
fo = open(filename, 'r')
lines = fo.readlines()
fo.close()
annotation_records = []
for line in lines:
parts = line.split(splitchar1)
annotation_records.append(create_record(parts))
# sort based upon begin
sorted_records = sorted(annotation_records, key=lambda a: int(a.begin))
# now relabel based upon the sorted order
label_value = 1
for sorted_record in sorted_records:
sorted_record.label = 'T' + str(label_value)
label_value += 1
# now write the resulting file to disk
fo = open(out_filename, 'w')
for sorted_record in sorted_records:
fo.write(sorted_record.__repr__())
fo.close()
#format of .ann file is T# Type Start End Text
#args are input file, output file
if __name__ == '__main__':
parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(),
usage=globals()['__doc__'],
version='$Id$')
parser.add_option ('-v', '--verbose', action='store_true',
default=False, help='verbose output')
(options, args) = parser.parse_args()
if len(args) < 2:
parser.error ('missing argument')
main(args[0], args[1])
sys.exit(0)

Related

Deep learning chatbot specific Index error list index out of range

I am trying to follow a tutorial on how to make a deeplearning chatbot with pytorch. However, this code is quite complex for me and it has stopped with a "IndexError: list index out of range". I looked the error up and get the gist of what it usually means, but seeing as this code is very complex for me I can't figure out how to solve the error.
this is the source tutorial: [https://colab.research.google.com/github/pytorch/tutorials/blob/gh-pages/_downloads/chatbot_tutorial.ipynb#scrollTo=LTzdbPF-OBL9][1]
Line 198 seems to be causing the error
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
This is the error log
Start preparing training data ...
Reading lines...
Traceback (most recent call last):
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 221, in <module>
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 209, in loadPrepareData
pairs = filterPairs(pairs)
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 202, in filterPairs
return [pair for pair in pairs if filterPair(pair)]
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 202, in <listcomp>
return [pair for pair in pairs if filterPair(pair)]
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 198, in filterPair
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
IndexError: list index out of range
Read 442563 sentence pairs
Process finished with exit code 1
And this is my code copied from my pycharm up to the block with the error. Seeing as its a huge code I could not copy the entire code. The rest of the code can be found in the github source link above.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
corpus_name = "cornell movie-dialogs corpus"
corpus = os.path.join("D:\Documents\Python\intents", corpus_name)
def printLines(file, n=10):
with open(file, 'rb') as datafile:
lines = datafile.readlines()
for line in lines[:n]:
print(line)
printLines(os.path.join(corpus, "movie_lines.txt"))
# Splits each line of the file into a dictionary of fields
def loadLines(fileName, fields):
lines = {}
with open(fileName, 'r', encoding='iso-8859-1') as f:
for line in f:
values = line.split(" +++$+++ ")
# Extract fields
lineObj = {}
for i, field in enumerate(fields):
lineObj[field] = values[i]
lines[lineObj['lineID']] = lineObj
return lines
# Groups fields of lines from `loadLines` into conversations based on *movie_conversations.txt*
def loadConversations(fileName, lines, fields):
conversations = []
with open(fileName, 'r', encoding='iso-8859-1') as f:
for line in f:
values = line.split(" +++$+++ ")
# Extract fields
convObj = {}
for i, field in enumerate(fields):
convObj[field] = values[i]
# Convert string to list (convObj["utteranceIDs"] == "['L598485', 'L598486', ...]")
lineIds = eval(convObj["utteranceIDs"])
# Reassemble lines
convObj["lines"] = []
for lineId in lineIds:
convObj["lines"].append(lines[lineId])
conversations.append(convObj)
return conversations
# Extracts pairs of sentences from conversations
def extractSentencePairs(conversations):
qa_pairs = []
for conversation in conversations:
# Iterate over all the lines of the conversation
for i in range(len(conversation["lines"]) - 1): # We ignore the last line (no answer for it)
inputLine = conversation["lines"][i]["text"].strip()
targetLine = conversation["lines"][i+1]["text"].strip()
# Filter wrong samples (if one of the lists is empty)
if inputLine and targetLine:
qa_pairs.append([inputLine, targetLine])
return qa_pairs
# Define path to new file
datafile = os.path.join(corpus, "formatted_movie_lines.txt")
delimiter = '\t'
# Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))
# Initialize lines dict, conversations list, and field ids
lines = {}
conversations = []
MOVIE_LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"]
MOVIE_CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"]
# Load lines and process conversations
print("\nProcessing corpus...")
lines = loadLines(os.path.join(corpus, "movie_lines.txt"), MOVIE_LINES_FIELDS)
print("\nLoading conversations...")
conversations = loadConversations(os.path.join(corpus, "movie_conversations.txt"),
lines, MOVIE_CONVERSATIONS_FIELDS)
# Write new csv file
print("\nWriting newly formatted file...")
with open(datafile, 'w', encoding='utf-8') as outputfile:
writer = csv.writer(outputfile, delimiter=delimiter)
for pair in extractSentencePairs(conversations):
writer.writerow(pair)
# Print a sample of lines
print("\nSample lines from file:")
printLines(datafile)
# Default word tokens
PAD_token = 0 # Used for padding short sentences
SOS_token = 1 # Start-of-sentence token
EOS_token = 2 # End-of-sentence token
class Voc:
def __init__(self, name):
self.name = name
self.trimmed = False
self.word2index = {}
self.word2count = {}
self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
self.num_words = 3 # Count SOS, EOS, PAD
def addSentence(self, sentence):
for word in sentence.split(' '):
self.addWord(word)
def addWord(self, word):
if word not in self.word2index:
self.word2index[word] = self.num_words
self.word2count[word] = 1
self.index2word[self.num_words] = word
self.num_words += 1
else:
self.word2count[word] += 1
# Remove words below a certain count threshold
def trim(self, min_count):
if self.trimmed:
return
self.trimmed = True
keep_words = []
for k, v in self.word2count.items():
if v >= min_count:
keep_words.append(k)
print('keep_words {} / {} = {:.4f}'.format(
len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
))
# Reinitialize dictionaries
self.word2index = {}
self.word2count = {}
self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
self.num_words = 3 # Count default tokens
for word in keep_words:
self.addWord(word)
MAX_LENGTH = 10 # Maximum sentence length to consider
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
)
# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
s = unicodeToAscii(s.lower().strip())
s = re.sub(r"([.!?])", r" \1", s)
s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
s = re.sub(r"\s+", r" ", s).strip()
return s
# Read query/response pairs and return a voc object
def readVocs(datafile, corpus_name):
print("Reading lines...")
# Read the file and split into lines
lines = open(datafile, encoding='utf-8').\
read().strip().split('\n')
# Split every line into pairs and normalize
pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
voc = Voc(corpus_name)
return voc, pairs
# Returns True iff both sentences in a pair 'p' are under the MAX_LENGTH threshold
def filterPair(p):
# Input sequences need to preserve the last word for EOS token
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
# Filter pairs using filterPair condition
def filterPairs(pairs):
return [pair for pair in pairs if filterPair(pair)]
# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
print("Start preparing training data ...")
voc, pairs = readVocs(datafile, corpus_name)
print("Read {!s} sentence pairs".format(len(pairs)))
pairs = filterPairs(pairs)
print("Trimmed to {!s} sentence pairs".format(len(pairs)))
print("Counting words...")
for pair in pairs:
voc.addSentence(pair[0])
voc.addSentence(pair[1])
print("Counted words:", voc.num_words)
return voc, pairs
# Load/Assemble voc and pairs
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
print(pair)
MIN_COUNT = 3 # Minimum word count threshold for trimming
I really hope someone can help me fix this problem and help me understand why it happens.
In the end I changed
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
to
try:
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
except:
return False
And now the code seems to be working.

MATLAB - replace the string in a text file with index and delete those line without indexed

I have a text file with data structure like this
30,311.263671875,158.188034058,20.6887207031,17.4877929688,0.000297248129755,aeroplane
30,350.668334961,177.547393799,19.1939697266,18.3677368164,0.00026999923648,aeroplane
30,367.98135376,192.697219849,16.7747192383,23.0987548828,0.000186387239864,aeroplane
30,173.569274902,151.629364014,38.0069885254,37.5704650879,0.000172595537151,aeroplane
30,553.904602051,309.903320312,660.893981934,393.194030762,5.19620243722e-05,aeroplane
30,294.739196777,156.249740601,16.3522338867,19.8487548828,1.7795707663e-05,aeroplane
30,34.1946258545,63.4127349854,475.104492188,318.754821777,6.71026540999e-06,aeroplane
30,748.506652832,0.350944519043,59.9415283203,28.3256549835,3.52978979379e-06,aeroplane
30,498.747009277,14.3766479492,717.006652832,324.668731689,1.61551643174e-06,aeroplane
30,81.6389465332,498.784301758,430.23046875,210.294677734,4.16855394647e-07,aeroplane
30,251.932098389,216.641052246,19.8385009766,20.7131652832,3.52147743106,bicycle
30,237.536972046,226.656692505,24.0902862549,15.7586669922,1.8601918593,bicycle
30,529.673400879,322.511322021,25.1921386719,21.6920166016,0.751171214506,bicycle
30,255.900146484,196.583847046,17.1589355469,27.4430847168,0.268321367912,bicycle
30,177.663650513,114.458488464,18.7516174316,16.6759414673,0.233057001606,bicycle
30,436.679382324,273.383331299,17.4342041016,19.6081542969,0.128449092153,bicycle
I want to index those file with a label file.and the result will be something like this.
60,509.277435303,284.482452393,26.1684875488,31.7470092773,0.00807665128377,15
60,187.909835815,170.448471069,40.0388793945,58.8763122559,0.00763951029512,15
60,254.447280884,175.946624756,18.7212677002,21.9440612793,0.00442053096776,15
However there might be some class that is not in label class and I need to filter those line out so I can use load() to load in.(you can't have char inside that text file and execute load().
here is my implement:
function test(vName,meta)
f_dt = fopen([vName '.txt'],'r');
f_indexed = fopen([vName '_indexed.txt'], 'w');
lbls = loadlbl()
count = 1;
while(true),
if(f_dt == -1),
break;
end
dt = fgets(f_dt);
if(dt == -1),
break
else
dt_cls = strsplit(dt,','){7};
dt_cls = regexprep(dt_cls, '\s+', '');
cls_idx = find(strcmp(lbls,dt_cls));
if(~isempty(cls_idx))
dt = strrep(dt,dt_cls,int2str(cls_idx));
fprintf(f_indexed,dt);
end
end
end
fclose(f_indexed);
if(f_dt ~= -1),
fclose(f_dt);
end
end
However it work very very slow because the text file contains 100 thousand of lines. Is it anyway that I could do this task smarter and faster?
You may use textscan, and get the indices/ line numbers of the labels you want. After knowing the line numbers, you can extract what you want.
fid = fopen('data.txt') ;
S = textscan(fid,'%s','delimiter','\n') ;
S = S{1} ;
fclose(fid) ;
%% get bicycle lines
idx = strfind(S, 'bicycle');
idx = find(not(cellfun('isempty', idx)));
S_bicycle = S(idx)
%% write this to text file
fid2 = fopen('text.txt','wt') ;
fprintf(fid2,'%s\n',S_bicycle{:});
fclose(fid2) ;
From S_bicycle, you can extract your numbers.

Dynamic Output Arguments in for-loop

I am fairly new to Matlab and I created this script to help me gather 4 numbers out of an excel file. This one works so far:
clear all
cd('F:/Wortpaare Analyse/Excel')
VPNumber = input('Which Participant Number?', 's');
filename = (['PAL_',VPNumber,'_Gain_Loss.xlsx']);
sheet = 1;
x1Range = 'N324';
GainBlock1 = xlsread(filename,sheet,x1Range);
x1Range = 'O324';
LossBlock1 = xlsread(filename,sheet,x1Range);
x1Range = 'AD324';
GainBlock2 = xlsread(filename,sheet,x1Range);
x1Range = 'AE324';
LossBlock2 = xlsread(filename,sheet,x1Range);
AnalyseProband = [GainBlock1, LossBlock1, GainBlock2, GainBlock2]
Now I would like to make a script that will analyze the first 20 excel files and tried this:
clear all
cd('F:/Wortpaare Analyse/Excel')
for VPNumber = 1:20 %for the 20 files
a = (['PAL_%d_Gain_Loss.xlsx']);
filename = sprintf(a, VPNumber) % specifies the name of the file
sheet = 1;
x1Range = 'N324';
(['GainBlock1_',VPNummer]) = xlsread(filename,sheet,x1Range);
....
end
The problem seems to be that I can only have one output argument. I would like to change the output argument in each loop, so it doesn't overwrite "GainBlock1" in every cycle.
In the end I would like to have these variables:
GainBlock1_1 (for the first excel sheet)
GainBlock1_2 (for the second excel sheet)
...
GainBlock1_20 (for the 20th excel sheet)
Is there a clever way to do that? I was able to write the first script fairly easily, but was unable to produce any significant progress in the second script. Any help is greatly appreciated.
Best,
Luca
This can be accomplished by either storing the data in an array or a cell. I believe an array will be sufficient for what you're trying to do. I've included a re-organized code sample below, broken out into functions to help make it easier to read and understand.
Basically, the first function handles the loop and controls where the data gets placed in the array, and the second function is your original script which accepts the VPNumber as an input.
What this should return is a 20x4 array, where the first index controls the sheet the data was pulled from and the second index controls whether it's [GainBlock1, LossBlock1, GainBlock2, LossBlock2]. i.e.- GainBlock 1 for sheet 5 would be AnalseProband(5,1), LossBlock2 for sheet 11 would be AnalyseProband(11,4).
function AnalyseProband = getAllProbands()
currentDir = pwd;
returnToOriginalDir = onCleanup(#()cd(currentDir));
cd('F:/Wortpaare Analyse/Excel')
numProbands = 20;
AnalyseProband = zeros(numProbands,4);
for n = 1:numProbands
AnalyseProband(n,:) = getBlockInfoFromXLS(n);
end
end
function AnalyseProband = getBlockInfoFromXLS(VPNumber)
a = 'PAL_%d_Gain_Loss.xlsx';
filename = sprintf(a, VPNumber); % specifies the name of the file
sheet = 1;
x1Range = 'N324';
GainBlock1 = xlsread(filename,sheet,x1Range);
x1Range = 'O324';
LossBlock1 = xlsread(filename,sheet,x1Range);
x1Range = 'AD324';
GainBlock2 = xlsread(filename,sheet,x1Range);
x1Range = 'AE324';
LossBlock2 = xlsread(filename,sheet,x1Range);
AnalyseProband = [GainBlock1, LossBlock1, GainBlock2, LossBlock2];
end

Matlab; how to extract information from a header's file (text file)

I have many text files that have 35 lines of header followed by a large matrix with data of an image (that info can be ignored and do not need to read it at the moment). I want to be able to read the header lines and extract information contained on those lines. For instance the first few lines of the header are..
File Version Number: 1.0
Date: 06/05/2015
Time: 10:33:44 AM
===========================================================
Beam Voltage (-kV) = 13.000
Filament (W) = 4.052
Cond. (-kV) = 8.885
CenterX1 (V) = 10.7
CenterY1 (V) = -45.9
Objective (%) = 71.40
OctupoleX = -0.4653
OctupoleY = -0.1914
Angle (deg) = 0.00
.
I would like to be able to open this text file and read the vulue of the day and time the file was created, filament power, the condenser voltage, the angle, etc.. and save these in variables or send them to a text box on a GUI program.
I have tried several things but since the values I want to extract some times are after a '=' or after a ':' or simply after a '' then I do not know how to approach this. Perhaps reading each line and look for a match of a word?
Any help would be much appreciated.
Thanks,
Alex
This is not particularly difficult, and one of the ways to do it would be to parse line-by-line as you suggested. Something like this:
MAX_LINES_TO_READ = 35;
fid = fopen('input.txt');
lineCount = 0;
dateString = '';
beamVoltage = 0;
while ~eof(fid)
line = fgetl(fid);
lineCount = lineCount + 1;
%//check conditions for skipping loop body
if isempty(line)
continue
elseif lineCount > MAX_LINES_TO_READ
break
end
%//find headers you are interested in
if strfind(line, 'Date')
%//find the first location of the header separator
idx = find(line, ':', 1);
%//extract substring starting from 1 char after separator
%//note: the trim is to get rid of leading/trailing whitespace
dateString = strtrim(line(idx + 1 : end));
elseif strfind(line, 'Beam Voltage')
idx = find(line, '=', 1);
beamVoltage = str2double(line(idx + 1 : end));
end
end
fclose(fid);

genstrings does not work with macro for NSLocalizedString

I would like to shorten "NSLocalizedString" to "_" so I'm using macro
_(x) NSLocalizedString(#x, #__FILE__)
.
But now, when I want to generate strings for localization with
find . -name \*.m | xargs genstrings
it generates nothing.
Any help?
You can tell genstrings to look for a different function by using the '-s' argument:
genstring -s MyFunctionName ....
However, MyFunctionName must follow the same naming and argument conventions as one of the built in NSLocalizeString macros.
In your case, you can not just specify the string key, you must also specify the documentation string. In fact, you should never generate a strings file without both the string and documentation. There are many languages where the actual phrase or word will depend on context. German is a great example where a car is "das auto" and more than one is "die autos". There are many more examples that include changes for gender, number, time, question versus statement, and yes versus no. The documentation string helps your translator figure out what translation to use.
In addition, the best practice is to use a key that is different from the native language word. That says use NSLocalizedStringWithDefaultValue(key, table, bundle, val, comment).
You can specify nil for the table and [NSBundle mainBundle] for the bundle argument.
You can wrap this in a shorthand, but you still have to follow the StringWithDefaultValue name and the arguments for genstrings to work.
I strongly recommend you look at the WWDC 2012 session on Localization Tips and Tricks.
Maurice
You can use the -s option of genstrings. From the man page :
-s routine
Substitutes routine for NSLocalizedString. For example, -s MyLocalString will catch calls to MyLocalString and MyLocalStringFromTable.
So I think you could try :
genstrings -s _
I had the same problem when my NSLocalizedString macro was taking 1 argument instead of 2 like genstrings expects, so i wrote i python script that does the job.
the first argument for the script is the macro name and the second is the path to your project.
import fnmatch
import os
from xml.dom import minidom
function = sys.argv[1]
rootdir = sys.argv[2]
# Generate strings from .m files
files = []
for root, dirnames, filenames in os.walk(rootdir):
for filename in fnmatch.filter(filenames, '*.m'):
files.append(os.path.join(root, filename))
strings = []
for file in files:
lineNumber = 0
for line in open(file):
lineNumber += 1
index = line.find(function)
if (index != -1):
callStr = line[index:]
index = callStr.find('#')
if (index == -1):
print 'call with a variable/macro. file: ' + file + ' line: %d' % lineNumber
else:
callStr = callStr[index+1:]
index = callStr.find('")')
callStr = callStr[:index+1]
if callStr not in strings:
strings.append(callStr)
# Write strings to file
f = open('Localizable.strings', 'w+')
for string in strings:
f.write(string + ' = ' + string + ';\n\n')
f.close()
I have improved Or Arbel's script to include the cases where there's multiple macro-calls on a single line:
import fnmatch
import os
from xml.dom import minidom
import sys
function = sys.argv[1]
rootdir = sys.argv[2]
# Generate strings from .m files
files = []
for root, dirnames, filenames in os.walk(rootdir):
for filename in fnmatch.filter(filenames, '*.m'):
files.append(os.path.join(root, filename))
strings = []
for file in files:
lineNumber = 0
for line in open(file):
lineNumber += 1
index = line.find(function)
startIndex = 0
while (index != -1):
startIndex = index+1
callStr = line[index:]
index = callStr.find('#')
if (index == -1):
print 'call with a variable/macro. file: ' + file + ' line: %d' % lineNumber
else:
callStr = callStr[index+1:]
index = callStr.find('")')
callStr = callStr[:index+1]
if callStr not in strings:
strings.append(callStr)
index = line.find(function, startIndex)
# Write strings to file
f = open('Localizable.strings', 'w+')
for string in strings:
f.write(string + ' = ' + string + ';\n\n')
f.close()