cur.execute() psycopg2.ProgrammingError: can't call .execute() on named cursors more than once - postgresql

I'm trying to get this code to run but I get this error above. Can someone please help? I've tried reading about this in other posts but I don't really know how to apply it here. I'm trying to iterate over lines of this database and select 1400 random ones. It blocks on the error above.
def paragraph_generator(test=True, itersize=5000, year=None, state=None):
con, cur = database_connection.connect(cursor_type="server")
cur.itersize = itersize
while True:
sql = f"""
SELECT
text_id,
lccn_sn,
date,
ed,
seq,
chroniclingamerica_meta.statefp,
chroniclingamerica_meta.countyfp,
text_ocr
FROM
chroniclingamerica natural join chroniclingamerica_meta
WHERE date_part('year',date) BETWEEN 1860 AND 1920
ORDER BY RANDOM()
LIMIT 1400
"""
if test:
sql = (
sql + " limit 10000"
) # limit 1000 means it only goes through 1000 lines of the database
else:
pass
print(sql)
cur.execute(sql)
for p in cur.fetchall():
tokens = stem_text(p[-1]) # Stem
# print(tokens)
tokens = (
p[-1]
.translate(str.maketrans("", "", punct))
.replace("\n", " ")
.lower()
.split(" ")
)
tokens_3 = [
a for a in tokens if len(a) == 3 if a in wn_lemmas
] # For 3-letter words, only keep WordNet recognized tokens
tokens = gensim.parsing.preprocessing.remove_short_tokens(
tokens, minsize=4
) # Remove 1-, 2-, and 3-letter words
tokens = tokens + tokens_3 # Add back in 3-letter WordNet-recognized tokens
tokens = gensim.parsing.preprocessing.remove_stopword_tokens(
tokens, stopwords=stop_words
) # Remove stopwords in stopword list above
print("THIS IS THE LENGTH OF TOKENS")
a = len(tokens)
print(a)
if len(tokens) != 0:
ocr_2 = 1 - (
len([a for a in tokens if a in wn_lemmas]) / len(tokens)
) # Generate a measure for proportion of OCR errors in a page
else:
ocr_2 = float("nan")
print("THIS IS OCR")
print(ocr_2)
ocr = ocr_2
if ocr < 0.75 and ~np.isnan(
ocr
): # If the % of OCR in a page is less than 75%, then keep the page and all tokens
tokens = tokens
else:
tokens = [] # Otherwise, give it an empty list (i.e. drop the page)
yield tokens
con.close()
Error:
cur.execute(sql)
psycopg2.ProgrammingError: can't call .execute() on named cursors more than once

Related

Deep learning chatbot specific Index error list index out of range

I am trying to follow a tutorial on how to make a deeplearning chatbot with pytorch. However, this code is quite complex for me and it has stopped with a "IndexError: list index out of range". I looked the error up and get the gist of what it usually means, but seeing as this code is very complex for me I can't figure out how to solve the error.
this is the source tutorial: [https://colab.research.google.com/github/pytorch/tutorials/blob/gh-pages/_downloads/chatbot_tutorial.ipynb#scrollTo=LTzdbPF-OBL9][1]
Line 198 seems to be causing the error
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
This is the error log
Start preparing training data ...
Reading lines...
Traceback (most recent call last):
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 221, in <module>
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 209, in loadPrepareData
pairs = filterPairs(pairs)
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 202, in filterPairs
return [pair for pair in pairs if filterPair(pair)]
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 202, in <listcomp>
return [pair for pair in pairs if filterPair(pair)]
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 198, in filterPair
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
IndexError: list index out of range
Read 442563 sentence pairs
Process finished with exit code 1
And this is my code copied from my pycharm up to the block with the error. Seeing as its a huge code I could not copy the entire code. The rest of the code can be found in the github source link above.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
corpus_name = "cornell movie-dialogs corpus"
corpus = os.path.join("D:\Documents\Python\intents", corpus_name)
def printLines(file, n=10):
with open(file, 'rb') as datafile:
lines = datafile.readlines()
for line in lines[:n]:
print(line)
printLines(os.path.join(corpus, "movie_lines.txt"))
# Splits each line of the file into a dictionary of fields
def loadLines(fileName, fields):
lines = {}
with open(fileName, 'r', encoding='iso-8859-1') as f:
for line in f:
values = line.split(" +++$+++ ")
# Extract fields
lineObj = {}
for i, field in enumerate(fields):
lineObj[field] = values[i]
lines[lineObj['lineID']] = lineObj
return lines
# Groups fields of lines from `loadLines` into conversations based on *movie_conversations.txt*
def loadConversations(fileName, lines, fields):
conversations = []
with open(fileName, 'r', encoding='iso-8859-1') as f:
for line in f:
values = line.split(" +++$+++ ")
# Extract fields
convObj = {}
for i, field in enumerate(fields):
convObj[field] = values[i]
# Convert string to list (convObj["utteranceIDs"] == "['L598485', 'L598486', ...]")
lineIds = eval(convObj["utteranceIDs"])
# Reassemble lines
convObj["lines"] = []
for lineId in lineIds:
convObj["lines"].append(lines[lineId])
conversations.append(convObj)
return conversations
# Extracts pairs of sentences from conversations
def extractSentencePairs(conversations):
qa_pairs = []
for conversation in conversations:
# Iterate over all the lines of the conversation
for i in range(len(conversation["lines"]) - 1): # We ignore the last line (no answer for it)
inputLine = conversation["lines"][i]["text"].strip()
targetLine = conversation["lines"][i+1]["text"].strip()
# Filter wrong samples (if one of the lists is empty)
if inputLine and targetLine:
qa_pairs.append([inputLine, targetLine])
return qa_pairs
# Define path to new file
datafile = os.path.join(corpus, "formatted_movie_lines.txt")
delimiter = '\t'
# Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))
# Initialize lines dict, conversations list, and field ids
lines = {}
conversations = []
MOVIE_LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"]
MOVIE_CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"]
# Load lines and process conversations
print("\nProcessing corpus...")
lines = loadLines(os.path.join(corpus, "movie_lines.txt"), MOVIE_LINES_FIELDS)
print("\nLoading conversations...")
conversations = loadConversations(os.path.join(corpus, "movie_conversations.txt"),
lines, MOVIE_CONVERSATIONS_FIELDS)
# Write new csv file
print("\nWriting newly formatted file...")
with open(datafile, 'w', encoding='utf-8') as outputfile:
writer = csv.writer(outputfile, delimiter=delimiter)
for pair in extractSentencePairs(conversations):
writer.writerow(pair)
# Print a sample of lines
print("\nSample lines from file:")
printLines(datafile)
# Default word tokens
PAD_token = 0 # Used for padding short sentences
SOS_token = 1 # Start-of-sentence token
EOS_token = 2 # End-of-sentence token
class Voc:
def __init__(self, name):
self.name = name
self.trimmed = False
self.word2index = {}
self.word2count = {}
self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
self.num_words = 3 # Count SOS, EOS, PAD
def addSentence(self, sentence):
for word in sentence.split(' '):
self.addWord(word)
def addWord(self, word):
if word not in self.word2index:
self.word2index[word] = self.num_words
self.word2count[word] = 1
self.index2word[self.num_words] = word
self.num_words += 1
else:
self.word2count[word] += 1
# Remove words below a certain count threshold
def trim(self, min_count):
if self.trimmed:
return
self.trimmed = True
keep_words = []
for k, v in self.word2count.items():
if v >= min_count:
keep_words.append(k)
print('keep_words {} / {} = {:.4f}'.format(
len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
))
# Reinitialize dictionaries
self.word2index = {}
self.word2count = {}
self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
self.num_words = 3 # Count default tokens
for word in keep_words:
self.addWord(word)
MAX_LENGTH = 10 # Maximum sentence length to consider
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
)
# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
s = unicodeToAscii(s.lower().strip())
s = re.sub(r"([.!?])", r" \1", s)
s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
s = re.sub(r"\s+", r" ", s).strip()
return s
# Read query/response pairs and return a voc object
def readVocs(datafile, corpus_name):
print("Reading lines...")
# Read the file and split into lines
lines = open(datafile, encoding='utf-8').\
read().strip().split('\n')
# Split every line into pairs and normalize
pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
voc = Voc(corpus_name)
return voc, pairs
# Returns True iff both sentences in a pair 'p' are under the MAX_LENGTH threshold
def filterPair(p):
# Input sequences need to preserve the last word for EOS token
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
# Filter pairs using filterPair condition
def filterPairs(pairs):
return [pair for pair in pairs if filterPair(pair)]
# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
print("Start preparing training data ...")
voc, pairs = readVocs(datafile, corpus_name)
print("Read {!s} sentence pairs".format(len(pairs)))
pairs = filterPairs(pairs)
print("Trimmed to {!s} sentence pairs".format(len(pairs)))
print("Counting words...")
for pair in pairs:
voc.addSentence(pair[0])
voc.addSentence(pair[1])
print("Counted words:", voc.num_words)
return voc, pairs
# Load/Assemble voc and pairs
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
print(pair)
MIN_COUNT = 3 # Minimum word count threshold for trimming
I really hope someone can help me fix this problem and help me understand why it happens.
In the end I changed
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
to
try:
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
except:
return False
And now the code seems to be working.

How can I run my Maximum Drawdown Code without this ValueError Exception?

Im trying to follow an exercise on calculating the maximum drawdown and maximum drawdown duration of a market market neutral vs a long-only trading strategy.
I followed the code to the T and has worked perfectly up until now, and I seem to be getting a ValueError Exception. What code do I need to change for my code to work?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from MaxDD_Function import calculateMaxDD
# CALCUALTING MAXDD AND CREATING THE FUNCTION.
def calculateMaxDD(cumret):
highwatermark = np.zeros(cumret.shape)
drawdown = np.zeros(cumret.shape)
drawdownduration = np.zeros(cumret.shape)
for t in np.arange(1, cumret.shape[0]):
highwatermark[t] = (np.maximum(highwatermark[t -1]), cumret[t])
drawdown[t] = ((1+ cumret[t] )/(1 + highwatermark[t]) - 1)
if drawdown[t] == 0:
drawdownduration[t] == 0
else:
drawdownduration[t] = drawdownduration[t -1] + 1
maxDD, i = np.min(drawdown, np.argmin(drawdown)) # drawdown < 0 always
maxDDD = np.max(drawdownduration)
return (maxDD, maxDDD, i)
# First part of example. Read the csv data and calculate.
#The first dataframe/set for my strategy
df = pd.read_csv('IGE_daily.csv')
#print (df.head())
df.sort_values(by= 'Date', inplace = True)
dailyret = df.loc[:, 'Adj Close'].pct_change()
excessRet = ((dailyret - 0.04)/252)
sharpeRatio = ((np.sqrt(252)*np.mean(excessRet))/np.std(excessRet))
print (sharpeRatio)
#Second part of example
#This is the second dataframe/set for my strategy.
df2 = pd.read_csv('SPY.csv')
#The new data frame, with both datasets.
df = pd.merge (df, df2, on = 'Date', suffixes = ('_IGE', '_SPY'))
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace = True)
df.sort_index(inplace = True)
dailyret = df [['Adj Close_IGE', 'Adj Close_SPY' ]].pct_change() # Daily
Returns
dailyret.rename(columns = {"Adj Close_IGE": "IGE", "Adj Close_SPY": "SPY"
}, inplace = True)
netRet = (dailyret['IGE'] - dailyret['SPY'])/2
sharpeRatio = np.sqrt(252) * np.mean(netRet)/np.std(netRet)
print (sharpeRatio)
cumret = np.cumprod(1 + netRet) - 1 #Cumalative return
#print (plt.plot(cumret))
#print (plt.show()) # Remember to always run plt.show to see the plot in
terminal.
maxDrawdown, maxDrawdownDuration, startDrawdownDay =
calculateMaxDD(cumret.values)
maxDrawdown = calculateMaxDD(cumret.values)
print (maxDrawdown)
Here are the results I got from my above mentioned code:
Ivies-MacBook-Pro:Quant_Trading Ivieidahosa$ python Ex3_4.py
-46.10531783058014
0.7743286831426566
Traceback (most recent call last):
File "Ex3_4.py", line 76, in <module>
maxDrawdown = calculateMaxDD(cumret.values)
File "Ex3_4.py", line 15, in calculateMaxDD
highwatermark[t] = (np.maximum(highwatermark[t -1]), cumret[t])
ValueError: invalid number of arguments
I expected the output on themaxDrawdown to be -0.09529268047208683,maxDrawdwnduration to be 497 andstartDrawdownday to be 1223.
Q: What code do I need to change for my code to work?
Your code uses a call to a numpy function having a defined a minimum-call-signature as: np.maximum( <array_like_A>, <array_like_B> )
This simply fails to meet the expected behaviour once only one of the expected pair of values was delivered in the reported line ( see the closing parenthesis ), or a scalar or any other, non-array-like type of object(s) were attempted to be delivered into the call-signature:
highwatermark[t] = ( np.maximum( highwatermark[t-1] ), cumret[t] )
where a tuple was attempted to get constructed on the right hand side of the value-assignment (well, actually an object-reference gets assigned in python, sure, but was trying to remain short here to tell that fast for an easy reading ), the first item of which was expected to get assigned to a returned value from a call to the above documented np.maximum(...) function. And Hic Sunt Leones ...
May like to start further bug-tracing with a cross-check-ing of the state of objects and the call-signature:
try:
for t in np.arange( 1, cumret.shape[0] ):
print( "The shape of <highwatermark[t-1]>-object was: ",
highwatermark[t-1].shape, " for t == ", t
)
except:
print( "The <highwatermark[t-1]>-object was not a numpy array",
" for t == ", t
)
finally:
print( np.maximum.__doc__ )

Cryptic TypeError: 'decimal.Decimal' object cannot be interpreted as an integer

I am struggling to understand why this function apparently fails in the Jupyter Notebook, but not in the IPython shell:
def present_value( r, n, fv = None, pmt = None ):
'''
Function to compute the Present Value based on interest rate and
a given future value.
Arguments accepted
------------------
* r = interest rate,
which should be given in its original percentage, eg.
5% instead of 0.05
* n = number of periods for which the cash flow,
either as annuity or single flow from one present value
* fv = future value in dollars,
if problem is annuity based, leave this empty
* pmt = each annuity payment in dollars,
if problem is single cash flow based, leave this empty
'''
original_args = [r, n, fv, pmt]
dec_args = [Decimal( arg ) if arg != None
else arg
for arg in original_args
]
if dec_args[3] == None:
return dec_args[2] / ( ( 1 + ( dec_args[0] / 100 ) )**dec_args[1] )
elif dec_args[2] == None:
# annuity_length = range( 1, dec_args[1] + 1 )
# Not allowed to add a Decimal object
# with an integer and to use it
# in the range() function,
# so we dereference the integer from original_args
annuity_length = range( 1, original_args[1] + 1 )
# Apply discounting to each annuity payment made
# according to number of years left till end
all_compounded_pmt = [dec_args[3] * ( 1 / ( ( 1 + dec_args[0] / 100 ) ** time_left ) ) \
for time_left in annuity_length
]
return sum( all_compounded_pmt )
When I imported the module that this function resides in, named functions.py, using from functions import *, and then executed present_value(r=7, n=35, pmt = 11000), I got the error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-93-c1cc587f7e27> in <module>()
----> 1 present_value(r=7, n=35, pmt = 11000)
/path_to_file/functions.py in present_value(r, n, fv, pmt)
73 if dec_args[3] == None:
74 return dec_args[2]/((1 + (dec_args[0]/100))**dec_args[1])
---> 75
76 elif dec_args[2] == None:
77 # annuity_length = range(1, dec_args[1]+1)
TypeError: 'decimal.Decimal' object cannot be interpreted as an integer
but in the IPython shell, evaluating this function it works perfectly fine:
In [42]: functions.present_value(r=7, n=35, pmt = 11000)
Out[42]: Decimal('142424.39530474029537')
Can anyone please help me with this really confusing and obscure issue?

How do you order annotations by offset in brat?

When using the rapid annotator tool brat, it appears that the created annotations file will present the annotation in the order that the annotations were performed by the user. If you start at the beginning of a document and go the end performing annotation, then the annotations will naturally be in the correct offset order. However, if you need to go earlier in the document and add another annotation, the offset order of the annotations in the output .ann file will be out of order.
How then can you rearrange the .ann file such that the annotations are in offset order when you are done? Is there some option within brat that allows you to do this or is it something that one has to write their own script to perform?
Hearing nothing, I did write a python script to accomplish what I had set out to do. First, I reorder all annotations by begin index. Secondly, I resequence the label numbers so that they are once again in ascending order.
import optparse, sys
splitchar1 = '\t'
splitchar2 = ' '
# for brat, overlapped is not permitted (or at least a warning is generated)
# we could use this simplification in sorting by simply sorting on begin. it is
# probably a good idea anyway.
class AnnotationRecord:
label = 'T0'
type = ''
begin = -1
end = -1
text = ''
def __repr__(self):
return self.label + splitchar1
+ self.type + splitchar2
+ str(self.begin) + splitchar2
+ str(self.end) + splitchar1 + self.text
def create_record(parts):
record = AnnotationRecord()
record.label = parts[0]
middle_parts = parts[1].split(splitchar2)
record.type = middle_parts[0]
record.begin = middle_parts[1]
record.end = middle_parts[2]
record.text = parts[2]
return record
def main(filename, out_filename):
fo = open(filename, 'r')
lines = fo.readlines()
fo.close()
annotation_records = []
for line in lines:
parts = line.split(splitchar1)
annotation_records.append(create_record(parts))
# sort based upon begin
sorted_records = sorted(annotation_records, key=lambda a: int(a.begin))
# now relabel based upon the sorted order
label_value = 1
for sorted_record in sorted_records:
sorted_record.label = 'T' + str(label_value)
label_value += 1
# now write the resulting file to disk
fo = open(out_filename, 'w')
for sorted_record in sorted_records:
fo.write(sorted_record.__repr__())
fo.close()
#format of .ann file is T# Type Start End Text
#args are input file, output file
if __name__ == '__main__':
parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(),
usage=globals()['__doc__'],
version='$Id$')
parser.add_option ('-v', '--verbose', action='store_true',
default=False, help='verbose output')
(options, args) = parser.parse_args()
if len(args) < 2:
parser.error ('missing argument')
main(args[0], args[1])
sys.exit(0)

Renaming a Word document and saving its filename with its first 10 letters

I have recovered some Word documents from a corrupted hard drive using a piece of software called photorec. The problem is that the documents' names can't be recovered; they are all renamed by a sequence of numbers. There are over 2000 documents to sort through and I was wondering if I could rename them using some automated process.
Is there a script I could use to find the first 10 letters in the document and rename it with that? It would have to be able to cope with multiple documents having the same first 10 letters and so not write over documents with the same name. Also, it would have to avoid renaming the document with illegal characters (such as '?', '*', '/', etc.)
I only have a little bit of experience with Python, C, and even less with bash programming in Linux, so bear with me if I don't know exactly what I'm doing if I have to write a new script.
How about VBScript? Here is a sketch:
FolderName = "C:\Docs\"
Set fs = CreateObject("Scripting.FileSystemObject")
Set fldr = fs.GetFolder(Foldername)
Set ws = CreateObject("Word.Application")
For Each f In fldr.Files
If Left(f.name,2)<>"~$" Then
If InStr(f.Type, "Microsoft Word") Then
MsgBox f.Name
Set doc = ws.Documents.Open(Foldername & f.Name)
s = vbNullString
i = 1
Do While Trim(s) = vbNullString And i <= doc.Paragraphs.Count
s = doc.Paragraphs(i)
s = CleanString(Left(s, 10))
i = i + 1
Loop
doc.Close False
If s = "" Then s = "NoParas"
s1 = s
i = 1
Do While fs.FileExists(s1)
s1 = s & i
i = i + 1
Loop
MsgBox "Name " & Foldername & f.Name & " As " & Foldername & s1 _
& Right(f.Name, InStrRev(f.Name, "."))
'' This uses copy, because it seems safer
f.Copy Foldername & s1 & Right(f.Name, InStrRev(f.Name, ".")), False
'' MoveFile will copy the file:
'' fs.MoveFile Foldername & f.Name, Foldername & s1 _
'' & Right(f.Name, InStrRev(f.Name, "."))
End If
End If
Next
msgbox "Done"
ws.Quit
Set ws = Nothing
Set fs = Nothing
Function CleanString(StringToClean)
''http://msdn.microsoft.com/en-us/library/ms974570.aspx
Dim objRegEx
Set objRegEx = CreateObject("VBScript.RegExp")
objRegEx.IgnoreCase = True
objRegEx.Global = True
''Find anything not a-z, 0-9
objRegEx.Pattern = "[^a-z0-9]"
CleanString = objRegEx.Replace(StringToClean, "")
End Function
Word documents are stored in a custom format which places a load of binary cruft on the beginning of the file.
The simplest thing would be to knock something up in Python that searched for the first line beginning with ASCII chars. Here you go:
#!/usr/bin/python
import glob
import os
for file in glob.glob("*.doc"):
f = open(file, "rb")
new_name = ""
chars = 0
char = f.read(1)
while char != "":
if 0 < ord(char) < 128:
if ord("a") <= ord(char) <= ord("z") or ord("A") <= ord(char) <= ord("Z") or ord("0") <= ord(char) <= ord("9"):
new_name += char
else:
new_name += "_"
chars += 1
if chars == 100:
new_name = new_name[:20] + ".doc"
print "renaming " + file + " to " + new_name
f.close()
break;
else:
new_name = ""
chars = 0
char = f.read(1)
if new_name != "":
os.rename(file, new_name)
NOTE: if you want to glob multiple directories you'll need to change the glob line accordingly. Also this takes no account of whether the file you're trying to rename to already exists, so if you have multiple docs with the same first few chars then you'll need to handle that.
I found the first chunk of 100 ASCII chars in a row (if you look for less than that you end up picking up doc keywords and such) and then used the first 20 of these to make the new name, replacing anything that's not a-z A-Z or 0-9 with underscores to avoid file name issues.