Create words and its position in Pyspark - pyspark

Hi I am trying to create string which will have words and its position as it appear in the input string. I am able to do it in python using below code -
from collections import defaultdict
import re
s = 'Create a string with position from a string a'
wp = defaultdict(list)
for n, k in enumerate(s.split()):
wp[k].append(n+1)
raw_output = re.search('{(.*)}', str(wp)).group(1).replace('[','').replace(']','')
final_output = re.sub("(\d), '", r"\1 '", raw_output)
And output is
"'Create': 1 'a': 2, 7, 9 'string': 3, 8 'with': 4 'position': 5 'from': 6"
How can I do the same in pyspark?

Pyspark has few additional concepts you might need to revisit, using RDD apis is the best
for your problem statement.
Here is a code snippet that should work for you.
def positional_encoder(sentence):
words=sentence.split(" ")
indexes=list(range(0,len(words)))
return list(zip(words,indexes))
data_rdd = sc.parallelize(["Create a string with position from a string a"])
words_index=data_rdd.map(lambda sentence: positional_encoder(sentence))
## Just for debugging:
words_index.collect() ## Remove this after debugging

Related

Deep learning chatbot specific Index error list index out of range

I am trying to follow a tutorial on how to make a deeplearning chatbot with pytorch. However, this code is quite complex for me and it has stopped with a "IndexError: list index out of range". I looked the error up and get the gist of what it usually means, but seeing as this code is very complex for me I can't figure out how to solve the error.
this is the source tutorial: [https://colab.research.google.com/github/pytorch/tutorials/blob/gh-pages/_downloads/chatbot_tutorial.ipynb#scrollTo=LTzdbPF-OBL9][1]
Line 198 seems to be causing the error
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
This is the error log
Start preparing training data ...
Reading lines...
Traceback (most recent call last):
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 221, in <module>
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 209, in loadPrepareData
pairs = filterPairs(pairs)
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 202, in filterPairs
return [pair for pair in pairs if filterPair(pair)]
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 202, in <listcomp>
return [pair for pair in pairs if filterPair(pair)]
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 198, in filterPair
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
IndexError: list index out of range
Read 442563 sentence pairs
Process finished with exit code 1
And this is my code copied from my pycharm up to the block with the error. Seeing as its a huge code I could not copy the entire code. The rest of the code can be found in the github source link above.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
corpus_name = "cornell movie-dialogs corpus"
corpus = os.path.join("D:\Documents\Python\intents", corpus_name)
def printLines(file, n=10):
with open(file, 'rb') as datafile:
lines = datafile.readlines()
for line in lines[:n]:
print(line)
printLines(os.path.join(corpus, "movie_lines.txt"))
# Splits each line of the file into a dictionary of fields
def loadLines(fileName, fields):
lines = {}
with open(fileName, 'r', encoding='iso-8859-1') as f:
for line in f:
values = line.split(" +++$+++ ")
# Extract fields
lineObj = {}
for i, field in enumerate(fields):
lineObj[field] = values[i]
lines[lineObj['lineID']] = lineObj
return lines
# Groups fields of lines from `loadLines` into conversations based on *movie_conversations.txt*
def loadConversations(fileName, lines, fields):
conversations = []
with open(fileName, 'r', encoding='iso-8859-1') as f:
for line in f:
values = line.split(" +++$+++ ")
# Extract fields
convObj = {}
for i, field in enumerate(fields):
convObj[field] = values[i]
# Convert string to list (convObj["utteranceIDs"] == "['L598485', 'L598486', ...]")
lineIds = eval(convObj["utteranceIDs"])
# Reassemble lines
convObj["lines"] = []
for lineId in lineIds:
convObj["lines"].append(lines[lineId])
conversations.append(convObj)
return conversations
# Extracts pairs of sentences from conversations
def extractSentencePairs(conversations):
qa_pairs = []
for conversation in conversations:
# Iterate over all the lines of the conversation
for i in range(len(conversation["lines"]) - 1): # We ignore the last line (no answer for it)
inputLine = conversation["lines"][i]["text"].strip()
targetLine = conversation["lines"][i+1]["text"].strip()
# Filter wrong samples (if one of the lists is empty)
if inputLine and targetLine:
qa_pairs.append([inputLine, targetLine])
return qa_pairs
# Define path to new file
datafile = os.path.join(corpus, "formatted_movie_lines.txt")
delimiter = '\t'
# Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))
# Initialize lines dict, conversations list, and field ids
lines = {}
conversations = []
MOVIE_LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"]
MOVIE_CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"]
# Load lines and process conversations
print("\nProcessing corpus...")
lines = loadLines(os.path.join(corpus, "movie_lines.txt"), MOVIE_LINES_FIELDS)
print("\nLoading conversations...")
conversations = loadConversations(os.path.join(corpus, "movie_conversations.txt"),
lines, MOVIE_CONVERSATIONS_FIELDS)
# Write new csv file
print("\nWriting newly formatted file...")
with open(datafile, 'w', encoding='utf-8') as outputfile:
writer = csv.writer(outputfile, delimiter=delimiter)
for pair in extractSentencePairs(conversations):
writer.writerow(pair)
# Print a sample of lines
print("\nSample lines from file:")
printLines(datafile)
# Default word tokens
PAD_token = 0 # Used for padding short sentences
SOS_token = 1 # Start-of-sentence token
EOS_token = 2 # End-of-sentence token
class Voc:
def __init__(self, name):
self.name = name
self.trimmed = False
self.word2index = {}
self.word2count = {}
self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
self.num_words = 3 # Count SOS, EOS, PAD
def addSentence(self, sentence):
for word in sentence.split(' '):
self.addWord(word)
def addWord(self, word):
if word not in self.word2index:
self.word2index[word] = self.num_words
self.word2count[word] = 1
self.index2word[self.num_words] = word
self.num_words += 1
else:
self.word2count[word] += 1
# Remove words below a certain count threshold
def trim(self, min_count):
if self.trimmed:
return
self.trimmed = True
keep_words = []
for k, v in self.word2count.items():
if v >= min_count:
keep_words.append(k)
print('keep_words {} / {} = {:.4f}'.format(
len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
))
# Reinitialize dictionaries
self.word2index = {}
self.word2count = {}
self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
self.num_words = 3 # Count default tokens
for word in keep_words:
self.addWord(word)
MAX_LENGTH = 10 # Maximum sentence length to consider
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
)
# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
s = unicodeToAscii(s.lower().strip())
s = re.sub(r"([.!?])", r" \1", s)
s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
s = re.sub(r"\s+", r" ", s).strip()
return s
# Read query/response pairs and return a voc object
def readVocs(datafile, corpus_name):
print("Reading lines...")
# Read the file and split into lines
lines = open(datafile, encoding='utf-8').\
read().strip().split('\n')
# Split every line into pairs and normalize
pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
voc = Voc(corpus_name)
return voc, pairs
# Returns True iff both sentences in a pair 'p' are under the MAX_LENGTH threshold
def filterPair(p):
# Input sequences need to preserve the last word for EOS token
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
# Filter pairs using filterPair condition
def filterPairs(pairs):
return [pair for pair in pairs if filterPair(pair)]
# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
print("Start preparing training data ...")
voc, pairs = readVocs(datafile, corpus_name)
print("Read {!s} sentence pairs".format(len(pairs)))
pairs = filterPairs(pairs)
print("Trimmed to {!s} sentence pairs".format(len(pairs)))
print("Counting words...")
for pair in pairs:
voc.addSentence(pair[0])
voc.addSentence(pair[1])
print("Counted words:", voc.num_words)
return voc, pairs
# Load/Assemble voc and pairs
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
print(pair)
MIN_COUNT = 3 # Minimum word count threshold for trimming
I really hope someone can help me fix this problem and help me understand why it happens.
In the end I changed
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
to
try:
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
except:
return False
And now the code seems to be working.

Encoding problem while running text summarization code

Good Day
I was testing the functionality of a text summarization code published on the website: https://towardsdatascience.com/understand-text-summarization-and-create-your-own-summarizer-in-python-b26a9f09fc70.
The problem is that, when I call the function on a text file, the 'cp949' codec can't decode byte 0xe2 in position 205: illegal multibyte sequence error appears. I know, from other posts, that it is an error related to the encoding type of the file. Therefore, I changed the encoding type of the test2.txt file to UTF-8 (saving the file in Plain text format, then choosing UTF-8 on Text Encoding > Other Encoding), but I still get this error message.
Here is the code that I wrote:
Import libraries
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
test_text_word = "test2.txt"
def read_article(test_text_word):
file = open(test_text_word, "r")
filedata = file.readlines()
article = filedata[0].split(". ")
sentences = []`
for sentence in article:
print(sentence)
sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
sentences.pop()
return sentences
def sentence_similarity(sent1, sent2, stopwords=None):
if stopwords is None:
stopwords = []
sent1 = [w.lower() for w in sent1]
sent2 = [w.lower() for w in sent2]
all_words = list(set(sent1 + sent2))
vector1 = [0] * len(all_words)
vector2 = [0] * len(all_words)
# build the vector for the first sentence
for w in sent1:
if w in stopwords:
continue
vector1[all_words.index(w)] += 1
# build the vector for the second sentence
for w in sent2:
if w in stopwords:
continue
vector2[all_words.index(w)] += 1
return 1 - cosine_distance(vector1, vector2)
def build_similarity_matrix(sentences, stop_words):
# Create an empty similarity matrix
similarity_matrix = np.zeros((len(sentences), len(sentences)))
for idx1 in range(len(sentences)):
for idx2 in range(len(sentences)):
if idx1 == idx2: #ignore if both are same sentences
continue
similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
return similarity_matrix
def generate_summary(test_text_word, top_n=5):
stop_words = stopwords.words('english')
summarize_text = []
# Step 1 - Read text anc split it
sentences = read_article(test_text_word)
# Step 2 - Generate Similary Martix across sentences
sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)
# Step 3 - Rank sentences in similarity martix
sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
scores = nx.pagerank(sentence_similarity_graph)
# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
print("Indexes of top ranked_sentence order are ", ranked_sentence)
for i in range(top_n):
summarize_text.append(" ".join(ranked_sentence[i][1]))
# Step 5 - Offcourse, output the summarize texr
print("Summarize Text: \n", ". ".join(summarize_text))
The problem is that, when I run the code, with the following command:
generate_summary("test2.txt", 2)
I receive this error message: 'cp949' codec can't decode byte 0xe2 in position 205: illegal multibyte sequence
Should I change something in the code?
Thanks for your support.

pyspark randomForest feature importance: how to get column names from the column numbers

I am using the standard (string indexer + one hot encoder + randomForest) pipeline in spark, as shown below
labelIndexer = StringIndexer(inputCol = class_label_name, outputCol="indexedLabel").fit(data)
string_feature_indexers = [
StringIndexer(inputCol=x, outputCol="int_{0}".format(x)).fit(data)
for x in char_col_toUse_names
]
onehot_encoder = [
OneHotEncoder(inputCol="int_"+x, outputCol="onehot_{0}".format(x))
for x in char_col_toUse_names
]
all_columns = num_col_toUse_names + bool_col_toUse_names + ["onehot_"+x for x in char_col_toUse_names]
assembler = VectorAssembler(inputCols=[col for col in all_columns], outputCol="features")
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=100)
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels)
pipeline = Pipeline(stages=[labelIndexer] + string_feature_indexers + onehot_encoder + [assembler, rf, labelConverter])
crossval = CrossValidator(estimator=pipeline,
estimatorParamMaps=paramGrid,
evaluator=evaluator,
numFolds=3)
cvModel = crossval.fit(trainingData)
now after the the fit I can get the random forest and the feature importance using cvModel.bestModel.stages[-2].featureImportances, but this does not give me feature/ column names, rather just the feature number.
What I get is below:
print(cvModel.bestModel.stages[-2].featureImportances)
(1446,[3,4,9,18,20,103,766,981,983,1098,1121,1134,1148,1227,1288,1345,1436,1444],[0.109898803421,0.0967396441648,4.24568235244e-05,0.0369705839109,0.0163489685127,3.2286694534e-06,0.0208192703688,0.0815822887175,0.0466903663708,0.0227619959989,0.0850922269211,0.000113388896956,0.0924779490403,0.163835022713,0.118987129392,0.107373548367,3.35577640585e-05,0.000229569946193])
How can I map it back to some column names or column name + value format?
Basically to get the feature importance of random forest along with the column names.
The transformed dataset metdata has the required attributes.Here is an easy way to do -
create a pandas dataframe (generally feature list will not be huge, so no memory issues in storing a pandas DF)
pandasDF = pd.DataFrame(dataset.schema["features"].metadata["ml_attr"]
["attrs"]["binary"]+dataset.schema["features"].metadata["ml_attr"]["attrs"]["numeric"]).sort_values("idx")
Then create a broadcast dictionary to map. broadcast is necessary in a distributed environment.
feature_dict = dict(zip(pandasDF["idx"],pandasDF["name"]))
feature_dict_broad = sc.broadcast(feature_dict)
You can also look here and here
Hey why don't you just map it back to the original columns through list expansion. Here is an example:
# in your case: trainingData.columns
data_frame_columns = ["A", "B", "C", "D", "E", "F"]
# in your case: print(cvModel.bestModel.stages[-2].featureImportances)
feature_importance = (1, [1, 3, 5], [0.5, 0.5, 0.5])
rf_output = [(data_frame_columns[i], feature_importance[2][j]) for i, j in zip(feature_importance[1], range(len(feature_importance[2])))]
dict(rf_output)
{'B': 0.5, 'D': 0.5, 'F': 0.5}
I was not able to find any way to get the true initial list of the columns back after the ml algorithm, I am using this as the current workaround.
print(len(cols_now))
FEATURE_COLS=[]
for x in cols_now:
if(x[-6:]!="catVar"):
FEATURE_COLS+=[x]
else:
temp=trainingData.select([x[:-7],x[:-6]+"tmp"]).distinct().sort(x[:-6]+"tmp")
temp_list=temp.select(x[:-7]).collect()
FEATURE_COLS+=[list(x)[0] for x in temp_list]
print(len(FEATURE_COLS))
print(FEATURE_COLS)
I have kept a consistent suffix naming across all the indexer (_tmp) & encoder (_catVar) like:
column_vec_in = str_col
column_vec_out = [col+"_catVar" for col in str_col]
indexers = [StringIndexer(inputCol=x, outputCol=x+'_tmp')
for x in column_vec_in ]
encoders = [OneHotEncoder(dropLast=False, inputCol=x+"_tmp", outputCol=y)
for x,y in zip(column_vec_in, column_vec_out)]
tmp = [[i,j] for i,j in zip(indexers, encoders)]
tmp = [i for sublist in tmp for i in sublist]
This can be further improved and generalized, but currently this tedious work around works best

Mapping binary data in perl

I have the following predefined codes that represent an index in a binary bitmap:
0 = standard
1 = special
2 = regular
3 = late
4 = early
5 = on time
6 = generic
7 = rfu
An example value I would take as an input would be 213, which becomes 11010101 in binary. Index 0, 2, 4, 6, and 7 have their bit flipped indicating that this record is:
standard + regular + early + generic + rfu.
I am trying to figure out in perl how to take that binary data and build a string, like mentioned above with code + code + code, etc.
Any help would be greatly appreciated. Thanks.
Edit: My thoughts on how I might approach this are:
Convert decimal to binary
Find length of binary string
Using substr get the value (0 or 1) index by index
If index value = 1 then add relevant code to string
Is there a better way to go about this?
You can test bits on input from 0 to 7, and take only these that are set,
my $in = 213;
my #r = ("standard","special","regular","late","early","on time","generic","rfu");
print join " + ", #r[ grep { $in & (1 << $_) } 0 .. $#r ];
# or
# print join " + ", map { $in & (1<<$_) ? $r[$_] : () } 0 .. $#r;
output
standard + regular + early + generic + rfu

I'm having trouble adding two matrices in python

I want to add two matrcies in python 3 but the problem comes when I add input to the program
Here is my code
def addmatrix(a,b):
d=[]
n=0
while n < len (a):
c = []
k = 0
while k < len (a[0]) :
c.append(a[n][k]+b[n][k])
k += 1
n += 1
d.append (c)
return d
def main():
a = input("Enter a Matrix: ")
b = input("Enter another Matrix: ")
print (addmatrix(a,b))
main()
If the input is
Enter a Matrix: [[5,6], [1,2], [2,4]]
Enter another Matrix: [[2,3], [-6,0], [-2, 4]]
The output comes out as [['[['], ['[['], ['52'], [',,'], ['63'], [']]'], [',,'], [' '], ['[['], ['1-'], [',6'], ['2,'], [']0'], [',]'], [' ,'], ['[ '], ['2['], [',-'], ['42'], ['],'], ['] ']]
But if I take out the input from the program and make it so that
def main():
a = [[5,6], [1,2], [2,4]]
b = [[2,3], [-6,0], [-2, 4]]
print (addmatrix(a,b))
main()
The output then comes out as [[7, 9], [-5, 2], [0, 8]] which is correct.
Is there a way I can make my program work so that when a person inputs two matrices they add together? I'm new at python so any help will be appreciated :)
You will have to convert the user input into a Python object. Right now, it's a string.
You can use eval (which should not be used if you don't know what your users will input. I can type in __import__('os').system('rm /some/file.txt') and Python will delete a file):
a = eval(input("Enter a Matrix: "))
Or you can use ast.literal_eval, which is safe:
from ast import literal_eval
...
a = literal_eval(input("Enter a Matrix: "))
Try this:
import ast
def addmatrix(a,b):
return [map(sum, zip(*x)) for x in zip(a,b)]
def main():
a = ast.literal_eval(raw_input("Enter a Matrix: "))
b = ast.literal_eval(raw_input("Enter another Matrix: "))
print addmatrix(a,b)
main()