NotFittedError: Vocabulary not fitted or provided or TypeError: string indices must be integers - email

Hello everyone, im new hier and want to start learning how machine learning works. So i want to build a machine learning email spam detector in colab, but something seems to be wrong:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from google.colab import files
uploaded = files.upload() filename = ('spam.tsv')
content = []
with open(filename, "r") as file_content:
for line in file_content.readlines():
line = line.strip() content.append(line)
spam = line
#print(spam) for testinn if it is working
z = spam['EmailText']
y = spam["Label"]
z_train, z_test,y_train, y_test = train_test_split(z,y,test_size = 0.2)
count_vector = CountVectorizer()
features = count_vector.transform(z_train)
model = svm.SVC()
model.fit(features,y_train)
features_test = count_vector.transform(z_test)
print(model.score(features_test,y_test))`
NotFittedError: Vocabulary not fitted or provided TypeError: string indices must be integers
i tried everything but nothing works really haha
i hope, you can help
thank you

Related

Issue with scipy.io.wavfile.read and scipy.fftpack.fft

from os.path import dirname, join as pjoin
import scipy.io as sio
from scipy.io import wavfile
from scipy.fftpack import fft
data_dir = pjoin(dirname(sio.__file__), 'tests', 'data')
wav_fname = pjoin(data_dir, 'test-44100Hz-2ch-32bit-float-be.wav')
print(wav_fname)
def create_FFT(fn,size=1000):
sample_rate, X = wavfile.read(fn)
fft_features = abs(fft(X)[:size])
return(sample_rate, X, fft_features)
for wav_fn in wav_fname :
samplerate, data, fft_features = create_FFT(wav_fn)
print(f"number of channels = {data.shape[1]}")
print("fft features are: {}".format(fft_features))
In the above code, if I don't include fft specific code in the create_FFT function, I could read the file and print the number of channels. However, as soon as I include fft specific code, I get an error "FileNotFoundError: [Errno 2] No such file or directory: 'C'"
Any help will be appreciated.
Found the answer. It was with the for loop at the bottom.

overlay a small image on multiple biger images and save them in a different folder

Hi i want to overlay or paste an image on bigger images(have a folder containinf 10 images and want to overlay the smaller images on all 10) and save them in a different folder. I did try somethings but ran into errors.
import scipy.misc
import numpy as np
import os
import cv2
outPath = "C:\darkflow\Augmented Images\augmented_images\.."
cov = cv2.imread("C:\darkflow\Augmented Images\extracted\cover\extracted_cover.jpg")
bgs = [cv2.imread(file) for file in glob.glob("C:\darkflow\Augmented Images\images\*.jpg")]
for bg in bgs:
bg[y_offset:y_offset+s_img.shape[0], x_offset:x_offset+s_img.shape[1]] = cov
f_image = cv2.cvtColor(bg, cv2.COLOR_BGR2RGB)
fullpath = os.path.join(outPath, 'augmented_'+ bg)
misc.imsave(fullpath, f_image)
with this code i get an error : ufunc 'add' did not contain a loop with signature matching types dtype('
I found the answer while looking into the code. My code is
from scipy import ndimage, misc
import scipy.misc
import numpy as np
import os
import cv2
cov = cv2.imread("C:\darkflow\Augmented Images\extracted\cover\extracted_cover.jpg")
bgs = [cv2.imread(file) for file in glob.glob("C:\darkflow\Augmented Images\images\*.jpg")]
d=1
x_offset=100
y_offset= 100
for bg in bgs:
bg[y_offset:y_offset+ cov.shape[0], x_offset:x_offset+ cov.shape[1]] = cov
filename = "images/file_%d.jpg"%d
cv2.imwrite(filename, bg)
d+=1

Loss isn't decreasing in neural network

I am implementing a variant of the CNN described by this paper.
My problem is that the loss isn't decreasing and I don't understand why. Same have to be said concerning accuracy(stuck at 0.5 more or less).
This a problem of 2 classes classification. I am using the data from this website:
I suspected the optimizer so I changed it whithout any improvements. I am pretty sure the data I am using is ok because I used it on an LSTM and the classifier was fine.
Here is my code:
from keras.layers import Embedding
from keras.layers import Conv2D
from keras.models import Sequential
from keras.layers import MaxPooling2D
from keras.layers import Reshape
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Input
from keras import backend as K
from keras.models import Model
import tensorflow as tf
from sklearn import preprocessing
import keras
import numpy as np
import pdb
#using multiple filters
config = tf.ConfigProto()
# Don't pre-allocate memory; allocate as-needed
config.gpu_options.allow_growth = True
# Only allow a total of half the GPU memory to be allocated
config.gpu_options.per_process_gpu_memory_fraction = 0.7
# Create a session with the above options specified.
config.gpu_options.per_process_gpu_memory_fraction = 0.65
K.tensorflow_backend.set_session(tf.Session(config=config))
class Classifier():
def __init__(self,Vocab,maxlen=75):
self.model=Sequential()
self.EMBED_DIM=30
self.batch_size=30
self.MAX_SEQUENCE_LENGTH=maxlen
self.nb_labels=2
self.model = Sequential()
self.Vocab=Vocab
def fit(self, X,y):
#pdb.set_trace()
mainIn=Input(shape=(self.MAX_SEQUENCE_LENGTH,), dtype='int32', name='main_input')
x=Embedding(len(self.Vocab)+2,self.EMBED_DIM,input_length=self.MAX_SEQUENCE_LENGTH)(mainIn)
x=Reshape((1,self.MAX_SEQUENCE_LENGTH, self.EMBED_DIM))(x)
x1=Conv2D(128, strides=2,kernel_size=5 ,activation="relu", padding='same')(x)
x1=MaxPooling2D((self.MAX_SEQUENCE_LENGTH-5+1,1),padding='same')(x1)
x1=Flatten()(x1)
x2=Conv2D(128, strides=2, kernel_size=4, activation="sigmoid", padding='same')(x)
x2=MaxPooling2D((self.MAX_SEQUENCE_LENGTH-4+1,1),padding='same')(x2)
x2=Flatten()(x2)
x3=Conv2D(128, strides=2, kernel_size=3, activation="tanh", padding='same')(x)
x3=MaxPooling2D((self.MAX_SEQUENCE_LENGTH-3+1,1),padding='same')(x3)
x3=Flatten()(x3)
combinedX=keras.layers.concatenate([x1,x2,x3],axis=1)
combinedX=Dense(64, activation="relu")(combinedX)
combinedX=Dropout(0.2)(combinedX)
#output=Dense(self.nb_labels, activation="sigmoid")(combinedX)
#output=Dense(2, activation="softmax")(combinedX)
output=Dense(1, activation="sigmoid")(combinedX)
#encoder =preprocessing.LabelEncoder()
#encoder.fit(y)
#encoded_Y = encoder.transform(y)
#labels=keras.utils.to_categorical(encoded_Y, num_classes=2)
labels=y
pdb.set_trace()
inputs2=X
self.model = Model(inputs=mainIn, outputs=output)
# self.model.compile(loss='binary_crossentropy',
# optimizer='adam',
# metrics=['acc'])
self.model.compile(loss='binary_crossentropy',
optimizer='rmsprop',
metrics=['acc'])
self.model.fit(inputs2,labels,epochs=7, batch_size=self.batch_size)
def predict(self, X):
return self.model.predict(np.array(X))
def predict_proba(self, X):
return self.model.predict(np.array(X), self.batch_size)
Here is my code to preprocess the data:
#loading the file
file2=pd.read_csv("Sentiment_Analysis Dataset.csv",error_bad_lines=False)
#splitting into train et test set
from sklearn.model_selection import train_test_split
Text=list(file2.SentimentText)
file2.groupby('Sentiment').count()
train_data,test_data,train_label,test_label=train_test_split(Text, file2.Sentiment, test_size=0.4, random_state=42)
#Buidling the dictionary
vocabDic=dict()
for document in train_data:
document=document.split(" ")
for word in document:
if word not in vocabDic:
vocabDic[word]=len(vocabDic)+1
vocabDic['unk']=len(vocabDic)+1
#coding the documents
def codeDocuments(documents,dictionnary):
documentsArray=list()
for i,document in enumerate(documents):
tempList=list()
document=document.split(" ")
for word in document:
if word in vocabDic:
word=vocabDic[word]
else:
word=vocabDic['unk']
tempList.append(word)
documentsArray.append(tempList)
return np.array(documentsArray)
train_docs=codeDocuments(train_data,vocabDic)
test_docs=codeDocuments(test_data,vocabDic)
#padding the documents
from keras.preprocessing import sequence
maxlen=75
train_set = sequence.pad_sequences(train_docs, maxlen=maxlen)
test_set = sequence.pad_sequences(test_docs, maxlen=maxlen)
#calling the model
model=Classifier(vocabDic,maxlen)
model.fit(train_set[:50000],train_label[:50000])

Printing confusion matrix to file produces illegal characters

I am classifying a set of images stored as tuples in a csv file.
The confusion matrix that I get on terminal display is correct. But when I write that same conf. matrix to a file, it produces illegal characters (32bit hex).
Here's the code-
from sklearn.metrics import confusion_matrix
import numpy as np
import os
import csv
from sklearn import svm
from sklearn import cross_validation
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn import metrics
import cPickle
def prec(num):
return "%0.5f"%num
outfile = open("output/linear_svm_output.txt","a")
for dim in [20,30,40]:
images=[]
labels=[]
name = str(dim)+"x"+str(dim)+".csv"
with open(name,'r') as file:
reader = csv.reader(file,delimiter=',')
for line in file:
labels.append(line[0])
line=line[2:] # Remove the label
image=[int(pixel) for pixel in line.split(',')]
images.append(np.array(image))
clf = svm.LinearSVC()
print clf
kf = cross_validation.KFold(len(images),n_folds=10,indices=True, shuffle=True, random_state=4)
print "\nDividing dataset using `Kfold()` -:\n\nThe training dataset has been divided into " + str(len(kf)) + " parts\n"
for train, test in kf:
training_images=[]
training_labels=[]
for i in train:
training_images.append(images[i])
training_labels.append(labels[i])
testing_images=[]
testing_labels=[]
for i in test:
testing_images.append(images[i])
testing_labels.append(labels[i])
clf.fit(training_images,training_labels)
predicted = clf.predict(testing_images)
print prec(clf.score(testing_images, testing_labels))
outfile.write(prec(clf.score(testing_images, testing_labels)))
outfile.write(str(clf))
outfile.write(confusion_matrix(testing_labels, predicted))
print confusion_matrix(testing_labels, predicted)
# outfile.write(metrics.classification_report(testing_labels, predicted))
print "\nDividing dataset using `train_test_split()` -:\n"
training_images, testing_images, training_labels, testing_labels = cross_validation.train_test_split(images,labels, test_size=0.2, random_state=0)
clf = clf.fit(training_images,training_labels)
score = clf.score(testing_images,testing_labels)
predicted = clf.predict(testing_images)
print prec(score)
outfile.write(str(clf))
outfile.write(confusion_matrix(testing_labels, predicted))
print confusion_matrix(testing_labels, predicted)
# outfile.write(metrics.classification_report(testing_labels, predicted))
Output in file-
302e 3939 3338 374c 696e 6561 7253 5643
2843 3d31 2e30 2c20 636c 6173 735f 7765
...
Use the following to print the matrix to file properly:
with open(filename, 'w') as f:
f.write(np.array2string(confusion_matrix(y_test, pred), separator=', '))
Because outfile.write(confusion_matrix(testing_labels, predicted)) will write the matrix out in binary format. If you want write it in human readable text, try this if you are using python 2.x
print >> outfile, confusion_matrix(testing_labels, predicted)
It just redirect the stdout to outfile

Bokeh's hovertool inipython refuses to display tooltips

I literally copy and pasted the example of how to use the hover tool from bokeh's documentation and I still can't get this damn thing to work. I just want bokeh's hover tool to display the x and y coordinates. I think I've implemented it correctly but let me know if anything's wrong. (The ASCII file reads in flawlessly and the graph plots correctly and all the other tools work)
from bokeh.plotting import *
from bokeh.objects import HoverTool
from collections import OrderedDict
output_notebook()
%matplotlib inline
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import mpld3
from mpld3 import plugins, utils
mpld3.enable_notebook()
from pylab import *
import pandas as pd
chip1 = pd.io.parsers.read_table("Chip1_Buffer_ASCII", sep=";")
chip2 = pd.io.parsers.read_table("Chip2_Buffer_ASCII", sep=";")
chip3 = pd.io.parsers.read_table("Chip3_Buffer_ASCII", sep=";")
chip1_1=chip1
chip1_2=chip2
chip1_3=chip3
chip1_1["Frequency (Hz)"]=chip1["Frequency (Hz)"].map(lambda x: math.log10(x))
chip1_2["Frequency (Hz)"]=chip2["Frequency (Hz)"].map(lambda x: math.log10(x))
chip1_3["Frequency (Hz)"]=chip3["Frequency (Hz)"].map(lambda x: math.log10(x))
diff_1_2 = chip1 - chip2
diff_1_2["Frequency (Hz)"] = chip1_1["Frequency (Hz)"]
source1 = ColumnDataSource(chip1_1.to_dict("list"))
source2 = ColumnDataSource(chip1_2.to_dict("list"))
source3 = ColumnDataSource(chip1_3.to_dict("list"))
source4=ColumnDataSource(diff_1_2.to_dict("list"))
import bokeh.plotting as bk
bk.figure(plot_width=600, # in units of px
plot_height=600,
title="Hello World!",
tools="pan,wheel_zoom,box_zoom,select,reset,hover")
bk.hold()
bk.line("Frequency (Hz)", "-Phase (°)",line_width=2,source=source1,logx=True,color="red",xlim=[0, 10000])
bk.line("Frequency (Hz)", "-Phase (°)",line_width=2,source=source2,logx=True,color="green",xlim=[0, 10000])
bk.line("Frequency (Hz)", "-Phase (°)",line_width=2,source=source3,logx=True,color="orange",xlim=[0, 10000])
bk.line("Frequency (Hz)", "-Phase (°)",line_width=2,source=source4,logx=True,color="orange",xlim=[0, 10000])
hover = bk.curplot().select(dict(type=HoverTool))
hover.tooltips=OrderedDict([
("(x,y)", "($x, $y)"),
("index", "$index")
])
bk.show()