Hello everyone, im new hier and want to start learning how machine learning works. So i want to build a machine learning email spam detector in colab, but something seems to be wrong:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from google.colab import files
uploaded = files.upload() filename = ('spam.tsv')
content = []
with open(filename, "r") as file_content:
for line in file_content.readlines():
line = line.strip() content.append(line)
spam = line
#print(spam) for testinn if it is working
z = spam['EmailText']
y = spam["Label"]
z_train, z_test,y_train, y_test = train_test_split(z,y,test_size = 0.2)
count_vector = CountVectorizer()
features = count_vector.transform(z_train)
model = svm.SVC()
model.fit(features,y_train)
features_test = count_vector.transform(z_test)
print(model.score(features_test,y_test))`
NotFittedError: Vocabulary not fitted or provided TypeError: string indices must be integers
i tried everything but nothing works really haha
i hope, you can help
thank you
Hi i want to overlay or paste an image on bigger images(have a folder containinf 10 images and want to overlay the smaller images on all 10) and save them in a different folder. I did try somethings but ran into errors.
import scipy.misc
import numpy as np
import os
import cv2
outPath = "C:\darkflow\Augmented Images\augmented_images\.."
cov = cv2.imread("C:\darkflow\Augmented Images\extracted\cover\extracted_cover.jpg")
bgs = [cv2.imread(file) for file in glob.glob("C:\darkflow\Augmented Images\images\*.jpg")]
for bg in bgs:
bg[y_offset:y_offset+s_img.shape[0], x_offset:x_offset+s_img.shape[1]] = cov
f_image = cv2.cvtColor(bg, cv2.COLOR_BGR2RGB)
fullpath = os.path.join(outPath, 'augmented_'+ bg)
misc.imsave(fullpath, f_image)
with this code i get an error : ufunc 'add' did not contain a loop with signature matching types dtype('
I found the answer while looking into the code. My code is
from scipy import ndimage, misc
import scipy.misc
import numpy as np
import os
import cv2
cov = cv2.imread("C:\darkflow\Augmented Images\extracted\cover\extracted_cover.jpg")
bgs = [cv2.imread(file) for file in glob.glob("C:\darkflow\Augmented Images\images\*.jpg")]
d=1
x_offset=100
y_offset= 100
for bg in bgs:
bg[y_offset:y_offset+ cov.shape[0], x_offset:x_offset+ cov.shape[1]] = cov
filename = "images/file_%d.jpg"%d
cv2.imwrite(filename, bg)
d+=1
I need to resize an screenshot taken by mss in order to get better reading by pytesseract and i get it done with pil+pyscreenshot but can't get it to with mss.
from numpy import array, flip
from mss import mss
from pytesseract import image_to_string
from time import sleep
def screenshot():
cap = array(mss().grab({'top': 171, 'left': 1088, 'width': 40, 'height': 17}))
cap = flip(cap[:, :, :3], 2)
return cap
def read(param):
tesseract = image_to_string(param)
return tesseract
while True:
print(read(screenshot()))
sleep(2)
here its working with pyscreenshot
from time import sleep
from PIL import Image, ImageOps
import pyscreenshot as ImageGrab
import pytesseract
while 1:
test = ImageGrab.grab(bbox=(1088,171,1126,187))
testt = ImageOps.fit(test, (50, 28), method=Image.ANTIALIAS)
testt.save('result.png')
read = pytesseract.image_to_string(testt)
print(read)
sleep(2)
And, i don't care about maintain aspect radio, works better that way with pytesseract.
I am implementing a variant of the CNN described by this paper.
My problem is that the loss isn't decreasing and I don't understand why. Same have to be said concerning accuracy(stuck at 0.5 more or less).
This a problem of 2 classes classification. I am using the data from this website:
I suspected the optimizer so I changed it whithout any improvements. I am pretty sure the data I am using is ok because I used it on an LSTM and the classifier was fine.
Here is my code:
from keras.layers import Embedding
from keras.layers import Conv2D
from keras.models import Sequential
from keras.layers import MaxPooling2D
from keras.layers import Reshape
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Input
from keras import backend as K
from keras.models import Model
import tensorflow as tf
from sklearn import preprocessing
import keras
import numpy as np
import pdb
#using multiple filters
config = tf.ConfigProto()
# Don't pre-allocate memory; allocate as-needed
config.gpu_options.allow_growth = True
# Only allow a total of half the GPU memory to be allocated
config.gpu_options.per_process_gpu_memory_fraction = 0.7
# Create a session with the above options specified.
config.gpu_options.per_process_gpu_memory_fraction = 0.65
K.tensorflow_backend.set_session(tf.Session(config=config))
class Classifier():
def __init__(self,Vocab,maxlen=75):
self.model=Sequential()
self.EMBED_DIM=30
self.batch_size=30
self.MAX_SEQUENCE_LENGTH=maxlen
self.nb_labels=2
self.model = Sequential()
self.Vocab=Vocab
def fit(self, X,y):
#pdb.set_trace()
mainIn=Input(shape=(self.MAX_SEQUENCE_LENGTH,), dtype='int32', name='main_input')
x=Embedding(len(self.Vocab)+2,self.EMBED_DIM,input_length=self.MAX_SEQUENCE_LENGTH)(mainIn)
x=Reshape((1,self.MAX_SEQUENCE_LENGTH, self.EMBED_DIM))(x)
x1=Conv2D(128, strides=2,kernel_size=5 ,activation="relu", padding='same')(x)
x1=MaxPooling2D((self.MAX_SEQUENCE_LENGTH-5+1,1),padding='same')(x1)
x1=Flatten()(x1)
x2=Conv2D(128, strides=2, kernel_size=4, activation="sigmoid", padding='same')(x)
x2=MaxPooling2D((self.MAX_SEQUENCE_LENGTH-4+1,1),padding='same')(x2)
x2=Flatten()(x2)
x3=Conv2D(128, strides=2, kernel_size=3, activation="tanh", padding='same')(x)
x3=MaxPooling2D((self.MAX_SEQUENCE_LENGTH-3+1,1),padding='same')(x3)
x3=Flatten()(x3)
combinedX=keras.layers.concatenate([x1,x2,x3],axis=1)
combinedX=Dense(64, activation="relu")(combinedX)
combinedX=Dropout(0.2)(combinedX)
#output=Dense(self.nb_labels, activation="sigmoid")(combinedX)
#output=Dense(2, activation="softmax")(combinedX)
output=Dense(1, activation="sigmoid")(combinedX)
#encoder =preprocessing.LabelEncoder()
#encoder.fit(y)
#encoded_Y = encoder.transform(y)
#labels=keras.utils.to_categorical(encoded_Y, num_classes=2)
labels=y
pdb.set_trace()
inputs2=X
self.model = Model(inputs=mainIn, outputs=output)
# self.model.compile(loss='binary_crossentropy',
# optimizer='adam',
# metrics=['acc'])
self.model.compile(loss='binary_crossentropy',
optimizer='rmsprop',
metrics=['acc'])
self.model.fit(inputs2,labels,epochs=7, batch_size=self.batch_size)
def predict(self, X):
return self.model.predict(np.array(X))
def predict_proba(self, X):
return self.model.predict(np.array(X), self.batch_size)
Here is my code to preprocess the data:
#loading the file
file2=pd.read_csv("Sentiment_Analysis Dataset.csv",error_bad_lines=False)
#splitting into train et test set
from sklearn.model_selection import train_test_split
Text=list(file2.SentimentText)
file2.groupby('Sentiment').count()
train_data,test_data,train_label,test_label=train_test_split(Text, file2.Sentiment, test_size=0.4, random_state=42)
#Buidling the dictionary
vocabDic=dict()
for document in train_data:
document=document.split(" ")
for word in document:
if word not in vocabDic:
vocabDic[word]=len(vocabDic)+1
vocabDic['unk']=len(vocabDic)+1
#coding the documents
def codeDocuments(documents,dictionnary):
documentsArray=list()
for i,document in enumerate(documents):
tempList=list()
document=document.split(" ")
for word in document:
if word in vocabDic:
word=vocabDic[word]
else:
word=vocabDic['unk']
tempList.append(word)
documentsArray.append(tempList)
return np.array(documentsArray)
train_docs=codeDocuments(train_data,vocabDic)
test_docs=codeDocuments(test_data,vocabDic)
#padding the documents
from keras.preprocessing import sequence
maxlen=75
train_set = sequence.pad_sequences(train_docs, maxlen=maxlen)
test_set = sequence.pad_sequences(test_docs, maxlen=maxlen)
#calling the model
model=Classifier(vocabDic,maxlen)
model.fit(train_set[:50000],train_label[:50000])
In a given DataSeries I want to use a Bokeh slider to change the number of observations I show (the tail). Can this be done, if it can why won't my script work?
import pandas as pd
from bokeh.plotting import show, output_file
from bokeh.models import CustomJS, Slider, Column
from bokeh.io import output_notebook
ds = pd.Series( [i for i in range(20)], pd.date_range('2016-01-02', periods=20, freq='D'))
tail = 5
ds.tail(tail)
#Bokeh slider to change value of "tail"
s1 = Slider(start=1, end=20, value=3, step=1)
s1.callback = CustomJS(args=dict(s1=s1, tail=tail), code="""
ds.tail("tail", s1.get('value'));
""")
show(Column(s1))
ds.tail(tail)