tesseract not able to detect korean language properly - tesseract

I am learning how to detect Korean text, for sample I am using Korean text present in back of package, but pytesseract.image_to_string(img_pl,lang='kor') is not able to segregate words separately when I query with level set to word
Here is my code
import numpy as np
import pandas as pd
import cv2
import PIL
import pytesseract
import html
import io
import os
img_cv2 = cv2.imread('/Users/aniketdeshmukh/Desktop/Korean Text Images/0.png')
img_pl = PIL.Image.open('/Users/aniketdeshmukh/Desktop/Korean Text Images/0.png')
text_pl = pytesseract.image_to_string(img_cv2,lang='kor')
text_pl = pytesseract.image_to_string(img_pl,lang='kor')
#print(text_pl)
data = pytesseract.image_to_data(img_pl,lang='kor')
dataList = list(map(lambda x: x.split('\t'),data.split('\n')))
df = pd.DataFrame(dataList[1:],columns=dataList[0])
#df
df.info
df.dropna(inplace=True)
col_int = ['level','page_num','block_num','par_num','line_num','word_num','left','top','width','height','conf']
df[col_int] = df[col_int].astype(int)
img_cv2 = cv2.imread('/Users/aniketdeshmukh/Desktop/Korean Text Images/0.png')
image = img_cv2.copy()
level = 'word'
for l,x,y,w,h,c in df[['level','left','top','width','height','conf']].values:
if level == 'page':
if l == 1:
cv2.rectangle(image,(x,y),(x+w,y+h),(0,0,0),2)
else:
continue
elif level == 'block':
if l == 2:
cv2.rectangle(image,(x,y),(x+w,y+h),(255,0,0),2)
else:
continue
elif level == 'para':
if l == 3:
cv2.rectangle(image,(x,y),(x+w,y+h),(0,255,0),2)
else:
continue
elif level == 'line':
if l == 4:
cv2.rectangle(image,(x,y),(x+w,y+h),(0,0,255),2)
else:
continue
elif level == 'word':
if l == 5:
cv2.rectangle(image,(x,y),(x+w,y+h),(0,255,0),2)
else:
continue
cv2.imshow("bounding box",image)
cv2.waitKey(0)
cv2.destroyAllWindows()
output I get is following even when level is set to 'word'

Related

Updating a Dash Callback using RadioItems

I am fairly new to python coding so I apologize in advance for my ignorance. I am trying to create a Dash App that drops outliers using standard deviation. The user selects a standard deviation using RadioItem inputs.
My question is what amendments do I need to make to my code so that the RadioItem value updates max_deviations using a callback?
Import packages, clean the data and define a query
import dash
import plotly.express as px
from dash import Dash, dcc, html, Input, Output, State
import pandas as pd
import numpy as np
app = dash.Dash(__name__)
server = app.server
df=pd.read_csv(r'C:\SVS_GIS\POWER BI\CSV_DATA\QSAS2021.csv', encoding='unicode_escape')
#SET DATE OF VALUATION
df['TIME'] = ((pd.to_datetime(df['Sale Date'], dayfirst=True)
.rsub(pd.to_datetime('01/10/2021', dayfirst=True))
.dt.days
)*-1)
df=df[df['TIME'] >= -365]
df = df.query("(SMA >=1 and SMA <= 3) and (LGA==60)")
prepare dataframe for dropping outliers
data = pd.DataFrame(data=df)
x = df.TIME
y = df.CHANGE
mean = np.mean(y)
standard_deviation = np.std(y)
distance_from_mean = abs(y - mean)
app layout
app.layout = html.Div([
html.Label("Standard Deviation Picker:", style={'fontSize':25, 'textAlign':'center'}),
html.Br(),
html.Label("1.0 = 68%, 2.0 = 95%, 3.0 = 99.7%", style={'fontSize':15,
'textAlign':'center'}),
html.Div(id="radio_items"),
dcc.RadioItems(
options=[{'label': i, 'value': i} for i in [1.0, 2.0, 3.0]],
value=2.0
),
html.Div([
dcc.Graph(id="the_graph")]
)])
callback
#app.callback(
Output("the_graph", "figure"),
Input("radio_items", 'value')
)
def update_graph(max_deviations):
not_outlier = distance_from_mean < max_deviations * standard_deviation
no_outliers = y[not_outlier]
trim_outliers = pd.DataFrame(data=no_outliers)
dff = pd.merge(trim_outliers, df, left_index=True, right_index=True)
return (dff)
fig = px.scatter(dff, x='TIME', y='CHANGE_y',
color ='SMA',
trendline='ols',
size='PV',
height=500,
width=800,
hover_name='SMA',
)
return dcc.Graph(id='the_graph', figure=fig)
if __name__ == '__main__':
app.run_server(debug=False)
Your dcc.RadioItems doesn't have an id prop. Add that, and make sure it matches the ID given in the callback, and you should be good.

Plotly Bar Chart Axes

Im trying to create a simple bar chart and deploy it within a dropdown menu. My problem is with the axes. I do not know why when I use px.bar the deploy just take one variable (x or y) and make the plot, but when I use px.scatter it works, it takes both variables.
from dash import Dash, dcc, html, Input, Output
import plotly.express as px
import pandas as pd
import numpy as np
app = Dash(__name__)
pf = fr.iloc[2:10,18:25]
pf.columns = [prueba1]
pf.index = ['CH + CHL','CH','CHL(UST)','CHL','CL + CHL','CL','CHL(UST)','CHL']
app.layout = html.Div(children=[
html.Div([dcc.Dropdown(pf.index,'CH', id='pandas-dropdown-2'),
html.Div(id='pandas-output-container')]),
dcc.Graph(id='example-graph')
])
Output('example-graph','figure'),
Input('pandas-dropdown-2', 'value'))
def update_graph(value_input):
pff = pf[pf.index == value_input]
fig = px.bar(pff, x = np.array(prueba1), y = list(pff.values))#,
return fig
__name__ == '__main__':
app.run_server(debug=True, use_reloader=False)
The result is this one:
enter image description here
Why the Y label is count instead of years?
But when I change from px.bar to px.scatter this happens:
enter image description here
It works!!! But I need the bar chart!

Deep learning chatbot specific Index error list index out of range

I am trying to follow a tutorial on how to make a deeplearning chatbot with pytorch. However, this code is quite complex for me and it has stopped with a "IndexError: list index out of range". I looked the error up and get the gist of what it usually means, but seeing as this code is very complex for me I can't figure out how to solve the error.
this is the source tutorial: [https://colab.research.google.com/github/pytorch/tutorials/blob/gh-pages/_downloads/chatbot_tutorial.ipynb#scrollTo=LTzdbPF-OBL9][1]
Line 198 seems to be causing the error
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
This is the error log
Start preparing training data ...
Reading lines...
Traceback (most recent call last):
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 221, in <module>
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 209, in loadPrepareData
pairs = filterPairs(pairs)
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 202, in filterPairs
return [pair for pair in pairs if filterPair(pair)]
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 202, in <listcomp>
return [pair for pair in pairs if filterPair(pair)]
File "D:\Documents\Python\python pycharm files\pythonProject4\3.9 Chatbot.py", line 198, in filterPair
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
IndexError: list index out of range
Read 442563 sentence pairs
Process finished with exit code 1
And this is my code copied from my pycharm up to the block with the error. Seeing as its a huge code I could not copy the entire code. The rest of the code can be found in the github source link above.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
corpus_name = "cornell movie-dialogs corpus"
corpus = os.path.join("D:\Documents\Python\intents", corpus_name)
def printLines(file, n=10):
with open(file, 'rb') as datafile:
lines = datafile.readlines()
for line in lines[:n]:
print(line)
printLines(os.path.join(corpus, "movie_lines.txt"))
# Splits each line of the file into a dictionary of fields
def loadLines(fileName, fields):
lines = {}
with open(fileName, 'r', encoding='iso-8859-1') as f:
for line in f:
values = line.split(" +++$+++ ")
# Extract fields
lineObj = {}
for i, field in enumerate(fields):
lineObj[field] = values[i]
lines[lineObj['lineID']] = lineObj
return lines
# Groups fields of lines from `loadLines` into conversations based on *movie_conversations.txt*
def loadConversations(fileName, lines, fields):
conversations = []
with open(fileName, 'r', encoding='iso-8859-1') as f:
for line in f:
values = line.split(" +++$+++ ")
# Extract fields
convObj = {}
for i, field in enumerate(fields):
convObj[field] = values[i]
# Convert string to list (convObj["utteranceIDs"] == "['L598485', 'L598486', ...]")
lineIds = eval(convObj["utteranceIDs"])
# Reassemble lines
convObj["lines"] = []
for lineId in lineIds:
convObj["lines"].append(lines[lineId])
conversations.append(convObj)
return conversations
# Extracts pairs of sentences from conversations
def extractSentencePairs(conversations):
qa_pairs = []
for conversation in conversations:
# Iterate over all the lines of the conversation
for i in range(len(conversation["lines"]) - 1): # We ignore the last line (no answer for it)
inputLine = conversation["lines"][i]["text"].strip()
targetLine = conversation["lines"][i+1]["text"].strip()
# Filter wrong samples (if one of the lists is empty)
if inputLine and targetLine:
qa_pairs.append([inputLine, targetLine])
return qa_pairs
# Define path to new file
datafile = os.path.join(corpus, "formatted_movie_lines.txt")
delimiter = '\t'
# Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))
# Initialize lines dict, conversations list, and field ids
lines = {}
conversations = []
MOVIE_LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"]
MOVIE_CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"]
# Load lines and process conversations
print("\nProcessing corpus...")
lines = loadLines(os.path.join(corpus, "movie_lines.txt"), MOVIE_LINES_FIELDS)
print("\nLoading conversations...")
conversations = loadConversations(os.path.join(corpus, "movie_conversations.txt"),
lines, MOVIE_CONVERSATIONS_FIELDS)
# Write new csv file
print("\nWriting newly formatted file...")
with open(datafile, 'w', encoding='utf-8') as outputfile:
writer = csv.writer(outputfile, delimiter=delimiter)
for pair in extractSentencePairs(conversations):
writer.writerow(pair)
# Print a sample of lines
print("\nSample lines from file:")
printLines(datafile)
# Default word tokens
PAD_token = 0 # Used for padding short sentences
SOS_token = 1 # Start-of-sentence token
EOS_token = 2 # End-of-sentence token
class Voc:
def __init__(self, name):
self.name = name
self.trimmed = False
self.word2index = {}
self.word2count = {}
self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
self.num_words = 3 # Count SOS, EOS, PAD
def addSentence(self, sentence):
for word in sentence.split(' '):
self.addWord(word)
def addWord(self, word):
if word not in self.word2index:
self.word2index[word] = self.num_words
self.word2count[word] = 1
self.index2word[self.num_words] = word
self.num_words += 1
else:
self.word2count[word] += 1
# Remove words below a certain count threshold
def trim(self, min_count):
if self.trimmed:
return
self.trimmed = True
keep_words = []
for k, v in self.word2count.items():
if v >= min_count:
keep_words.append(k)
print('keep_words {} / {} = {:.4f}'.format(
len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
))
# Reinitialize dictionaries
self.word2index = {}
self.word2count = {}
self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
self.num_words = 3 # Count default tokens
for word in keep_words:
self.addWord(word)
MAX_LENGTH = 10 # Maximum sentence length to consider
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
)
# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
s = unicodeToAscii(s.lower().strip())
s = re.sub(r"([.!?])", r" \1", s)
s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
s = re.sub(r"\s+", r" ", s).strip()
return s
# Read query/response pairs and return a voc object
def readVocs(datafile, corpus_name):
print("Reading lines...")
# Read the file and split into lines
lines = open(datafile, encoding='utf-8').\
read().strip().split('\n')
# Split every line into pairs and normalize
pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
voc = Voc(corpus_name)
return voc, pairs
# Returns True iff both sentences in a pair 'p' are under the MAX_LENGTH threshold
def filterPair(p):
# Input sequences need to preserve the last word for EOS token
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
# Filter pairs using filterPair condition
def filterPairs(pairs):
return [pair for pair in pairs if filterPair(pair)]
# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
print("Start preparing training data ...")
voc, pairs = readVocs(datafile, corpus_name)
print("Read {!s} sentence pairs".format(len(pairs)))
pairs = filterPairs(pairs)
print("Trimmed to {!s} sentence pairs".format(len(pairs)))
print("Counting words...")
for pair in pairs:
voc.addSentence(pair[0])
voc.addSentence(pair[1])
print("Counted words:", voc.num_words)
return voc, pairs
# Load/Assemble voc and pairs
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
print(pair)
MIN_COUNT = 3 # Minimum word count threshold for trimming
I really hope someone can help me fix this problem and help me understand why it happens.
In the end I changed
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
to
try:
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
except:
return False
And now the code seems to be working.

Encoding problem while running text summarization code

Good Day
I was testing the functionality of a text summarization code published on the website: https://towardsdatascience.com/understand-text-summarization-and-create-your-own-summarizer-in-python-b26a9f09fc70.
The problem is that, when I call the function on a text file, the 'cp949' codec can't decode byte 0xe2 in position 205: illegal multibyte sequence error appears. I know, from other posts, that it is an error related to the encoding type of the file. Therefore, I changed the encoding type of the test2.txt file to UTF-8 (saving the file in Plain text format, then choosing UTF-8 on Text Encoding > Other Encoding), but I still get this error message.
Here is the code that I wrote:
Import libraries
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
test_text_word = "test2.txt"
def read_article(test_text_word):
file = open(test_text_word, "r")
filedata = file.readlines()
article = filedata[0].split(". ")
sentences = []`
for sentence in article:
print(sentence)
sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
sentences.pop()
return sentences
def sentence_similarity(sent1, sent2, stopwords=None):
if stopwords is None:
stopwords = []
sent1 = [w.lower() for w in sent1]
sent2 = [w.lower() for w in sent2]
all_words = list(set(sent1 + sent2))
vector1 = [0] * len(all_words)
vector2 = [0] * len(all_words)
# build the vector for the first sentence
for w in sent1:
if w in stopwords:
continue
vector1[all_words.index(w)] += 1
# build the vector for the second sentence
for w in sent2:
if w in stopwords:
continue
vector2[all_words.index(w)] += 1
return 1 - cosine_distance(vector1, vector2)
def build_similarity_matrix(sentences, stop_words):
# Create an empty similarity matrix
similarity_matrix = np.zeros((len(sentences), len(sentences)))
for idx1 in range(len(sentences)):
for idx2 in range(len(sentences)):
if idx1 == idx2: #ignore if both are same sentences
continue
similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
return similarity_matrix
def generate_summary(test_text_word, top_n=5):
stop_words = stopwords.words('english')
summarize_text = []
# Step 1 - Read text anc split it
sentences = read_article(test_text_word)
# Step 2 - Generate Similary Martix across sentences
sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)
# Step 3 - Rank sentences in similarity martix
sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
scores = nx.pagerank(sentence_similarity_graph)
# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
print("Indexes of top ranked_sentence order are ", ranked_sentence)
for i in range(top_n):
summarize_text.append(" ".join(ranked_sentence[i][1]))
# Step 5 - Offcourse, output the summarize texr
print("Summarize Text: \n", ". ".join(summarize_text))
The problem is that, when I run the code, with the following command:
generate_summary("test2.txt", 2)
I receive this error message: 'cp949' codec can't decode byte 0xe2 in position 205: illegal multibyte sequence
Should I change something in the code?
Thanks for your support.

I am trying to write output of a python dataframe in a csv file.

I am getting an error on line 35 in the spamwriter.writerow() funtion. The error states that wRR should be byte instead of string. Please check the code given below:
import csv
import sys
import numpy as np
from numpy import genfromtxt
from numpy.linalg import inv
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d, Axes3D
Lambda = float(sys.argv[1])
Sigma2 = float(sys.argv[2])
X_train = genfromtxt(sys.argv[3], delimiter=',')
y_train = genfromtxt(sys.argv[4], delimiter=',')
X_test = genfromtxt(sys.argv[5], delimiter=',')
# Get the number of columns -> dimension of the input vector -> size of identity_matrix
N = X_train.shape[0]
d = X_train.shape[1]
N_test = X_test.shape[0]
######################
## PART 1 ##
######################
identity_matrix = np.identity(d)
LambdaDotIdentityMatrix = np.multiply(identity_matrix,Lambda)
XTransposeX = np.transpose(X_train).dot(X_train)
Inverse = inv(LambdaDotIdentityMatrix+XTransposeX)
XtransposeY = np.transpose(X_train).dot(y_train)
wRR = Inverse.dot(XtransposeY)
nameOfThePart1File = "wRR_"+str(int(Lambda))+".csv"
with open(nameOfThePart1File, 'wb') as csvfile:
spamwriter = csv.writer(csvfile, delimiter='\n', quotechar='|', quoting=csv.QUOTE_MINIMAL)
spamwriter.writerow(np.transpose(wRR))