chatterbot twitter_trainer ASCII encoding error - chatbot

I am trying to run the chatterbot's TwitterTrainer on a separate program like so:
from chatterbot import ChatBot
from chatterbot.trainers import TwitterTrainer
from settings import TWITTER
import logging
# Comment out the following line to disable verbose logging
logging.basicConfig(level=logging.INFO)
chatbot = ChatBot("TwitterBot",
logic_adapters=[
"chatterbot.logic.BestMatch"
],
input_adapter="chatterbot.input.TerminalAdapter",
output_adapter="chatterbot.output.TerminalAdapter",
database="./twitter-database.db",
twitter_consumer_key=TWITTER["CONSUMER_KEY"],
twitter_consumer_secret=TWITTER["CONSUMER_SECRET"],
twitter_access_token_key=TWITTER["ACCESS_TOKEN"],
twitter_access_token_secret=TWITTER["ACCESS_TOKEN_SECRET"],
trainer="chatterbot.trainers.TwitterTrainer",
random_seed_word="random"
)
chatbot.train()
chatbot.logger.info('Trained database generated successfully!')
And i get errors that look like that:
File "C:\Python27\lib\json\decoder.py", line 364, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end()) File "C:\Python27\lib\json\decoder.py", line 380, in raw_decode
obj, end = self.scan_once(s, idx) UnicodeDecodeError: 'utf8' codec can't decode byte 0x85 in position 94: invalid start byte
This program doesn't run more than 3 seconds straight, but some tweets are written to the twitter-database.db until exception occurs.
Also when looking at the trainer.py i saw this:
# TODO: Handle non-ascii characters properly
Any ideas about why this happens and how can i fix this?

Could you try to add Python Source Code Encoding top of your file # -*- coding: utf-8 -*-. These type error will occurs due to this. More information available here http://chatterbot.readthedocs.io/en/stable/encoding.html#fixing-encoding-errors

Related

Apache Beam ReadFromSpanner decoding issue

I'm trying to run the following script in a GCP Dataflow pipeline.
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from typing import NamedTuple, Optional
from apache_beam.io.gcp.spanner import *
from past.builtins import unicode
import logging
class ItemRow(NamedTuple):
item_id: unicode
class LogResults(beam.DoFn):
"""Just log the results"""
def process(self, element):
logging.info("row: %s", element)
yield element
class SpannerToSpannerAndBigQueryPipelineOptions(PipelineOptions):
"""
Runtime Parameters given during template execution
path parameter is necessary for execution of pipeline
"""
#classmethod
def _add_argparse_args(cls, parser):
parser.add_argument(
'--SOURCE_SPANNER_PROJECT_ID', type=str, help='Source Spanner project ID',
default='project_id')
parser.add_argument(
'--SOURCE_SPANNER_DATASET_ID', type=str, help='Source Spanner dataset ID',
default='dataset_id')
parser.add_argument(
'--SOURCE_SPANNER_INSTANCE_ID', type=str, help='Source Spanner instance ID',
default='instance_id')
parser.add_argument(
'--SOURCE_QUERY', type=str, help='SQL to run in Source Spanner Instance',
required=True)
# Setup pipeline
def run():
beam.coders.registry.register_coder(ItemRow, beam.coders.RowCoder)
pipeline_options = PipelineOptions()
p = beam.Pipeline(options=pipeline_options)
importer_options = pipeline_options.view_as(
SpannerToSpannerAndBigQueryPipelineOptions)
rows = (
p
| "Read from source Spanner" >> ReadFromSpanner(
project_id=importer_options.SOURCE_SPANNER_PROJECT_ID,
instance_id=importer_options.SOURCE_SPANNER_INSTANCE_ID,
database_id=importer_options.SOURCE_SPANNER_DATASET_ID,
row_type=ItemRow,
sql='Select item_id from Items WHERE created_ts BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 5 SECOND) AND CURRENT_TIMESTAMP()',
timestamp_bound_mode=TimestampBoundMode.MAX_STALENESS,
staleness=3,
time_unit=TimeUnit.HOURS,
).with_output_types(ItemRow)
)
rows | 'Log results' >> beam.ParDo(LogResults())
result = p.run()
result.wait_until_finish()
if __name__ == "__main__":
run()
However, I've been running into issues for decoding the results obtained from Spanner. These are the output logs from my Dataflow job:
"An exception was raised when trying to execute the workitem 6665479626992209510 : Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/dataflow_worker/batchworker.py", line 649, in do_work
work_executor.execute()
File "/usr/local/lib/python3.7/site-packages/dataflow_worker/executor.py", line 179, in execute
op.start()
File "dataflow_worker/native_operations.py", line 38, in dataflow_worker.native_operations.NativeReadOperation.start
File "dataflow_worker/native_operations.py", line 39, in dataflow_worker.native_operations.NativeReadOperation.start
File "dataflow_worker/native_operations.py", line 44, in dataflow_worker.native_operations.NativeReadOperation.start
File "dataflow_worker/native_operations.py", line 48, in dataflow_worker.native_operations.NativeReadOperation.start
File "/usr/local/lib/python3.7/site-packages/dataflow_worker/inmemory.py", line 108, in __iter__
yield self._source.coder.decode(value)
File "/usr/local/lib/python3.7/site-packages/apache_beam/coders/coders.py", line 468, in decode
return self.get_impl().decode(encoded)
File "apache_beam/coders/coder_impl.py", line 226, in apache_beam.coders.coder_impl.StreamCoderImpl.decode
File "apache_beam/coders/coder_impl.py", line 228, in apache_beam.coders.coder_impl.StreamCoderImpl.decode
File "apache_beam/coders/coder_impl.py", line 123, in apache_beam.coders.coder_impl.CoderImpl.decode_from_stream
File "/usr/local/lib/python3.7/site-packages/apache_beam/coders/row_coder.py", line 215, in decode_from_stream
is_null in zip(self.components, nulls)))
File "/usr/local/lib/python3.7/site-packages/apache_beam/coders/row_coder.py", line 215, in <genexpr>
is_null in zip(self.components, nulls)))
File "apache_beam/coders/coder_impl.py", line 259, in apache_beam.coders.coder_impl.CallbackCoderImpl.decode_from_stream
File "apache_beam/coders/coder_impl.py", line 261, in apache_beam.coders.coder_impl.CallbackCoderImpl.decode_from_stream
File "/usr/local/lib/python3.7/site-packages/apache_beam/coders/coders.py", line 414, in decode
return value.decode('utf-8')
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x83 in position 9: invalid start byte
"
I'm unsure as to how to solve this problem. I'm using this https://beam.apache.org/releases/pydoc/2.27.0/apache_beam.io.gcp.spanner.html?highlight=spanner#module-apache_beam.io.gcp.spanner example as a starting point. The issue appears to be in decoding the results obtained from Spanner. There is little to no documentation on how to specify the schema for the Spanner table/tables I'm trying to query.
There is also an experimental IO module for Spanner which does not use the Java expansion module. Is it recommended to switch to the experimental version?
Thanks
I could not run a pipeline using the apache_beam.io.gcp.spanner module so I ended up using the apache_beam.io.gcp.experimental.spannerio module instead.

flask form post : cannot submit Chinese charactor, only English words works

I cannot submit Chinese words in the form.
English word is OK.
How can I set the utf-8 ?
In the html or app.py? or html code?
Here is my page: http://shiqiu.pw/testpage
PS: I can submit Chinese word in my mac, no problem. But when I deploy my code to my server, Chinese word can not be submitted, it shows Internal Server Error
PPS:I send my post words to mysql, and then show mysql data in new page, perhaps there is something need to set with my mysql?
#myapp.py code part
with connection.cursor() as cursor:
# Create a new recrod
word = request.form.get('word')
print('word')
print(word)
meaning = request.form.get('meaning')
sql = 'INSERT INTO sqdict (word, meaning) VALUES (%s, %s)'
cursor.execute(sql, (word, meaning)) # execute
# connection is not autocommit by default. So you must commit to save your changes.
connection.commit()
with connection.cursor() as cursor:
sql = 'SELECT * from sqdict'
cursor.execute(sql)
# sqlresult = cursor.fetchone() # only show the first row
sqlresult = cursor.fetchall() # all rows
print('sqlresult')
print(sqlresult)
allwords = sqlresult
ERROR LOG BELOW
word
工作
ERROR:flask.app:Exception on /test [POST]
Traceback (most recent call last):
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/flask/app.py", line 1993, in make_response
rv = self.response_class.force_type(rv, request.environ)
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/werkzeug/wrappers/base_response.py", line 269, in force_type
response = BaseResponse(*_run_wsgi_app(response, environ))
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/werkzeug/test.py", line 1119, in run_wsgi_app
app_rv = app(environ, start_response)
TypeError: 'InternalError' object is not callable
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/flask/app.py", line 2311, in wsgi_app
response = self.full_dispatch_request()
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/flask/app.py", line 1835, in full_dispatch_request
return self.finalize_request(rv)
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/flask/app.py", line 1850, in finalize_request
response = self.make_response(rv)
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/flask/app.py", line 2001, in make_response
reraise(TypeError, new_error, sys.exc_info()[2])
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/flask/_compat.py", line 35, in reraise
raise value.with_traceback(tb)
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/flask/app.py", line 1993, in make_response
rv = self.response_class.force_type(rv, request.environ)
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/werkzeug/wrappers/base_response.py", line 269, in force_type
response = BaseResponse(*_run_wsgi_app(response, environ))
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/werkzeug/test.py", line 1119, in run_wsgi_app
app_rv = app(environ, start_response)
TypeError: 'InternalError' object is not callable
The view function did not return a valid response. The return type must be a string, tuple, Response instance, or WSGI callable, but it was a InternalError.
My input Chinese charactor is showed as '工作'
I don't know which is the Internalerror object.
Full code is at github,code maybe not the same, but not big difference.
I have tried request.form.getunicode('word').
Also, I tried app.config['JSON_AS_ASCII' = False).
Or, set <meta charset="UTF-8"> in the html code.
BUT, no one works.
I hope the form can support Chinese words as value.

An error in my code to be a simple ftp

I met an error when running codes at the bottom. It's like a simple ftp.
I use python2.6.6 and CentOS release 6.8
In most linux server, it gets right results like this:(I'm very sorry that I have just sign up and couldn't )
Clinet:
[root#Test ftp]# python client.py
path:put|/home/aaa.txt
Server:
[root#Test ftp]# python server.py
connected...
pre_data:put|aaa.txt|4
cmd: put
file_name: aaa.txt
file_size: 4
upload successed.
But I get errors in some server(such as my own VM in my PC). I have done lots of tests(python2.6/python2.7, Centos6.5/Centos6.7) and found this error is not because them. Here is the error imformation:
[root#Lewis-VM ftp]# python server.py
connected...
pre_data:put|aaa.txt|7sdfsdf ###Here gets the wrong result, "sdfsdf" is the content of /home/aaa.txt and it shouldn't be sent here to 'file_size' and so it cause the "ValueError" below
cmd: put
file_name: aaa.txt
file_size: 7sdfsdf
----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 10699)
Traceback (most recent call last):
File "/usr/lib64/python2.6/SocketServer.py", line 570, in process_request_thread
self.finish_request(request, client_address)
File "/usr/lib64/python2.6/SocketServer.py", line 332, in finish_request
self.RequestHandlerClass(request, client_address, self)
File "/usr/lib64/python2.6/SocketServer.py", line 627, in __init__
self.handle()
File "server.py", line 30, in handle
if int(file_size)>recv_size:
ValueError: invalid literal for int() with base 10: '7sdfsdf\n'
What's more, I found that if I insert a time.sleep(1) between sk.send(cmd+"|"+file_name+'|'+str(file_size)) and sk.send(data) in client.py, the error will disappear. I have said that I did tests in different system and python versions and the error is not because them. So I guess that is it because of some system configs? I have check about socket.send() and socket.recv() in python.org but fount nothing helpful. So could somebody help me to explain why this happend?
The code are here:
#!/usr/bin/env python
#coding:utf-8
################
#This is server#
################
import SocketServer
import os
class MyServer(SocketServer.BaseRequestHandler):
def handle(self):
base_path = '/home/ftp/file'
conn = self.request
print 'connected...'
while True:
#####receive pre_data: we should get data like 'put|/home/aaa|7'
pre_data = conn.recv(1024)
print 'pre_data:' + pre_data
cmd,file_name,file_size = pre_data.split('|')
print 'cmd: ' + cmd
print 'file_name: '+ file_name
print 'file_size: '+ file_size
recv_size = 0
file_dir = os.path.join(base_path,file_name)
f = file(file_dir,'wb')
Flag = True
####receive 1024bytes each time
while Flag:
if int(file_size)>recv_size:
data = conn.recv(1024)
recv_size+=len(data)
else:
recv_size = 0
Flag = False
continue
f.write(data)
print 'upload successed.'
f.close()
instance = SocketServer.ThreadingTCPServer(('127.0.0.1',9999),MyServer)
instance.serve_forever()
#!/usr/bin/env python
#coding:utf-8
################
#This is client#
################
import socket
import sys
import os
ip_port = ('127.0.0.1',9999)
sk = socket.socket()
sk.connect(ip_port)
while True:
input = raw_input('path:')
#####we should input like 'put|/home/aaa.txt'
cmd,path = input.split('|')
file_name = os.path.basename(path)
file_size=os.stat(path).st_size
sk.send(cmd+"|"+file_name+'|'+str(file_size))
send_size = 0
f= file(path,'rb')
Flag = True
#####read 1024 bytes and send it to server each time
while Flag:
if send_size + 1024 >file_size:
data = f.read(file_size-send_size)
Flag = False
else:
data = f.read(1024)
send_size+=1024
sk.send(data)
f.close()
sk.close()
The TCP is a stream of data. That is the problem. TCP do not need to keep message boundaries. So when a client calls something like
connection.send("0123456789")
connection.send("ABCDEFGHIJ")
then a naive server like
while True;
data = conn.recv(1024)
print data + "_"
may print any of:
0123456789_ABCDEFGHIJ_
0123456789ABCDEFGHIJ_
0_1_2_3_4_5_6_7_8_9_A_B_C_D_E_F_G_H_I_J_
The server has no chance to recognize how many sends client called because the TCP stack at client side just inserted data to a stream and the server must be able to process the data received in different number of buffers than the client used.
Your server must contain a logic to separate the header and the data. All of application protocols based on TCP use a mechanism to identify application level boundaries. For example HTTP separates headers and body by an empty line and it informs about the body length in a separate header.
Your program works correctly when server receives a header with the command, name and size in a separate buffer it it fails when client is fast enough and push the data into stream quickly and the server reads header and data in one chunk.

OSError: cannot identify image file <_io.StringIO object at 0x00000000022810D8>

Use win8 and python3.4,I need to convert text to images.So I try to implement a himself.But I encounter a OSError.I try to use BytesIO instead of StringIO,it will pop error "OSError: cannot identify image file <_io.BytesIO object at xxxx>.
I still can't find the reason.
Codes as follow:
# -*- coding: utf-8 -*-
import os
import pygame
from io import StringIO,BytesIO
from PIL import Image
pygame.init()
text = u'This is a test text,test 123.'
font_path = "C:/windows/fonts/simsun.ttc"
im = Image.new("RGB",(300,50),(255,255,255))
font = pygame.font.Font(os.path.join(font_path),22)
rtext = font.render(text, True, (0,0,0),(255,255,255))
sio = StringIO()
print(sio.getvalue())
pygame.image.save(rtext, sio)
sio.seek(0)
#print(sio.getvalue())
line = Image.open(sio)
im.paste(line,(10,5))
im.show()
im.save("t1.png")
As is I get this error:
Traceback (most recent call last):
File "D:/mypython/learn/demo.py", line 19, in <module>
line = Image.open(sio)
File "D:\Python34\lib\site-packages\PIL\Image.py", line 2319, in open
% (filename if filename else fp))
OSError: cannot identify image file <_io.StringIO object at 0x00000000022810D8>
line = Image.open(sio)
As far as I am concerned, sio is still a StringIO(). If you are trying to open it as an image, try opening it with line = Image.open(name), where name is the actual name of the image, not a StringIO().

Issue with multiline TokenInputTransformer in IPython extension

I am trying to use a TokenInputTransformer within an IPython extension module, but it seems that there is something wrong with the standard implementation of token transformers with multiline input. Consider the following minimal extension:
from IPython.core.inputtransformer import TokenInputTransformer
#TokenInputTransformer.wrap
def test_transformer(tokens):
return tokens
def load_ipython_extension(ip):
for s in (ip.input_splitter, ip.input_transformer_manager):
s.python_line_transforms.extend([test_transformer()])
print "Test activated"
When I load the extension in IPython 1.1.0 I get a non-handled exception with multiline input:
In [1]: %load_ext test
Test activated
In [2]: abs(
...: 2
...: )
Traceback (most recent call last):
File "/Applications/anaconda/bin/ipython", line 6, in <module>
sys.exit(start_ipython())
File "/Applications/anaconda/lib/python2.7/site-packages/IPython/__init__.py", line 118, in start_ipython
return launch_new_instance(argv=argv, **kwargs)
File "/Applications/anaconda/lib/python2.7/site-packages/IPython/config/application.py", line 545, in launch_instance
app.start()
File "/Applications/anaconda/lib/python2.7/site-packages/IPython/terminal/ipapp.py", line 362, in start
self.shell.mainloop()
File "/Applications/anaconda/lib/python2.7/site-packages/IPython/terminal/interactiveshell.py", line 436, in mainloop
self.interact(display_banner=display_banner)
File "/Applications/anaconda/lib/python2.7/site-packages/IPython/terminal/interactiveshell.py", line 548, in interact
self.input_splitter.push(line)
File "/Applications/anaconda/lib/python2.7/site-packages/IPython/core/inputsplitter.py", line 620, in push
out = self.push_line(line)
File "/Applications/anaconda/lib/python2.7/site-packages/IPython/core/inputsplitter.py", line 655, in push_line
line = transformer.push(line)
File "/Applications/anaconda/lib/python2.7/site-packages/IPython/core/inputtransformer.py", line 152, in push
return self.output(tokens)
File "/Applications/anaconda/lib/python2.7/site-packages/IPython/core/inputtransformer.py", line 157, in output
return untokenize(self.func(tokens)).rstrip('\n')
File "/Applications/anaconda/lib/python2.7/site-packages/IPython/utils/_tokenize_py2.py", line 276, in untokenize
return ut.untokenize(iterable)
File "/Applications/anaconda/lib/python2.7/site-packages/IPython/utils/_tokenize_py2.py", line 214, in untokenize
self.add_whitespace(start)
File "/Applications/anaconda/lib/python2.7/site-packages/IPython/utils/_tokenize_py2.py", line 199, in add_whitespace
assert row >= self.prev_row
AssertionError
If you suspect this is an IPython bug, please report it at:
https://github.com/ipython/ipython/issues
or send an email to the mailing list at ipython-dev#scipy.org
You can print a more detailed traceback right now with "%tb", or use "%debug"
to interactively debug it.
Extra-detailed tracebacks for bug-reporting purposes can be enabled via:
%config Application.verbose_crash=True
Am I doing something wrong or is it really an IPython bug?
It is really an IPython bug, I think. Specifically, the way we handle tokenize fails when an expression involving brackets (()[]{}) is spread over more than one line. I'm trying to work out what we can do about this.
Kinda late answer but, I was trying to use it in my own extension and just had the same problem. I've solved it by simply removing NL from the list (it's not the same as NEWLINE token which end statement), NL token only appears inside [], (), {} so it should be safely removable.
from tokenize import NL
#TokenInputTransformer.wrap
def mat_transformer(tokens):
tokens = list(filter(lambda t: t.type != NL, tokens))
return tokens
If you are looking for full example, I've posted my goofy code there: https://github.com/Quinzel/pymat