Encoding - What is this function? - encoding

I'm porting and updating an old app and I came across this function. I'd like to know more about it, but I don't actually know what it's called. I'm assuming it has a popular name. Does anyone know?
This version is in Python, although it was originally in Java.
def encode(msg): # msg is a string
msg_len = len(msg)
j = (msg_len + 6) / 7
k = 0
cbytesOutput = [ctypes.c_byte(0)]*(msg_len + j) # return is msg length + j bytes long
for l in xrange(j):
i1 = l * 8
j1 = i1
byte0 = ctypes.c_byte(-128)
byte1 = ctypes.c_byte(1)
k1 = 0
while k1 < 7 and k < msg_len:
byte2 = ctypes.c_byte(ord(msg[k]))
if (byte2.value & 0xffffff80) != 0:
byte0 = ctypes.c_byte(byte0.value | byte1.value)
j1 += 1
cbytesOutput[j1] = ctypes.c_byte(byte2.value | 0xffffff80)
byte1 = ctypes.c_byte(byte1.value << 1)
k += 1
k1 += 1
cbytesOutput[i1] = byte0
return cbytesOutput
Any comments on the algorithm in general? I'm thinking of replacing it. Profiler says it's the worst function in the entire app (it's 60% of the time on the slowest code path) and it bloats the data as well.
Thanks

Related

How to convert from Pinescript version 2 to version 5

I have tried to convert the following from version 2 to version 5 without success. I have been getting various errors showing up. Also, the current converter on the Pine Script v5 User Manual doesn't go below Version 3. I would appreciate any help to do this. Much appreciated and thanks.
//#version=2
//Both fisher and macdl MTF
resCustom = input(title="Timeframe", type=resolution, defval="60" )
//--------macdl
src=close
shortLength = input(12, title="Fast Length")
longLength = input(26, title="Slow Length")
sigLength = input(9, title="Signal Length")
ma(s,l) => ema(s,l)
sema = ma( src, shortLength )
lema = ma( src, longLength )
i1 = sema + ma( src - sema, shortLength )
i2 = lema + ma( src - lema, longLength )
macdl = i1 - i2
macdl2 = security(tickerid, resCustom,macdl)
macd=sema-lema
//-------end
//---------fisher
len = input(34, minval=1, title="Fisher")
round_(val) => val > .99 ? .999 : val < -.99 ? -.999 : val
high_ = highest(hl2, len)
low_ = lowest(hl2, len)
value = round_(.66 * ((hl2 - low_) / max(high_ - low_, .001) - .5) + .67 * nz(value[1]))
fish1 = .5 * log((1 + value) / max(1 - value, .001)) + .5 * nz(fish1[1])
fish2 = security(tickerid, resCustom,fish1)
//------------end
sw1=iff(fish2<-6 and macdl2>macdl2[1],1,0)
sw2=iff(fish2>6 and macdl2<macdl2[1],-1,0)
final=sw1+sw2
swap=final==1 or final==-1?fuchsia:green
plot(fish2, color=swap, title="Fisher",style=histogram)
hline(0, color=orange)

Using zero_grad() after loss.backward(), but still receives RuntimeError: "Trying to backward through the graph a second time..."

Below is my implementation of a2c using PyTorch. Upon learning about backpropagation in PyTorch, I have known to zero_grad() the optimizer after each update iteration. However, there is still a RunTime error on second-time backpropagation.
def torchworker(number, model):
worker_env = gym.make("Taxi-v3").env
max_steps_per_episode = 2000
worker_opt = optim.Adam(lr=5e-4, params=model.parameters())
p_history = []
val_history = []
r_history = []
running_reward = 0
episode_count = 0
under = 0
start = time.time()
for i in range(2):
state = worker_env.reset()
episode_reward = 0
penalties = 0
drop = 0
print("Episode {} begins ({})".format(episode_count, number))
worker_env.render()
criterion = nn.SmoothL1Loss()
time_solve = 0
for _ in range(1, max_steps_per_episode):
#worker_env.render()
state = torch.tensor(state, dtype=torch.long)
action_probs = model.forward(state)[0]
critic_value = model.forward(state)[1]
val_history.append((state, critic_value[0]))
# Choose action
action = np.random.choice(6, p=action_probs.detach().numpy())
p_history.append(torch.log(action_probs[action]))
# Apply chosen action
state, reward, done, _ = worker_env.step(action)
r_history.append(reward)
episode_reward += reward
time_solve += 1
if reward == -10:
penalties += 1
elif reward == 20:
drop += 1
if done:
break
# Update running reward to check condition for solving
running_reward = (running_reward * (episode_count) + episode_reward) / (episode_count + 1)
# Calculate discounted returns
returns = deque(maxlen=3500)
discounted_sum = 0
for r in r_history[::-1]:
discounted_sum = r + gamma * discounted_sum
returns.appendleft(discounted_sum)
# Calculate actor losses and critic losses
loss_actor_value = 0
loss_critic_value = 0
history = zip(p_history, val_history, returns)
for log_prob, value, ret in history:
diff = ret - value[1]
loss_actor_value += -log_prob * diff
ret_tensor = torch.tensor(ret, dtype=torch.float32)
loss_critic_value += criterion(value[1], ret_tensor)
loss = loss_actor_value + 0.1 * loss_critic_value
print(loss)
# Update params
loss.backward()
worker_opt.step()
worker_opt.zero_grad()
# Log details
end = time.time()
episode_count += 1
if episode_count % 1 == 0:
worker_env.render()
if running_reward > -50: # Condition to consider the task solved
under += 1
if under > 5:
print("Solved at episode {} !".format(episode_count))
break
I believe there may be something to do with the architecture of my AC model, so I also include it here for reference.
class ActorCriticNetwork(nn.Module):
def __init__(self, num_inputs, num_hidden, num_actions):
super(ActorCriticNetwork, self).__init__()
self.embed = nn.Embedding(500, 10)
self.fc1 = nn.Linear(10, num_hidden * 2)
self.fc2 = nn.Linear(num_hidden * 2, num_hidden)
self.c = nn.Linear(num_hidden, 1)
self.fc3 = nn.Linear(num_hidden, num_hidden)
self.a = nn.Linear(num_hidden, num_actions)
def forward(self, x):
out = F.relu(self.embed(x))
out = F.relu(self.fc1(out))
out = F.relu(self.fc2(out))
critic = self.c(out)
out = F.relu(self.fc3(out.detach()))
actor = F.softmax(self.a(out), dim=-1)
return actor, critic
Would you please tell me what the mistake here is? Thank you in advance.
SOLVED: I forgot to clear the history of probabilities, action-values and rewards after iterations. It is clear why that would cause the issue, as the older elements would cause propagating through old dcgs.

MATLAB Execution Time Increasing

Here is my code. The intent is I have a Wireshark capture saved to a particularly formatted text file. The MATLAB code is supposed to go through the Packets, dissect them for different protocols, and then make tables based on those protocols. I currently have this programmed for ETHERNET/IP/UDP/MODBUS. In this case, it creates a column in MBTable each time it encounters a new register value, and each time it comes across a change to that register value, it updates the value in that line of the table. The first column of MBTable is time, the registers start with the second column.
MBTable is preallocated to over 100,000 Rows (nol is very large), 10 columns before this code is executed. The actual data from a file I'm pulling into the table gets to about 10,000 rows and 4 columns and the code execution is so slow I have to stop it. The tic/toc value is calculated every 1000 rows and continues to increase exponentially with every iteration. It is a large loop, but I can't see where anything is growing in such a way that it would cause it to run slower with each iteration.
All variables get initialized up top (left out to lessen amount of code.
The variables eth, eth.ip, eth.ip.udp, and eth.ip.udp.modbus are all of type struct as is eth.header and eth.ip.header. WSID is a file ID from a .txt file opened earlier.
MBTable = zeros(nol,10);
tval = tic;
while not(feof(WSID))
packline = packline + 1;
fl = fl + 1;
%Get the next line from the file
MBLine = fgetl(WSID);
%Make sure line is not blank or short
if length(MBLine) >= 3
%Split the line into 1. Line no, 2. Data, 3. ASCII
%MBAll = strsplit(MBLine,' ');
%First line of new packet, if headers included
if strcmp(MBLine(1:3),'No.')
newpack = true;
newtime = false;
newdata = false;
stoppack = false;
packline = 1;
end
%If packet has headers, 2nd line contains timestamp
if newpack
Ordered = false;
if packline == 2;
newtime = true;
%MBstrs = strsplit(MBAll{2},' ');
packno = int32(str2double(MBLine(1:8)));
t = str2double(MBLine(9:20));
et = t - lastt;
if lastt > 0 && et > 0
L = L + 1;
MBTable(L,1) = t;
end
%newpack = false;
end
if packline > 3
dataline = int16(str2double(MBLine(1:4)));
packdata = strcat(packdata,MBLine(7:53));
end
end
else
%if t >= st
if packline > 3
stoppack = true;
newpack = false;
end
if stoppack
invalid = false;
%eth = struct;
eth.pack = packdata(~isspace(packdata));
eth.length = length(eth.pack);
%Dissect the packet data
eth.stbyte = 1;
eth.ebyte = eth.length;
eth.header.stbyte = 1;
eth.header.ebyte = 28;
%Ethernet Packet Data
eth.header.pack = eth.pack(eth.stbyte:eth.stbyte+27);
eth.header.dest = eth.header.pack(eth.header.stbyte:eth.header.stbyte + 11);
eth.header.src = eth.header.pack(eth.header.stbyte + 12:eth.header.stbyte + 23);
eth.typecode = eth.header.pack(eth.header.stbyte + 24:eth.header.ebyte);
if strcmp(eth.typecode,'0800')
eth.type = 'IP';
%eth.ip = struct;
%IP Packet Data
eth.ip.stbyte = eth.header.ebyte + 1;
eth.ip.ver = eth.pack(eth.ip.stbyte);
%IP Header length
eth.ip.header.length = 4*int8(str2double(eth.pack(eth.ip.stbyte+1)));
eth.ip.header.ebyte = eth.ip.stbyte + eth.ip.header.length - 1;
%Differentiated Services Field
eth.ip.DSF = eth.pack(eth.ip.stbyte + 2:eth.ip.stbyte + 3);
%Total IP Packet Length
eth.ip.length = hex2dec(eth.pack(eth.ip.stbyte+4:eth.ip.stbyte+7));
eth.ip.ebyte = eth.ip.stbyte + max(eth.ip.length,46) - 1;
eth.ip.pack = eth.pack(eth.ip.stbyte:eth.ip.ebyte);
eth.ip.ID = eth.pack(eth.ip.stbyte+8:eth.ip.stbyte+11);
eth.ip.flags = eth.pack(eth.ip.stbyte+12:eth.ip.stbyte+13);
eth.ip.fragoff = eth.pack(eth.ip.stbyte+14:eth.ip.stbyte+15);
%Time to Live
eth.ip.ttl = hex2dec(eth.pack(eth.ip.stbyte+16:eth.ip.stbyte+17));
eth.ip.typecode = eth.pack(eth.ip.stbyte+18:eth.ip.stbyte+19);
eth.ip.checksum = eth.pack(eth.ip.stbyte+20:eth.ip.stbyte+23);
%eth.ip.src = eth.pack(eth.ip.stbyte+24:eth.ip.stbyte+31);
eth.ip.src = ...
[num2str(hex2dec(eth.pack(eth.ip.stbyte+24:eth.ip.stbyte+25))),'.', ...
num2str(hex2dec(eth.pack(eth.ip.stbyte+26:eth.ip.stbyte+27))),'.', ...
num2str(hex2dec(eth.pack(eth.ip.stbyte+28:eth.ip.stbyte+29))),'.', ...
num2str(hex2dec(eth.pack(eth.ip.stbyte+30:eth.ip.stbyte+31)))];
eth.ip.dest = ...
[num2str(hex2dec(eth.pack(eth.ip.stbyte+32:eth.ip.stbyte+33))),'.', ...
num2str(hex2dec(eth.pack(eth.ip.stbyte+34:eth.ip.stbyte+35))),'.', ...
num2str(hex2dec(eth.pack(eth.ip.stbyte+36:eth.ip.stbyte+37))),'.', ...
num2str(hex2dec(eth.pack(eth.ip.stbyte+38:eth.ip.stbyte+39)))];
if strcmp(eth.ip.typecode,'11')
eth.ip.type = 'UDP';
eth.ip.udp.stbyte = eth.ip.stbyte + 40;
eth.ip.udp.src = hex2dec(eth.pack(eth.ip.udp.stbyte:eth.ip.udp.stbyte + 3));
eth.ip.udp.dest = hex2dec(eth.pack(eth.ip.udp.stbyte+4:eth.ip.udp.stbyte+7));
eth.ip.udp.length = hex2dec(eth.pack(eth.ip.udp.stbyte+8:eth.ip.udp.stbyte+11));
eth.ip.udp.checksum = eth.pack(eth.ip.udp.stbyte+12:eth.ip.udp.stbyte+15);
eth.ip.udp.protoID = eth.pack(eth.ip.udp.stbyte+20:eth.ip.udp.stbyte+23);
if strcmp(eth.ip.udp.protoID,'0000')
eth.ip.udp.proto = 'MODBUS';
%eth.ip.udp.modbus = struct;
eth.ip.udp.modbus.stbyte = eth.ip.udp.stbyte+16;
eth.ip.udp.modbus.transID = eth.pack(eth.ip.udp.modbus.stbyte:eth.ip.udp.modbus.stbyte+3);
eth.ip.udp.modbus.protoID = eth.ip.udp.protoID;
eth.ip.udp.modbus.length = int16(str2double(eth.pack(eth.ip.udp.modbus.stbyte + 8:eth.ip.udp.modbus.stbyte + 11)));
eth.ip.udp.modbus.UID = eth.pack(eth.ip.udp.modbus.stbyte + 12:eth.ip.udp.modbus.stbyte + 13);
eth.ip.udp.modbus.func = hex2dec(eth.pack(eth.ip.udp.modbus.stbyte + 14:eth.ip.udp.modbus.stbyte+15));
eth.ip.udp.modbus.register = eth.pack(eth.ip.udp.modbus.stbyte + 16: eth.ip.udp.modbus.stbyte+19);
%Number of words to a register, or the number of registers
eth.ip.udp.modbus.words = hex2dec(eth.pack(eth.ip.udp.modbus.stbyte+20:eth.ip.udp.modbus.stbyte+23));
eth.ip.udp.modbus.bytes = hex2dec(eth.pack(eth.ip.udp.modbus.stbyte+24:eth.ip.udp.modbus.stbyte+25));
eth.ip.udp.modbus.data = eth.pack(eth.ip.udp.modbus.stbyte + 26:eth.ip.udp.modbus.stbyte + 26 + 2*eth.ip.udp.modbus.bytes - 1);
%If func 16 or 23, loop through data/registers and add to table
if eth.ip.udp.modbus.func == 16 || eth.ip.udp.modbus.func == 23
stp = eth.ip.udp.modbus.bytes*2/eth.ip.udp.modbus.words;
for n = 1:stp:eth.ip.udp.modbus.bytes*2;
%Check for existence of register as a key?
if ~isKey(MBMap,eth.ip.udp.modbus.register)
MBCol = MBCol + 1;
MBMap(eth.ip.udp.modbus.register) = MBCol;
end
MBTable(L,MBCol) = hex2dec(eth.ip.udp.modbus.data(n:n+stp-1));
eth.ip.udp.modbus.register = dec2hex(hex2dec(eth.ip.udp.modbus.register)+1);
end
lastt = t;
end
%If func 4, make sure it is the response, then put
%data into table for register column
elseif false
%need code to handle serial to UDP conversion box
else
invalid = true;
end
else
invalid = true;
end
else
invalid = true;
end
if ~invalid
end
end
%end
end
%Display Progress
if int64(fl/1000)*1000 == fl
for x = 1:length(mess);
fprintf('\b');
end
%fprintf('Lines parsed: %i',fl);
mess = sprintf('Lines parsed: %i / %i',fl,nol);
fprintf('%s',mess);
%Check execution time - getting slower:
%%{
ext = toc(tval);
mess = sprintf('\nExecution Time: %f\n',ext);
fprintf('%s',mess);
%%}
end
end
ext = toc - exst;
Update: I updated my code above to remove the overloaded operators (disp and lt were replaced with mess and lastt)
Was asked to use the profiler, so I limited to 2000 lines in the table (added && L >=2000 to the while loop) to limit the execution time, and here are the top results from the profiler:
SGAS_Wireshark_Parser_v0p7_fulleth 1 57.110 s 9.714 s
Strcat 9187 29.271 s 13.598 s
Blanks 9187 15.673 s 15.673 s
Uigetfile 1 12.226 s 0.009 s
uitools\private\uigetputfile_helper 1 12.212 s 0.031 s
FileChooser.FileChooser>FileChooser.show 1 12.085 s 0.006s
...er>FileChooser.showPeerAndBlockMATLAB 1 12.056 s 0.001s
...nChooser>FileOpenChooser.doShowDialog 1 12.049 s 12.049 s
hex2dec 44924 2.944 s 2.702 s
num2str 16336 1.139 s 0.550 s
str2double 17356 1.025 s 1.025 s
int2str 16336 0.589 s 0.589 s
fgetl 17356 0.488 s 0.488 s
dec2hex 6126 0.304 s 0.304 s
fliplr 44924 0.242 s 0.242 s
It appears to be strcat calls that are doing it. I only explicitly call strcat on one line. Are some of the other string manipulations I'm doing calling strcat indirectly?
Each loop should be calling strcat the same number of times though, so I still don't understand why it takes longer and longer the more it runs...
also, hex2dec is called a lot, but is not really affecting the time.
But anyway, are there any other methods I can use the combine the strings?
Here is the issue:
The string (an char array in MATLAB) packdata was being resized and reallocated over and over again. That's what was slowing down this code. I did the following steps:
I eliminated the redundant variable packdata and now only use eth.pack.
I preallocated eth.pack and a couple "helper variables" of known lengths by running blanks ONCE for each before the loop ever starts
eth.pack = blanks(604);
thisline = blanks(47);
smline = blanks(32);
(Note: 604 is the maximum possible size of packdata based on headers + MODBUS protocol)
Then I created a pointer variable to point to the location of the last char written to packdata.
pptr = 1;
...
dataline = int16(str2double(MBLine(1:4)));
thisline = MBLine(7:53); %Always 47 characters
smline = [thisline(~isspace(thisline)),blanks(32-sum(~isspace(thisline)))]; %Always 32 Characters
eth.pack(pptr:pptr+31) = smline;
pptr = pptr + 32;
The above was inside the 'if packline > 3' block in place of the 'packdata =' statement, then at the end of the 'if stoppack' block was the reset statement:
pptr = 1; %Reset Pointer
FYI, not surprisingly this brought out other flaws in my code which I've mostly fixed but still need to finish. Not a big issue now as this loop executes lightning fast with these changes. Thanks to Yvon for helping point me in the right direction.
I kept thinking my huge table, MBTable was the issue... but it had nothing to do with it.

Improve speed on joining multiple images

What I'm trying to do is take numerous(up to 48) 1024x768 images that are color coded images(weather maps, the precip overlay) and add up the precip to fall over the course of time. When I run into non-precip I want to take a box 5x5 around the pixel in question and average the value and use that value as the value of the pixel in question.
I can do this but it takes a long time to accomplish it. I have heard numpy could improve the speed but I still haven't been able to wrap my mind around how it going to improve the speed given the sequence of events that have to take place. It seems like I would still have to do it pixel by pixel. I've included an idea of the code I'm using to accomplish this SLOWLY.
I have this actually as two separate program, one to download the images and the other does the image processing(working up toward merging the two programs in the near future, just trying to get all the bugs worked out before the merger.) Hence some of the download coding may look a little strange. I figure I could probably write the file straight to a variable but I haven't been doing it that way so I stuck with a bit longer approach.
Is there anyway of increasing the speed? I don't see anyway of avoiding pixel by pixel due to the color coding scheme in place(look at the color bar in the lower left it shows the full color scheme...I only included part of it for demo purposes in the coding below.) Some of the coding may be a bit rough since I chopped from the two programs and put the important parts in here...it shows what I'm currently doing and gives the full idea of how I'm going about doing it.
Also, if you happen to see this three to four or more days after it was posted you would need to change the date in the download link to the current date. The files are only kept on the server for 3-4 days before they are removed.
from PIL import Image
import time
import urllib
import os
pathstr = '/'
url = 'http://mag.ncep.noaa.gov/GemPakTier/MagGemPakImages/gfs/20140216/00/gfs_namer_006_1000_500_thick.gif'
urllib.urlretrieve(url,str(pathstr + '20140216006.gif'))
url = 'http://mag.ncep.noaa.gov/GemPakTier/MagGemPakImages/gfs/20140216/00/gfs_namer_012_1000_500_thick.gif'
urllib.urlretrieve(url,str(pathstr + '20140216012.gif'))
url = 'http://mag.ncep.noaa.gov/GemPakTier/MagGemPakImages/gfs/20140216/00/gfs_namer_018_1000_500_thick.gif'
urllib.urlretrieve(url,str(pathstr + '20140216018.gif'))
url = 'http://mag.ncep.noaa.gov/GemPakTier/MagGemPakImages/gfs/20140216/00/gfs_namer_024_1000_500_thick.gif'
urllib.urlretrieve(url,str(pathstr + '20140216024.gif'))
class Convert():
def __init__(self):
self.colorscale2 = [(255,255,255),(127,255,0),(0,205,0),(145,44,238),(16,78,139),
(30,144,255),(0,178,238),(0,238,238),(137,104,205),(0,139,0),
(139,0,139),(139,0,0),(205,0,0),(238,64,0),(255,127,0),(205,133,0),
(255,215,0),(238,238,0),(255,255,0),(139,71,38),(255,0,0),(0,0,255),(0,0,0)]
self.x = 0
self.y = 0
self.grid = 0
self.moist = 0
self.scan = 0
self.turn = 0
self.precip = {}
start = time.time()
for i in range(6, 30, 6):
if i < 10:
filename = '/2014021600' + str(i) + '.gif'
else:
filename = '/201402160' + str(i) + '.gif'
self.im1 = Image.open(filename).convert('RGB')
self.image = self.im1.getdata()
self.size = width, height = self.im1.size
self.coordinates = self.x,self.y = width, height
self.getprecip()
self.turn = 1
print (time.time()-start)
def getprecip(self):
for self.x in range(81, 950):
for self.y in range(29, 749):
if self.turn == 0:
self.moist = 0
else:
self.moist = self.precip[self.x,self.y]
self.coordinates = self.x,self.y
self.scan = 0
self.imagescan()
if self.turn == 0:
self.precip[self.x,self.y] = self.moist
else:
self.precip[self.x,self.y] += self.moist
def imagescan(self):
if self.image[(self.y * 1024) + self.x] == self.colorscale2[0]:
self.moist =0
self.grid -=1
elif self.image[(self.y * 1024) + self.x] == self.colorscale2[1]:
self.moist =.01
elif self.image[(self.y * 1024) + self.x] == self.colorscale2[2]:
self.moist =.1
elif self.image[(self.y * 1024) + self.x] == self.colorscale2[3]:
self.moist =.25
elif self.image[(self.y * 1024) + self.x] == self.colorscale2[4]:
self.moist =.5
#on and on through self.colorscale2[18]
if self.scan == 1:
self.grid += 1
if self.scan == 0:
x = self.x
y = self.y
self.deliso540()
self.x = x
self.y = y
def deliso540(self):
self.grid = 1
self.scan = 1
for p in range(self.x-2,self.x+2):
for q in range(self.y-2,self.y+2):
self.x = p
self.y = q
self.imagescan()
self.moist = self.moist / self.grid

Extract numbers from specific image

I am involved in a project that I think you can help me. I have multiple images that you can see here Images to recognize. The goal here is to extract the numbers between the dashed lines. What is the best approach to do that? The idea that I have from the beginning is to find the coordinates of the dash lines and do the crop function, then is just run OCR software. But is not easy to find those coordinates, can you help me? Or if you have a better approach tell me.
Best regards,
Pedro Pimenta
You may start by looking at more obvious (bigger) objects in your images. The dashed lines are way too small in some images. Searching for the "euros milhoes" logo and the barcode will be easier and it will help you have an idea of the scale and rotation involved.
To find these objects without using match template you can binarize your image (watch out for the background texture) and use the Hu moments on the contours/blobs.
Don't expect a good OCR accuracy on images where the numbers are smaller than 8-10 pixels.
You can use python-tesseract https://code.google.com/p/python-tesseract/ ,it works with your image.What you need to do is to split the result string.I use your https://www.dropbox.com/sh/kcybs1i04w3ao97/u33YGH_Kv6#f:euro9.jpg to test.And source code is below.UPDATE
# -*- coding: utf-8 -*-
from PIL import Image
from PIL import ImageEnhance
import tesseract
im = Image.open('test.jpg')
enhancer = ImageEnhance.Contrast(im)
im = enhancer.enhance(4)
im = im.convert('1')
w, h = im.size
im = im.resize((w * (416 / h), 416))
pix = im.load()
LINE_CR = 0.01
WHITE_HEIGHT_CR = int(h * (20 / 416.0))
status = 0
white_line = []
for i in xrange(h):
line = []
for j in xrange(w):
line.append(pix[(j, i)])
p = line.count(0) / float(w)
if not p > LINE_CR:
white_line.append(i)
wp = None
for i in range(10, len(white_line) - WHITE_HEIGHT_CR):
k = white_line[i]
if white_line[i + WHITE_HEIGHT_CR] == k + WHITE_HEIGHT_CR:
wp = k
break
result = []
flag = 0
while 1:
if wp < 0:
result.append(wp)
break
line = []
for i in xrange(w):
line.append(pix[(i, wp)])
p = line.count(0) / float(w)
if flag == 0 and p > LINE_CR:
l = []
for xx in xrange(20):
l.append(pix[(xx, wp)])
if l.count(0) > 5:
break
l = []
for xx in xrange(416-1, 416-100-1, -1):
l.append(pix[(xx, wp)])
if l.count(0) > 17:
break
result.append(wp)
wp -= 1
flag = 1
continue
if flag == 1 and p < LINE_CR:
result.append(wp)
wp -= 1
flag = 0
continue
wp -= 1
result.reverse()
for i in range(1, len(result)):
if result[i] - result[i - 1] < 15:
result[i - 1] = -1
result = filter(lambda x: x >= 0, result)
im = im.crop((0, result[0], w, result[-1]))
im.save('test_converted.jpg')
api = tesseract.TessBaseAPI()
api.Init(".","eng",tesseract.OEM_DEFAULT)
api.SetVariable("tessedit_char_whitelist", "0123456789abcdefghijklmnopqrstuvwxyz")
api.SetPageSegMode(tesseract.PSM_AUTO)
mImgFile = "test_converted.jpg"
mBuffer=open(mImgFile,"rb").read()
result = tesseract.ProcessPagesBuffer(mBuffer,len(mBuffer),api)
print "result(ProcessPagesBuffer)=",result
Depends python 2.7 python-tesseract-win32 python-opencv numpy PIL,and be sure to follow python-tesseract's remember to .