character-wise confidence values using tesseract 3.01 - tesseract

i executed the following code to generate character-wise confidence values:
int main(int argc, char **argv) {
const char *lang="eng";
const PIX *pixs;
if ((pixs = pixRead(argv[1])) == NULL) {
cout <<"Unsupported image type"<<endl;
exit(3);
}
TessBaseAPI api;
api.SetVariable("save_blob_choices", "T");
api.SetPageSegMode(tesseract::PSM_SINGLE_WORD );
api.SetImage(pixs);
int rc = api.Init(argv[0], lang);
api.Recognize(NULL);
ResultIterator* ri = api.GetIterator();
if(ri != 0)
{
do
{
const char* symbol = ri->GetUTF8Text(RIL_SYMBOL);
if(symbol != 0)
{
float conf = ri->Confidence(RIL_SYMBOL);
cout<<"\nnext symbol: "<< symbol << " confidence: " << conf <<"\n" <<endl;
}
delete[] symbol;
} while((ri->Next(RIL_SYMBOL)));
}
return 0;
}
link to image
the output obtained for the above image was:
next symbol: N confidence: 72.3563 next symbol: B confidence: 72.3563
next symbol: E confidence: 69.9937 next symbol: T confidence: 69.9937
next symbol: R confidence: 69.9937 next symbol: A confidence: 69.9937
next symbol: N confidence: 69.9937 next symbol: G confidence: 69.9937
next symbol: - confidence: 69.9937 next symbol: I confidence: 69.9937
As is evident, the confidence values for characters belonging to the same word is the same.
Is this the expected output? Shouldn't the confidence values be different for each character?
I tried executing the code for a word in which each character was in different font style..and yet, the confidence value was the same for characters belonging to the same word.

The issue is that you're calling Init after the SetVariable call.

Related

Composite grammars with ANTLR

(Environment is ANTLR 4 with Javascript)
I have a Test.g4 grammar, importing two other grammars (details omitted).
grammar Test;
import Time, Basic;
request: PERIOD '=' exp=interval;
PERIOD: [Pp][Ee][Rr][Ii][Oo][Dd];
Time.g4 grammar is:
grammar Time;
epoch: Digit Digit Digit Digit Digit Digit Digit Digit Digit Digit Digit Digit Digit;
interval: '[' from=epoch ',' to=epoch ']';
Digit: DIGIT;
fragment DIGIT: [0-9];
Basic.g4 grammar is:
grammar Basic;
any: DECIMAL_LITERAL;
DECIMAL_LITERAL: DIGIT+;
fragment DIGIT: [0-9];
There is a custom visitor for Time grammar:
const Time = superclass => class extends superclass {
constructor() {
super();
superclass.call(this);
return this;
}
visitEpoch(ctx) {
return parseInt(ctx.getText());
}
visitInterval(ctx) {
var from = parseInt(this.visit(ctx.from));
var to = parseInt(this.visit(ctx.to));
return { from: from, to: to }
}
}
exports.Time = Time;
The test code is:
#!/usr/bin/env node
const antlr4 = require('antlr4');
const TestLexer = require('./TestLexer').TestLexer;
const TestParser = require('./TestParser').TestParser;
const TestVisitor = require('./TestVisitor').TestVisitor;
const Time = require('./time').Time;
class Visitor extends Time(TestVisitor) {
run(chars) {
var stream = new antlr4.InputStream(chars);
var lexer = new TestLexer(stream);
var tokens = new antlr4.CommonTokenStream(lexer);
var parser = new TestParser(tokens);
parser.buildParseTrees = true;
var tree = parser.request();
var result = this.visitRequest(tree);
console.log(result[2]);
}
}
var time = new Visitor();
time.run('period=[1234567890123,1234567890888]');
If I only import Time grammar, everything works fine. However, if I import both Time and Basic grammars, I get the following erros:
line 1:8 mismatched input '1234567890123' expecting Digit
line 1:22 mismatched input '1234567890888' expecting Digit
What am I doing wrong?
Thank you in advance,
RG

Cannot convert type "Int.Type" to Type "Int" - Function Parameter - Swift Equivalent of setw C++

I've been looking for some equivalent for std::setw in swift, and haven't been able to find one. Thus, I've written a quick function that more or less accomplishes the same goal, though when using toprint as a parameter, I'm getting a build error saying that Int.Type cannot be converted to Int. Any idea as to how to fix this?
// FUNCTION WITH ERROR
func space (toprint: Int) {
var spaces = [" ", " ", " ", " ", " ", " ", " "]
var digits = [Int] ()
var toprinttemp = toprint
while toprinttemp >= 1 {
digits.append(toprint%10)
toprinttemp = toprinttemp/10
}
let count = (digits.count - 1)
print("(\(spaces[count])", terminator:"")
}
// Print top row
var counter = 0
while (counter) <= (rows - 1) {
if (tlcorner - counter) <= userinput {
var toprint:Int = (tlcorner - counter)
print("\(toprint)", terminator:"")
// Calling Function
space(toprint:Int)
}
counter += 1
}
You need to pass an Int into the function, not the type Int.
Call it like this:
space(toprint: 5)
Note that toprint here is the name of the parameter for the space function, it is not the Int you want to pass in. In your case, you'll want this:
space(toprint: toprint)
Here the first toprint is the parameter label and the second is the name of your Int
If your goal is just to get a string with a variable number of spaces, you can just do this:
String(repeating: " ", count: 10)
Read Create a string with n blank spaces or other repeated character for other ways of doing this.

error: lvalue required

// getline : empty string array and max length as input
// stores input stream to array and return its length
#include<stdio.h>
#define LENGTH 100
int getline1(char* , int );
int main(){
char *s;
int i;
s=(char*)malloc(LENGTH*sizeof(char));
i=getline1(s,LENGTH);
printf("%s %d",s,i);
return 0;
}
int getline1(char *s, int lim){
int c ,i;
i = 0;
printf("%u",s);
while(--lim >= 0 && (c=getchar()) != EOF && c = '\n'){
*(s+i)=c; //error : lvalue required
i++;
}
if(c=='\n'){
*(s+i)=c;
i++;
}
*(s+i)='\0';
return i;
}
I get the error mentioned in the mentioned line can any body tell whats wrong. code works fine if i use arrays.getline : empty string array and max length as input
stores input stream to array and return its length
It cannot assign the value to the *(s+i) expression. Understandably so since you dereference it, leaving the value at that position (which is a constant).
Try using s[i] instead.

Postgres COPY command - fields with commas, quoted with double quotes

I've searched and found a few posts relating to postgres csv imports, but nothing that solves my current problem.
I use the postgres copy command all the time to bring data from hetergeneous data sources into our system. Currently struggling with a 100-million row .csv file, comma-quote delimited. Issue is with rows like so:
009098,0981098094,"something","something else",""this one, well, is a problem"", "another thing"
Fields enclosed in double-quotes with embedded commas. The fields are not correctly parsed and I get the error:
"ERROR: extra data after last expected column"
Usually when this arises I deal with the offending rows ad hoc, but this file is so huge I'm hoping for some more general way to defend against it. Asking for a revised data format is not a possibility.
copy mytable from '/path/to/file.csv' csv header quote '"'
That's malformed CSV. You double a double quote to embed a double quote inside a quote field; for example:
"where","is ""pancakes""","house?"
has three values:
where
is "pancakes"
house?
The row you're having trouble with has stray doubled double quotes:
009098,0981098094,"something","something else",""this one, well, is a problem"", "another thing"
^^ ^^
I don't think there is anything that COPY can do about this as the correct version is ambiguous: should it be "this one, well, is a problem" or should it be """this one, well, is a problem"""?
I think you'll have to fix it by hand. A quick sed one-liner should be able to do the job if you can uniquely identify the broken row.
For reference purposes, the closest thing I've seen to a CSV standard is RFC 4180 and section two has this to say:
5. Each field may or may not be enclosed in double quotes (however
some programs, such as Microsoft Excel, do not use double quotes
at all). If fields are not enclosed with double quotes, then
double quotes may not appear inside the fields. For example:
"aaa","bbb","ccc" CRLF
zzz,yyy,xxx
[...]
7. If double-quotes are used to enclose fields, then a double-quote
appearing inside a field must be escaped by preceding it with
another double quote. For example:
"aaa","b""bb","ccc"
Here is code based on the CSV code from The Practice of Programming by Kernighan and Plauger that has been adapted to deal with your weird malformed CSV data. (It wasn't all that hard to do; I already had the main code working and packaged, so I just had to add the CSV output functions and to modify the advquoted() function to handle the weird format in this question.
csv2.h
/*
#(#)File: $RCSfile: csv2.h,v $
#(#)Version: $Revision: 2.1 $
#(#)Last changed: $Date: 2012/11/01 22:23:07 $
#(#)Purpose: Scanner for Comma Separated Variable (CSV) Data
#(#)Author: J Leffler
#(#)Origin: Kernighan & Pike, 'The Practice of Programming'
*/
/*TABSTOP=4*/
#ifndef CSV2_H
#define CSV2_H
#ifdef __cplusplus
extern "C" {
#endif
#ifdef MAIN_PROGRAM
#ifndef lint
/* Prevent over-aggressive optimizers from eliminating ID string */
const char jlss_id_csv2_h[] = "#(#)$Id: csv2.h,v 2.1 2012/11/01 22:23:07 jleffler Exp $";
#endif /* lint */
#endif /* MAIN_PROGRAM */
#include <stdio.h>
extern char *csvgetline(FILE *ifp); /* Read next input line */
extern char *csvgetfield(size_t n); /* Return field n */
extern size_t csvnfield(void); /* Return number of fields */
extern void csvreset(void); /* Release space used by CSV */
extern int csvputfield(FILE *ofp, const char *field);
extern int csvputline(FILE *ofp, char **fields, int nfields);
extern void csvseteol(const char *eol);
#ifdef __cplusplus
}
#endif
#endif /* CSV2_H */
csv2.c
/*
#(#)File: $RCSfile: csv2.c,v $
#(#)Version: $Revision: 2.1 $
#(#)Last changed: $Date: 2012/11/01 22:23:07 $
#(#)Purpose: Scanner for Comma Separated Variable (CSV) Data
#(#)Modification: Deal with specific malformed CSV
#(#)Author: J Leffler
#(#)Origin: Kernighan & Pike, 'The Practice of Programming'
*/
/*TABSTOP=4*/
#ifndef lint
/* Prevent over-aggressive optimizers from eliminating ID string */
const char jlss_id_csv2_c[] = "#(#)$Id: csv2.c,v 2.1 2012/11/01 22:23:07 jleffler Exp $";
#endif /* lint */
/*
** See RFC 4180 (http://www.ietf.org/rfc/rfc4180.txt).
**
** Specific malformed CSV - see SO 13183644 (http://stackoverflow.com/questions/13183644).
** Data contains malformed CSV fields like: OK,""this is a problem"",OK
** Two (but not three) field quotes at the start extract as "this is a problem" (with the quotes).
*/
#include "csv2.h"
#include <stdlib.h>
#include <string.h>
enum { NOMEM = -2 };
static char *line = 0; /* Input line */
static char *sline = 0; /* Split line */
static size_t maxline = 0; /* Size of line[] and sline[] */
static char **field = 0; /* Field pointers */
static size_t maxfield = 0; /* Size of field[] */
static size_t nfield = 0; /* Number of fields */
static char fieldsep[]= ","; /* Field separator characters */
static char fieldquote = '"'; /* Quote character */
static char eolstr[8] = "\n";
void csvreset(void)
{
free(line);
free(sline);
free(field);
line = 0;
sline = 0;
field = 0;
maxline = maxfield = nfield = 0;
}
static int endofline(FILE *ifp, int c)
{
int eol = (c == '\r' || c == '\n');
if (c == '\r')
{
c = getc(ifp);
if (c != '\n' && c != EOF)
ungetc(c, ifp);
}
return(eol);
}
/* Modified to deal with specific malformed CSV */
static char *advquoted(char *p)
{
size_t i;
size_t j;
if (p[0] == fieldquote && (p[1] != *fieldsep && p[1] != fieldquote))
{
/* Malformed CSV: ""some stuff"" --> "some stuff" */
/* Find "\"\"," or "\"\"\0" to mark end of field */
/* If we don't find it, drop through to 'regular' case */
char *eof = strstr(&p[2], "\"\"");
if (eof != 0 && (eof[2] == *fieldsep || eof[2] == '\0'))
{
p[eof + 1 - p] = '\0';
return(eof + 2);
}
}
for (i = j = 0; p[j] != '\0'; i++, j++)
{
if (p[j] == fieldquote && p[++j] != fieldquote)
{
size_t k = strcspn(p+j, fieldsep);
memmove(p+i, p+j, k); // 1 -> i fixing transcription error
i += k;
j += k;
break;
}
p[i] = p[j];
}
p[i] = '\0';
return(p + j);
}
static int split(void)
{
char *p;
char **newf;
char *sepp;
int sepc;
nfield = 0;
if (line[0] == '\0')
return(0);
strcpy(sline, line);
p = sline;
do
{
if (nfield >= maxfield)
{
maxfield *= 2;
newf = (char **)realloc(field, maxfield * sizeof(field[0]));
if (newf == 0)
return NOMEM;
field = newf;
}
if (*p == fieldquote)
sepp = advquoted(++p);
else
sepp = p + strcspn(p, fieldsep);
sepc = sepp[0];
sepp[0] = '\0';
field[nfield++] = p;
p = sepp + 1;
} while (sepc == ',');
return(nfield);
}
char *csvgetline(FILE *ifp)
{
size_t i;
int c;
if (line == NULL)
{
/* Allocate on first call */
maxline = maxfield = 1;
line = (char *)malloc(maxline); /*=C++=*/
sline = (char *)malloc(maxline); /*=C++-*/
field = (char **)malloc(maxfield*sizeof(field[0])); /*=C++=*/
if (line == NULL || sline == NULL || field == NULL)
{
csvreset();
return(NULL); /* out of memory */
}
}
for (i = 0; (c = getc(ifp)) != EOF && !endofline(ifp, c); i++)
{
if (i >= maxline - 1)
{
char *newl;
char *news;
maxline *= 2;
newl = (char *)realloc(line, maxline); /*=C++=*/
news = (char *)realloc(sline, maxline); /*=C++-*/
if (newl == NULL || news == NULL)
{
csvreset();
return(NULL); /* out of memory */
}
line = newl;
sline = news;
}
line[i] = c;
}
line[i] = '\0';
if (split() == NOMEM)
{
csvreset();
return(NULL);
}
return((c == EOF && i == 0) ? NULL : line);
}
char *csvgetfield(size_t n)
{
if (n >= nfield)
return(0);
return(field[n]);
}
size_t csvnfield(void)
{
return(nfield);
}
int csvputfield(FILE *ofp, const char *ofield)
{
const char escapes[] = "\",\r\n";
if (strpbrk(ofield, escapes) != 0)
{
size_t len = strlen(ofield) + 2;
const char *pos = ofield;
while ((pos = strchr(pos, '"')) != 0)
{
len++;
pos++;
}
char *space = malloc(len+1);
if (space == 0)
return EOF;
char *cpy = space;
pos = ofield;
*cpy++ = '"';
char c;
while ((c = *pos++) != '\0')
{
if (c == '"')
*cpy++ = c;
*cpy++ = c;
}
*cpy++ = '"';
*cpy = '\0';
int rc = fputs(space, ofp);
free(space);
return rc;
}
else
return fputs(ofield, ofp);
}
int csvputline(FILE *ofp, char **fields, int nfields)
{
for (int i = 0; i < nfields; i++)
{
if (i > 0)
putc(',', ofp);
if (csvputfield(ofp, fields[i]) == EOF)
return EOF;
}
return(fputs(eolstr, ofp));
}
void csvseteol(const char *eol)
{
size_t nbytes = strlen(eol);
if (nbytes >= sizeof(eolstr))
nbytes = sizeof(eolstr) - 1;
memmove(eolstr, eol, nbytes);
eolstr[nbytes] = '\0';
}
#ifdef TEST
int main(void)
{
char *in_line;
while ((in_line = csvgetline(stdin)) != 0)
{
size_t n = csvnfield();
char *fields[n]; /* C99 VLA */
printf("line = '%s'\n", in_line);
for (size_t i = 0; i < n; i++)
{
printf("field[%zu] = '%s'\n", i, csvgetfield(i));
printf("field[%zu] = [", i);
csvputfield(stdout, csvgetfield(i));
fputs("]\n", stdout);
fields[i] = csvgetfield(i);
}
printf("fields[0..%zu] = ", n-1);
csvputline(stdout, fields, n);
}
return(0);
}
#endif /* TEST */
Compile the code with -DTEST to create a program with the example main() function. You need a C99 compiler; the code in main() uses a VLA (variable length array). You could avoid that with dynamic memory allocation or with pessimistic (overkill) memory allocation (an array of a few thousand pointers isn't going to kill most systems these days, but few CSV files will have a few thousand fields per line).
Example Data
Based closely on the data in the question.
009098,0981098094,"something","something else",""this one, well, is a problem"", "another thing"
123458,1234561007,"anything","nothing else",""this one, well, is a problem"","dohicky
503458,1234598094,"nothing","everything else","""this one, well, it isn't a problem""","abelone"
610078,1236100794,"everything","anything else","this ""isn't a problem"", he said.","Orcas Rule"
Example Output
line = '009098,0981098094,"something","something else",""this one, well, is a problem"", "another thing"'
field[0] = '009098'
field[0] = [009098]
field[1] = '0981098094'
field[1] = [0981098094]
field[2] = 'something'
field[2] = [something]
field[3] = 'something else'
field[3] = [something else]
field[4] = '"this one, well, is a problem"'
field[4] = ["""this one, well, is a problem"""]
field[5] = ' "another thing"'
field[5] = [" ""another thing"""]
fields[0..5] = 009098,0981098094,something,something else,"""this one, well, is a problem"""," ""another thing"""
line = '123458,1234561007,"anything","nothing else",""this one, well, is a problem"","dohicky'
field[0] = '123458'
field[0] = [123458]
field[1] = '1234561007'
field[1] = [1234561007]
field[2] = 'anything'
field[2] = [anything]
field[3] = 'nothing else'
field[3] = [nothing else]
field[4] = '"this one, well, is a problem"'
field[4] = ["""this one, well, is a problem"""]
field[5] = 'dohicky'
field[5] = [dohicky]
fields[0..5] = 123458,1234561007,anything,nothing else,"""this one, well, is a problem""",dohicky
line = '503458,1234598094,"nothing","everything else","""this one, well, it isn't a problem""","abelone"'
field[0] = '503458'
field[0] = [503458]
field[1] = '1234598094'
field[1] = [1234598094]
field[2] = 'nothing'
field[2] = [nothing]
field[3] = 'everything else'
field[3] = [everything else]
field[4] = '"this one, well, it isn't a problem"'
field[4] = ["""this one, well, it isn't a problem"""]
field[5] = 'abelone'
field[5] = [abelone]
fields[0..5] = 503458,1234598094,nothing,everything else,"""this one, well, it isn't a problem""",abelone
line = '610078,1236100794,"everything","anything else","this ""isn't a problem"", he said.","Orcas Rule"'
field[0] = '610078'
field[0] = [610078]
field[1] = '1236100794'
field[1] = [1236100794]
field[2] = 'everything'
field[2] = [everything]
field[3] = 'anything else'
field[3] = [anything else]
field[4] = 'this "isn't a problem", he said.'
field[4] = ["this ""isn't a problem"", he said."]
field[5] = 'Orcas Rule'
field[5] = [Orcas Rule]
fields[0..5] = 610078,1236100794,everything,anything else,"this ""isn't a problem"", he said.",Orcas Rule
The fields are printed twice, once to test the field extraction, once to test the field printing. You'd simplify the output by removing the printing except for csvputline() to convert your file from malformed CSV to properly formed CSV.

How can I convert from a character string to a hexadecimal one?

If I have a character string, how can I convert the values to hexadecimal in Objective-C? Likewise, how can I convert from a hexadecimal string to a character string?
As an exercise and in case it helps, I wrote a program to demonstrate how I might do this in pure C, which is 100% legal in Objective-C. I used the string-formatting functions in stdio.h to do the actual conversions.
Note that this can (should?) be tweaked for your setting. It will create a string twice as long as the passed-in string when going char->hex (converting 'Z' to '5a' for instance), and a string half as long going the other way.
I wrote this code in such a way that you can simply copy/paste and then compile/run to play around with it. Here is my sample output:
My favorite way to include C in XCode is to make a .h file with the function declarations separate from the .c file with implementation. See the comments:
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
// Place these prototypes in a .h to #import from wherever you need 'em
// Do not import the .c file anywhere.
// Note: You must free() these char *s
//
// allocates space for strlen(arg) * 2 and fills
// that space with chars corresponding to the hex
// representations of the arg string
char *makeHexStringFromCharString(const char*);
//
// allocates space for about 1/2 strlen(arg)
// and fills it with the char representation
char *makeCharStringFromHexString(const char*);
// this is just sample code
int main() {
char source[256];
printf("Enter a Char string to convert to Hex:");
scanf("%s", source);
char *output = makeHexStringFromCharString(source);
printf("converted '%s' TO: %s\n\n", source, output);
free(output);
printf("Enter a Hex string to convert to Char:");
scanf("%s", source);
output = makeCharStringFromHexString(source);
printf("converted '%s' TO: %s\n\n", source, output);
free(output);
}
// Place these in a .c file (named same as .h above)
// and include it in your target's build settings
// (should happen by default if you create the file in Xcode)
char *makeHexStringFromCharString(const char*input) {
char *output = malloc(sizeof(char) * strlen(input) * 2 + 1);
int i, limit;
for(i=0, limit = strlen(input); i<limit; i++) {
sprintf(output + (i*2), "%x", input[i]);
}
output[strlen(input)*2] = '\0';
return output;
}
char *makeCharStringFromHexString(const char*input) {
char *output = malloc(sizeof(char) * (strlen(input) / 2) + 1);
char sourceSnippet[3] = {[2]='\0'};
int i, limit;
for(i=0, limit = strlen(input); i<limit; i+=2) {
sourceSnippet[0] = input[i];
sourceSnippet[1] = input[i+1];
sscanf(sourceSnippet, "%x", (int *) (output + (i/2)));
}
output[strlen(input)/2+1] = '\0';
return output;
}