262 lines
8.3 KiB
Diff
262 lines
8.3 KiB
Diff
diff -r -u0 willuslib/ocrtess.c willuslib/ocrtess.c
|
|
--- willuslib/ocrtess.c 2018-12-31 19:59:58.000000000 +0100
|
|
+++ willuslib/ocrtess.c 2019-07-27 18:47:06.706765733 +0200
|
|
@@ -29,0 +30,258 @@
|
|
+
|
|
+
|
|
+/*
|
|
+** ocr_type=0: OEM_DEFAULT
|
|
+** ocr_type=1: OEM_TESSERACT_ONLY
|
|
+** ocr_type=2: OEM_LSTM_ONLY
|
|
+** ocr_type=3: OEM_TESSERACT_LSTM_COMBINED
|
|
+*/
|
|
+void *tess_capi_init(char *datapath,char *language,int ocr_type,FILE *out,
|
|
+ char *initstr,int maxlen,int *status)
|
|
+
|
|
+ {
|
|
+ char original_locale[256];
|
|
+ TessBaseAPI *api = TessBaseAPICreate();
|
|
+
|
|
+ /* willus mod, 11-24-16 */
|
|
+ /* Tesseract needs "C" locale to correctly parse all data .traineddata files. */
|
|
+
|
|
+ strncpy(original_locale,setlocale(LC_ALL,NULL),255);
|
|
+ original_locale[255]='\0';
|
|
+ setlocale(LC_ALL,"C");
|
|
+
|
|
+ // Make the order of args a bit more forgiving than it used to be.
|
|
+ const char* lang = "eng";
|
|
+ TessPageSegMode pagesegmode = PSM_SINGLE_BLOCK;
|
|
+ if (language!=NULL && language[0]!='\0')
|
|
+ lang = language;
|
|
+
|
|
+/*
|
|
+v4.00 loads either TESSERACT enginer, LSTM engine, or both. No CUBE.
|
|
+*/
|
|
+ ocr_type=0; /* Ignore specified and use default */
|
|
+ TessBaseAPISetOutputName(api, NULL);
|
|
+ (*status)=TessBaseAPIInit2(api, datapath,lang,
|
|
+ ocr_type==0 ? OEM_DEFAULT :
|
|
+ (ocr_type==1 ? OEM_TESSERACT_ONLY :
|
|
+ (ocr_type==2 ? OEM_LSTM_ONLY :
|
|
+ (OEM_TESSERACT_LSTM_COMBINED))));
|
|
+ if ((*status)!=0)
|
|
+ {
|
|
+ /* willus mod, 11-24-16 */
|
|
+ setlocale(LC_ALL,original_locale);
|
|
+ TessBaseAPIEnd(api);
|
|
+ TessBaseAPIDelete(api);
|
|
+ return(NULL);
|
|
+ }
|
|
+ /*
|
|
+ api.Init("tesscapi",lang,tesseract::OEM_DEFAULT,
|
|
+ &(argv[arg]), argc - arg, NULL, NULL, false);
|
|
+ */
|
|
+ // We have 2 possible sources of pagesegmode: a config file and
|
|
+ // the command line. For backwards compatability reasons, the
|
|
+ // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
|
|
+ // default for this program is tesseract::PSM_AUTO. We will let
|
|
+ // the config file take priority, so the command-line default
|
|
+ // can take priority over the tesseract default, so we use the
|
|
+ // value from the command line only if the retrieved mode
|
|
+ // is still tesseract::PSM_SINGLE_BLOCK, indicating no change
|
|
+ // in any config file. Therefore the only way to force
|
|
+ // tesseract::PSM_SINGLE_BLOCK is from the command line.
|
|
+ // It would be simpler if we could set the value before Init,
|
|
+ // but that doesn't work.
|
|
+ if (TessBaseAPIGetPageSegMode(api) == PSM_SINGLE_BLOCK)
|
|
+ TessBaseAPISetPageSegMode(api, pagesegmode);
|
|
+
|
|
+ /*
|
|
+ ** Initialization message
|
|
+ */
|
|
+ {
|
|
+ char istr[1024];
|
|
+
|
|
+// printf("tessedit_ocr_engine_mode = %d\n",tessedit_ocr_engine_mode);
|
|
+ sprintf(istr,"%s",TessVersion());
|
|
+ sprintf(&istr[strlen(istr)],"\n Tesseract data folder = '%s'",datapath==NULL?getenv("TESSDATA_PREFIX"):datapath);
|
|
+ strcat(istr,"\n Tesseract languages: ");
|
|
+ char** languages = TessBaseAPIGetAvailableLanguagesAsVector(api);
|
|
+/*
|
|
+printf("OEM=%d\n",api->oem());
|
|
+printf("Langs='%s'\n",api->GetInitLanguagesAsString());
|
|
+printf("AnyTessLang()=%d\n",(int)api->tesseract()->AnyTessLang());
|
|
+printf("AnyLSTMLang()=%d\n",(int)api->tesseract()->AnyLSTMLang());
|
|
+printf("num_sub_langs()=%d\n",api->tesseract()->num_sub_langs());
|
|
+printf("languages.size()=%d\n",(int)languages.size());
|
|
+*/
|
|
+ char* l = languages;
|
|
+ int eng = 0;
|
|
+ TessBaseAPI *lang1;
|
|
+ while (l != NULL)
|
|
+ {
|
|
+ eng=(int)TessBaseAPIOem(api);
|
|
+ sprintf(&istr[strlen(istr)],"%s%s [%s]",l==languages?"":", ",l,
|
|
+ eng==2?"LSTM+Tess":(eng==1?"LSTM":"Tess"));
|
|
+ l++;
|
|
+ }
|
|
+
|
|
+ TessDeleteTextArray(languages);
|
|
+
|
|
+ /*
|
|
+ if (ocr_type==0 || ocr_type==3)
|
|
+ sprintf(&istr[strlen(istr)],"[LSTM+] (lang=");
|
|
+ else if (ocr_type==2)
|
|
+ sprintf(&istr[strlen(istr)],"[LSTM] (lang=");
|
|
+ strncpy(&istr[strlen(istr)],language,253-strlen(istr));
|
|
+ istr[253]='\0';
|
|
+ strcat(istr,")");
|
|
+ */
|
|
+ if (out!=NULL)
|
|
+ fprintf(out,"%s\n",istr);
|
|
+ if (initstr!=NULL)
|
|
+ {
|
|
+ strncpy(initstr,istr,maxlen-1);
|
|
+ initstr[maxlen-1]='\0';
|
|
+ }
|
|
+ }
|
|
+
|
|
+
|
|
+ /* Turn off LSTM debugging output */
|
|
+ TessBaseAPISetVariable(api,"lstm_debug_level","0");
|
|
+#if (WILLUSDEBUG & 1)
|
|
+ TessBaseAPISetVariable(api,"lstm_debug_level","9");
|
|
+ TessBaseAPISetVariable(api,"paragraph_debug_level","9");
|
|
+ TessBaseAPISetVariable(api,"tessdata_manager_debug_level","9");
|
|
+ TessBaseAPISetVariable(api,"tosp_debug_level","9");
|
|
+ TessBaseAPISetVariable(api,"wordrec_debug_level","9");
|
|
+ TessBaseAPISetVariable(api,"segsearch_debug_level","9");
|
|
+#endif
|
|
+ /* willus mod, 11-24-16 */
|
|
+ setlocale(LC_ALL,original_locale);
|
|
+ return((void *)api);
|
|
+ }
|
|
+
|
|
+
|
|
+int tess_capi_get_ocr(void *vapi,PIX *pix,char *outstr,int maxlen,int segmode,FILE *out)
|
|
+
|
|
+ {
|
|
+ TessBaseAPI *api;
|
|
+ static int old_segmode=-1;
|
|
+
|
|
+ api=(TessBaseAPI *)vapi;
|
|
+ if (old_segmode != segmode)
|
|
+ {
|
|
+ old_segmode=segmode;
|
|
+ TessBaseAPISetPageSegMode(api, (TessPageSegMode)segmode);
|
|
+ }
|
|
+ if (!TessBaseAPIProcessPage(api,pix,0,NULL,NULL,0,NULL))
|
|
+ {
|
|
+ /* pixDestroy(&pix); */
|
|
+ if (out!=NULL)
|
|
+ fprintf(out,"tesscapi: Error during bitmap processing.\n");
|
|
+ TessBaseAPIClear(api);
|
|
+ return(-1);
|
|
+ }
|
|
+ char* text = TessBaseAPIGetUTF8Text(api);
|
|
+ strncpy(outstr,text,maxlen-1);
|
|
+ outstr[maxlen-1]='\0';
|
|
+ TessDeleteText(text);
|
|
+ TessBaseAPIClear(api);
|
|
+ return(0);
|
|
+ }
|
|
+
|
|
+
|
|
+int tess_capi_get_ocr_multiword(void *vapi,PIX *pix,int segmode,
|
|
+ int **left,int **top,int **right,int **bottom,
|
|
+ int **ybase,char **text,int *nw,
|
|
+ FILE *out)
|
|
+
|
|
+ {
|
|
+ TessBaseAPI *api;
|
|
+ static int old_segmode=-1;
|
|
+
|
|
+ api=(TessBaseAPI *)vapi;
|
|
+ if (old_segmode != segmode)
|
|
+ {
|
|
+ old_segmode=segmode;
|
|
+ TessBaseAPISetPageSegMode(api, (TessPageSegMode)segmode);
|
|
+ }
|
|
+ if (!TessBaseAPIProcessPage(api,pix,0,NULL,NULL,0,NULL))
|
|
+ {
|
|
+ if (out!=NULL)
|
|
+ fprintf(out,"tesscapi: Error during bitmap processing.\n");
|
|
+ TessBaseAPIClear(api);
|
|
+ (*nw)=0;
|
|
+ return(-1);
|
|
+ }
|
|
+
|
|
+ //(*nw)=api->GetOCRWords(left,top,right,bottom,ybase,text);
|
|
+
|
|
+ int iword,nwords,totlen,it8;
|
|
+ int *x0,*y0,*x1,*y1,*ybaseline;
|
|
+ char *tutf8;
|
|
+
|
|
+ TessResultIterator *res_it = TessBaseAPIGetIterator(api);
|
|
+ /* Count words */
|
|
+ iword=0;
|
|
+ totlen=0;
|
|
+ while(TessResultIteratorNext(res_it, RIL_BLOCK))
|
|
+ {
|
|
+ if(!TessResultIteratorNext(res_it, RIL_WORD))
|
|
+ {
|
|
+ continue;
|
|
+ }
|
|
+ iword++;
|
|
+ char* textstr = TessResultIteratorGetUTF8Text(res_it, RIL_WORD);
|
|
+ totlen+=strlen(textstr)+1;
|
|
+ }
|
|
+ nwords = iword;
|
|
+
|
|
+ x0=(*left)=(int *)malloc(sizeof(int)*5*nwords);
|
|
+ y0=(*top)=&x0[nwords];
|
|
+ x1=(*right)=&y0[nwords];
|
|
+ y1=(*bottom)=&x1[nwords];
|
|
+ ybaseline=(*ybase)=&y1[nwords];
|
|
+ tutf8=(*text)=(char *)malloc(totlen);
|
|
+ iword=0;
|
|
+ it8=0;
|
|
+ TessPageIteratorBegin( (TessPageIterator *) res_it);
|
|
+ while (TessResultIteratorNext(res_it, RIL_BLOCK))
|
|
+ {
|
|
+ if (!TessResultIteratorNext(res_it, RIL_WORD))
|
|
+ {
|
|
+ continue;
|
|
+ }
|
|
+ char* textstr = TessResultIteratorGetUTF8Text(res_it, RIL_WORD);
|
|
+ strcpy(&tutf8[it8],textstr);
|
|
+ it8 += strlen(&tutf8[it8])+1;
|
|
+
|
|
+ int bbleft, bbtop, bbright, bbbottom;
|
|
+ int u1,v1,u2,v2;
|
|
+ TessPageIteratorBoundingBox( (TessPageIterator *) res_it, RIL_WORD, &bbleft, &bbtop, &bbright, &bbbottom);
|
|
+ TessPageIteratorBaseline( (TessPageIterator *) res_it, RIL_WORD, &u1, &v1, &u2, &v2);
|
|
+ x0[iword]=bbleft;
|
|
+ x1[iword]=bbright;
|
|
+ y0[iword]=bbtop;
|
|
+ y1[iword]=bbbottom;
|
|
+ ybaseline[iword]=(v1+v2)/2;
|
|
+ iword++;
|
|
+ }
|
|
+
|
|
+ TessResultIteratorDelete(res_it);
|
|
+
|
|
+ (*nw) = iword;
|
|
+
|
|
+ TessBaseAPIClear(api);
|
|
+ return(0);
|
|
+ }
|
|
+
|
|
+
|
|
+void tess_capi_end(void *vapi)
|
|
+
|
|
+ {
|
|
+ TessBaseAPI *api;
|
|
+
|
|
+ if (vapi==NULL)
|
|
+ return;
|
|
+ api=(TessBaseAPI *)vapi;
|
|
+ TessBaseAPIEnd(api);
|
|
+ TessBaseAPIDelete(api);
|
|
+ }
|