Merge pull request #93631 from danielfullmer/k2pdfopt-2.52
k2pdfopt: 2.51a -> 2.53
This commit is contained in:
commit
6b043b105e
|
@ -0,0 +1,49 @@
|
|||
From 2629af4ed00d7ca65359178203d80fb146901cdb Mon Sep 17 00:00:00 2001
|
||||
From: Daniel Fullmer <danielrf12@gmail.com>
|
||||
Date: Fri, 3 Jul 2020 21:00:45 -0700
|
||||
Subject: [PATCH 1/2] Fix CMakeLists
|
||||
|
||||
---
|
||||
CMakeLists.txt | 12 ++++++++----
|
||||
1 file changed, 8 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
||||
index e218279..4341de9 100644
|
||||
--- a/CMakeLists.txt
|
||||
+++ b/CMakeLists.txt
|
||||
@@ -57,6 +57,7 @@ endif(JPEG_FOUND)
|
||||
include(FindJasper)
|
||||
if(JASPER_FOUND)
|
||||
set(HAVE_JASPER_LIB 1)
|
||||
+ set(K2PDFOPT_LIB ${K2PDFOPT_LIB} ${JASPER_LIBRARY})
|
||||
endif(JASPER_FOUND)
|
||||
|
||||
# paths from willuslib/wgs.c
|
||||
@@ -71,9 +72,12 @@ else()
|
||||
message(STATUS "Could NOT find ghostscript executable")
|
||||
endif(GHOSTSCRIPT_EXECUTABLE)
|
||||
|
||||
-# willus.h
|
||||
-# HAVE_GSL_LIB
|
||||
-
|
||||
+pkg_check_modules(GSL gsl)
|
||||
+if(GSL_FOUND)
|
||||
+ set(HAVE_GSL_LIB 1)
|
||||
+ include_directories(SYSTEM ${GSL_INCLUDEDIR})
|
||||
+ set(K2PDFOPT_LIB ${K2PDFOPT_LIB} ${GSL_LDFLAGS})
|
||||
+endif(GSL_FOUND)
|
||||
|
||||
# libfreetype6 (>= 2.3.9), libjbig2dec0, libjpeg8 (>= 8c), libx11-6, libxext6, zlib1g (>= 1:1.2.0)
|
||||
# MUPDF_STATIC_LDFLAGS misses mupdf-js-none, and doubles libs ...
|
||||
@@ -85,7 +89,7 @@ if(MUPDF_FOUND)
|
||||
include_directories(SYSTEM ${MUPDF_INCLUDEDIR})
|
||||
message(STATUS "mupdf libraries: ${MUPDF_LDFLAGS}")
|
||||
set(K2PDFOPT_LIB ${K2PDFOPT_LIB} ${MUPDF_LDFLAGS}
|
||||
- -lmupdf-js-none -lopenjpeg -ljbig2dec -ljpeg -lfreetype
|
||||
+
|
||||
)
|
||||
endif(MUPDF_FOUND)
|
||||
|
||||
--
|
||||
2.27.0
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
{ stdenv, fetchzip, fetchurl, fetchpatch, cmake, pkgconfig
|
||||
, zlib, libpng
|
||||
{ stdenv, runCommand, fetchzip, fetchurl, fetchpatch, fetchFromGitHub
|
||||
, cmake, pkgconfig, zlib, libpng, makeWrapper
|
||||
, enableGSL ? true, gsl
|
||||
, enableGhostScript ? true, ghostscript
|
||||
, enableMuPDF ? true, mupdf
|
||||
|
@ -11,44 +11,132 @@
|
|||
|
||||
with stdenv.lib;
|
||||
|
||||
stdenv.mkDerivation rec {
|
||||
pname = "k2pdfopt";
|
||||
version = "2.51a";
|
||||
# k2pdfopt is a pain to package. It requires modified versions of mupdf,
|
||||
# leptonica, and tesseract. Instead of shipping patches for these upstream
|
||||
# packages, k2pdfopt includes just the modified source files for these
|
||||
# packages. The individual files from the {mupdf,leptonica,tesseract}_mod/
|
||||
# directories are intended to replace the corresponding source files in the
|
||||
# upstream packages, for a particular version of that upstream package.
|
||||
#
|
||||
# There are a few ways we could approach packaging these modified versions of
|
||||
# mupdf, leptonica, and mupdf:
|
||||
# 1) Override the upstream source with a new derivation that involves copying
|
||||
# the modified source files from k2pdfopt and replacing the corresponding
|
||||
# source files in the upstream packages. Since the files are intended for a
|
||||
# particular version of the upstream package, this would not allow us to easily
|
||||
# use updates to those packages in nixpkgs.
|
||||
# 2) Manually produce patches which can be applied against the upstream
|
||||
# project, and have the same effect as replacing those files. This is what I
|
||||
# believe k2pdfopt should do this for us anyway. The benefit of creating and
|
||||
# applying patches in this way is that minor updates (esp. security fixes) to
|
||||
# upstream packages might still allow these patches to apply successfully.
|
||||
# 3) Automatically produce these patches inside a nix derivation. This is the
|
||||
# approach taken here, using the "mkPatch" provided below. This has the
|
||||
# benefit of easier review and should hopefully be simpler to update in the
|
||||
# future.
|
||||
|
||||
src = (fetchzip {
|
||||
url = "http://www.willus.com/k2pdfopt/src/k2pdfopt_v2.51_src.zip";
|
||||
sha256 = "133l7xkvi67s6sfk8cfh7rmavbsf7ib5fyksk1ci6b6sch3z2sw9";
|
||||
});
|
||||
let
|
||||
# Create a patch against src based on changes applied in patchCommands
|
||||
mkPatch = { name, src, patchCommands }: runCommand "${name}-k2pdfopt.patch" { inherit src; } ''
|
||||
source $stdenv/setup
|
||||
unpackPhase
|
||||
|
||||
# Note: the v2.51a zip contains only files to be replaced in the v2.50 zip.
|
||||
v251a_src = (fetchzip {
|
||||
url = "http://www.willus.com/k2pdfopt/src/k2pdfopt_v2.51a_src.zip";
|
||||
sha256 = "0vvwblii7kgdwfxw8dzk6jbmz4dv94d7rkv18i60y8wkayj6yhl6";
|
||||
});
|
||||
orig=$sourceRoot
|
||||
new=$sourceRoot-modded
|
||||
cp -r $orig/. $new/
|
||||
|
||||
postUnpack = ''
|
||||
cp -r ${v251a_src}/* $sourceRoot
|
||||
pushd $new >/dev/null
|
||||
${patchCommands}
|
||||
popd >/dev/null
|
||||
|
||||
diff -Naur $orig $new > $out || true
|
||||
'';
|
||||
|
||||
patches = [ ./k2pdfopt.patch ./k2pdfopt-mupdf-1.16.1.patch ];
|
||||
pname = "k2pdfopt";
|
||||
version = "2.53";
|
||||
k2pdfopt_src = fetchzip {
|
||||
url = "http://www.willus.com/${pname}/src/${pname}_v${version}_src.zip";
|
||||
sha256 = "1fna8bg3pascjfc3hmc6xn0xi2yh7f1qp0d344mw9hqanbnykyy8";
|
||||
};
|
||||
in stdenv.mkDerivation rec {
|
||||
inherit pname version;
|
||||
src = k2pdfopt_src;
|
||||
|
||||
nativeBuildInputs = [ cmake pkgconfig ];
|
||||
patches = [
|
||||
./0001-Fix-CMakeLists.patch
|
||||
];
|
||||
|
||||
postPatch = ''
|
||||
substituteInPlace willuslib/bmpdjvu.c \
|
||||
--replace "<djvu.h>" "<libdjvu/ddjvuapi.h>"
|
||||
'';
|
||||
|
||||
nativeBuildInputs = [ cmake pkgconfig makeWrapper ];
|
||||
|
||||
buildInputs =
|
||||
let
|
||||
# The patches below were constructed by taking the files from k2pdfopt in
|
||||
# the {mupdf,leptonica,tesseract}_mod/ directories, replacing the
|
||||
# corresponding files in the respective source trees, resolving any errors
|
||||
# with more recent versions of these depencencies, and running diff.
|
||||
mupdf_modded = mupdf.overrideAttrs (attrs: {
|
||||
patches = attrs.patches ++ [ ./mupdf.patch ]; # Last verified with mupdf 1.16.1
|
||||
# We use specific versions of these sources below to match the versions
|
||||
# used in the k2pdfopt source. Note that this does _not_ need to match the
|
||||
# version used elsewhere in nixpkgs, since it is only used to create the
|
||||
# patch that can then be applied to the version in nixpkgs.
|
||||
mupdf_patch = mkPatch {
|
||||
name = "mupdf";
|
||||
src = fetchurl {
|
||||
url = "https://mupdf.com/downloads/archive/mupdf-1.17.0-source.tar.gz";
|
||||
sha256 = "13nl9nrcx2awz9l83mlv2psi1lmn3hdnfwxvwgwiwbxlkjl3zqq0";
|
||||
};
|
||||
patchCommands = ''
|
||||
cp ${k2pdfopt_src}/mupdf_mod/{filter-basic,font,stext-device,string}.c ./source/fitz/
|
||||
cp ${k2pdfopt_src}/mupdf_mod/pdf-* ./source/pdf/
|
||||
'';
|
||||
};
|
||||
mupdf_modded = mupdf.overrideAttrs ({ patches ? [], ... }: {
|
||||
patches = patches ++ [ mupdf_patch ];
|
||||
# This function is missing in font.c, see font-win32.c
|
||||
postPatch = ''
|
||||
echo "void pdf_install_load_system_font_funcs(fz_context *ctx) {}" >> source/fitz/font.c
|
||||
'';
|
||||
});
|
||||
leptonica_modded = leptonica.overrideAttrs (attrs: {
|
||||
patches = [ ./leptonica.patch ]; # Last verified with leptonica 1.78.0
|
||||
|
||||
leptonica_patch = mkPatch {
|
||||
name = "leptonica";
|
||||
src = fetchurl {
|
||||
url = "http://www.leptonica.org/source/leptonica-1.79.0.tar.gz";
|
||||
sha256 = "1n004gv1dj3pq1fcnfdclvvx5nang80336aa67nvs3nnqp4ncn84";
|
||||
};
|
||||
patchCommands = "cp -r ${k2pdfopt_src}/leptonica_mod/. ./src/";
|
||||
};
|
||||
leptonica_modded = leptonica.overrideAttrs ({ patches ? [], ... }: {
|
||||
patches = patches ++ [ leptonica_patch ];
|
||||
});
|
||||
|
||||
tesseract_patch = mkPatch {
|
||||
name = "tesseract";
|
||||
src = fetchFromGitHub {
|
||||
owner = "tesseract-ocr";
|
||||
repo = "tesseract";
|
||||
rev = "4.1.1";
|
||||
sha256 = "1ca27zbjpx35nxh9fha410z3jskwyj06i5hqiqdc08s2d7kdivwn";
|
||||
};
|
||||
patchCommands = ''
|
||||
cp ${k2pdfopt_src}/tesseract_mod/{baseapi,tesscapi,tesseract}.* src/api/
|
||||
cp ${k2pdfopt_src}/tesseract_mod/{tesscapi,tessedit,tesseract}.* src/ccmain/
|
||||
cp ${k2pdfopt_src}/tesseract_mod/dotproduct{avx,fma,sse}.* src/arch/
|
||||
cp ${k2pdfopt_src}/tesseract_mod/{intsimdmatrixsse,simddetect}.* src/arch/
|
||||
cp ${k2pdfopt_src}/tesseract_mod/{errcode,genericvector,mainblk,params,serialis,tessdatamanager,tess_version,tprintf,unicharset}.* src/ccutil/
|
||||
cp ${k2pdfopt_src}/tesseract_mod/{input,lstmrecognizer}.* src/lstm/
|
||||
cp ${k2pdfopt_src}/tesseract_mod/openclwrapper.* src/opencl/
|
||||
'';
|
||||
};
|
||||
tesseract_modded = tesseract4.override {
|
||||
tesseractBase = tesseract4.tesseractBase.overrideAttrs (_: {
|
||||
patches = [ ./tesseract.patch ]; # Last verified with tesseract 1.4
|
||||
tesseractBase = tesseract4.tesseractBase.overrideAttrs ({ patches ? [], ... }: {
|
||||
patches = patches ++ [ tesseract_patch ];
|
||||
# Additional compilation fixes
|
||||
postPatch = ''
|
||||
echo libtesseract_api_la_SOURCES += tesscapi.cpp >> src/api/Makefile.am
|
||||
substituteInPlace src/api/tesseract.h \
|
||||
--replace "#include <leptonica.h>" "//#include <leptonica.h>"
|
||||
'';
|
||||
});
|
||||
};
|
||||
in
|
||||
|
@ -71,6 +159,10 @@ stdenv.mkDerivation rec {
|
|||
install -D -m 755 k2pdfopt $out/bin/k2pdfopt
|
||||
'';
|
||||
|
||||
preFixup = optionalString enableTesseract ''
|
||||
wrapProgram $out/bin/k2pdfopt --set-default TESSDATA_PREFIX ${tesseract4}/share/tessdata
|
||||
'';
|
||||
|
||||
meta = with stdenv.lib; {
|
||||
description = "Optimizes PDF/DJVU files for mobile e-readers (e.g. the Kindle) and smartphones";
|
||||
homepage = "http://www.willus.com/k2pdfopt";
|
||||
|
|
|
@ -1,151 +0,0 @@
|
|||
diff --git a/willuslib/wmupdf.c b/willuslib/wmupdf.c
|
||||
index 81627ef..f14a96c 100644
|
||||
--- a/willuslib/wmupdf.c
|
||||
+++ b/willuslib/wmupdf.c
|
||||
@@ -189,8 +189,6 @@ int wmupdf_remake_pdf(char *infile,char *outfile,WPDFPAGEINFO *pageinfo,int use_
|
||||
pdf_write_opts.do_compress=1;
|
||||
pdf_write_opts.do_linear=0;
|
||||
pdf_write_opts.do_garbage=1; /* 2 and 3 don't work for this. */
|
||||
- pdf_write_opts.continue_on_error=0;
|
||||
- pdf_write_opts.errors=NULL;
|
||||
write_failed=0;
|
||||
wpdfpageinfo_sort(pageinfo);
|
||||
xref=NULL;
|
||||
@@ -1687,8 +1685,8 @@ WPDFOUTLINE *wpdfoutline_read_from_pdf_file(char *filename)
|
||||
/* Sumatra version of MuPDF v1.4 -- use locally installed fonts */
|
||||
pdf_install_load_system_font_funcs(ctx);
|
||||
fz_try(ctx) { doc=fz_open_document(ctx,filename); }
|
||||
- fz_catch(ctx)
|
||||
- {
|
||||
+ fz_catch(ctx)
|
||||
+ {
|
||||
fz_drop_context(ctx);
|
||||
return(NULL);
|
||||
}
|
||||
@@ -1890,5 +1888,5 @@ static pdf_obj *pdf_new_string_utf8(fz_context *ctx,char *string)
|
||||
willus_mem_free((double **)&utfbuf,funcname);
|
||||
return(pdfobj);
|
||||
}
|
||||
-
|
||||
+
|
||||
#endif /* HAVE_MUPDF_LIB */
|
||||
diff --git a/willuslib/wmupdfinfo.c b/willuslib/wmupdfinfo.c
|
||||
index 5c7f38c..9b9e6fd 100644
|
||||
--- a/willuslib/wmupdfinfo.c
|
||||
+++ b/willuslib/wmupdfinfo.c
|
||||
@@ -237,23 +237,22 @@ static void showglobalinfo(fz_context *ctx, globals *glo,char *filename)
|
||||
pdf_obj *robj;
|
||||
|
||||
robj=pdf_resolve_indirect(ctx,obj);
|
||||
- n=pdf_sprint_obj(ctx,NULL,0,robj,1);
|
||||
- buf=malloc(n+2);
|
||||
+ buf=pdf_sprint_obj(ctx,NULL,0,&n,robj,1,0);
|
||||
if (buf==NULL)
|
||||
{
|
||||
fz_write_printf(ctx,out,"Info object (%d %d R):\n",pdf_to_num(ctx,obj),pdf_to_gen(ctx,obj));
|
||||
- pdf_print_obj(ctx,out,robj,1);
|
||||
+ pdf_print_obj(ctx,out,robj,1,0);
|
||||
}
|
||||
else
|
||||
{
|
||||
- pdf_sprint_obj(ctx,buf,n+2,robj,1);
|
||||
+ pdf_sprint_obj(ctx,buf,n+2,&n,robj,1,0);
|
||||
display_pdf_field(ctx,out,buf,"Title","TITLE");
|
||||
display_pdf_field(ctx,out,buf,"CreationDate","CREATED");
|
||||
display_pdf_field(ctx,out,buf,"ModDate","LAST MODIFIED");
|
||||
display_pdf_field(ctx,out,buf,"Producer","PDF PRODUCER");
|
||||
display_pdf_field(ctx,out,buf,"Creator","CREATOR");
|
||||
display_file_size(ctx,out,filename);
|
||||
- free(buf);
|
||||
+ fz_free(ctx,buf);
|
||||
}
|
||||
}
|
||||
if (glo->dims==1)
|
||||
@@ -275,7 +274,7 @@ static void showglobalinfo(fz_context *ctx, globals *glo,char *filename)
|
||||
if (obj)
|
||||
{
|
||||
fz_write_printf(ctx,out, "\nEncryption object (%d %d R):\n", pdf_to_num(ctx,obj), pdf_to_gen(ctx,obj));
|
||||
- pdf_print_obj(ctx,out, pdf_resolve_indirect(ctx,obj), 1);
|
||||
+ pdf_print_obj(ctx,out, pdf_resolve_indirect(ctx,obj), 1, 0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -396,7 +395,7 @@ gatherdimensions(fz_context *ctx, globals *glo, int page, pdf_obj *pageref, pdf_
|
||||
if (j < glo->dims)
|
||||
return;
|
||||
|
||||
- glo->dim = fz_resize_array(ctx, glo->dim, glo->dims+1, sizeof(struct info));
|
||||
+ glo->dim = fz_realloc_array(ctx, glo->dim, glo->dims+1, struct info);
|
||||
glo->dims++;
|
||||
|
||||
glo->dim[glo->dims - 1].page = page;
|
||||
@@ -441,7 +440,7 @@ gatherfonts(fz_context *ctx, globals *glo, int page, pdf_obj *pageref, pdf_obj *
|
||||
if (k < glo->fonts)
|
||||
continue;
|
||||
|
||||
- glo->font = fz_resize_array(ctx, glo->font, glo->fonts+1, sizeof(struct info));
|
||||
+ glo->font = fz_realloc_array(ctx, glo->font, glo->fonts+1, struct info);
|
||||
glo->fonts++;
|
||||
|
||||
glo->font[glo->fonts - 1].page = page;
|
||||
@@ -510,7 +509,7 @@ gatherimages(fz_context *ctx, globals *glo, int page, pdf_obj *pageref, pdf_obj
|
||||
if (k < glo->images)
|
||||
continue;
|
||||
|
||||
- glo->image = fz_resize_array(ctx, glo->image, glo->images+1, sizeof(struct info));
|
||||
+ glo->image = fz_realloc_array(ctx, glo->image, glo->images+1, struct info);
|
||||
glo->images++;
|
||||
|
||||
glo->image[glo->images - 1].page = page;
|
||||
@@ -568,7 +567,7 @@ gatherforms(fz_context *ctx, globals *glo, int page, pdf_obj *pageref, pdf_obj *
|
||||
if (k < glo->forms)
|
||||
continue;
|
||||
|
||||
- glo->form = fz_resize_array(ctx, glo->form, glo->forms+1, sizeof(struct info));
|
||||
+ glo->form = fz_realloc_array(ctx, glo->form, glo->forms+1, struct info);
|
||||
glo->forms++;
|
||||
|
||||
glo->form[glo->forms - 1].page = page;
|
||||
@@ -613,7 +612,7 @@ gatherpsobjs(fz_context *ctx, globals *glo, int page, pdf_obj *pageref, pdf_obj
|
||||
if (k < glo->psobjs)
|
||||
continue;
|
||||
|
||||
- glo->psobj = fz_resize_array(ctx, glo->psobj, glo->psobjs+1, sizeof(struct info));
|
||||
+ glo->psobj = fz_realloc_array(ctx, glo->psobj, glo->psobjs+1, struct info);
|
||||
glo->psobjs++;
|
||||
|
||||
glo->psobj[glo->psobjs - 1].page = page;
|
||||
@@ -656,7 +655,7 @@ gathershadings(fz_context *ctx, globals *glo, int page, pdf_obj *pageref, pdf_ob
|
||||
if (k < glo->shadings)
|
||||
continue;
|
||||
|
||||
- glo->shading = fz_resize_array(ctx, glo->shading, glo->shadings+1, sizeof(struct info));
|
||||
+ glo->shading = fz_realloc_array(ctx, glo->shading, glo->shadings+1, struct info);
|
||||
glo->shadings++;
|
||||
|
||||
glo->shading[glo->shadings - 1].page = page;
|
||||
@@ -724,7 +723,7 @@ gatherpatterns(fz_context *ctx, globals *glo, int page, pdf_obj *pageref, pdf_ob
|
||||
if (k < glo->patterns)
|
||||
continue;
|
||||
|
||||
- glo->pattern = fz_resize_array(ctx, glo->pattern, glo->patterns+1, sizeof(struct info));
|
||||
+ glo->pattern = fz_realloc_array(ctx, glo->pattern, glo->patterns+1, struct info);
|
||||
glo->patterns++;
|
||||
|
||||
glo->pattern[glo->patterns - 1].page = page;
|
||||
@@ -1216,7 +1215,7 @@ void wmupdfinfo_get(char *filename,int *pagelist,char **buf)
|
||||
if (fout==NULL)
|
||||
return;
|
||||
*/
|
||||
-
|
||||
+
|
||||
ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED);
|
||||
if (!ctx)
|
||||
{
|
||||
@@ -1307,5 +1306,5 @@ static void date_convert(char *dst,char *src)
|
||||
else if (src[i]!='\0')
|
||||
sprintf(&dst[strlen(dst)]," %s",&src[i]);
|
||||
}
|
||||
-
|
||||
+
|
||||
#endif /* HAVE_MUPDF_LIB */
|
|
@ -1,99 +0,0 @@
|
|||
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
||||
index 4a2378b..502c477 100644
|
||||
--- a/CMakeLists.txt
|
||||
+++ b/CMakeLists.txt
|
||||
@@ -52,6 +52,7 @@ endif(JPEG_FOUND)
|
||||
include(FindJasper)
|
||||
if(JASPER_FOUND)
|
||||
set(HAVE_JASPER_LIB 1)
|
||||
+ set(K2PDFOPT_LIB ${K2PDFOPT_LIB} ${JASPER_LIBRARY})
|
||||
endif(JASPER_FOUND)
|
||||
|
||||
# paths from willuslib/wgs.c
|
||||
@@ -66,8 +67,12 @@ else()
|
||||
message(STATUS "Could NOT find ghostscript executable")
|
||||
endif(GHOSTSCRIPT_EXECUTABLE)
|
||||
|
||||
-# willus.h
|
||||
-# HAVE_GSL_LIB
|
||||
+pkg_check_modules(GSL gsl)
|
||||
+if(GSL_FOUND)
|
||||
+ set(HAVE_GSL_LIB 1)
|
||||
+ include_directories(SYSTEM ${GSL_INCLUDEDIR})
|
||||
+ set(K2PDFOPT_LIB ${K2PDFOPT_LIB} ${GSL_LDFLAGS})
|
||||
+endif(GSL_FOUND)
|
||||
|
||||
|
||||
# libfreetype6 (>= 2.3.9), libjbig2dec0, libjpeg8 (>= 8c), libx11-6, libxext6, zlib1g (>= 1:1.2.0)
|
||||
@@ -80,7 +85,7 @@ if(MUPDF_FOUND)
|
||||
include_directories(SYSTEM ${MUPDF_INCLUDEDIR})
|
||||
message(STATUS "mupdf libraries: ${MUPDF_LDFLAGS}")
|
||||
set(K2PDFOPT_LIB ${K2PDFOPT_LIB} ${MUPDF_LDFLAGS}
|
||||
- -lmupdf-js-none -lopenjpeg -ljbig2dec -ljpeg -lfreetype
|
||||
+
|
||||
)
|
||||
endif(MUPDF_FOUND)
|
||||
|
||||
@@ -91,9 +96,25 @@ if(DJVU_FOUND)
|
||||
set(K2PDFOPT_LIB ${K2PDFOPT_LIB} ${DJVU_LDFLAGS})
|
||||
endif(DJVU_FOUND)
|
||||
|
||||
-# HAVE_GOCR_LIB
|
||||
-# HAVE_LEPTONICA_LIB
|
||||
-# HAVE_TESSERACT_LIB
|
||||
+find_library(GOCR_LIB NAMES Pgm2asc)
|
||||
+if(GOCR_LIB)
|
||||
+ set(HAVE_GOCR_LIB 1)
|
||||
+ set(K2PDFOPT_LIB ${K2PDFOPT_LIB} ${GOCR_LIB})
|
||||
+endif(GOCR_LIB)
|
||||
+
|
||||
+pkg_check_modules(LEPTONICA lept)
|
||||
+if(LEPTONICA_FOUND)
|
||||
+ set(HAVE_LEPTONICA_LIB 1)
|
||||
+ include_directories(SYSTEM ${LEPTONICA_INCLUDEDIR})
|
||||
+ set(K2PDFOPT_LIB ${K2PDFOPT_LIB} ${LEPTONICA_LDFLAGS})
|
||||
+endif(LEPTONICA_FOUND)
|
||||
+
|
||||
+pkg_check_modules(TESSERACT tesseract)
|
||||
+if(TESSERACT_FOUND)
|
||||
+ set(HAVE_TESSERACT_LIB 1)
|
||||
+ include_directories(SYSTEM ${TESSERACT_INCLUDEDIR})
|
||||
+ set(K2PDFOPT_LIB ${K2PDFOPT_LIB} ${TESSERACT_LDFLAGS})
|
||||
+endif(TESSERACT_FOUND)
|
||||
|
||||
# ---- Describe project
|
||||
|
||||
diff --git a/willuslib/CMakeLists.txt b/willuslib/CMakeLists.txt
|
||||
index 463bbc9..8043db5 100644
|
||||
--- a/willuslib/CMakeLists.txt
|
||||
+++ b/willuslib/CMakeLists.txt
|
||||
@@ -6,7 +6,7 @@ include_directories(..)
|
||||
set(WILLUSLIB_SRC
|
||||
ansi.c array.c bmp.c bmpdjvu.c bmpmupdf.c dtcompress.c filelist.c
|
||||
fontdata.c fontrender.c gslpolyfit.c linux.c math.c mem.c ocr.c
|
||||
- ocrjocr.c ocrtess.c pdfwrite.c point2d.c render.c strbuf.c string.c
|
||||
+ ocrgocr.c ocrtess.c pdfwrite.c point2d.c render.c strbuf.c string.c
|
||||
token.c wfile.c wgs.c wgui.c willusversion.c win.c winbmp.c
|
||||
wincomdlg.c winmbox.c winshell.c wmupdf.c wmupdfinfo.c wpdf.c wsys.c
|
||||
wzfile.c wleptonica.c
|
||||
diff --git a/willuslib/ocrgocr.c b/willuslib/ocrgocr.c
|
||||
index 6027e9a..fbe10f0 100644
|
||||
--- a/willuslib/ocrgocr.c
|
||||
+++ b/willuslib/ocrgocr.c
|
||||
@@ -29,6 +29,8 @@
|
||||
#ifdef HAVE_GOCR_LIB
|
||||
#include <gocr.h>
|
||||
|
||||
+job_t *OCR_JOB;
|
||||
+
|
||||
/*
|
||||
** bmp8 must be grayscale
|
||||
** (x1,y1) and (x2,y2) from top left of bitmap
|
||||
@@ -63,6 +65,7 @@ void gocr_single_word_from_bmp8(char *text,int maxlen,WILLUSBITMAP *bmp8,
|
||||
h=y2-y1+1;
|
||||
dh=h+bw*2;
|
||||
job=&_job;
|
||||
+ OCR_JOB=job;
|
||||
job_init(job);
|
||||
job_init_image(job);
|
||||
// willus_mem_alloc_warn((void **)&job->src.p.p,w*h,funcname,10);
|
|
@ -1,254 +0,0 @@
|
|||
From 8c11a20925686855023df90ed477957c7d7fe91e Mon Sep 17 00:00:00 2001
|
||||
From: Daniel Fullmer <danielrf12@gmail.com>
|
||||
Date: Fri, 13 Sep 2019 15:54:21 -0400
|
||||
Subject: [PATCH] Willus mod for k2pdfopt
|
||||
|
||||
---
|
||||
src/allheaders.h | 4 ++
|
||||
src/dewarp2.c | 106 ++++++++++++++++++++++++++++++++++++++++++-----
|
||||
src/leptwin.c | 6 ++-
|
||||
3 files changed, 104 insertions(+), 12 deletions(-)
|
||||
|
||||
diff --git a/src/allheaders.h b/src/allheaders.h
|
||||
index e68eff1..b3cc729 100644
|
||||
--- a/src/allheaders.h
|
||||
+++ b/src/allheaders.h
|
||||
@@ -669,6 +669,10 @@ LEPT_DLL extern L_DEWARPA * dewarpaReadMem ( const l_uint8 *data, size_t size );
|
||||
LEPT_DLL extern l_ok dewarpaWrite ( const char *filename, L_DEWARPA *dewa );
|
||||
LEPT_DLL extern l_ok dewarpaWriteStream ( FILE *fp, L_DEWARPA *dewa );
|
||||
LEPT_DLL extern l_ok dewarpaWriteMem ( l_uint8 **pdata, size_t *psize, L_DEWARPA *dewa );
|
||||
+/* WILLUS MOD */
|
||||
+ LEPT_DLL extern l_int32 dewarpBuildPageModel_ex ( L_DEWARP *dew, const char *debugfile,l_int32 fit_order );
|
||||
+ LEPT_DLL extern l_int32 dewarpFindVertDisparity_ex ( L_DEWARP *dew, PTAA *ptaa, l_int32 rotflag,l_int32 fit_order );
|
||||
+ LEPT_DLL extern l_int32 dewarpBuildLineModel_ex ( L_DEWARP *dew, l_int32 opensize, const char *debugfile,l_int32 fit_order );
|
||||
LEPT_DLL extern l_ok dewarpBuildPageModel ( L_DEWARP *dew, const char *debugfile );
|
||||
LEPT_DLL extern l_ok dewarpFindVertDisparity ( L_DEWARP *dew, PTAA *ptaa, l_int32 rotflag );
|
||||
LEPT_DLL extern l_ok dewarpFindHorizDisparity ( L_DEWARP *dew, PTAA *ptaa );
|
||||
diff --git a/src/dewarp2.c b/src/dewarp2.c
|
||||
index 220eec1..2e29500 100644
|
||||
--- a/src/dewarp2.c
|
||||
+++ b/src/dewarp2.c
|
||||
@@ -144,9 +144,17 @@ static const l_float32 L_ALLOWED_W_FRACT = 0.05; /* no bigger */
|
||||
* longest textlines.
|
||||
* </pre>
|
||||
*/
|
||||
+/* WILLUS MOD */
|
||||
l_ok
|
||||
-dewarpBuildPageModel(L_DEWARP *dew,
|
||||
- const char *debugfile)
|
||||
+dewarpBuildPageModel(L_DEWARP *dew,const char *debugfile)
|
||||
+{
|
||||
+return(dewarpBuildPageModel_ex(dew,debugfile,2));
|
||||
+}
|
||||
+
|
||||
+l_ok
|
||||
+dewarpBuildPageModel_ex(L_DEWARP *dew,
|
||||
+ const char *debugfile,
|
||||
+ l_int32 fit_order)
|
||||
{
|
||||
l_int32 linecount, topline, botline, ret;
|
||||
PIX *pixs, *pix1, *pix2, *pix3;
|
||||
@@ -225,7 +233,7 @@ PTAA *ptaa1, *ptaa2;
|
||||
/* Get the sampled vertical disparity from the textline centers.
|
||||
* The disparity array will push pixels vertically so that each
|
||||
* textline is flat and centered at the y-position of the mid-point. */
|
||||
- if (dewarpFindVertDisparity(dew, ptaa2, 0) != 0) {
|
||||
+ if (dewarpFindVertDisparity_ex(dew, ptaa2, 0, fit_order) != 0) {
|
||||
L_WARNING("vertical disparity not built\n", procName);
|
||||
ptaaDestroy(&ptaa2);
|
||||
return 1;
|
||||
@@ -290,13 +298,24 @@ PTAA *ptaa1, *ptaa2;
|
||||
* a pdf. Non-pix debug output goes to /tmp.
|
||||
* </pre>
|
||||
*/
|
||||
+/* WILLUS MOD */
|
||||
l_ok
|
||||
dewarpFindVertDisparity(L_DEWARP *dew,
|
||||
PTAA *ptaa,
|
||||
l_int32 rotflag)
|
||||
{
|
||||
+return(dewarpFindVertDisparity_ex(dew,ptaa,rotflag,2));
|
||||
+}
|
||||
+/* WILLUS MOD -- add cubic and quartic fits and ..._ex functions */
|
||||
+l_int32
|
||||
+dewarpFindVertDisparity_ex(L_DEWARP *dew,
|
||||
+ PTAA *ptaa,
|
||||
+ l_int32 rotflag,
|
||||
+ l_int32 fit_order)
|
||||
+{
|
||||
l_int32 i, j, nlines, npts, nx, ny, sampling;
|
||||
-l_float32 c0, c1, c2, x, y, midy, val, medval, meddev, minval, maxval;
|
||||
+/* WILLUS MOD */
|
||||
+l_float32 c0, c1, c2, c3, c4, x, y, midy, val, medval, meddev, minval, maxval;
|
||||
l_float32 *famidys;
|
||||
NUMA *nax, *nafit, *nacurve0, *nacurve1, *nacurves;
|
||||
NUMA *namidy, *namidys, *namidysi;
|
||||
@@ -304,11 +323,22 @@ PIX *pix1, *pix2, *pixcirc, *pixdb;
|
||||
PTA *pta, *ptad, *ptacirc;
|
||||
PTAA *ptaa0, *ptaa1, *ptaa2, *ptaa3, *ptaa4, *ptaa5, *ptaat;
|
||||
FPIX *fpix;
|
||||
+/* WILLUS MOD */
|
||||
+l_int32 fit_order1,fit_order2;
|
||||
|
||||
PROCNAME("dewarpFindVertDisparity");
|
||||
|
||||
if (!dew)
|
||||
return ERROR_INT("dew not defined", procName, 1);
|
||||
+/* WILLUS MOD */
|
||||
+ if (fit_order < 10)
|
||||
+ fit_order1 = fit_order2 = fit_order;
|
||||
+ else
|
||||
+ {
|
||||
+ fit_order1=fit_order % 10;
|
||||
+ fit_order2=fit_order / 10;
|
||||
+ fit_order2=fit_order2 % 10;
|
||||
+ }
|
||||
dew->vsuccess = 0;
|
||||
if (!ptaa)
|
||||
return ERROR_INT("ptaa not defined", procName, 1);
|
||||
@@ -331,12 +361,32 @@ FPIX *fpix;
|
||||
pixdb = (rotflag) ? pixRotateOrth(dew->pixs, 1) : pixClone(dew->pixs);
|
||||
for (i = 0; i < nlines; i++) { /* for each line */
|
||||
pta = ptaaGetPta(ptaa, i, L_CLONE);
|
||||
- ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL);
|
||||
- numaAddNumber(nacurve0, c2);
|
||||
+/* WILLUS MOD */
|
||||
+if (fit_order1>3)
|
||||
+ {
|
||||
+ ptaGetQuarticLSF(pta, &c4, &c3, &c2, &c1, &c0, NULL);
|
||||
+ numaAddNumber(nacurve0, c4);
|
||||
+ }
|
||||
+else if (fit_order1==3)
|
||||
+ {
|
||||
+ ptaGetCubicLSF(pta, &c3, &c2, &c1, &c0, NULL);
|
||||
+ numaAddNumber(nacurve0, c3);
|
||||
+ }
|
||||
+else
|
||||
+ {
|
||||
+ ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL);
|
||||
+ numaAddNumber(nacurve0, c2);
|
||||
+ }
|
||||
ptad = ptaCreate(nx);
|
||||
for (j = 0; j < nx; j++) { /* uniformly sampled in x */
|
||||
x = j * sampling;
|
||||
- applyQuadraticFit(c2, c1, c0, x, &y);
|
||||
+/* WILLUS MOD */
|
||||
+if (fit_order1>3)
|
||||
+ applyQuarticFit(c4, c3, c2, c1, c0, x, &y);
|
||||
+else if (fit_order1==3)
|
||||
+ applyCubicFit(c3, c2, c1, c0, x, &y);
|
||||
+else
|
||||
+ applyQuadraticFit(c2, c1, c0, x, &y);
|
||||
ptaAddPt(ptad, x, y);
|
||||
}
|
||||
ptaaAddPta(ptaa0, ptad, L_INSERT);
|
||||
@@ -350,7 +400,13 @@ FPIX *fpix;
|
||||
for (i = 0; i < nlines; i++) {
|
||||
pta = ptaaGetPta(ptaa, i, L_CLONE);
|
||||
ptaGetArrays(pta, &nax, NULL);
|
||||
- ptaGetQuadraticLSF(pta, NULL, NULL, NULL, &nafit);
|
||||
+/* WILLUS MOD */
|
||||
+if (fit_order1>3)
|
||||
+ptaGetQuarticLSF(pta, NULL, NULL, NULL, NULL, NULL, &nafit);
|
||||
+else if (fit_order1==3)
|
||||
+ptaGetCubicLSF(pta, NULL, NULL, NULL, NULL, &nafit);
|
||||
+else
|
||||
+ptaGetQuadraticLSF(pta, NULL, NULL, NULL, &nafit);
|
||||
ptad = ptaCreateFromNuma(nax, nafit);
|
||||
ptaaAddPta(ptaat, ptad, L_INSERT);
|
||||
ptaDestroy(&pta);
|
||||
@@ -494,11 +550,24 @@ FPIX *fpix;
|
||||
ptaa5 = ptaaCreate(nx); /* uniformly sampled across full height of image */
|
||||
for (j = 0; j < nx; j++) { /* for each column */
|
||||
pta = ptaaGetPta(ptaa4, j, L_CLONE);
|
||||
- ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL);
|
||||
+/* WILLUS MOD */
|
||||
+/* Order higher than 2 can cause a little craziness here. */
|
||||
+if (fit_order2>3)
|
||||
+ ptaGetQuarticLSF(pta, &c4, &c3, &c2, &c1, &c0, NULL);
|
||||
+else if (fit_order2==3)
|
||||
+ ptaGetCubicLSF(pta, &c3, &c2, &c1, &c0, NULL);
|
||||
+else
|
||||
+ ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL);
|
||||
ptad = ptaCreate(ny);
|
||||
for (i = 0; i < ny; i++) { /* uniformly sampled in y */
|
||||
y = i * sampling;
|
||||
- applyQuadraticFit(c2, c1, c0, y, &val);
|
||||
+/* WILLUS MOD */
|
||||
+if (fit_order2>3)
|
||||
+ applyQuarticFit(c4, c3, c2, c1, c0, y, &val);
|
||||
+else if (fit_order2==3)
|
||||
+ applyCubicFit(c3, c2, c1, c0, y, &val);
|
||||
+else
|
||||
+ applyQuadraticFit(c2, c1, c0, y, &val);
|
||||
ptaAddPt(ptad, y, val);
|
||||
}
|
||||
ptaaAddPta(ptaa5, ptad, L_INSERT);
|
||||
@@ -1602,11 +1671,21 @@ FPIX *fpix;
|
||||
* See notes there.
|
||||
* </pre>
|
||||
*/
|
||||
+/* WILLUS MOD */
|
||||
l_ok
|
||||
dewarpBuildLineModel(L_DEWARP *dew,
|
||||
l_int32 opensize,
|
||||
const char *debugfile)
|
||||
{
|
||||
+return(dewarpBuildLineModel_ex(dew,opensize,debugfile,2));
|
||||
+}
|
||||
+
|
||||
+l_int32
|
||||
+dewarpBuildLineModel_ex(L_DEWARP *dew,
|
||||
+ l_int32 opensize,
|
||||
+ const char *debugfile,
|
||||
+ l_int32 fit_order)
|
||||
+{
|
||||
char buf[64];
|
||||
l_int32 i, j, bx, by, ret, nlines;
|
||||
BOXA *boxa;
|
||||
@@ -1695,6 +1774,8 @@ PTAA *ptaa1, *ptaa2;
|
||||
|
||||
/* Remove all lines that are not at least 0.75 times the length
|
||||
* of the longest line. */
|
||||
+/* WILLUS MOD */
|
||||
+/*
|
||||
ptaa2 = dewarpRemoveShortLines(pix, ptaa1, 0.75, DEBUG_SHORT_LINES);
|
||||
if (debugfile) {
|
||||
pix1 = pixConvertTo32(pix);
|
||||
@@ -1704,6 +1785,8 @@ PTAA *ptaa1, *ptaa2;
|
||||
pixDestroy(&pix1);
|
||||
pixDestroy(&pix2);
|
||||
}
|
||||
+*/
|
||||
+ptaa2=ptaa1;
|
||||
ptaaDestroy(&ptaa1);
|
||||
nlines = ptaaGetCount(ptaa2);
|
||||
if (nlines < dew->minlines) {
|
||||
@@ -1717,7 +1800,8 @@ PTAA *ptaa1, *ptaa2;
|
||||
* centers. The disparity array will push pixels vertically
|
||||
* so that each line is flat and centered at the y-position
|
||||
* of the mid-point. */
|
||||
- ret = dewarpFindVertDisparity(dew, ptaa2, 1 - i);
|
||||
+/* WILLUS MOD */
|
||||
+ ret = dewarpFindVertDisparity_ex(dew, ptaa2, 1 - i, fit_order);
|
||||
|
||||
/* If i == 0, move the result to the horizontal disparity,
|
||||
* rotating it back by -90 degrees. */
|
||||
diff --git a/src/leptwin.c b/src/leptwin.c
|
||||
index 72643a0..573d33e 100644
|
||||
--- a/src/leptwin.c
|
||||
+++ b/src/leptwin.c
|
||||
@@ -364,5 +364,9 @@ PIXCMAP *cmap;
|
||||
|
||||
return hBitmap;
|
||||
}
|
||||
-
|
||||
+#else
|
||||
+/* willus mod: Avoid weird issue with OS/X library archiver when there are no symbols */
|
||||
+int leptwin_my_empty_func(void);
|
||||
+int leptwin_my_empty_func(void)
|
||||
+{return(0);}
|
||||
#endif /* _WIN32 */
|
||||
--
|
||||
2.22.0
|
||||
|
File diff suppressed because it is too large
Load Diff
|
@ -1,675 +0,0 @@
|
|||
From 39aa8502eee7bb669a29d1a9b3bfe5c9595ad960 Mon Sep 17 00:00:00 2001
|
||||
From: Daniel Fullmer <danielrf12@gmail.com>
|
||||
Date: Fri, 13 Sep 2019 13:45:05 -0400
|
||||
Subject: [PATCH] Willus mod changes from k2pdfopt
|
||||
|
||||
---
|
||||
src/api/Makefile.am | 1 +
|
||||
src/api/baseapi.cpp | 87 +++++++++++
|
||||
src/api/baseapi.h | 3 +
|
||||
src/api/tesscapi.cpp | 311 +++++++++++++++++++++++++++++++++++++
|
||||
src/api/tesseract.h | 29 ++++
|
||||
src/ccmain/tessedit.cpp | 5 +-
|
||||
src/ccutil/ccutil.h | 7 +
|
||||
src/ccutil/genericvector.h | 21 ++-
|
||||
src/ccutil/mainblk.cpp | 17 +-
|
||||
src/ccutil/params.cpp | 3 +-
|
||||
src/ccutil/serialis.cpp | 3 +
|
||||
src/ccutil/serialis.h | 2 +
|
||||
src/lstm/input.cpp | 3 +
|
||||
13 files changed, 488 insertions(+), 4 deletions(-)
|
||||
create mode 100644 src/api/tesscapi.cpp
|
||||
create mode 100644 src/api/tesseract.h
|
||||
|
||||
diff --git a/src/api/Makefile.am b/src/api/Makefile.am
|
||||
index d9b76eb6..cd2dc30f 100644
|
||||
--- a/src/api/Makefile.am
|
||||
+++ b/src/api/Makefile.am
|
||||
@@ -39,6 +39,7 @@ libtesseract_api_la_SOURCES += lstmboxrenderer.cpp
|
||||
libtesseract_api_la_SOURCES += pdfrenderer.cpp
|
||||
libtesseract_api_la_SOURCES += wordstrboxrenderer.cpp
|
||||
libtesseract_api_la_SOURCES += renderer.cpp
|
||||
+libtesseract_api_la_SOURCES += tesscapi.cpp
|
||||
|
||||
lib_LTLIBRARIES += libtesseract.la
|
||||
libtesseract_la_LDFLAGS = $(LEPTONICA_LIBS) $(OPENCL_LDFLAGS) $(libarchive_LIBS)
|
||||
diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp
|
||||
index 9245d07c..ea964ee6 100644
|
||||
--- a/src/api/baseapi.cpp
|
||||
+++ b/src/api/baseapi.cpp
|
||||
@@ -215,6 +215,14 @@ TessBaseAPI::TessBaseAPI()
|
||||
// Use the current locale if building debug code.
|
||||
std::locale::global(std::locale(""));
|
||||
#endif
|
||||
+ const char *locale;
|
||||
+ locale = std::setlocale(LC_ALL, nullptr);
|
||||
+/* willus mod Remove assertions--taken care of in tesscapi.cpp */
|
||||
+// ASSERT_HOST(!strcmp(locale, "C"));
|
||||
+ locale = std::setlocale(LC_CTYPE, nullptr);
|
||||
+// ASSERT_HOST(!strcmp(locale, "C"));
|
||||
+ locale = std::setlocale(LC_NUMERIC, nullptr);
|
||||
+// ASSERT_HOST(!strcmp(locale, "C"));
|
||||
}
|
||||
|
||||
TessBaseAPI::~TessBaseAPI() {
|
||||
@@ -1333,6 +1341,85 @@ static void AddBoxToTSV(const PageIterator* it, PageIteratorLevel level,
|
||||
text->add_str_int("\t", bottom - top);
|
||||
}
|
||||
|
||||
+/* willus mod */
|
||||
+int TessBaseAPI::GetOCRWords(int **x00,int **y00,int **x11,int **y11,int **ybaseline0,
|
||||
+ char **utf8words)
|
||||
+
|
||||
+ {
|
||||
+ int iword,nwords,totlen,it8;
|
||||
+ int *x0,*y0,*x1,*y1,*ybaseline;
|
||||
+ char *tutf8;
|
||||
+
|
||||
+ ResultIterator *res_it = GetIterator();
|
||||
+ /* Count words */
|
||||
+ iword=0;
|
||||
+ totlen=0;
|
||||
+ while (!res_it->Empty(RIL_BLOCK))
|
||||
+ {
|
||||
+ if (res_it->Empty(RIL_WORD))
|
||||
+ {
|
||||
+ res_it->Next(RIL_WORD);
|
||||
+ continue;
|
||||
+ }
|
||||
+ iword++;
|
||||
+ STRING textstr=std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
|
||||
+ totlen+=strlen(textstr.string())+1;
|
||||
+ res_it->Next(RIL_WORD);
|
||||
+ }
|
||||
+ nwords=iword;
|
||||
+/*
|
||||
+printf("\nnwords=%d, totlen=%d\n",nwords,totlen);
|
||||
+*/
|
||||
+ x0=(*x00)=(int *)malloc(sizeof(int)*5*nwords);
|
||||
+ y0=(*y00)=&x0[nwords];
|
||||
+ x1=(*x11)=&y0[nwords];
|
||||
+ y1=(*y11)=&x1[nwords];
|
||||
+ ybaseline=(*ybaseline0)=&y1[nwords];
|
||||
+ tutf8=(*utf8words)=(char *)malloc(totlen);
|
||||
+ iword=0;
|
||||
+ it8=0;
|
||||
+ res_it->Begin();
|
||||
+ while (!res_it->Empty(RIL_BLOCK))
|
||||
+ {
|
||||
+ if (res_it->Empty(RIL_WORD))
|
||||
+ {
|
||||
+ res_it->Next(RIL_WORD);
|
||||
+ continue;
|
||||
+ }
|
||||
+ STRING textstr=std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
|
||||
+ strcpy(&tutf8[it8],textstr.string());
|
||||
+ it8 += strlen(&tutf8[it8])+1;
|
||||
+ /*
|
||||
+ STRING textstr("");
|
||||
+ textstr += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
|
||||
+ */
|
||||
+/*
|
||||
+printf("Word %d: '%s'\n",iword,textstr.string());
|
||||
+*/
|
||||
+ int left, top, right, bottom;
|
||||
+ int u1,v1,u2,v2;
|
||||
+ res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
|
||||
+ res_it->Baseline(RIL_WORD, &u1, &v1, &u2, &v2);
|
||||
+ x0[iword]=left;
|
||||
+ x1[iword]=right;
|
||||
+ y0[iword]=top;
|
||||
+ y1[iword]=bottom;
|
||||
+ ybaseline[iword]=(v1+v2)/2;
|
||||
+ iword++;
|
||||
+/*
|
||||
+printf("BB: (%d,%d)-(%d,%d) BL: (%d,%d)-(%d,%d)\n",left,bottom,right,top,x1,y1,x2,y2);
|
||||
+*/
|
||||
+ res_it->Next(RIL_WORD);
|
||||
+ }
|
||||
+/*
|
||||
+printf("iword=%d\n",iword);
|
||||
+*/
|
||||
+ return(iword);
|
||||
+ }
|
||||
+
|
||||
+/* willus mod */
|
||||
+int GetOCRWords(int **x0,int **y0,int **x1,int **y1,int **ybaseline,char **utf8words);
|
||||
+
|
||||
/**
|
||||
* Make a TSV-formatted string from the internal data structures.
|
||||
* page_number is 0-based but will appear in the output as 1-based.
|
||||
diff --git a/src/api/baseapi.h b/src/api/baseapi.h
|
||||
index 3724dd92..23be5920 100644
|
||||
--- a/src/api/baseapi.h
|
||||
+++ b/src/api/baseapi.h
|
||||
@@ -575,6 +575,9 @@ class TESS_API TessBaseAPI {
|
||||
*/
|
||||
char* GetHOCRText(ETEXT_DESC* monitor, int page_number);
|
||||
|
||||
+/* willus mod */
|
||||
+int GetOCRWords(int **x0,int **y0,int **x1,int **y1,int **ybaseline,char **utf8words);
|
||||
+
|
||||
/**
|
||||
* Make a HTML-formatted string with hOCR markup from the internal
|
||||
* data structures.
|
||||
diff --git a/src/api/tesscapi.cpp b/src/api/tesscapi.cpp
|
||||
new file mode 100644
|
||||
index 00000000..1752fafe
|
||||
--- /dev/null
|
||||
+++ b/src/api/tesscapi.cpp
|
||||
@@ -0,0 +1,311 @@
|
||||
+/*
|
||||
+** tesscapi.cpp willus.com attempt at C wrapper for tesseract.
|
||||
+** (Butchered from tesseractmain.cpp)
|
||||
+** Last udpated 9-1-12
|
||||
+**
|
||||
+** Copyright (C) 2012 http://willus.com
|
||||
+**
|
||||
+** This program is free software: you can redistribute it and/or modify
|
||||
+** it under the terms of the GNU Affero General Public License as
|
||||
+** published by the Free Software Foundation, either version 3 of the
|
||||
+** License, or (at your option) any later version.
|
||||
+**
|
||||
+** This program is distributed in the hope that it will be useful,
|
||||
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+** GNU Affero General Public License for more details.
|
||||
+**
|
||||
+** You should have received a copy of the GNU Affero General Public License
|
||||
+** along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
+**
|
||||
+*/
|
||||
+
|
||||
+/*
|
||||
+#include "mfcpch.h"
|
||||
+*/
|
||||
+// #define USE_VLD //Uncomment for Visual Leak Detector.
|
||||
+#if (defined _MSC_VER && defined USE_VLD)
|
||||
+#include <vld.h>
|
||||
+#endif
|
||||
+
|
||||
+// Include automatically generated configuration file if running autoconf
|
||||
+#ifdef HAVE_CONFIG_H
|
||||
+#include "config_auto.h"
|
||||
+#endif
|
||||
+#include <locale.h>
|
||||
+#ifdef USING_GETTEXT
|
||||
+#include <libintl.h>
|
||||
+#define _(x) gettext(x)
|
||||
+#else
|
||||
+#define _(x) (x)
|
||||
+#endif
|
||||
+
|
||||
+#include "allheaders.h"
|
||||
+#include "baseapi.h"
|
||||
+#include "strngs.h"
|
||||
+#include "params.h"
|
||||
+#include "blobs.h"
|
||||
+#include "simddetect.h"
|
||||
+#include "tesseractclass.h"
|
||||
+/*
|
||||
+#include "notdll.h"
|
||||
+*/
|
||||
+
|
||||
+/* C Wrappers */
|
||||
+#include "tesseract.h"
|
||||
+
|
||||
+// static tesseract::TessBaseAPI api[4];
|
||||
+
|
||||
+/*
|
||||
+** ocr_type=0: OEM_DEFAULT
|
||||
+** ocr_type=1: OEM_TESSERACT_ONLY
|
||||
+** ocr_type=2: OEM_LSTM_ONLY
|
||||
+** ocr_type=3: OEM_TESSERACT_LSTM_COMBINED
|
||||
+*/
|
||||
+void *tess_capi_init(char *datapath,char *language,int ocr_type,FILE *out,
|
||||
+ char *initstr,int maxlen,int *status)
|
||||
+
|
||||
+ {
|
||||
+ char original_locale[256];
|
||||
+ tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI;
|
||||
+/*
|
||||
+printf("@tess_capi_init\n");
|
||||
+printf(" datapath='%s'\n",datapath);
|
||||
+printf(" language='%s'\n",language);
|
||||
+printf(" ocr_type=%d\n",ocr_type);
|
||||
+*/
|
||||
+#ifdef USE_NLS
|
||||
+ setlocale (LC_ALL, "");
|
||||
+ bindtextdomain (PACKAGE, LOCALEDIR);
|
||||
+ textdomain (PACKAGE);
|
||||
+#endif
|
||||
+ /* willus mod, 11-24-16 */
|
||||
+ /* Tesseract needs "C" locale to correctly parse all data .traineddata files. */
|
||||
+/*
|
||||
+printf("locale='%s'\n",setlocale(LC_ALL,NULL));
|
||||
+printf("ctype='%s'\n",setlocale(LC_CTYPE,NULL));
|
||||
+printf("numeric='%s'\n",setlocale(LC_NUMERIC,NULL));
|
||||
+*/
|
||||
+ strncpy(original_locale,setlocale(LC_ALL,NULL),255);
|
||||
+ original_locale[255]='\0';
|
||||
+/*
|
||||
+printf("original_locale='%s'\n",original_locale);
|
||||
+*/
|
||||
+ setlocale(LC_ALL,"C");
|
||||
+/*
|
||||
+printf("new locale='%s'\n",setlocale(LC_ALL,NULL));
|
||||
+printf("new ctype='%s'\n",setlocale(LC_CTYPE,NULL));
|
||||
+printf("new numeric='%s'\n",setlocale(LC_NUMERIC,NULL));
|
||||
+*/
|
||||
+ // fprintf(stderr, "tesseract %s\n", tesseract::TessBaseAPI::Version());
|
||||
+ // Make the order of args a bit more forgiving than it used to be.
|
||||
+ const char* lang = "eng";
|
||||
+ tesseract::PageSegMode pagesegmode = tesseract::PSM_SINGLE_BLOCK;
|
||||
+ if (language!=NULL && language[0]!='\0')
|
||||
+ lang = language;
|
||||
+ /*
|
||||
+ if (output == NULL)
|
||||
+ {
|
||||
+ fprintf(stderr, _("Usage:%s imagename outputbase [-l lang] "
|
||||
+ "[-psm pagesegmode] [configfile...]\n"), argv[0]);
|
||||
+ fprintf(stderr,
|
||||
+ _("pagesegmode values are:\n"
|
||||
+ "0 = Orientation and script detection (OSD) only.\n"
|
||||
+ "1 = Automatic page segmentation with OSD.\n"
|
||||
+ "2 = Automatic page segmentation, but no OSD, or OCR\n"
|
||||
+ "3 = Fully automatic page segmentation, but no OSD. (Default)\n"
|
||||
+ "4 = Assume a single column of text of variable sizes.\n"
|
||||
+ "5 = Assume a single uniform block of vertically aligned text.\n"
|
||||
+ "6 = Assume a single uniform block of text.\n"
|
||||
+ "7 = Treat the image as a single text line.\n"
|
||||
+ "8 = Treat the image as a single word.\n"
|
||||
+ "9 = Treat the image as a single word in a circle.\n"
|
||||
+ "10 = Treat the image as a single character.\n"));
|
||||
+ fprintf(stderr, _("-l lang and/or -psm pagesegmode must occur before any"
|
||||
+ "configfile.\n"));
|
||||
+ exit(1);
|
||||
+ }
|
||||
+ */
|
||||
+/*
|
||||
+printf("SSE = %s\n",SIMDDetect::IsSSEAvailable() ? "AVAILABLE" : "NOT AVAILABLE");
|
||||
+printf("AVX = %s\n",SIMDDetect::IsAVXAvailable() ? "AVAILABLE" : "NOT AVAILABLE");
|
||||
+*/
|
||||
+/*
|
||||
+v4.00 loads either TESSERACT enginer, LSTM engine, or both. No CUBE.
|
||||
+*/
|
||||
+ ocr_type=0; /* Ignore specified and use default */
|
||||
+ api->SetOutputName(NULL);
|
||||
+ (*status)=api->Init(datapath,lang,
|
||||
+ ocr_type==0 ? tesseract::OEM_DEFAULT :
|
||||
+ (ocr_type==1 ? tesseract::OEM_TESSERACT_ONLY :
|
||||
+ (ocr_type==2 ? tesseract::OEM_LSTM_ONLY :
|
||||
+ (tesseract::OEM_TESSERACT_LSTM_COMBINED))));
|
||||
+ if ((*status)!=0)
|
||||
+ {
|
||||
+ /* willus mod, 11-24-16 */
|
||||
+ setlocale(LC_ALL,original_locale);
|
||||
+ api->End();
|
||||
+ delete api;
|
||||
+ return(NULL);
|
||||
+ }
|
||||
+ /*
|
||||
+ api.Init("tesscapi",lang,tesseract::OEM_DEFAULT,
|
||||
+ &(argv[arg]), argc - arg, NULL, NULL, false);
|
||||
+ */
|
||||
+ // We have 2 possible sources of pagesegmode: a config file and
|
||||
+ // the command line. For backwards compatability reasons, the
|
||||
+ // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
|
||||
+ // default for this program is tesseract::PSM_AUTO. We will let
|
||||
+ // the config file take priority, so the command-line default
|
||||
+ // can take priority over the tesseract default, so we use the
|
||||
+ // value from the command line only if the retrieved mode
|
||||
+ // is still tesseract::PSM_SINGLE_BLOCK, indicating no change
|
||||
+ // in any config file. Therefore the only way to force
|
||||
+ // tesseract::PSM_SINGLE_BLOCK is from the command line.
|
||||
+ // It would be simpler if we could set the value before Init,
|
||||
+ // but that doesn't work.
|
||||
+ if (api->GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK)
|
||||
+ api->SetPageSegMode(pagesegmode);
|
||||
+
|
||||
+ /*
|
||||
+ ** Initialization message
|
||||
+ */
|
||||
+ {
|
||||
+ char istr[1024];
|
||||
+ int sse,avx;
|
||||
+
|
||||
+// printf("tessedit_ocr_engine_mode = %d\n",tessedit_ocr_engine_mode);
|
||||
+ sprintf(istr,"%s",api->Version());
|
||||
+ sse=tesseract::SIMDDetect::IsSSEAvailable();
|
||||
+ avx=tesseract::SIMDDetect::IsAVXAvailable();
|
||||
+ if (sse || avx)
|
||||
+ sprintf(&istr[strlen(istr)]," [%s]",sse&&avx?"SSE+AVX":(sse?"SSE":"AVX"));
|
||||
+ sprintf(&istr[strlen(istr)],"\n Tesseract data folder = '%s'",datapath==NULL?getenv("TESSDATA_PREFIX"):datapath);
|
||||
+ strcat(istr,"\n Tesseract languages: ");
|
||||
+ GenericVector<STRING> languages;
|
||||
+ api->GetLoadedLanguagesAsVector(&languages);
|
||||
+/*
|
||||
+printf("OEM=%d\n",api->oem());
|
||||
+printf("Langs='%s'\n",api->GetInitLanguagesAsString());
|
||||
+printf("AnyTessLang()=%d\n",(int)api->tesseract()->AnyTessLang());
|
||||
+printf("AnyLSTMLang()=%d\n",(int)api->tesseract()->AnyLSTMLang());
|
||||
+printf("num_sub_langs()=%d\n",api->tesseract()->num_sub_langs());
|
||||
+printf("languages.size()=%d\n",(int)languages.size());
|
||||
+*/
|
||||
+
|
||||
+ for (int i=0;i<=api->tesseract()->num_sub_langs();i++)
|
||||
+ {
|
||||
+ tesseract::Tesseract *lang1;
|
||||
+ int eng;
|
||||
+ lang1 = i==0 ? api->tesseract() : api->tesseract()->get_sub_lang(i-1);
|
||||
+ eng=(int)lang1->tessedit_ocr_engine_mode;
|
||||
+ sprintf(&istr[strlen(istr)],"%s%s [%s]",i==0?"":", ",lang1->lang.string(),
|
||||
+ eng==2?"LSTM+Tess":(eng==1?"LSTM":"Tess"));
|
||||
+ }
|
||||
+/*
|
||||
+printf("%d. '%s'\n",i+1,languages[i].string());
|
||||
+printf(" sublang[%d].oem_engine = %d\n",i+1,(int)api->tesseract()->get_sub_lang(i)->tessedit_ocr_engine_mode);
|
||||
+*/
|
||||
+
|
||||
+ /*
|
||||
+ if (ocr_type==0 || ocr_type==3)
|
||||
+ sprintf(&istr[strlen(istr)],"[LSTM+] (lang=");
|
||||
+ else if (ocr_type==2)
|
||||
+ sprintf(&istr[strlen(istr)],"[LSTM] (lang=");
|
||||
+ strncpy(&istr[strlen(istr)],language,253-strlen(istr));
|
||||
+ istr[253]='\0';
|
||||
+ strcat(istr,")");
|
||||
+ */
|
||||
+ if (out!=NULL)
|
||||
+ fprintf(out,"%s\n",istr);
|
||||
+ if (initstr!=NULL)
|
||||
+ {
|
||||
+ strncpy(initstr,istr,maxlen-1);
|
||||
+ initstr[maxlen-1]='\0';
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+
|
||||
+ /* Turn off LSTM debugging output */
|
||||
+ api->SetVariable("lstm_debug_level","0");
|
||||
+#if (WILLUSDEBUG & 1)
|
||||
+ api->SetVariable("lstm_debug_level","9");
|
||||
+ api->SetVariable("paragraph_debug_level","9");
|
||||
+ api->SetVariable("tessdata_manager_debug_level","9");
|
||||
+ api->SetVariable("tosp_debug_level","9");
|
||||
+ api->SetVariable("wordrec_debug_level","9");
|
||||
+ api->SetVariable("segsearch_debug_level","9");
|
||||
+#endif
|
||||
+ /* willus mod, 11-24-16 */
|
||||
+ setlocale(LC_ALL,original_locale);
|
||||
+ return((void *)api);
|
||||
+ }
|
||||
+
|
||||
+
|
||||
+int tess_capi_get_ocr(void *vapi,PIX *pix,char *outstr,int maxlen,int segmode,FILE *out)
|
||||
+
|
||||
+ {
|
||||
+ tesseract::TessBaseAPI *api;
|
||||
+ static int old_segmode=-1;
|
||||
+
|
||||
+ api=(tesseract::TessBaseAPI *)vapi;
|
||||
+ if (old_segmode != segmode)
|
||||
+ {
|
||||
+ old_segmode=segmode;
|
||||
+ api->SetPageSegMode((tesseract::PageSegMode)segmode);
|
||||
+ }
|
||||
+ if (!api->ProcessPage(pix,0,NULL,NULL,0,NULL))
|
||||
+ {
|
||||
+ /* pixDestroy(&pix); */
|
||||
+ if (out!=NULL)
|
||||
+ fprintf(out,"tesscapi: Error during bitmap processing.\n");
|
||||
+ api->Clear();
|
||||
+ return(-1);
|
||||
+ }
|
||||
+ strncpy(outstr,api->GetUTF8Text(),maxlen-1);
|
||||
+ outstr[maxlen-1]='\0';
|
||||
+ api->Clear();
|
||||
+ return(0);
|
||||
+ }
|
||||
+
|
||||
+
|
||||
+int tess_capi_get_ocr_multiword(void *vapi,PIX *pix,int segmode,
|
||||
+ int **left,int **top,int **right,int **bottom,
|
||||
+ int **ybase,char **text,int *nw,
|
||||
+ FILE *out)
|
||||
+
|
||||
+ {
|
||||
+ tesseract::TessBaseAPI *api;
|
||||
+ static int old_segmode=-1;
|
||||
+
|
||||
+ api=(tesseract::TessBaseAPI *)vapi;
|
||||
+ if (old_segmode != segmode)
|
||||
+ {
|
||||
+ old_segmode=segmode;
|
||||
+ api->SetPageSegMode((tesseract::PageSegMode)segmode);
|
||||
+ }
|
||||
+ if (!api->ProcessPage(pix,0,NULL,NULL,0,NULL))
|
||||
+ {
|
||||
+ if (out!=NULL)
|
||||
+ fprintf(out,"tesscapi: Error during bitmap processing.\n");
|
||||
+ api->Clear();
|
||||
+ (*nw)=0;
|
||||
+ return(-1);
|
||||
+ }
|
||||
+ (*nw)=api->GetOCRWords(left,top,right,bottom,ybase,text);
|
||||
+ api->Clear();
|
||||
+ return(0);
|
||||
+ }
|
||||
+
|
||||
+
|
||||
+void tess_capi_end(void *vapi)
|
||||
+
|
||||
+ {
|
||||
+ tesseract::TessBaseAPI *api;
|
||||
+
|
||||
+ if (vapi==NULL)
|
||||
+ return;
|
||||
+ api=(tesseract::TessBaseAPI *)vapi;
|
||||
+ api->End();
|
||||
+ delete api;
|
||||
+ }
|
||||
diff --git a/src/api/tesseract.h b/src/api/tesseract.h
|
||||
new file mode 100644
|
||||
index 00000000..575948cc
|
||||
--- /dev/null
|
||||
+++ b/src/api/tesseract.h
|
||||
@@ -0,0 +1,29 @@
|
||||
+/*
|
||||
+** Willus.com's Tesseract C Wrappers
|
||||
+**
|
||||
+** 6-8-12
|
||||
+**
|
||||
+*/
|
||||
+
|
||||
+#ifndef _TESSERACT_H_
|
||||
+#define _TESSERACT_H_
|
||||
+
|
||||
+//#include <leptonica.h>
|
||||
+#ifdef __cplusplus
|
||||
+extern "C" {
|
||||
+#endif
|
||||
+
|
||||
+void *tess_capi_init(char *datapath,char *language,int ocr_type,FILE *out,
|
||||
+ char *initstr,int maxlen,int *status);
|
||||
+int tess_capi_get_ocr(void *api,PIX *pix,char *outstr,int maxlen,int segmode,FILE *out);
|
||||
+int tess_capi_get_ocr_multiword(void *vapi,PIX *pix,int segmode,
|
||||
+ int **left,int **top,int **right,int **bottom,
|
||||
+ int **ybase,char **text,int *nw,
|
||||
+ FILE *out);
|
||||
+void tess_capi_end(void *api);
|
||||
+
|
||||
+#ifdef __cplusplus
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+#endif
|
||||
diff --git a/src/ccmain/tessedit.cpp b/src/ccmain/tessedit.cpp
|
||||
index 17f0951b..7af94ee2 100644
|
||||
--- a/src/ccmain/tessedit.cpp
|
||||
+++ b/src/ccmain/tessedit.cpp
|
||||
@@ -101,6 +101,10 @@ bool Tesseract::init_tesseract_lang_data(
|
||||
" to your \"tessdata\" directory.\n");
|
||||
return false;
|
||||
}
|
||||
+ /* willus mod */
|
||||
+ TFile fp;
|
||||
+ strncpy(fp.tfile_filename,tessdata_path.string(),511);
|
||||
+ fp.tfile_filename[511]='\0';
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
if (oem == OEM_DEFAULT) {
|
||||
// Set the engine mode from availability, which can then be overridden by
|
||||
@@ -116,7 +120,6 @@ bool Tesseract::init_tesseract_lang_data(
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
// If a language specific config file (lang.config) exists, load it in.
|
||||
- TFile fp;
|
||||
if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
|
||||
ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp,
|
||||
this->params());
|
||||
diff --git a/src/ccutil/ccutil.h b/src/ccutil/ccutil.h
|
||||
index 71e89c60..bdeccc14 100644
|
||||
--- a/src/ccutil/ccutil.h
|
||||
+++ b/src/ccutil/ccutil.h
|
||||
@@ -80,6 +80,13 @@ class CCUtil {
|
||||
// Member parameters.
|
||||
// These have to be declared and initialized after params_ member, since
|
||||
// params_ should be initialized before parameters are added to it.
|
||||
+/* willus mod */
|
||||
+/*
|
||||
+ #ifdef _WIN32
|
||||
+ STRING_VAR_H(tessedit_module_name, WINDLLNAME,
|
||||
+ "Module colocated with tessdata dir");
|
||||
+ #endif
|
||||
+*/
|
||||
INT_VAR_H(ambigs_debug_level, 0, "Debug level for unichar ambiguities");
|
||||
BOOL_VAR_H(use_definite_ambigs_for_classifier, false,
|
||||
"Use definite ambiguities when running character classifier");
|
||||
diff --git a/src/ccutil/genericvector.h b/src/ccutil/genericvector.h
|
||||
index 3556d153..3a5e8662 100644
|
||||
--- a/src/ccutil/genericvector.h
|
||||
+++ b/src/ccutil/genericvector.h
|
||||
@@ -382,7 +382,26 @@ inline bool LoadDataFromFile(const char* filename, GenericVector<char>* data) {
|
||||
// reserve an extra byte in case caller wants to append a '\0' character
|
||||
data->reserve(size + 1);
|
||||
data->resize_no_init(size);
|
||||
- result = static_cast<long>(fread(&(*data)[0], 1, size, fp)) == size;
|
||||
+ /* willus mod Dec 2018--weird issue with Win XP and MinGW gcc 7.3.0 */
|
||||
+ /* Can't read entire file at once -- need to break up into smaller blocksize reads */
|
||||
+ {
|
||||
+ int frs,n;
|
||||
+ int blocksize;
|
||||
+ blocksize=1024*1024;
|
||||
+ for (n=0;1;)
|
||||
+ {
|
||||
+ int bs;
|
||||
+ bs= size-n > blocksize ? blocksize : size-n;
|
||||
+ frs=(int)fread(&(*data)[n],1,bs,fp);
|
||||
+ n+=frs;
|
||||
+ if (frs<bs || bs<blocksize || n>=size)
|
||||
+ break;
|
||||
+ }
|
||||
+ result = static_cast<long>((long)n==size);
|
||||
+ }
|
||||
+ /*
|
||||
+ result = static_cast<long>(fread(&(*data)[0], 1, size, fp)) == size;
|
||||
+ */
|
||||
}
|
||||
fclose(fp);
|
||||
}
|
||||
diff --git a/src/ccutil/mainblk.cpp b/src/ccutil/mainblk.cpp
|
||||
index 52b04b04..80b26044 100644
|
||||
--- a/src/ccutil/mainblk.cpp
|
||||
+++ b/src/ccutil/mainblk.cpp
|
||||
@@ -55,8 +55,22 @@ void CCUtil::main_setup(const char *argv0, const char *basename) {
|
||||
#if defined(_WIN32)
|
||||
} else if (datadir == nullptr || _access(datadir.string(), 0) != 0) {
|
||||
/* Look for tessdata in directory of executable. */
|
||||
+ /*
|
||||
+ char drive[_MAX_DRIVE];
|
||||
+ char dir[_MAX_DIR];
|
||||
+ */
|
||||
char path[_MAX_PATH];
|
||||
- DWORD length = GetModuleFileName(nullptr, path, sizeof(path));
|
||||
+ int i;
|
||||
+ /* DWORD length = */ GetModuleFileName(nullptr, path, sizeof(path));
|
||||
+ /* willus mod--avoid _splitpath_s -- not in XP */
|
||||
+ for (i=strlen(path)-1;i>=0 && path[i]!='/' && path[i]!='\\';i--);
|
||||
+ if (i>=0)
|
||||
+ {
|
||||
+ path[i]='\0';
|
||||
+ datadir=path;
|
||||
+ datadir += "/tessdata";
|
||||
+ }
|
||||
+ /*
|
||||
if (length > 0 && length < sizeof(path)) {
|
||||
char* separator = std::strrchr(path, '\\');
|
||||
if (separator != nullptr) {
|
||||
@@ -65,6 +79,7 @@ void CCUtil::main_setup(const char *argv0, const char *basename) {
|
||||
datadir += "/tessdata";
|
||||
}
|
||||
}
|
||||
+ */
|
||||
#endif /* _WIN32 */
|
||||
#if defined(TESSDATA_PREFIX)
|
||||
} else {
|
||||
diff --git a/src/ccutil/params.cpp b/src/ccutil/params.cpp
|
||||
index 00bf2563..486c5ce0 100644
|
||||
--- a/src/ccutil/params.cpp
|
||||
+++ b/src/ccutil/params.cpp
|
||||
@@ -82,7 +82,8 @@ bool ParamUtils::ReadParamsFromFp(SetParamConstraint constraint, TFile *fp,
|
||||
|
||||
if (!foundit) {
|
||||
anyerr = true; // had an error
|
||||
- tprintf("Warning: Parameter not found: %s\n", line);
|
||||
+ /* willus mod */
|
||||
+ tprintf("Tesseract warning: Parameter %s not found in file %s.\n",line,fp->tfile_filename);
|
||||
}
|
||||
}
|
||||
}
|
||||
diff --git a/src/ccutil/serialis.cpp b/src/ccutil/serialis.cpp
|
||||
index 7def011f..6107a494 100644
|
||||
--- a/src/ccutil/serialis.cpp
|
||||
+++ b/src/ccutil/serialis.cpp
|
||||
@@ -201,6 +201,9 @@ bool TFile::Open(const STRING& filename, FileReader reader) {
|
||||
offset_ = 0;
|
||||
is_writing_ = false;
|
||||
swap_ = false;
|
||||
+ /* willus mod */
|
||||
+ strncpy(tfile_filename,filename.string(),511);
|
||||
+ tfile_filename[511]='\0';
|
||||
if (reader == nullptr)
|
||||
return LoadDataFromFile(filename, data_);
|
||||
else
|
||||
diff --git a/src/ccutil/serialis.h b/src/ccutil/serialis.h
|
||||
index 095b9227..4cc8251e 100644
|
||||
--- a/src/ccutil/serialis.h
|
||||
+++ b/src/ccutil/serialis.h
|
||||
@@ -77,6 +77,8 @@ class TFile {
|
||||
public:
|
||||
TFile();
|
||||
~TFile();
|
||||
+ /* willus mod */
|
||||
+ char tfile_filename[512];
|
||||
|
||||
// All the Open methods load the whole file into memory for reading.
|
||||
// Opens a file with a supplied reader, or nullptr to use the default.
|
||||
diff --git a/src/lstm/input.cpp b/src/lstm/input.cpp
|
||||
index 73b584b3..0b0b54c3 100644
|
||||
--- a/src/lstm/input.cpp
|
||||
+++ b/src/lstm/input.cpp
|
||||
@@ -93,8 +93,11 @@ Pix* Input::PrepareLSTMInputs(const ImageData& image_data,
|
||||
return nullptr;
|
||||
}
|
||||
if (width < min_width || height < min_width) {
|
||||
+ /* willus mod -- no warning */
|
||||
+ /*
|
||||
tprintf("Image too small to scale!! (%dx%d vs min width of %d)\n", width,
|
||||
height, min_width);
|
||||
+ */
|
||||
pixDestroy(&pix);
|
||||
return nullptr;
|
||||
}
|
||||
--
|
||||
2.22.0
|
||||
|
Loading…
Reference in New Issue