diff --git a/pkgs/applications/misc/k2pdfopt/default.nix b/pkgs/applications/misc/k2pdfopt/default.nix
index 9391fe88c5e..58bd200e713 100644
--- a/pkgs/applications/misc/k2pdfopt/default.nix
+++ b/pkgs/applications/misc/k2pdfopt/default.nix
@@ -36,67 +36,19 @@ stdenv.mkDerivation rec {
buildInputs =
let
+ # The patches below were constructed by taking the files from k2pdfopt in
+ # the {mupdf,leptonica,tesseract}_mod/ directories, replacing the
+ # corresponding files in the respective source trees, resolving any errors
+ # with more recent versions of these depencencies, and running diff.
mupdf_modded = mupdf.overrideAttrs (attrs: {
- # Excluded the pdf-*.c files, since they mostly just broke the #includes
- prePatch = ''
- cp ${src}/mupdf_mod/{font,stext-device,string}.c source/fitz/
- cp ${src}/mupdf_mod/font-win32.c source/pdf/
- '';
+ patches = attrs.patches ++ [ ./mupdf.patch ]; # Last verified with mupdf 1.14.0
});
-
leptonica_modded = leptonica.overrideAttrs (attrs: {
- name = "leptonica-1.74.4";
- # Modified source files apply to this particular version of leptonica
- version = "1.74.4";
-
- src = fetchurl {
- url = "http://www.leptonica.org/source/leptonica-1.74.4.tar.gz";
- sha256 = "0fw39amgyv8v6nc7x8a4c7i37dm04i6c5zn62d24bgqnlhk59hr9";
- };
-
- prePatch = ''
- cp ${src}/leptonica_mod/{allheaders.h,dewarp2.c,leptwin.c} src/
- '';
- patches = [
- # stripped down copy of upstream commit b88c821f8d347bce0aea86d606c710303919f3d2
- ./leptonica-CVE-2018-3836.patch
- (fetchpatch {
- # CVE-2018-7186
- url = "https://github.com/DanBloomberg/leptonica/commit/"
- + "ee301cb2029db8a6289c5295daa42bba7715e99a.patch";
- sha256 = "0cgb7mvz2px1rg5i80wk1wxxjvzjga617d8q6j7qygkp7jm6495d";
- })
- (fetchpatch {
- # CVE-2018-7247
- url = "https://github.com/DanBloomberg/leptonica/commit/"
- + "c1079bb8e77cdd426759e466729917ca37a3ed9f.patch";
- sha256 = "1z4iac5gwqggh7aa8cvyp6nl9fwd1v7wif26caxc9y5qr3jj34qf";
- })
- (fetchpatch {
- # CVE-2018-7440
- url = "https://github.com/DanBloomberg/leptonica/commit/"
- + "49ecb6c2dfd6ed5078c62f4a8eeff03e3beced3b.patch";
- sha256 = "1hjmva98iaw9xj7prg7aimykyayikcwnk4hk0380007hqb35lqmy";
- })
- ];
+ patches = [ ./leptonica.patch ]; # Last verified with leptonica 1.78.0
});
tesseract_modded = tesseract4.override {
tesseractBase = tesseract4.tesseractBase.overrideAttrs (_: {
- prePatch = ''
- cp ${src}/tesseract_mod/baseapi.{h,cpp} src/api/
- cp ${src}/tesseract_mod/ccutil.{h,cpp} src/ccutil/
- cp ${src}/tesseract_mod/genericvector.h src/ccutil/
- cp ${src}/tesseract_mod/input.cpp src/lstm/
- cp ${src}/tesseract_mod/lstmrecognizer.cpp src/lstm/
- cp ${src}/tesseract_mod/mainblk.cpp src/ccutil/
- cp ${src}/tesseract_mod/params.cpp src/ccutil/
- cp ${src}/tesseract_mod/serialis.{h,cpp} src/ccutil/
- cp ${src}/tesseract_mod/tesscapi.cpp src/api/
- cp ${src}/tesseract_mod/tessdatamanager.cpp src/ccstruct/
- cp ${src}/tesseract_mod/tessedit.cpp src/ccmain/
- cp ${src}/include_mod/{tesseract.h,leptonica.h} src/api/
- '';
- patches = [ ./tesseract.patch ];
+ patches = [ ./tesseract.patch ]; # Last verified with tesseract 1.4
});
};
in
diff --git a/pkgs/applications/misc/k2pdfopt/leptonica-CVE-2018-3836.patch b/pkgs/applications/misc/k2pdfopt/leptonica-CVE-2018-3836.patch
deleted file mode 100644
index f1b4170fbaa..00000000000
--- a/pkgs/applications/misc/k2pdfopt/leptonica-CVE-2018-3836.patch
+++ /dev/null
@@ -1,95 +0,0 @@
---- a/src/allheaders.h
-+++ b/src/allheaders.h
-@@ -2600,6 +2600,7 @@
- LEPT_DLL extern char * stringReverse ( const char *src );
- LEPT_DLL extern char * strtokSafe ( char *cstr, const char *seps, char **psaveptr );
- LEPT_DLL extern l_int32 stringSplitOnToken ( char *cstr, const char *seps, char **phead, char **ptail );
-+LEPT_DLL extern l_int32 stringCheckForChars ( const char *src, const char *chars, l_int32 *pfound );
- LEPT_DLL extern char * stringRemoveChars ( const char *src, const char *remchars );
- LEPT_DLL extern l_int32 stringFindSubstr ( const char *src, const char *sub, l_int32 *ploc );
- LEPT_DLL extern char * stringReplaceSubstr ( const char *src, const char *sub1, const char *sub2, l_int32 *pfound, l_int32 *ploc );
---- a/src/gplot.c
-+++ b/src/gplot.c
-@@ -141,9 +141,10 @@
- const char *xlabel,
- const char *ylabel)
- {
--char *newroot;
--char buf[L_BUF_SIZE];
--GPLOT *gplot;
-+char *newroot;
-+char buf[L_BUF_SIZE];
-+l_int32 badchar;
-+GPLOT *gplot;
-
- PROCNAME("gplotCreate");
-
-@@ -152,6 +153,9 @@
- if (outformat != GPLOT_PNG && outformat != GPLOT_PS &&
- outformat != GPLOT_EPS && outformat != GPLOT_LATEX)
- return (GPLOT *)ERROR_PTR("outformat invalid", procName, NULL);
-+ stringCheckForChars(rootname, "`;&|><\"?*", &badchar);
-+ if (badchar) /* danger of command injection */
-+ return (GPLOT *)ERROR_PTR("invalid rootname", procName, NULL);
-
- if ((gplot = (GPLOT *)LEPT_CALLOC(1, sizeof(GPLOT))) == NULL)
- return (GPLOT *)ERROR_PTR("gplot not made", procName, NULL);
---- a/src/utils2.c
-+++ b/src/utils2.c
-@@ -42,6 +42,7 @@
- * l_int32 stringSplitOnToken()
- *
- * Find and replace string and array procs
-+ * l_int32 stringCheckForChars()
- * char *stringRemoveChars()
- * l_int32 stringFindSubstr()
- * char *stringReplaceSubstr()
-@@ -701,6 +702,48 @@
- /*--------------------------------------------------------------------*
- * Find and replace procs *
- *--------------------------------------------------------------------*/
-+/*!
-+ * \brief stringCheckForChars()
-+ *
-+ * \param[in] src input string; can be of zero length
-+ * \param[in] chars string of chars to be searched for in %src
-+ * \param[out] pfound 1 if any characters are found; 0 otherwise
-+ * \return 0 if OK, 1 on error
-+ *
-+ *
-+ * Notes:
-+ * (1) This can be used to sanitize an operation by checking for
-+ * special characters that don't belong in a string.
-+ *
-+ */
-+l_int32
-+stringCheckForChars(const char *src,
-+ const char *chars,
-+ l_int32 *pfound)
-+{
-+char ch;
-+l_int32 i, n;
-+
-+ PROCNAME("stringCheckForChars");
-+
-+ if (!pfound)
-+ return ERROR_INT("&found not defined", procName, 1);
-+ *pfound = FALSE;
-+ if (!src || !chars)
-+ return ERROR_INT("src and chars not both defined", procName, 1);
-+
-+ n = strlen(src);
-+ for (i = 0; i < n; i++) {
-+ ch = src[i];
-+ if (strchr(chars, ch)) {
-+ *pfound = TRUE;
-+ break;
-+ }
-+ }
-+ return 0;
-+}
-+
-+
- /*!
- * \brief stringRemoveChars()
- *
diff --git a/pkgs/applications/misc/k2pdfopt/leptonica.patch b/pkgs/applications/misc/k2pdfopt/leptonica.patch
new file mode 100644
index 00000000000..dfab99fd013
--- /dev/null
+++ b/pkgs/applications/misc/k2pdfopt/leptonica.patch
@@ -0,0 +1,254 @@
+From 8c11a20925686855023df90ed477957c7d7fe91e Mon Sep 17 00:00:00 2001
+From: Daniel Fullmer
+Date: Fri, 13 Sep 2019 15:54:21 -0400
+Subject: [PATCH] Willus mod for k2pdfopt
+
+---
+ src/allheaders.h | 4 ++
+ src/dewarp2.c | 106 ++++++++++++++++++++++++++++++++++++++++++-----
+ src/leptwin.c | 6 ++-
+ 3 files changed, 104 insertions(+), 12 deletions(-)
+
+diff --git a/src/allheaders.h b/src/allheaders.h
+index e68eff1..b3cc729 100644
+--- a/src/allheaders.h
++++ b/src/allheaders.h
+@@ -669,6 +669,10 @@ LEPT_DLL extern L_DEWARPA * dewarpaReadMem ( const l_uint8 *data, size_t size );
+ LEPT_DLL extern l_ok dewarpaWrite ( const char *filename, L_DEWARPA *dewa );
+ LEPT_DLL extern l_ok dewarpaWriteStream ( FILE *fp, L_DEWARPA *dewa );
+ LEPT_DLL extern l_ok dewarpaWriteMem ( l_uint8 **pdata, size_t *psize, L_DEWARPA *dewa );
++/* WILLUS MOD */
++ LEPT_DLL extern l_int32 dewarpBuildPageModel_ex ( L_DEWARP *dew, const char *debugfile,l_int32 fit_order );
++ LEPT_DLL extern l_int32 dewarpFindVertDisparity_ex ( L_DEWARP *dew, PTAA *ptaa, l_int32 rotflag,l_int32 fit_order );
++ LEPT_DLL extern l_int32 dewarpBuildLineModel_ex ( L_DEWARP *dew, l_int32 opensize, const char *debugfile,l_int32 fit_order );
+ LEPT_DLL extern l_ok dewarpBuildPageModel ( L_DEWARP *dew, const char *debugfile );
+ LEPT_DLL extern l_ok dewarpFindVertDisparity ( L_DEWARP *dew, PTAA *ptaa, l_int32 rotflag );
+ LEPT_DLL extern l_ok dewarpFindHorizDisparity ( L_DEWARP *dew, PTAA *ptaa );
+diff --git a/src/dewarp2.c b/src/dewarp2.c
+index 220eec1..2e29500 100644
+--- a/src/dewarp2.c
++++ b/src/dewarp2.c
+@@ -144,9 +144,17 @@ static const l_float32 L_ALLOWED_W_FRACT = 0.05; /* no bigger */
+ * longest textlines.
+ *
+ */
++/* WILLUS MOD */
+ l_ok
+-dewarpBuildPageModel(L_DEWARP *dew,
+- const char *debugfile)
++dewarpBuildPageModel(L_DEWARP *dew,const char *debugfile)
++{
++return(dewarpBuildPageModel_ex(dew,debugfile,2));
++}
++
++l_ok
++dewarpBuildPageModel_ex(L_DEWARP *dew,
++ const char *debugfile,
++ l_int32 fit_order)
+ {
+ l_int32 linecount, topline, botline, ret;
+ PIX *pixs, *pix1, *pix2, *pix3;
+@@ -225,7 +233,7 @@ PTAA *ptaa1, *ptaa2;
+ /* Get the sampled vertical disparity from the textline centers.
+ * The disparity array will push pixels vertically so that each
+ * textline is flat and centered at the y-position of the mid-point. */
+- if (dewarpFindVertDisparity(dew, ptaa2, 0) != 0) {
++ if (dewarpFindVertDisparity_ex(dew, ptaa2, 0, fit_order) != 0) {
+ L_WARNING("vertical disparity not built\n", procName);
+ ptaaDestroy(&ptaa2);
+ return 1;
+@@ -290,13 +298,24 @@ PTAA *ptaa1, *ptaa2;
+ * a pdf. Non-pix debug output goes to /tmp.
+ *
+ */
++/* WILLUS MOD */
+ l_ok
+ dewarpFindVertDisparity(L_DEWARP *dew,
+ PTAA *ptaa,
+ l_int32 rotflag)
+ {
++return(dewarpFindVertDisparity_ex(dew,ptaa,rotflag,2));
++}
++/* WILLUS MOD -- add cubic and quartic fits and ..._ex functions */
++l_int32
++dewarpFindVertDisparity_ex(L_DEWARP *dew,
++ PTAA *ptaa,
++ l_int32 rotflag,
++ l_int32 fit_order)
++{
+ l_int32 i, j, nlines, npts, nx, ny, sampling;
+-l_float32 c0, c1, c2, x, y, midy, val, medval, meddev, minval, maxval;
++/* WILLUS MOD */
++l_float32 c0, c1, c2, c3, c4, x, y, midy, val, medval, meddev, minval, maxval;
+ l_float32 *famidys;
+ NUMA *nax, *nafit, *nacurve0, *nacurve1, *nacurves;
+ NUMA *namidy, *namidys, *namidysi;
+@@ -304,11 +323,22 @@ PIX *pix1, *pix2, *pixcirc, *pixdb;
+ PTA *pta, *ptad, *ptacirc;
+ PTAA *ptaa0, *ptaa1, *ptaa2, *ptaa3, *ptaa4, *ptaa5, *ptaat;
+ FPIX *fpix;
++/* WILLUS MOD */
++l_int32 fit_order1,fit_order2;
+
+ PROCNAME("dewarpFindVertDisparity");
+
+ if (!dew)
+ return ERROR_INT("dew not defined", procName, 1);
++/* WILLUS MOD */
++ if (fit_order < 10)
++ fit_order1 = fit_order2 = fit_order;
++ else
++ {
++ fit_order1=fit_order % 10;
++ fit_order2=fit_order / 10;
++ fit_order2=fit_order2 % 10;
++ }
+ dew->vsuccess = 0;
+ if (!ptaa)
+ return ERROR_INT("ptaa not defined", procName, 1);
+@@ -331,12 +361,32 @@ FPIX *fpix;
+ pixdb = (rotflag) ? pixRotateOrth(dew->pixs, 1) : pixClone(dew->pixs);
+ for (i = 0; i < nlines; i++) { /* for each line */
+ pta = ptaaGetPta(ptaa, i, L_CLONE);
+- ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL);
+- numaAddNumber(nacurve0, c2);
++/* WILLUS MOD */
++if (fit_order1>3)
++ {
++ ptaGetQuarticLSF(pta, &c4, &c3, &c2, &c1, &c0, NULL);
++ numaAddNumber(nacurve0, c4);
++ }
++else if (fit_order1==3)
++ {
++ ptaGetCubicLSF(pta, &c3, &c2, &c1, &c0, NULL);
++ numaAddNumber(nacurve0, c3);
++ }
++else
++ {
++ ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL);
++ numaAddNumber(nacurve0, c2);
++ }
+ ptad = ptaCreate(nx);
+ for (j = 0; j < nx; j++) { /* uniformly sampled in x */
+ x = j * sampling;
+- applyQuadraticFit(c2, c1, c0, x, &y);
++/* WILLUS MOD */
++if (fit_order1>3)
++ applyQuarticFit(c4, c3, c2, c1, c0, x, &y);
++else if (fit_order1==3)
++ applyCubicFit(c3, c2, c1, c0, x, &y);
++else
++ applyQuadraticFit(c2, c1, c0, x, &y);
+ ptaAddPt(ptad, x, y);
+ }
+ ptaaAddPta(ptaa0, ptad, L_INSERT);
+@@ -350,7 +400,13 @@ FPIX *fpix;
+ for (i = 0; i < nlines; i++) {
+ pta = ptaaGetPta(ptaa, i, L_CLONE);
+ ptaGetArrays(pta, &nax, NULL);
+- ptaGetQuadraticLSF(pta, NULL, NULL, NULL, &nafit);
++/* WILLUS MOD */
++if (fit_order1>3)
++ptaGetQuarticLSF(pta, NULL, NULL, NULL, NULL, NULL, &nafit);
++else if (fit_order1==3)
++ptaGetCubicLSF(pta, NULL, NULL, NULL, NULL, &nafit);
++else
++ptaGetQuadraticLSF(pta, NULL, NULL, NULL, &nafit);
+ ptad = ptaCreateFromNuma(nax, nafit);
+ ptaaAddPta(ptaat, ptad, L_INSERT);
+ ptaDestroy(&pta);
+@@ -494,11 +550,24 @@ FPIX *fpix;
+ ptaa5 = ptaaCreate(nx); /* uniformly sampled across full height of image */
+ for (j = 0; j < nx; j++) { /* for each column */
+ pta = ptaaGetPta(ptaa4, j, L_CLONE);
+- ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL);
++/* WILLUS MOD */
++/* Order higher than 2 can cause a little craziness here. */
++if (fit_order2>3)
++ ptaGetQuarticLSF(pta, &c4, &c3, &c2, &c1, &c0, NULL);
++else if (fit_order2==3)
++ ptaGetCubicLSF(pta, &c3, &c2, &c1, &c0, NULL);
++else
++ ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL);
+ ptad = ptaCreate(ny);
+ for (i = 0; i < ny; i++) { /* uniformly sampled in y */
+ y = i * sampling;
+- applyQuadraticFit(c2, c1, c0, y, &val);
++/* WILLUS MOD */
++if (fit_order2>3)
++ applyQuarticFit(c4, c3, c2, c1, c0, y, &val);
++else if (fit_order2==3)
++ applyCubicFit(c3, c2, c1, c0, y, &val);
++else
++ applyQuadraticFit(c2, c1, c0, y, &val);
+ ptaAddPt(ptad, y, val);
+ }
+ ptaaAddPta(ptaa5, ptad, L_INSERT);
+@@ -1602,11 +1671,21 @@ FPIX *fpix;
+ * See notes there.
+ *
+ */
++/* WILLUS MOD */
+ l_ok
+ dewarpBuildLineModel(L_DEWARP *dew,
+ l_int32 opensize,
+ const char *debugfile)
+ {
++return(dewarpBuildLineModel_ex(dew,opensize,debugfile,2));
++}
++
++l_int32
++dewarpBuildLineModel_ex(L_DEWARP *dew,
++ l_int32 opensize,
++ const char *debugfile,
++ l_int32 fit_order)
++{
+ char buf[64];
+ l_int32 i, j, bx, by, ret, nlines;
+ BOXA *boxa;
+@@ -1695,6 +1774,8 @@ PTAA *ptaa1, *ptaa2;
+
+ /* Remove all lines that are not at least 0.75 times the length
+ * of the longest line. */
++/* WILLUS MOD */
++/*
+ ptaa2 = dewarpRemoveShortLines(pix, ptaa1, 0.75, DEBUG_SHORT_LINES);
+ if (debugfile) {
+ pix1 = pixConvertTo32(pix);
+@@ -1704,6 +1785,8 @@ PTAA *ptaa1, *ptaa2;
+ pixDestroy(&pix1);
+ pixDestroy(&pix2);
+ }
++*/
++ptaa2=ptaa1;
+ ptaaDestroy(&ptaa1);
+ nlines = ptaaGetCount(ptaa2);
+ if (nlines < dew->minlines) {
+@@ -1717,7 +1800,8 @@ PTAA *ptaa1, *ptaa2;
+ * centers. The disparity array will push pixels vertically
+ * so that each line is flat and centered at the y-position
+ * of the mid-point. */
+- ret = dewarpFindVertDisparity(dew, ptaa2, 1 - i);
++/* WILLUS MOD */
++ ret = dewarpFindVertDisparity_ex(dew, ptaa2, 1 - i, fit_order);
+
+ /* If i == 0, move the result to the horizontal disparity,
+ * rotating it back by -90 degrees. */
+diff --git a/src/leptwin.c b/src/leptwin.c
+index 72643a0..573d33e 100644
+--- a/src/leptwin.c
++++ b/src/leptwin.c
+@@ -364,5 +364,9 @@ PIXCMAP *cmap;
+
+ return hBitmap;
+ }
+-
++#else
++/* willus mod: Avoid weird issue with OS/X library archiver when there are no symbols */
++int leptwin_my_empty_func(void);
++int leptwin_my_empty_func(void)
++{return(0);}
+ #endif /* _WIN32 */
+--
+2.22.0
+
diff --git a/pkgs/applications/misc/k2pdfopt/mupdf.patch b/pkgs/applications/misc/k2pdfopt/mupdf.patch
new file mode 100644
index 00000000000..f7c04d42a71
--- /dev/null
+++ b/pkgs/applications/misc/k2pdfopt/mupdf.patch
@@ -0,0 +1,1060 @@
+From 3d763f84872351c250ffea26150e73b02b8f4c6f Mon Sep 17 00:00:00 2001
+From: Daniel Fullmer
+Date: Fri, 13 Sep 2019 15:11:45 -0400
+Subject: [PATCH] Willus mod for k2pdfopt
+
+---
+ source/fitz/filter-basic.c | 3 +
+ source/fitz/font-win32.c | 866 +++++++++++++++++++++++++++++++++++++
+ source/fitz/font.c | 3 +
+ source/fitz/stext-device.c | 5 +
+ source/fitz/string.c | 5 +
+ source/pdf/pdf-annot.c | 14 +-
+ source/pdf/pdf-link.c | 3 +
+ source/pdf/pdf-parse.c | 5 +
+ source/pdf/pdf-xref.c | 9 +
+ 9 files changed, 912 insertions(+), 1 deletion(-)
+ create mode 100644 source/fitz/font-win32.c
+
+diff --git a/source/fitz/filter-basic.c b/source/fitz/filter-basic.c
+index 0713a62e7..b8ef4d292 100644
+--- a/source/fitz/filter-basic.c
++++ b/source/fitz/filter-basic.c
+@@ -259,7 +259,10 @@ look_for_endstream:
+ if (!state->warned)
+ {
+ state->warned = 1;
++/* willus mod -- no warning */
++/*
+ fz_warn(ctx, "PDF stream Length incorrect");
++*/
+ }
+ return *stm->rp++;
+ }
+diff --git a/source/fitz/font-win32.c b/source/fitz/font-win32.c
+new file mode 100644
+index 000000000..45de8cfd3
+--- /dev/null
++++ b/source/fitz/font-win32.c
+@@ -0,0 +1,866 @@
++/*
++** Routines to access MS Windows system fonts.
++** From sumatra PDF distro.
++** Modified for MuPDF v1.9a by willus.com
++*/
++#include "mupdf/pdf.h"
++
++/*
++ Which fonts are embedded is based on a few preprocessor definitions.
++
++ The base 14 fonts are always embedded.
++ For CJK font substitution we embed DroidSansFallback.
++
++ Set NOCJK to skip all CJK support (this also omits embedding the CJK CMaps)
++ Set NOCJKFONT to skip the embedded CJK font.
++ Set NOCJKFULL to embed a smaller CJK font without CJK Extension A support.
++*/
++
++#ifdef NOCJK
++#define NOCJKFONT
++#endif
++
++/* SumatraPDF: also load fonts included with Windows */
++#ifdef _WIN32
++
++#ifndef UNICODE
++#define UNICODE
++#endif
++#ifndef _UNICODE
++#define _UNICODE
++#endif
++
++#include
++
++// TODO: Use more of FreeType for TTF parsing (for performance reasons,
++// the fonts can't be parsed completely, though)
++#include
++#include FT_TRUETYPE_IDS_H
++#include FT_TRUETYPE_TAGS_H
++
++#define TTC_VERSION1 0x00010000
++#define TTC_VERSION2 0x00020000
++
++#define MAX_FACENAME 128
++
++// Note: the font face must be the first field so that the structure
++// can be treated like a simple string for searching
++typedef struct pdf_fontmapMS_s
++{
++ char fontface[MAX_FACENAME];
++ char fontpath[MAX_PATH];
++ int index;
++} pdf_fontmapMS;
++
++typedef struct pdf_fontlistMS_s
++{
++ pdf_fontmapMS *fontmap;
++ int len;
++ int cap;
++} pdf_fontlistMS;
++
++typedef struct _tagTT_OFFSET_TABLE
++{
++ ULONG uVersion;
++ USHORT uNumOfTables;
++ USHORT uSearchRange;
++ USHORT uEntrySelector;
++ USHORT uRangeShift;
++} TT_OFFSET_TABLE;
++
++typedef struct _tagTT_TABLE_DIRECTORY
++{
++ ULONG uTag; //table name
++ ULONG uCheckSum; //Check sum
++ ULONG uOffset; //Offset from beginning of file
++ ULONG uLength; //length of the table in bytes
++} TT_TABLE_DIRECTORY;
++
++typedef struct _tagTT_NAME_TABLE_HEADER
++{
++ USHORT uFSelector; //format selector. Always 0
++ USHORT uNRCount; //Name Records count
++ USHORT uStorageOffset; //Offset for strings storage, from start of the table
++} TT_NAME_TABLE_HEADER;
++
++typedef struct _tagTT_NAME_RECORD
++{
++ USHORT uPlatformID;
++ USHORT uEncodingID;
++ USHORT uLanguageID;
++ USHORT uNameID;
++ USHORT uStringLength;
++ USHORT uStringOffset; //from start of storage area
++} TT_NAME_RECORD;
++
++typedef struct _tagFONT_COLLECTION
++{
++ ULONG Tag;
++ ULONG Version;
++ ULONG NumFonts;
++} FONT_COLLECTION;
++
++static struct {
++ char *name;
++ char *pattern;
++} baseSubstitutes[] = {
++ { "Courier", "CourierNewPSMT" },
++ { "Courier-Bold", "CourierNewPS-BoldMT" },
++ { "Courier-Oblique", "CourierNewPS-ItalicMT" },
++ { "Courier-BoldOblique", "CourierNewPS-BoldItalicMT" },
++ { "Helvetica", "ArialMT" },
++ { "Helvetica-Bold", "Arial-BoldMT" },
++ { "Helvetica-Oblique", "Arial-ItalicMT" },
++ { "Helvetica-BoldOblique", "Arial-BoldItalicMT" },
++ { "Times-Roman", "TimesNewRomanPSMT" },
++ { "Times-Bold", "TimesNewRomanPS-BoldMT" },
++ { "Times-Italic", "TimesNewRomanPS-ItalicMT" },
++ { "Times-BoldItalic", "TimesNewRomanPS-BoldItalicMT" },
++ { "Symbol", "SymbolMT" },
++};
++static const char *base_font_names[][10] =
++{
++ { "Courier", "CourierNew", "CourierNewPSMT", NULL },
++ { "Courier-Bold", "CourierNew,Bold", "Courier,Bold",
++ "CourierNewPS-BoldMT", "CourierNew-Bold", NULL },
++ { "Courier-Oblique", "CourierNew,Italic", "Courier,Italic",
++ "CourierNewPS-ItalicMT", "CourierNew-Italic", NULL },
++ { "Courier-BoldOblique", "CourierNew,BoldItalic", "Courier,BoldItalic",
++ "CourierNewPS-BoldItalicMT", "CourierNew-BoldItalic", NULL },
++ { "Helvetica", "ArialMT", "Arial", NULL },
++ { "Helvetica-Bold", "Arial-BoldMT", "Arial,Bold", "Arial-Bold",
++ "Helvetica,Bold", NULL },
++ { "Helvetica-Oblique", "Arial-ItalicMT", "Arial,Italic", "Arial-Italic",
++ "Helvetica,Italic", "Helvetica-Italic", NULL },
++ { "Helvetica-BoldOblique", "Arial-BoldItalicMT",
++ "Arial,BoldItalic", "Arial-BoldItalic",
++ "Helvetica,BoldItalic", "Helvetica-BoldItalic", NULL },
++ { "Times-Roman", "TimesNewRomanPSMT", "TimesNewRoman",
++ "TimesNewRomanPS", NULL },
++ { "Times-Bold", "TimesNewRomanPS-BoldMT", "TimesNewRoman,Bold",
++ "TimesNewRomanPS-Bold", "TimesNewRoman-Bold", NULL },
++ { "Times-Italic", "TimesNewRomanPS-ItalicMT", "TimesNewRoman,Italic",
++ "TimesNewRomanPS-Italic", "TimesNewRoman-Italic", NULL },
++ { "Times-BoldItalic", "TimesNewRomanPS-BoldItalicMT",
++ "TimesNewRoman,BoldItalic", "TimesNewRomanPS-BoldItalic",
++ "TimesNewRoman-BoldItalic", NULL },
++ { "Symbol", "Symbol,Italic", "Symbol,Bold", "Symbol,BoldItalic",
++ "SymbolMT", "SymbolMT,Italic", "SymbolMT,Bold", "SymbolMT,BoldItalic", NULL },
++ { "ZapfDingbats", NULL }
++};
++
++static pdf_fontlistMS fontlistMS =
++{
++ NULL,
++ 0,
++ 0,
++};
++static int strcmp_ignore_space(const char *a, const char *b);
++static const char *clean_font_name(const char *fontname);
++static const char *pdf_clean_base14_name(const char *fontname);
++
++static inline USHORT BEtoHs(USHORT x)
++{
++ BYTE *data = (BYTE *)&x;
++ return (data[0] << 8) | data[1];
++}
++
++static inline ULONG BEtoHl(ULONG x)
++{
++ BYTE *data = (BYTE *)&x;
++ return (data[0] << 24) | (data[1] << 16) | (data[2] << 8) | data[3];
++}
++
++static int strcmp_ignore_space(const char *a, const char *b)
++{
++ while (1)
++ {
++ while (*a == ' ')
++ a++;
++ while (*b == ' ')
++ b++;
++ if (*a != *b)
++ return 1;
++ if (*a == 0)
++ return *a != *b;
++ if (*b == 0)
++ return *a != *b;
++ a++;
++ b++;
++ }
++}
++
++/* A little bit more sophisticated name matching so that e.g. "EurostileExtended"
++ matches "EurostileExtended-Roman" or "Tahoma-Bold,Bold" matches "Tahoma-Bold" */
++static int
++lookup_compare(const void *elem1, const void *elem2)
++{
++ const char *val1 = elem1;
++ const char *val2 = elem2;
++ int len1 = strlen(val1);
++ int len2 = strlen(val2);
++
++ if (len1 != len2)
++ {
++ const char *rest = len1 > len2 ? val1 + len2 : val2 + len1;
++ if (',' == *rest || !_stricmp(rest, "-roman"))
++ return _strnicmp(val1, val2, fz_mini(len1, len2));
++ }
++
++ return _stricmp(val1, val2);
++}
++
++static void
++remove_spaces(char *srcDest)
++{
++ char *dest;
++
++ for (dest = srcDest; *srcDest; srcDest++)
++ if (*srcDest != ' ')
++ *dest++ = *srcDest;
++ *dest = '\0';
++}
++
++static int
++str_ends_with(const char *str, const char *end)
++{
++ size_t len1 = strlen(str);
++ size_t len2 = strlen(end);
++
++ return len1 >= len2 && !strcmp(str + len1 - len2, end);
++}
++
++static pdf_fontmapMS *
++pdf_find_windows_font_path(const char *fontname)
++{
++ return bsearch(fontname, fontlistMS.fontmap, fontlistMS.len, sizeof(pdf_fontmapMS), lookup_compare);
++}
++
++/* source and dest can be same */
++static void
++decode_unicode_BE(fz_context *ctx, char *source, int sourcelen, char *dest, int destlen)
++{
++ WCHAR *tmp;
++ int converted, i;
++
++ if (sourcelen % 2 != 0)
++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : invalid unicode string");
++
++ tmp = fz_malloc_array(ctx, sourcelen / 2 + 1, sizeof(WCHAR));
++ for (i = 0; i < sourcelen / 2; i++)
++ tmp[i] = BEtoHs(((WCHAR *)source)[i]);
++ tmp[sourcelen / 2] = '\0';
++
++ converted = WideCharToMultiByte(CP_UTF8, 0, tmp, -1, dest, destlen, NULL, NULL);
++ fz_free(ctx, tmp);
++ if (!converted)
++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : invalid unicode string");
++}
++
++static void
++decode_platform_string(fz_context *ctx, int platform, int enctype, char *source, int sourcelen, char *dest, int destlen)
++{
++ switch (platform)
++ {
++ case TT_PLATFORM_APPLE_UNICODE:
++ switch (enctype)
++ {
++ case TT_APPLE_ID_DEFAULT:
++ case TT_APPLE_ID_UNICODE_2_0:
++ decode_unicode_BE(ctx, source, sourcelen, dest, destlen);
++ return;
++ }
++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : unsupported encoding (%d/%d)", platform, enctype);
++ case TT_PLATFORM_MACINTOSH:
++ switch (enctype)
++ {
++ case TT_MAC_ID_ROMAN:
++ if (sourcelen + 1 > destlen)
++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : overlong fontname: %s", source);
++ // TODO: Convert to UTF-8 from what encoding?
++ memcpy(dest, source, sourcelen);
++ dest[sourcelen] = 0;
++ return;
++ }
++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : unsupported encoding (%d/%d)", platform, enctype);
++ case TT_PLATFORM_MICROSOFT:
++ switch (enctype)
++ {
++ case TT_MS_ID_SYMBOL_CS:
++ case TT_MS_ID_UNICODE_CS:
++ case TT_MS_ID_UCS_4:
++ decode_unicode_BE(ctx, source, sourcelen, dest, destlen);
++ return;
++ }
++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : unsupported encoding (%d/%d)", platform, enctype);
++ default:
++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : unsupported encoding (%d/%d)", platform, enctype);
++ }
++}
++
++static void
++grow_system_font_list(fz_context *ctx, pdf_fontlistMS *fl)
++{
++ int newcap;
++ pdf_fontmapMS *newitems;
++
++ if (fl->cap == 0)
++ newcap = 1024;
++ else
++ newcap = fl->cap * 2;
++
++ // use realloc/free for the fontmap, since the list can
++ // remain in memory even with all fz_contexts destroyed
++ newitems = realloc(fl->fontmap, newcap * sizeof(pdf_fontmapMS));
++ if (!newitems)
++ fz_throw(ctx, FZ_ERROR_GENERIC, "OOM in grow_system_font_list");
++ memset(newitems + fl->cap, 0, sizeof(pdf_fontmapMS) * (newcap - fl->cap));
++
++ fl->fontmap = newitems;
++ fl->cap = newcap;
++}
++
++static void
++append_mapping(fz_context *ctx, pdf_fontlistMS *fl, const char *facename, const char *path, int index)
++{
++ if (fl->len == fl->cap)
++ grow_system_font_list(ctx, fl);
++
++ if (fl->len >= fl->cap)
++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : fontlist overflow");
++
++ fz_strlcpy(fl->fontmap[fl->len].fontface, facename, sizeof(fl->fontmap[0].fontface));
++ fz_strlcpy(fl->fontmap[fl->len].fontpath, path, sizeof(fl->fontmap[0].fontpath));
++ fl->fontmap[fl->len].index = index;
++
++ ++fl->len;
++}
++
++static void
++safe_read(fz_context *ctx, fz_stream *file, int offset, char *buf, int size)
++{
++ int n;
++ fz_seek(ctx, file, offset, 0);
++ n = fz_read(ctx, file, (unsigned char *)buf, size);
++ if (n != size)
++ fz_throw(ctx, FZ_ERROR_GENERIC, "safe_read: read %d, expected %d", n, size);
++}
++
++static void
++read_ttf_string(fz_context *ctx, fz_stream *file, int offset, TT_NAME_RECORD *ttRecordBE, char *buf, int size)
++{
++ char szTemp[MAX_FACENAME * 2];
++ // ignore empty and overlong strings
++ int stringLength = BEtoHs(ttRecordBE->uStringLength);
++ if (stringLength == 0 || stringLength >= sizeof(szTemp))
++ return;
++
++ safe_read(ctx, file, offset + BEtoHs(ttRecordBE->uStringOffset), szTemp, stringLength);
++ decode_platform_string(ctx, BEtoHs(ttRecordBE->uPlatformID),
++ BEtoHs(ttRecordBE->uEncodingID), szTemp, stringLength, buf, size);
++}
++
++static void
++makeFakePSName(char szName[MAX_FACENAME], const char *szStyle)
++{
++ // append the font's subfamily, unless it's a Regular font
++ if (*szStyle && _stricmp(szStyle, "Regular") != 0)
++ {
++ fz_strlcat(szName, "-", MAX_FACENAME);
++ fz_strlcat(szName, szStyle, MAX_FACENAME);
++ }
++ remove_spaces(szName);
++}
++
++static void
++parseTTF(fz_context *ctx, fz_stream *file, int offset, int index, const char *path)
++{
++ TT_OFFSET_TABLE ttOffsetTableBE;
++ TT_TABLE_DIRECTORY tblDirBE;
++ TT_NAME_TABLE_HEADER ttNTHeaderBE;
++ TT_NAME_RECORD ttRecordBE;
++
++ char szPSName[MAX_FACENAME] = { 0 };
++ char szTTName[MAX_FACENAME] = { 0 };
++ char szStyle[MAX_FACENAME] = { 0 };
++ char szCJKName[MAX_FACENAME] = { 0 };
++ int i, count, tblOffset;
++
++ safe_read(ctx, file, offset, (char *)&ttOffsetTableBE, sizeof(TT_OFFSET_TABLE));
++
++ // check if this is a TrueType font of version 1.0 or an OpenType font
++ if (BEtoHl(ttOffsetTableBE.uVersion) != TTC_VERSION1 &&
++ BEtoHl(ttOffsetTableBE.uVersion) != TTAG_OTTO)
++ {
++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : invalid font version %x", (unsigned int)BEtoHl(ttOffsetTableBE.uVersion));
++ }
++
++ // determine the name table's offset by iterating through the offset table
++ count = BEtoHs(ttOffsetTableBE.uNumOfTables);
++ for (i = 0; i < count; i++)
++ {
++ int entryOffset = offset + sizeof(TT_OFFSET_TABLE) + i * sizeof(TT_TABLE_DIRECTORY);
++ safe_read(ctx, file, entryOffset, (char *)&tblDirBE, sizeof(TT_TABLE_DIRECTORY));
++ if (!BEtoHl(tblDirBE.uTag) || BEtoHl(tblDirBE.uTag) == TTAG_name)
++ break;
++ }
++ if (count == i || !BEtoHl(tblDirBE.uTag))
++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : nameless font");
++ tblOffset = BEtoHl(tblDirBE.uOffset);
++
++ // read the 'name' table for record count and offsets
++ safe_read(ctx, file, tblOffset, (char *)&ttNTHeaderBE, sizeof(TT_NAME_TABLE_HEADER));
++ offset = tblOffset + sizeof(TT_NAME_TABLE_HEADER);
++ tblOffset += BEtoHs(ttNTHeaderBE.uStorageOffset);
++
++ // read through the strings for PostScript name and font family
++ count = BEtoHs(ttNTHeaderBE.uNRCount);
++ for (i = 0; i < count; i++)
++ {
++ short langId, nameId;
++ BOOL isCJKName;
++
++ safe_read(ctx, file, offset + i * sizeof(TT_NAME_RECORD), (char *)&ttRecordBE, sizeof(TT_NAME_RECORD));
++
++ langId = BEtoHs(ttRecordBE.uLanguageID);
++ nameId = BEtoHs(ttRecordBE.uNameID);
++ isCJKName = TT_NAME_ID_FONT_FAMILY == nameId && LANG_CHINESE == PRIMARYLANGID(langId);
++
++ // ignore non-English strings (except for Chinese font names)
++ if (langId && langId != TT_MS_LANGID_ENGLISH_UNITED_STATES && !isCJKName)
++ continue;
++ // ignore names other than font (sub)family and PostScript name
++ fz_try(ctx)
++ {
++ if (isCJKName)
++ read_ttf_string(ctx, file, tblOffset, &ttRecordBE, szCJKName, sizeof(szCJKName));
++ else if (TT_NAME_ID_FONT_FAMILY == nameId)
++ read_ttf_string(ctx, file, tblOffset, &ttRecordBE, szTTName, sizeof(szTTName));
++ else if (TT_NAME_ID_FONT_SUBFAMILY == nameId)
++ read_ttf_string(ctx, file, tblOffset, &ttRecordBE, szStyle, sizeof(szStyle));
++ else if (TT_NAME_ID_PS_NAME == nameId)
++ read_ttf_string(ctx, file, tblOffset, &ttRecordBE, szPSName, sizeof(szPSName));
++ }
++ fz_catch(ctx)
++ {
++ fz_warn(ctx, "ignoring face name decoding fonterror");
++ }
++ }
++
++ // try to prevent non-Arial fonts from accidentally substituting Arial
++ if (!strcmp(szPSName, "ArialMT"))
++ {
++ // cf. https://code.google.com/p/sumatrapdf/issues/detail?id=2471
++ if (strcmp(szTTName, "Arial") != 0)
++ szPSName[0] = '\0';
++ // TODO: is there a better way to distinguish Arial Caps from Arial proper?
++ // cf. http://code.google.com/p/sumatrapdf/issues/detail?id=1290
++ else if (strstr(path, "caps") || strstr(path, "Caps"))
++ fz_throw(ctx, FZ_ERROR_GENERIC, "ignore %s, as it can't be distinguished from Arial,Regular", path);
++ }
++
++ if (szPSName[0])
++ append_mapping(ctx, &fontlistMS, szPSName, path, index);
++ if (szTTName[0])
++ {
++ // derive a PostScript-like name and add it, if it's different from the font's
++ // included PostScript name; cf. http://code.google.com/p/sumatrapdf/issues/detail?id=376
++ makeFakePSName(szTTName, szStyle);
++ // compare the two names before adding this one
++ if (lookup_compare(szTTName, szPSName))
++ append_mapping(ctx, &fontlistMS, szTTName, path, index);
++ }
++ if (szCJKName[0])
++ {
++ makeFakePSName(szCJKName, szStyle);
++ if (lookup_compare(szCJKName, szPSName) && lookup_compare(szCJKName, szTTName))
++ append_mapping(ctx, &fontlistMS, szCJKName, path, index);
++ }
++}
++
++static void
++parseTTFs(fz_context *ctx, const char *path)
++{
++ fz_stream *file = fz_open_file(ctx, path);
++ /* "fonterror : %s not found", path */
++ fz_try(ctx)
++ {
++ parseTTF(ctx, file, 0, 0, path);
++ }
++ fz_always(ctx)
++ {
++ fz_drop_stream(ctx,file);
++ }
++ fz_catch(ctx)
++ {
++ fz_rethrow(ctx);
++ }
++}
++
++static void
++parseTTCs(fz_context *ctx, const char *path)
++{
++ FONT_COLLECTION fontcollectionBE;
++ ULONG i, numFonts, *offsettableBE = NULL;
++
++ fz_stream *file = fz_open_file(ctx, path);
++ /* "fonterror : %s not found", path */
++
++ fz_var(offsettableBE);
++
++ fz_try(ctx)
++ {
++ safe_read(ctx, file, 0, (char *)&fontcollectionBE, sizeof(FONT_COLLECTION));
++ if (BEtoHl(fontcollectionBE.Tag) != TTAG_ttcf)
++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : wrong format %x", (unsigned int)BEtoHl(fontcollectionBE.Tag));
++ if (BEtoHl(fontcollectionBE.Version) != TTC_VERSION1 &&
++ BEtoHl(fontcollectionBE.Version) != TTC_VERSION2)
++ {
++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : invalid version %x", (unsigned int)BEtoHl(fontcollectionBE.Version));
++ }
++
++ numFonts = BEtoHl(fontcollectionBE.NumFonts);
++ offsettableBE = fz_malloc_array(ctx, numFonts, sizeof(ULONG));
++
++ safe_read(ctx, file, sizeof(FONT_COLLECTION), (char *)offsettableBE, numFonts * sizeof(ULONG));
++ for (i = 0; i < numFonts; i++)
++ parseTTF(ctx, file, BEtoHl(offsettableBE[i]), i, path);
++ }
++ fz_always(ctx)
++ {
++ fz_free(ctx, offsettableBE);
++ fz_drop_stream(ctx,file);
++ }
++ fz_catch(ctx)
++ {
++ fz_rethrow(ctx);
++ }
++}
++
++static void
++extend_system_font_list(fz_context *ctx, const WCHAR *path)
++{
++ WCHAR szPath[MAX_PATH], *lpFileName;
++ WIN32_FIND_DATA FileData;
++ HANDLE hList;
++
++ GetFullPathName(path, nelem(szPath), szPath, &lpFileName);
++
++ hList = FindFirstFile(szPath, &FileData);
++ if (hList == INVALID_HANDLE_VALUE)
++ {
++ // Don't complain about missing directories
++ if (GetLastError() == ERROR_FILE_NOT_FOUND)
++ return;
++ fz_throw(ctx, FZ_ERROR_GENERIC, "extend_system_font_list: unknown error %d", (int)GetLastError());
++ }
++ do
++ {
++ if (!(FileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY))
++ {
++ char szPathUtf8[MAX_PATH], *fileExt;
++ int res;
++ lstrcpyn(lpFileName, FileData.cFileName, szPath + MAX_PATH - lpFileName);
++ res = WideCharToMultiByte(CP_UTF8, 0, szPath, -1, szPathUtf8, sizeof(szPathUtf8), NULL, NULL);
++ if (!res)
++ {
++ fz_warn(ctx, "WideCharToMultiByte failed for %S", szPath);
++ continue;
++ }
++ fileExt = szPathUtf8 + strlen(szPathUtf8) - 4;
++ fz_try(ctx)
++ {
++ if (!_stricmp(fileExt, ".ttc"))
++ parseTTCs(ctx, szPathUtf8);
++ else if (!_stricmp(fileExt, ".ttf") || !_stricmp(fileExt, ".otf"))
++ parseTTFs(ctx, szPathUtf8);
++ }
++ fz_catch(ctx)
++ {
++ // ignore errors occurring while parsing a given font file
++ }
++ }
++ } while (FindNextFile(hList, &FileData));
++ FindClose(hList);
++}
++
++static void
++destroy_system_font_list(void)
++{
++ free(fontlistMS.fontmap);
++ memset(&fontlistMS, 0, sizeof(fontlistMS));
++}
++
++static void
++create_system_font_list(fz_context *ctx)
++{
++ WCHAR szFontDir[MAX_PATH];
++ UINT cch;
++
++ cch = GetWindowsDirectory(szFontDir, nelem(szFontDir) - 12);
++ if (0 < cch && cch < nelem(szFontDir) - 12)
++ {
++ /* willus.com edit--Win XP default MSVCRT.DLL doesn't have wcscat_s */
++#ifdef _WIN64
++ wcscat_s(szFontDir, MAX_PATH, L"\\Fonts\\*.?t?");
++#else
++ wcscat(szFontDir,L"\\Fonts\\*.?t?");
++#endif
++ extend_system_font_list(ctx, szFontDir);
++ }
++
++ if (fontlistMS.len == 0)
++ fz_warn(ctx, "couldn't find any usable system fonts");
++
++#ifdef NOCJKFONT
++ {
++ // If no CJK fallback font is builtin but one has been shipped separately (in the same
++ // directory as the main executable), add it to the list of loadable system fonts
++ WCHAR szFile[MAX_PATH], *lpFileName;
++ GetModuleFileName(0, szFontDir, MAX_PATH);
++ GetFullPathName(szFontDir, MAX_PATH, szFile, &lpFileName);
++ lstrcpyn(lpFileName, L"DroidSansFallback.ttf", szFile + MAX_PATH - lpFileName);
++ extend_system_font_list(ctx, szFile);
++ }
++#endif
++
++ // sort the font list, so that it can be searched binarily
++ qsort(fontlistMS.fontmap, fontlistMS.len, sizeof(pdf_fontmapMS), _stricmp);
++
++#ifdef DEBUG
++ // allow to overwrite system fonts for debugging purposes
++ // (either pass a full path or a search pattern such as "fonts\*.ttf")
++ cch = GetEnvironmentVariable(L"MUPDF_FONTS_PATTERN", szFontDir, nelem(szFontDir));
++ if (0 < cch && cch < nelem(szFontDir))
++ {
++ int i, prev_len = fontlistMS.len;
++ extend_system_font_list(ctx, szFontDir);
++ for (i = prev_len; i < fontlistMS.len; i++)
++ {
++ pdf_fontmapMS *entry = bsearch(fontlistMS.fontmap[i].fontface, fontlistMS.fontmap, prev_len, sizeof(pdf_fontmapMS), lookup_compare);
++ if (entry)
++ *entry = fontlistMS.fontmap[i];
++ }
++ qsort(fontlistMS.fontmap, fontlistMS.len, sizeof(pdf_fontmapMS), _stricmp);
++ }
++#endif
++
++ // make sure to clean up after ourselves
++ atexit(destroy_system_font_list);
++}
++
++static fz_font *
++pdf_load_windows_font_by_name(fz_context *ctx, const char *orig_name)
++{
++ pdf_fontmapMS *found = NULL;
++ char *comma, *fontname;
++ fz_font *font;
++
++ /* WILLUS MOD--not multi-threaded for k2pdfopt */
++ /* fz_synchronize_begin(); */
++ if (fontlistMS.len == 0)
++ {
++ fz_try(ctx)
++ {
++ create_system_font_list(ctx);
++ }
++ fz_catch(ctx) { }
++ }
++ /* WILLUS MOD--not multi-threaded for k2pdfopt */
++ /* fz_synchronize_end(); */
++ if (fontlistMS.len == 0)
++ fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror: couldn't find any fonts");
++
++ // work on a normalized copy of the font name
++ fontname = fz_strdup(ctx, orig_name);
++ remove_spaces(fontname);
++
++ // first, try to find the exact font name (including appended style information)
++ comma = strchr(fontname, ',');
++ if (comma)
++ {
++ *comma = '-';
++ found = pdf_find_windows_font_path(fontname);
++ *comma = ',';
++ }
++ // second, substitute the font name with a known PostScript name
++ else
++ {
++ int i;
++ for (i = 0; i < nelem(baseSubstitutes) && !found; i++)
++ if (!strcmp(fontname, baseSubstitutes[i].name))
++ found = pdf_find_windows_font_path(baseSubstitutes[i].pattern);
++ }
++ // third, search for the font name without additional style information
++ if (!found)
++ found = pdf_find_windows_font_path(fontname);
++ // fourth, try to separate style from basename for prestyled fonts (e.g. "ArialBold")
++ if (!found && !comma && (str_ends_with(fontname, "Bold") || str_ends_with(fontname, "Italic")))
++ {
++ int styleLen = str_ends_with(fontname, "Bold") ? 4 : str_ends_with(fontname, "BoldItalic") ? 10 : 6;
++ fontname = fz_resize_array(ctx, fontname, strlen(fontname) + 2, sizeof(char));
++ comma = fontname + strlen(fontname) - styleLen;
++ memmove(comma + 1, comma, styleLen + 1);
++ *comma = '-';
++ found = pdf_find_windows_font_path(fontname);
++ *comma = ',';
++ if (!found)
++ found = pdf_find_windows_font_path(fontname);
++ }
++ // fifth, try to convert the font name from the common Chinese codepage 936
++ if (!found && fontname[0] < 0)
++ {
++ WCHAR cjkNameW[MAX_FACENAME];
++ char cjkName[MAX_FACENAME];
++ if (MultiByteToWideChar(936, MB_ERR_INVALID_CHARS, fontname, -1, cjkNameW, nelem(cjkNameW)) &&
++ WideCharToMultiByte(CP_UTF8, 0, cjkNameW, -1, cjkName, nelem(cjkName), NULL, NULL))
++ {
++ comma = strchr(cjkName, ',');
++ if (comma)
++ {
++ *comma = '-';
++ found = pdf_find_windows_font_path(cjkName);
++ *comma = ',';
++ }
++ if (!found)
++ found = pdf_find_windows_font_path(cjkName);
++ }
++ }
++
++ fz_free(ctx, fontname);
++ if (!found)
++ fz_throw(ctx, FZ_ERROR_GENERIC, "couldn't find system font '%s'", orig_name);
++
++ /*
++ fz_warn(ctx, "loading non-embedded font '%s' from '%s'", orig_name, found->fontpath);
++ */
++
++ font = fz_new_font_from_file(ctx, orig_name, found->fontpath, found->index,
++ strcmp(found->fontface, "DroidSansFallback") != 0);
++ /* willus mod for MuPDF v1.10, 10-21-2016 */
++ {
++ fz_font_flags_t *flags;
++ flags=fz_font_flags(font);
++ if (flags!=NULL)
++ flags->ft_substitute = 1;
++ }
++ return font;
++}
++
++static fz_font *
++pdf_load_windows_font(fz_context *ctx, const char *fontname, int bold, int italic, int needs_exact_metrics)
++{
++ if (needs_exact_metrics)
++ {
++ const char *clean_name;
++ /* WILLUS: Declare pdf_clean_base14_name() */
++ extern const char *pdf_clean_base14_name(const char *fontname);
++
++ /* TODO: the metrics for Times-Roman and Courier don't match
++ those of Windows' Times New Roman and Courier New; for
++ some reason, Poppler doesn't seem to have this problem */
++ int len;
++ if (fz_lookup_builtin_font(ctx,fontname, bold, italic, &len))
++ return NULL;
++
++ /* cf. http://code.google.com/p/sumatrapdf/issues/detail?id=2173 */
++ clean_name = pdf_clean_base14_name(fontname);
++ if (clean_name != fontname && !strncmp(clean_name, "Times-", 6))
++ return NULL;
++ }
++
++ // TODO: unset font->ft_substitute for base14/needs_exact_metrics?
++ return pdf_load_windows_font_by_name(ctx, fontname);
++}
++
++static const char *clean_font_name(const char *fontname)
++{
++ int i, k;
++ for (i = 0; i < nelem(base_font_names); i++)
++ for (k = 0; base_font_names[i][k]; k++)
++ if (!strcmp_ignore_space(base_font_names[i][k], fontname))
++ return base_font_names[i][0];
++ return fontname;
++}
++
++
++/* SumatraPDF: expose clean_font_name */
++static const char * pdf_clean_base14_name(const char *fontname)
++{
++ return clean_font_name(fontname);
++}
++
++static fz_font *
++pdf_load_windows_cjk_font(fz_context *ctx, const char *fontname, int ros, int serif)
++{
++ fz_font *font;
++
++ font=NULL; /* WILLUS: Avoid compiler warning */
++ /* try to find a matching system font before falling back to an approximate one */
++ fz_try(ctx)
++ {
++ font = pdf_load_windows_font_by_name(ctx, fontname);
++ }
++ fz_catch(ctx)
++ {
++ font = NULL;
++ }
++ if (font)
++ return font;
++
++ /* try to fall back to a reasonable system font */
++ fz_try(ctx)
++ {
++ if (serif)
++ {
++ switch (ros)
++ {
++ case FZ_ADOBE_CNS: font = pdf_load_windows_font_by_name(ctx, "MingLiU"); break;
++ case FZ_ADOBE_GB: font = pdf_load_windows_font_by_name(ctx, "SimSun"); break;
++ case FZ_ADOBE_JAPAN: font = pdf_load_windows_font_by_name(ctx, "MS-Mincho"); break;
++ case FZ_ADOBE_KOREA: font = pdf_load_windows_font_by_name(ctx, "Batang"); break;
++ default: fz_throw(ctx, FZ_ERROR_GENERIC, "invalid serif ros");
++ }
++ }
++ else
++ {
++ switch (ros)
++ {
++ case FZ_ADOBE_CNS: font = pdf_load_windows_font_by_name(ctx, "DFKaiShu-SB-Estd-BF"); break;
++ case FZ_ADOBE_GB:
++ fz_try(ctx)
++ {
++ font = pdf_load_windows_font_by_name(ctx, "KaiTi");
++ }
++ fz_catch(ctx)
++ {
++ font = pdf_load_windows_font_by_name(ctx, "KaiTi_GB2312");
++ }
++ break;
++ case FZ_ADOBE_JAPAN: font = pdf_load_windows_font_by_name(ctx, "MS-Gothic"); break;
++ case FZ_ADOBE_KOREA: font = pdf_load_windows_font_by_name(ctx, "Gulim"); break;
++ default: fz_throw(ctx, FZ_ERROR_GENERIC, "invalid sans-serif ros");
++ }
++ }
++ }
++ fz_catch(ctx)
++ {
++#ifdef NOCJKFONT
++ /* If no CJK fallback font is builtin, maybe one has been shipped separately */
++ font = pdf_load_windows_font_by_name(ctx, "DroidSansFallback");
++#else
++ fz_rethrow(ctx);
++#endif
++ }
++
++ return font;
++}
++
++#endif
++
++void pdf_install_load_system_font_funcs(fz_context *ctx)
++{
++#ifdef _WIN32
++ fz_install_load_system_font_funcs(ctx, pdf_load_windows_font, pdf_load_windows_cjk_font, NULL);
++#endif
++}
+diff --git a/source/fitz/font.c b/source/fitz/font.c
+index 733d91dae..69c46d968 100644
+--- a/source/fitz/font.c
++++ b/source/fitz/font.c
+@@ -5,8 +5,11 @@
+ #include "draw-imp.h"
+
+ #include
++/* willus mod -- remove hb includes */
++/*
+ #include "hb.h"
+ #include "hb-ft.h"
++*/
+
+ #include
+
+diff --git a/source/fitz/stext-device.c b/source/fitz/stext-device.c
+index 0ba944d44..3c05c51ac 100644
+--- a/source/fitz/stext-device.c
++++ b/source/fitz/stext-device.c
+@@ -692,6 +692,11 @@ fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options
+ dev->trm = fz_identity;
+ dev->lastchar = ' ';
+ dev->curdir = 1;
++ /* willus mod -- seems like this should be here, but not sure. */
++ if (opts)
++ dev->flags = opts->flags;
++ else
++ dev->flags = 0;
+
+ return (fz_device*)dev;
+ }
+diff --git a/source/fitz/string.c b/source/fitz/string.c
+index e70ae6e6e..b310463f4 100644
+--- a/source/fitz/string.c
++++ b/source/fitz/string.c
+@@ -448,6 +448,10 @@ fz_utflen(const char *s)
+
+ float fz_atof(const char *s)
+ {
++/* willus mod: atof(s), #if-#else-#endif */
++#if (!defined(__SSE__))
++ return(atof(s));
++#else
+ float result;
+
+ errno = 0;
+@@ -457,6 +461,7 @@ float fz_atof(const char *s)
+ return 1;
+ result = fz_clamp(result, -FLT_MAX, FLT_MAX);
+ return result;
++#endif
+ }
+
+ int fz_atoi(const char *s)
+diff --git a/source/pdf/pdf-annot.c b/source/pdf/pdf-annot.c
+index 68de8898a..5d43485bd 100644
+--- a/source/pdf/pdf-annot.c
++++ b/source/pdf/pdf-annot.c
+@@ -4,8 +4,20 @@
+ #include
+ #include
+
++/* willus mod--don't use _mkgmtime--not available in Win XP */
+ #ifdef _WIN32
+-#define timegm _mkgmtime
++static time_t timegm(struct tm *date);
++static time_t timegm(struct tm *date)
++
++ {
++ time_t t,z;
++ struct tm gmz;
++
++ z=(time_t)0;
++ gmz=(*gmtime(&z));
++ t=mktime(date)-mktime(&gmz);
++ return(t);
++ }
+ #endif
+
+ #define TEXT_ANNOT_SIZE (25.0f)
+diff --git a/source/pdf/pdf-link.c b/source/pdf/pdf-link.c
+index ae5beaa35..b5a52a000 100644
+--- a/source/pdf/pdf-link.c
++++ b/source/pdf/pdf-link.c
+@@ -351,6 +351,9 @@ pdf_resolve_link(fz_context *ctx, pdf_document *doc, const char *uri, float *xp,
+ }
+ return page;
+ }
++/* willus mod -- be quiet */
++/*
+ fz_warn(ctx, "unknown link uri '%s'", uri);
++*/
+ return -1;
+ }
+diff --git a/source/pdf/pdf-parse.c b/source/pdf/pdf-parse.c
+index 501c5626a..927ba6cd5 100644
+--- a/source/pdf/pdf-parse.c
++++ b/source/pdf/pdf-parse.c
+@@ -586,9 +586,14 @@ pdf_parse_ind_obj(fz_context *ctx, pdf_document *doc,
+ if (c == '\r')
+ {
+ c = fz_peek_byte(ctx, file);
++/* willus mod -- no warning */
++/*
+ if (c != '\n')
+ fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen);
+ else
++*/
++if (c=='\n')
++/* willus mod -- end */
+ fz_read_byte(ctx, file);
+ }
+ stm_ofs = fz_tell(ctx, file);
+diff --git a/source/pdf/pdf-xref.c b/source/pdf/pdf-xref.c
+index 2475b6e86..bc163563a 100644
+--- a/source/pdf/pdf-xref.c
++++ b/source/pdf/pdf-xref.c
+@@ -707,8 +707,11 @@ pdf_xref_size_from_old_trailer(fz_context *ctx, pdf_document *doc, pdf_lexbuf *b
+ if (!s)
+ fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection length missing");
+ len = fz_atoi(fz_strsep(&s, " "));
++/* willus mod -- no warning */
++/*
+ if (len < 0)
+ fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection length must be positive");
++*/
+
+ /* broken pdfs where the section is not on a separate line */
+ if (s && *s != '\0')
+@@ -1372,7 +1375,10 @@ pdf_init_document(fz_context *ctx, pdf_document *doc)
+ {
+ pdf_drop_xref_sections(ctx, doc);
+ fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
++/* willus mod -- be quiet */
++/*
+ fz_warn(ctx, "trying to repair broken xref");
++*/
+ repaired = 1;
+ }
+
+@@ -1496,7 +1502,10 @@ pdf_drop_document_imp(fz_context *ctx, pdf_document *doc)
+ /* Swallow error, but continue dropping */
+ }
+
++/* willu smod -- no pdf_drop_js */
++/*
+ pdf_drop_js(ctx, doc->js);
++*/
+
+ pdf_drop_xref_sections(ctx, doc);
+ fz_free(ctx, doc->xref_index);
+--
+2.22.0
+
diff --git a/pkgs/applications/misc/k2pdfopt/tesseract.patch b/pkgs/applications/misc/k2pdfopt/tesseract.patch
index b882f5b949c..adfee9ae282 100644
--- a/pkgs/applications/misc/k2pdfopt/tesseract.patch
+++ b/pkgs/applications/misc/k2pdfopt/tesseract.patch
@@ -1,13 +1,675 @@
+From 39aa8502eee7bb669a29d1a9b3bfe5c9595ad960 Mon Sep 17 00:00:00 2001
+From: Daniel Fullmer
+Date: Fri, 13 Sep 2019 13:45:05 -0400
+Subject: [PATCH] Willus mod changes from k2pdfopt
+
+---
+ src/api/Makefile.am | 1 +
+ src/api/baseapi.cpp | 87 +++++++++++
+ src/api/baseapi.h | 3 +
+ src/api/tesscapi.cpp | 311 +++++++++++++++++++++++++++++++++++++
+ src/api/tesseract.h | 29 ++++
+ src/ccmain/tessedit.cpp | 5 +-
+ src/ccutil/ccutil.h | 7 +
+ src/ccutil/genericvector.h | 21 ++-
+ src/ccutil/mainblk.cpp | 17 +-
+ src/ccutil/params.cpp | 3 +-
+ src/ccutil/serialis.cpp | 3 +
+ src/ccutil/serialis.h | 2 +
+ src/lstm/input.cpp | 3 +
+ 13 files changed, 488 insertions(+), 4 deletions(-)
+ create mode 100644 src/api/tesscapi.cpp
+ create mode 100644 src/api/tesseract.h
+
diff --git a/src/api/Makefile.am b/src/api/Makefile.am
-index d8c1e54..46ead13 100644
+index d9b76eb6..cd2dc30f 100644
--- a/src/api/Makefile.am
+++ b/src/api/Makefile.am
-@@ -42,7 +42,7 @@ libtesseract_api_la_CPPFLAGS = $(AM_CPPFLAGS)
- if VISIBILITY
- libtesseract_api_la_CPPFLAGS += -DTESS_EXPORTS
- endif
--libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp pdfrenderer.cpp
-+libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp pdfrenderer.cpp tesscapi.cpp
+@@ -39,6 +39,7 @@ libtesseract_api_la_SOURCES += lstmboxrenderer.cpp
+ libtesseract_api_la_SOURCES += pdfrenderer.cpp
+ libtesseract_api_la_SOURCES += wordstrboxrenderer.cpp
+ libtesseract_api_la_SOURCES += renderer.cpp
++libtesseract_api_la_SOURCES += tesscapi.cpp
lib_LTLIBRARIES += libtesseract.la
- libtesseract_la_LDFLAGS =
+ libtesseract_la_LDFLAGS = $(LEPTONICA_LIBS) $(OPENCL_LDFLAGS) $(libarchive_LIBS)
+diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp
+index 9245d07c..ea964ee6 100644
+--- a/src/api/baseapi.cpp
++++ b/src/api/baseapi.cpp
+@@ -215,6 +215,14 @@ TessBaseAPI::TessBaseAPI()
+ // Use the current locale if building debug code.
+ std::locale::global(std::locale(""));
+ #endif
++ const char *locale;
++ locale = std::setlocale(LC_ALL, nullptr);
++/* willus mod Remove assertions--taken care of in tesscapi.cpp */
++// ASSERT_HOST(!strcmp(locale, "C"));
++ locale = std::setlocale(LC_CTYPE, nullptr);
++// ASSERT_HOST(!strcmp(locale, "C"));
++ locale = std::setlocale(LC_NUMERIC, nullptr);
++// ASSERT_HOST(!strcmp(locale, "C"));
+ }
+
+ TessBaseAPI::~TessBaseAPI() {
+@@ -1333,6 +1341,85 @@ static void AddBoxToTSV(const PageIterator* it, PageIteratorLevel level,
+ text->add_str_int("\t", bottom - top);
+ }
+
++/* willus mod */
++int TessBaseAPI::GetOCRWords(int **x00,int **y00,int **x11,int **y11,int **ybaseline0,
++ char **utf8words)
++
++ {
++ int iword,nwords,totlen,it8;
++ int *x0,*y0,*x1,*y1,*ybaseline;
++ char *tutf8;
++
++ ResultIterator *res_it = GetIterator();
++ /* Count words */
++ iword=0;
++ totlen=0;
++ while (!res_it->Empty(RIL_BLOCK))
++ {
++ if (res_it->Empty(RIL_WORD))
++ {
++ res_it->Next(RIL_WORD);
++ continue;
++ }
++ iword++;
++ STRING textstr=std::unique_ptr(res_it->GetUTF8Text(RIL_WORD)).get();
++ totlen+=strlen(textstr.string())+1;
++ res_it->Next(RIL_WORD);
++ }
++ nwords=iword;
++/*
++printf("\nnwords=%d, totlen=%d\n",nwords,totlen);
++*/
++ x0=(*x00)=(int *)malloc(sizeof(int)*5*nwords);
++ y0=(*y00)=&x0[nwords];
++ x1=(*x11)=&y0[nwords];
++ y1=(*y11)=&x1[nwords];
++ ybaseline=(*ybaseline0)=&y1[nwords];
++ tutf8=(*utf8words)=(char *)malloc(totlen);
++ iword=0;
++ it8=0;
++ res_it->Begin();
++ while (!res_it->Empty(RIL_BLOCK))
++ {
++ if (res_it->Empty(RIL_WORD))
++ {
++ res_it->Next(RIL_WORD);
++ continue;
++ }
++ STRING textstr=std::unique_ptr(res_it->GetUTF8Text(RIL_WORD)).get();
++ strcpy(&tutf8[it8],textstr.string());
++ it8 += strlen(&tutf8[it8])+1;
++ /*
++ STRING textstr("");
++ textstr += std::unique_ptr(res_it->GetUTF8Text(RIL_WORD)).get();
++ */
++/*
++printf("Word %d: '%s'\n",iword,textstr.string());
++*/
++ int left, top, right, bottom;
++ int u1,v1,u2,v2;
++ res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
++ res_it->Baseline(RIL_WORD, &u1, &v1, &u2, &v2);
++ x0[iword]=left;
++ x1[iword]=right;
++ y0[iword]=top;
++ y1[iword]=bottom;
++ ybaseline[iword]=(v1+v2)/2;
++ iword++;
++/*
++printf("BB: (%d,%d)-(%d,%d) BL: (%d,%d)-(%d,%d)\n",left,bottom,right,top,x1,y1,x2,y2);
++*/
++ res_it->Next(RIL_WORD);
++ }
++/*
++printf("iword=%d\n",iword);
++*/
++ return(iword);
++ }
++
++/* willus mod */
++int GetOCRWords(int **x0,int **y0,int **x1,int **y1,int **ybaseline,char **utf8words);
++
+ /**
+ * Make a TSV-formatted string from the internal data structures.
+ * page_number is 0-based but will appear in the output as 1-based.
+diff --git a/src/api/baseapi.h b/src/api/baseapi.h
+index 3724dd92..23be5920 100644
+--- a/src/api/baseapi.h
++++ b/src/api/baseapi.h
+@@ -575,6 +575,9 @@ class TESS_API TessBaseAPI {
+ */
+ char* GetHOCRText(ETEXT_DESC* monitor, int page_number);
+
++/* willus mod */
++int GetOCRWords(int **x0,int **y0,int **x1,int **y1,int **ybaseline,char **utf8words);
++
+ /**
+ * Make a HTML-formatted string with hOCR markup from the internal
+ * data structures.
+diff --git a/src/api/tesscapi.cpp b/src/api/tesscapi.cpp
+new file mode 100644
+index 00000000..1752fafe
+--- /dev/null
++++ b/src/api/tesscapi.cpp
+@@ -0,0 +1,311 @@
++/*
++** tesscapi.cpp willus.com attempt at C wrapper for tesseract.
++** (Butchered from tesseractmain.cpp)
++** Last udpated 9-1-12
++**
++** Copyright (C) 2012 http://willus.com
++**
++** This program is free software: you can redistribute it and/or modify
++** it under the terms of the GNU Affero General Public License as
++** published by the Free Software Foundation, either version 3 of the
++** License, or (at your option) any later version.
++**
++** This program is distributed in the hope that it will be useful,
++** but WITHOUT ANY WARRANTY; without even the implied warranty of
++** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++** GNU Affero General Public License for more details.
++**
++** You should have received a copy of the GNU Affero General Public License
++** along with this program. If not, see .
++**
++*/
++
++/*
++#include "mfcpch.h"
++*/
++// #define USE_VLD //Uncomment for Visual Leak Detector.
++#if (defined _MSC_VER && defined USE_VLD)
++#include
++#endif
++
++// Include automatically generated configuration file if running autoconf
++#ifdef HAVE_CONFIG_H
++#include "config_auto.h"
++#endif
++#include
++#ifdef USING_GETTEXT
++#include
++#define _(x) gettext(x)
++#else
++#define _(x) (x)
++#endif
++
++#include "allheaders.h"
++#include "baseapi.h"
++#include "strngs.h"
++#include "params.h"
++#include "blobs.h"
++#include "simddetect.h"
++#include "tesseractclass.h"
++/*
++#include "notdll.h"
++*/
++
++/* C Wrappers */
++#include "tesseract.h"
++
++// static tesseract::TessBaseAPI api[4];
++
++/*
++** ocr_type=0: OEM_DEFAULT
++** ocr_type=1: OEM_TESSERACT_ONLY
++** ocr_type=2: OEM_LSTM_ONLY
++** ocr_type=3: OEM_TESSERACT_LSTM_COMBINED
++*/
++void *tess_capi_init(char *datapath,char *language,int ocr_type,FILE *out,
++ char *initstr,int maxlen,int *status)
++
++ {
++ char original_locale[256];
++ tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI;
++/*
++printf("@tess_capi_init\n");
++printf(" datapath='%s'\n",datapath);
++printf(" language='%s'\n",language);
++printf(" ocr_type=%d\n",ocr_type);
++*/
++#ifdef USE_NLS
++ setlocale (LC_ALL, "");
++ bindtextdomain (PACKAGE, LOCALEDIR);
++ textdomain (PACKAGE);
++#endif
++ /* willus mod, 11-24-16 */
++ /* Tesseract needs "C" locale to correctly parse all data .traineddata files. */
++/*
++printf("locale='%s'\n",setlocale(LC_ALL,NULL));
++printf("ctype='%s'\n",setlocale(LC_CTYPE,NULL));
++printf("numeric='%s'\n",setlocale(LC_NUMERIC,NULL));
++*/
++ strncpy(original_locale,setlocale(LC_ALL,NULL),255);
++ original_locale[255]='\0';
++/*
++printf("original_locale='%s'\n",original_locale);
++*/
++ setlocale(LC_ALL,"C");
++/*
++printf("new locale='%s'\n",setlocale(LC_ALL,NULL));
++printf("new ctype='%s'\n",setlocale(LC_CTYPE,NULL));
++printf("new numeric='%s'\n",setlocale(LC_NUMERIC,NULL));
++*/
++ // fprintf(stderr, "tesseract %s\n", tesseract::TessBaseAPI::Version());
++ // Make the order of args a bit more forgiving than it used to be.
++ const char* lang = "eng";
++ tesseract::PageSegMode pagesegmode = tesseract::PSM_SINGLE_BLOCK;
++ if (language!=NULL && language[0]!='\0')
++ lang = language;
++ /*
++ if (output == NULL)
++ {
++ fprintf(stderr, _("Usage:%s imagename outputbase [-l lang] "
++ "[-psm pagesegmode] [configfile...]\n"), argv[0]);
++ fprintf(stderr,
++ _("pagesegmode values are:\n"
++ "0 = Orientation and script detection (OSD) only.\n"
++ "1 = Automatic page segmentation with OSD.\n"
++ "2 = Automatic page segmentation, but no OSD, or OCR\n"
++ "3 = Fully automatic page segmentation, but no OSD. (Default)\n"
++ "4 = Assume a single column of text of variable sizes.\n"
++ "5 = Assume a single uniform block of vertically aligned text.\n"
++ "6 = Assume a single uniform block of text.\n"
++ "7 = Treat the image as a single text line.\n"
++ "8 = Treat the image as a single word.\n"
++ "9 = Treat the image as a single word in a circle.\n"
++ "10 = Treat the image as a single character.\n"));
++ fprintf(stderr, _("-l lang and/or -psm pagesegmode must occur before any"
++ "configfile.\n"));
++ exit(1);
++ }
++ */
++/*
++printf("SSE = %s\n",SIMDDetect::IsSSEAvailable() ? "AVAILABLE" : "NOT AVAILABLE");
++printf("AVX = %s\n",SIMDDetect::IsAVXAvailable() ? "AVAILABLE" : "NOT AVAILABLE");
++*/
++/*
++v4.00 loads either TESSERACT enginer, LSTM engine, or both. No CUBE.
++*/
++ ocr_type=0; /* Ignore specified and use default */
++ api->SetOutputName(NULL);
++ (*status)=api->Init(datapath,lang,
++ ocr_type==0 ? tesseract::OEM_DEFAULT :
++ (ocr_type==1 ? tesseract::OEM_TESSERACT_ONLY :
++ (ocr_type==2 ? tesseract::OEM_LSTM_ONLY :
++ (tesseract::OEM_TESSERACT_LSTM_COMBINED))));
++ if ((*status)!=0)
++ {
++ /* willus mod, 11-24-16 */
++ setlocale(LC_ALL,original_locale);
++ api->End();
++ delete api;
++ return(NULL);
++ }
++ /*
++ api.Init("tesscapi",lang,tesseract::OEM_DEFAULT,
++ &(argv[arg]), argc - arg, NULL, NULL, false);
++ */
++ // We have 2 possible sources of pagesegmode: a config file and
++ // the command line. For backwards compatability reasons, the
++ // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
++ // default for this program is tesseract::PSM_AUTO. We will let
++ // the config file take priority, so the command-line default
++ // can take priority over the tesseract default, so we use the
++ // value from the command line only if the retrieved mode
++ // is still tesseract::PSM_SINGLE_BLOCK, indicating no change
++ // in any config file. Therefore the only way to force
++ // tesseract::PSM_SINGLE_BLOCK is from the command line.
++ // It would be simpler if we could set the value before Init,
++ // but that doesn't work.
++ if (api->GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK)
++ api->SetPageSegMode(pagesegmode);
++
++ /*
++ ** Initialization message
++ */
++ {
++ char istr[1024];
++ int sse,avx;
++
++// printf("tessedit_ocr_engine_mode = %d\n",tessedit_ocr_engine_mode);
++ sprintf(istr,"%s",api->Version());
++ sse=tesseract::SIMDDetect::IsSSEAvailable();
++ avx=tesseract::SIMDDetect::IsAVXAvailable();
++ if (sse || avx)
++ sprintf(&istr[strlen(istr)]," [%s]",sse&&avx?"SSE+AVX":(sse?"SSE":"AVX"));
++ sprintf(&istr[strlen(istr)],"\n Tesseract data folder = '%s'",datapath==NULL?getenv("TESSDATA_PREFIX"):datapath);
++ strcat(istr,"\n Tesseract languages: ");
++ GenericVector languages;
++ api->GetLoadedLanguagesAsVector(&languages);
++/*
++printf("OEM=%d\n",api->oem());
++printf("Langs='%s'\n",api->GetInitLanguagesAsString());
++printf("AnyTessLang()=%d\n",(int)api->tesseract()->AnyTessLang());
++printf("AnyLSTMLang()=%d\n",(int)api->tesseract()->AnyLSTMLang());
++printf("num_sub_langs()=%d\n",api->tesseract()->num_sub_langs());
++printf("languages.size()=%d\n",(int)languages.size());
++*/
++
++ for (int i=0;i<=api->tesseract()->num_sub_langs();i++)
++ {
++ tesseract::Tesseract *lang1;
++ int eng;
++ lang1 = i==0 ? api->tesseract() : api->tesseract()->get_sub_lang(i-1);
++ eng=(int)lang1->tessedit_ocr_engine_mode;
++ sprintf(&istr[strlen(istr)],"%s%s [%s]",i==0?"":", ",lang1->lang.string(),
++ eng==2?"LSTM+Tess":(eng==1?"LSTM":"Tess"));
++ }
++/*
++printf("%d. '%s'\n",i+1,languages[i].string());
++printf(" sublang[%d].oem_engine = %d\n",i+1,(int)api->tesseract()->get_sub_lang(i)->tessedit_ocr_engine_mode);
++*/
++
++ /*
++ if (ocr_type==0 || ocr_type==3)
++ sprintf(&istr[strlen(istr)],"[LSTM+] (lang=");
++ else if (ocr_type==2)
++ sprintf(&istr[strlen(istr)],"[LSTM] (lang=");
++ strncpy(&istr[strlen(istr)],language,253-strlen(istr));
++ istr[253]='\0';
++ strcat(istr,")");
++ */
++ if (out!=NULL)
++ fprintf(out,"%s\n",istr);
++ if (initstr!=NULL)
++ {
++ strncpy(initstr,istr,maxlen-1);
++ initstr[maxlen-1]='\0';
++ }
++ }
++
++
++ /* Turn off LSTM debugging output */
++ api->SetVariable("lstm_debug_level","0");
++#if (WILLUSDEBUG & 1)
++ api->SetVariable("lstm_debug_level","9");
++ api->SetVariable("paragraph_debug_level","9");
++ api->SetVariable("tessdata_manager_debug_level","9");
++ api->SetVariable("tosp_debug_level","9");
++ api->SetVariable("wordrec_debug_level","9");
++ api->SetVariable("segsearch_debug_level","9");
++#endif
++ /* willus mod, 11-24-16 */
++ setlocale(LC_ALL,original_locale);
++ return((void *)api);
++ }
++
++
++int tess_capi_get_ocr(void *vapi,PIX *pix,char *outstr,int maxlen,int segmode,FILE *out)
++
++ {
++ tesseract::TessBaseAPI *api;
++ static int old_segmode=-1;
++
++ api=(tesseract::TessBaseAPI *)vapi;
++ if (old_segmode != segmode)
++ {
++ old_segmode=segmode;
++ api->SetPageSegMode((tesseract::PageSegMode)segmode);
++ }
++ if (!api->ProcessPage(pix,0,NULL,NULL,0,NULL))
++ {
++ /* pixDestroy(&pix); */
++ if (out!=NULL)
++ fprintf(out,"tesscapi: Error during bitmap processing.\n");
++ api->Clear();
++ return(-1);
++ }
++ strncpy(outstr,api->GetUTF8Text(),maxlen-1);
++ outstr[maxlen-1]='\0';
++ api->Clear();
++ return(0);
++ }
++
++
++int tess_capi_get_ocr_multiword(void *vapi,PIX *pix,int segmode,
++ int **left,int **top,int **right,int **bottom,
++ int **ybase,char **text,int *nw,
++ FILE *out)
++
++ {
++ tesseract::TessBaseAPI *api;
++ static int old_segmode=-1;
++
++ api=(tesseract::TessBaseAPI *)vapi;
++ if (old_segmode != segmode)
++ {
++ old_segmode=segmode;
++ api->SetPageSegMode((tesseract::PageSegMode)segmode);
++ }
++ if (!api->ProcessPage(pix,0,NULL,NULL,0,NULL))
++ {
++ if (out!=NULL)
++ fprintf(out,"tesscapi: Error during bitmap processing.\n");
++ api->Clear();
++ (*nw)=0;
++ return(-1);
++ }
++ (*nw)=api->GetOCRWords(left,top,right,bottom,ybase,text);
++ api->Clear();
++ return(0);
++ }
++
++
++void tess_capi_end(void *vapi)
++
++ {
++ tesseract::TessBaseAPI *api;
++
++ if (vapi==NULL)
++ return;
++ api=(tesseract::TessBaseAPI *)vapi;
++ api->End();
++ delete api;
++ }
+diff --git a/src/api/tesseract.h b/src/api/tesseract.h
+new file mode 100644
+index 00000000..575948cc
+--- /dev/null
++++ b/src/api/tesseract.h
+@@ -0,0 +1,29 @@
++/*
++** Willus.com's Tesseract C Wrappers
++**
++** 6-8-12
++**
++*/
++
++#ifndef _TESSERACT_H_
++#define _TESSERACT_H_
++
++//#include
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++void *tess_capi_init(char *datapath,char *language,int ocr_type,FILE *out,
++ char *initstr,int maxlen,int *status);
++int tess_capi_get_ocr(void *api,PIX *pix,char *outstr,int maxlen,int segmode,FILE *out);
++int tess_capi_get_ocr_multiword(void *vapi,PIX *pix,int segmode,
++ int **left,int **top,int **right,int **bottom,
++ int **ybase,char **text,int *nw,
++ FILE *out);
++void tess_capi_end(void *api);
++
++#ifdef __cplusplus
++}
++#endif
++
++#endif
+diff --git a/src/ccmain/tessedit.cpp b/src/ccmain/tessedit.cpp
+index 17f0951b..7af94ee2 100644
+--- a/src/ccmain/tessedit.cpp
++++ b/src/ccmain/tessedit.cpp
+@@ -101,6 +101,10 @@ bool Tesseract::init_tesseract_lang_data(
+ " to your \"tessdata\" directory.\n");
+ return false;
+ }
++ /* willus mod */
++ TFile fp;
++ strncpy(fp.tfile_filename,tessdata_path.string(),511);
++ fp.tfile_filename[511]='\0';
+ #ifndef DISABLED_LEGACY_ENGINE
+ if (oem == OEM_DEFAULT) {
+ // Set the engine mode from availability, which can then be overridden by
+@@ -116,7 +120,6 @@ bool Tesseract::init_tesseract_lang_data(
+ #endif // ndef DISABLED_LEGACY_ENGINE
+
+ // If a language specific config file (lang.config) exists, load it in.
+- TFile fp;
+ if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
+ ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp,
+ this->params());
+diff --git a/src/ccutil/ccutil.h b/src/ccutil/ccutil.h
+index 71e89c60..bdeccc14 100644
+--- a/src/ccutil/ccutil.h
++++ b/src/ccutil/ccutil.h
+@@ -80,6 +80,13 @@ class CCUtil {
+ // Member parameters.
+ // These have to be declared and initialized after params_ member, since
+ // params_ should be initialized before parameters are added to it.
++/* willus mod */
++/*
++ #ifdef _WIN32
++ STRING_VAR_H(tessedit_module_name, WINDLLNAME,
++ "Module colocated with tessdata dir");
++ #endif
++*/
+ INT_VAR_H(ambigs_debug_level, 0, "Debug level for unichar ambiguities");
+ BOOL_VAR_H(use_definite_ambigs_for_classifier, false,
+ "Use definite ambiguities when running character classifier");
+diff --git a/src/ccutil/genericvector.h b/src/ccutil/genericvector.h
+index 3556d153..3a5e8662 100644
+--- a/src/ccutil/genericvector.h
++++ b/src/ccutil/genericvector.h
+@@ -382,7 +382,26 @@ inline bool LoadDataFromFile(const char* filename, GenericVector* data) {
+ // reserve an extra byte in case caller wants to append a '\0' character
+ data->reserve(size + 1);
+ data->resize_no_init(size);
+- result = static_cast(fread(&(*data)[0], 1, size, fp)) == size;
++ /* willus mod Dec 2018--weird issue with Win XP and MinGW gcc 7.3.0 */
++ /* Can't read entire file at once -- need to break up into smaller blocksize reads */
++ {
++ int frs,n;
++ int blocksize;
++ blocksize=1024*1024;
++ for (n=0;1;)
++ {
++ int bs;
++ bs= size-n > blocksize ? blocksize : size-n;
++ frs=(int)fread(&(*data)[n],1,bs,fp);
++ n+=frs;
++ if (frs=size)
++ break;
++ }
++ result = static_cast((long)n==size);
++ }
++ /*
++ result = static_cast(fread(&(*data)[0], 1, size, fp)) == size;
++ */
+ }
+ fclose(fp);
+ }
+diff --git a/src/ccutil/mainblk.cpp b/src/ccutil/mainblk.cpp
+index 52b04b04..80b26044 100644
+--- a/src/ccutil/mainblk.cpp
++++ b/src/ccutil/mainblk.cpp
+@@ -55,8 +55,22 @@ void CCUtil::main_setup(const char *argv0, const char *basename) {
+ #if defined(_WIN32)
+ } else if (datadir == nullptr || _access(datadir.string(), 0) != 0) {
+ /* Look for tessdata in directory of executable. */
++ /*
++ char drive[_MAX_DRIVE];
++ char dir[_MAX_DIR];
++ */
+ char path[_MAX_PATH];
+- DWORD length = GetModuleFileName(nullptr, path, sizeof(path));
++ int i;
++ /* DWORD length = */ GetModuleFileName(nullptr, path, sizeof(path));
++ /* willus mod--avoid _splitpath_s -- not in XP */
++ for (i=strlen(path)-1;i>=0 && path[i]!='/' && path[i]!='\\';i--);
++ if (i>=0)
++ {
++ path[i]='\0';
++ datadir=path;
++ datadir += "/tessdata";
++ }
++ /*
+ if (length > 0 && length < sizeof(path)) {
+ char* separator = std::strrchr(path, '\\');
+ if (separator != nullptr) {
+@@ -65,6 +79,7 @@ void CCUtil::main_setup(const char *argv0, const char *basename) {
+ datadir += "/tessdata";
+ }
+ }
++ */
+ #endif /* _WIN32 */
+ #if defined(TESSDATA_PREFIX)
+ } else {
+diff --git a/src/ccutil/params.cpp b/src/ccutil/params.cpp
+index 00bf2563..486c5ce0 100644
+--- a/src/ccutil/params.cpp
++++ b/src/ccutil/params.cpp
+@@ -82,7 +82,8 @@ bool ParamUtils::ReadParamsFromFp(SetParamConstraint constraint, TFile *fp,
+
+ if (!foundit) {
+ anyerr = true; // had an error
+- tprintf("Warning: Parameter not found: %s\n", line);
++ /* willus mod */
++ tprintf("Tesseract warning: Parameter %s not found in file %s.\n",line,fp->tfile_filename);
+ }
+ }
+ }
+diff --git a/src/ccutil/serialis.cpp b/src/ccutil/serialis.cpp
+index 7def011f..6107a494 100644
+--- a/src/ccutil/serialis.cpp
++++ b/src/ccutil/serialis.cpp
+@@ -201,6 +201,9 @@ bool TFile::Open(const STRING& filename, FileReader reader) {
+ offset_ = 0;
+ is_writing_ = false;
+ swap_ = false;
++ /* willus mod */
++ strncpy(tfile_filename,filename.string(),511);
++ tfile_filename[511]='\0';
+ if (reader == nullptr)
+ return LoadDataFromFile(filename, data_);
+ else
+diff --git a/src/ccutil/serialis.h b/src/ccutil/serialis.h
+index 095b9227..4cc8251e 100644
+--- a/src/ccutil/serialis.h
++++ b/src/ccutil/serialis.h
+@@ -77,6 +77,8 @@ class TFile {
+ public:
+ TFile();
+ ~TFile();
++ /* willus mod */
++ char tfile_filename[512];
+
+ // All the Open methods load the whole file into memory for reading.
+ // Opens a file with a supplied reader, or nullptr to use the default.
+diff --git a/src/lstm/input.cpp b/src/lstm/input.cpp
+index 73b584b3..0b0b54c3 100644
+--- a/src/lstm/input.cpp
++++ b/src/lstm/input.cpp
+@@ -93,8 +93,11 @@ Pix* Input::PrepareLSTMInputs(const ImageData& image_data,
+ return nullptr;
+ }
+ if (width < min_width || height < min_width) {
++ /* willus mod -- no warning */
++ /*
+ tprintf("Image too small to scale!! (%dx%d vs min width of %d)\n", width,
+ height, min_width);
++ */
+ pixDestroy(&pix);
+ return nullptr;
+ }
+--
+2.22.0
+