diff --git a/pkgs/development/python-modules/pyocr/default.nix b/pkgs/development/python-modules/pyocr/default.nix index 65a8c741d6e..54caba10189 100644 --- a/pkgs/development/python-modules/pyocr/default.nix +++ b/pkgs/development/python-modules/pyocr/default.nix @@ -4,30 +4,35 @@ buildPythonPackage rec { name = "pyocr-${version}"; - version = "0.4.6"; + version = "0.4.7"; # Don't fetch from PYPI because it doesn't contain tests. src = fetchFromGitHub { owner = "jflesch"; repo = "pyocr"; rev = version; - sha256 = "0amyhkkm400qzbw65ivyzrzxl2r7vxqgsgqm7ml95m7gwkwhnzz0"; + sha256 = "1iw73r8yrgjf8g00yzpz62ymqbf89cqhyhl9g430srmsrq7mn2yd"; }; - patches = [ ./tesseract.patch ]; + NIX_CUNEIFORM_CMD = "${cuneiform}/bin/cuneiform"; + NIX_CUNEIFORM_DATA = "${cuneiform}/share/cuneiform"; + NIX_LIBTESSERACT_PATH = "${tesseract}/lib/libtesseract.so"; + NIX_TESSDATA_PREFIX = "${tesseract}/share/tessdata"; + NIX_TESSERACT_CMD = "${tesseract}/bin/tesseract"; + + patches = [ ./paths.patch ]; postPatch = '' - sed -i \ - -e 's,^\(TESSERACT_CMD *= *\).*,\1"${tesseract}/bin/tesseract",' \ - -e 's,^\(CUNEIFORM_CMD *= *\).*,\1"${cuneiform}/bin/cuneiform",' \ - -e '/^CUNIFORM_POSSIBLE_PATHS *= *\[/,/^\]$/ { - c CUNIFORM_POSSIBLE_PATHS = ["${cuneiform}/share/cuneiform"] - }' src/pyocr/{tesseract,cuneiform}.py + substituteInPlace src/pyocr/cuneiform.py \ + --subst-var NIX_CUNEIFORM_CMD \ + --subst-var NIX_CUNEIFORM_CMD - sed -i -r \ - -e 's,"libtesseract\.so\.3","${tesseract}/lib/libtesseract.so",' \ - -e 's,^(TESSDATA_PREFIX *=).*,\1 "${tesseract}/share/tessdata",' \ - src/pyocr/libtesseract/tesseract_raw.py + substituteInPlace src/pyocr/tesseract.py \ + --subst-var NIX_TESSERACT_CMD + + substituteInPlace src/pyocr/libtesseract/tesseract_raw.py \ + --subst-var NIX_TESSDATA_PREFIX \ + --subst-var NIX_LIBTESSERACT_PATH # Disable specific tests that are probably failing because of this issue: # https://github.com/jflesch/pyocr/issues/52 diff --git a/pkgs/development/python-modules/pyocr/paths.patch b/pkgs/development/python-modules/pyocr/paths.patch new file mode 100644 index 00000000000..3fe11598b7d --- /dev/null +++ b/pkgs/development/python-modules/pyocr/paths.patch @@ -0,0 +1,111 @@ +diff --git a/src/pyocr/cuneiform.py b/src/pyocr/cuneiform.py +index a461d92..1f2b914 100644 +--- a/src/pyocr/cuneiform.py ++++ b/src/pyocr/cuneiform.py +@@ -27,13 +27,9 @@ from . import error + from . import util + + +-# CHANGE THIS IF CUNEIFORM IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY +-CUNEIFORM_CMD = 'cuneiform' ++CUNEIFORM_CMD = '@NIX_CUNEIFORM_CMD@' + +-CUNEIFORM_DATA_POSSIBLE_PATHS = [ +- "/usr/local/share/cuneiform", +- "/usr/share/cuneiform", +-] ++CUNEIFORM_DATA_POSSIBLE_PATHS = ['@NIX_CUNEIFORM_DATA@'] + + LANGUAGES_LINE_PREFIX = "Supported languages: " + LANGUAGES_SPLIT_RE = re.compile("[^a-z]") +diff --git a/src/pyocr/libtesseract/tesseract_raw.py b/src/pyocr/libtesseract/tesseract_raw.py +index b4e7bda..47505f7 100644 +--- a/src/pyocr/libtesseract/tesseract_raw.py ++++ b/src/pyocr/libtesseract/tesseract_raw.py +@@ -1,55 +1,13 @@ + import ctypes + import logging + import os +-import sys + + from ..error import TesseractError + + + logger = logging.getLogger(__name__) + +-TESSDATA_PREFIX = os.getenv('TESSDATA_PREFIX', None) +-libnames = [] +- +-if getattr(sys, 'frozen', False): +- # Pyinstaller integration +- libnames += [os.path.join(sys._MEIPASS, "libtesseract-4.dll")] +- libnames += [os.path.join(sys._MEIPASS, "libtesseract-3.dll")] +- tessdata = os.path.join(sys._MEIPASS, "data") +- if not os.path.exists(os.path.join(tessdata, "tessdata")): +- logger.warning( +- "Running from container, but no tessdata ({}) found !".format( +- tessdata +- ) +- ) +- else: +- TESSDATA_PREFIX = tessdata +- +- +-if sys.platform[:3] == "win": +- libnames += [ +- # Jflesch> Don't they have the equivalent of LD_LIBRARY_PATH on +- # Windows ? +- "../vs2010/DLL_Release/libtesseract302.dll", +- "libtesseract302.dll", +- "C:\\Program Files (x86)\\Tesseract-OCR\\libtesseract-4.dll", +- "C:\\Program Files (x86)\\Tesseract-OCR\\libtesseract-3.dll", +- ] +-else: +- libnames += [ +- "libtesseract.so.4", +- "libtesseract.so.3", +- ] +- +- +-g_libtesseract = None +- +-for libname in libnames: +- try: +- g_libtesseract = ctypes.cdll.LoadLibrary(libname) +- break +- except OSError: +- pass ++g_libtesseract = ctypes.cdll.LoadLibrary('@NIX_LIBTESSERACT_PATH@') + + + class PageSegMode(object): +@@ -326,12 +284,11 @@ def init(lang=None): + try: + if lang: + lang = lang.encode("utf-8") +- prefix = None +- if TESSDATA_PREFIX: +- prefix = TESSDATA_PREFIX.encode("utf-8") ++ prefix = os.getenv('TESSDATA_PREFIX', '@NIX_TESSDATA_PREFIX@') ++ os.environ['TESSDATA_PREFIX'] = prefix + g_libtesseract.TessBaseAPIInit3( + ctypes.c_void_p(handle), +- ctypes.c_char_p(prefix), ++ ctypes.c_char_p(prefix.encode('utf-8')), + ctypes.c_char_p(lang) + ) + g_libtesseract.TessBaseAPISetVariable( +diff --git a/src/pyocr/tesseract.py b/src/pyocr/tesseract.py +index c935881..7139ffe 100755 +--- a/src/pyocr/tesseract.py ++++ b/src/pyocr/tesseract.py +@@ -31,8 +31,7 @@ from .builders import DigitBuilder # backward compatibility + from .error import TesseractError # backward compatibility + from .util import digits_only + +-# CHANGE THIS IF TESSERACT IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY +-TESSERACT_CMD = 'tesseract.exe' if os.name == 'nt' else 'tesseract' ++TESSERACT_CMD = '@NIX_TESSERACT_CMD@' + + TESSDATA_EXTENSION = ".traineddata" + diff --git a/pkgs/development/python-modules/pyocr/tesseract.patch b/pkgs/development/python-modules/pyocr/tesseract.patch deleted file mode 100644 index d09a7c57352..00000000000 --- a/pkgs/development/python-modules/pyocr/tesseract.patch +++ /dev/null @@ -1,316 +0,0 @@ -This patch is required for pyocr to work with Tesseract version 3.05.00 -and has been submitted upstream at the following URL: - -https://github.com/jflesch/pyocr/pull/62 - -diff --git a/src/pyocr/builders.py b/src/pyocr/builders.py -index 73c964d..20f390c 100644 ---- a/src/pyocr/builders.py -+++ b/src/pyocr/builders.py -@@ -240,8 +240,10 @@ class BaseBuilder(object): - cuneiform_args : Arguments passed to the Cuneiform command line. - """ - -- def __init__(self, file_extensions, tesseract_configs, cuneiform_args): -+ def __init__(self, file_extensions, tesseract_flags, tesseract_configs, -+ cuneiform_args): - self.file_extensions = file_extensions -+ self.tesseract_flags = tesseract_flags - self.tesseract_configs = tesseract_configs - self.cuneiform_args = cuneiform_args - -@@ -298,7 +300,7 @@ class TextBuilder(BaseBuilder): - def __init__(self, tesseract_layout=3, cuneiform_dotmatrix=False, - cuneiform_fax=False, cuneiform_singlecolumn=False): - file_ext = ["txt"] -- tess_conf = ["-psm", str(tesseract_layout)] -+ tess_flags = ["-psm", str(tesseract_layout)] - cun_args = ["-f", "text"] - # Add custom cuneiform parameters if needed - for par, arg in [(cuneiform_dotmatrix, "--dotmatrix"), -@@ -306,7 +308,7 @@ class TextBuilder(BaseBuilder): - (cuneiform_singlecolumn, "--singlecolumn")]: - if par: - cun_args.append(arg) -- super(TextBuilder, self).__init__(file_ext, tess_conf, cun_args) -+ super(TextBuilder, self).__init__(file_ext, tess_flags, [], cun_args) - self.tesseract_layout = tesseract_layout - self.built_text = [] - -@@ -540,9 +542,11 @@ class WordBoxBuilder(BaseBuilder): - - def __init__(self, tesseract_layout=1): - file_ext = ["html", "hocr"] -- tess_conf = ["hocr", "-psm", str(tesseract_layout)] -+ tess_flags = ["-psm", str(tesseract_layout)] -+ tess_conf = ["hocr"] - cun_args = ["-f", "hocr"] -- super(WordBoxBuilder, self).__init__(file_ext, tess_conf, cun_args) -+ super(WordBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf, -+ cun_args) - self.word_boxes = [] - self.tesseract_layout = tesseract_layout - -@@ -614,9 +618,11 @@ class LineBoxBuilder(BaseBuilder): - - def __init__(self, tesseract_layout=1): - file_ext = ["html", "hocr"] -- tess_conf = ["hocr", "-psm", str(tesseract_layout)] -+ tess_flags = ["-psm", str(tesseract_layout)] -+ tess_conf = ["hocr"] - cun_args = ["-f", "hocr"] -- super(LineBoxBuilder, self).__init__(file_ext, tess_conf, cun_args) -+ super(LineBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf, -+ cun_args) - self.lines = [] - self.tesseract_layout = tesseract_layout - -diff --git a/src/pyocr/libtesseract/tesseract_raw.py b/src/pyocr/libtesseract/tesseract_raw.py -index 0c2259a..f7ab309 100644 ---- a/src/pyocr/libtesseract/tesseract_raw.py -+++ b/src/pyocr/libtesseract/tesseract_raw.py -@@ -263,11 +263,22 @@ if g_libtesseract: - ] - g_libtesseract.TessDeleteText.restype = None - -- g_libtesseract.TessBaseAPIDetectOS.argtypes = [ -- ctypes.c_void_p, # TessBaseAPI* -- ctypes.POINTER(OSResults), -- ] -- g_libtesseract.TessBaseAPIDetectOS.restype = ctypes.c_bool -+ if hasattr(g_libtesseract, 'TessBaseAPIDetectOrientationScript'): -+ g_libtesseract.TessBaseAPIDetectOrientationScript.argtypes = [ -+ ctypes.c_void_p, # TessBaseAPI* -+ ctypes.POINTER(ctypes.c_int), # orient_deg -+ ctypes.POINTER(ctypes.c_float), # orient_conf -+ ctypes.POINTER(ctypes.c_char_p), # script_name -+ ctypes.POINTER(ctypes.c_float), # script_conf -+ ] -+ g_libtesseract.TessBaseAPIDetectOrientationScript.restype = \ -+ ctypes.c_bool -+ else: -+ g_libtesseract.TessBaseAPIDetectOS.argtypes = [ -+ ctypes.c_void_p, # TessBaseAPI* -+ ctypes.POINTER(OSResults), -+ ] -+ g_libtesseract.TessBaseAPIDetectOS.restype = ctypes.c_bool - - - def init(lang=None): -@@ -526,15 +537,37 @@ def detect_os(handle): - global g_libtesseract - assert(g_libtesseract) - -- results = OSResults() -- r = g_libtesseract.TessBaseAPIDetectOS( -- ctypes.c_void_p(handle), -- ctypes.pointer(results) -- ) -- if not r: -- raise TesseractError("detect_orientation failed", -- "TessBaseAPIDetectOS() failed") -- return { -- "orientation": results.best_orientation_id, -- "confidence": results.best_oconfidence, -- } -+ # Use the new API function if it is available, because since Tesseract -+ # 3.05.00 the old API function _always_ returns False. -+ if hasattr(g_libtesseract, 'TessBaseAPIDetectOrientationScript'): -+ orientation_deg = ctypes.c_int(0) -+ orientation_confidence = ctypes.c_float(0.0) -+ -+ r = g_libtesseract.TessBaseAPIDetectOrientationScript( -+ ctypes.c_void_p(handle), -+ ctypes.byref(orientation_deg), -+ ctypes.byref(orientation_confidence), -+ None, # script_name -+ None # script_confidence -+ ) -+ -+ if not r: -+ raise TesseractError("detect_orientation failed", -+ "TessBaseAPIDetectOrientationScript() failed") -+ return { -+ "orientation": round(orientation_deg.value / 90), -+ "confidence": orientation_confidence.value, -+ } -+ else: # old API (before Tesseract 3.05.00) -+ results = OSResults() -+ r = g_libtesseract.TessBaseAPIDetectOS( -+ ctypes.c_void_p(handle), -+ ctypes.pointer(results) -+ ) -+ if not r: -+ raise TesseractError("detect_orientation failed", -+ "TessBaseAPIDetectOS() failed") -+ return { -+ "orientation": results.best_orientation_id, -+ "confidence": results.best_oconfidence, -+ } -diff --git a/src/pyocr/tesseract.py b/src/pyocr/tesseract.py -index 99b0121..658c96b 100755 ---- a/src/pyocr/tesseract.py -+++ b/src/pyocr/tesseract.py -@@ -22,6 +22,8 @@ import os - import subprocess - import sys - import tempfile -+import contextlib -+import shutil - - from . import builders - from . import error -@@ -62,9 +64,11 @@ class CharBoxBuilder(builders.BaseBuilder): - - def __init__(self): - file_ext = ["box"] -+ tess_flags = [] - tess_conf = ["batch.nochop", "makebox"] - cun_args = [] -- super(CharBoxBuilder, self).__init__(file_ext, tess_conf, cun_args) -+ super(CharBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf, -+ cun_args) - self.tesseract_layout = 1 - - @staticmethod -@@ -173,18 +177,19 @@ def detect_orientation(image, lang=None): - TesseractError --- if no script detected on the image - """ - _set_environment() -- with temp_file(".bmp") as input_file: -- command = [TESSERACT_CMD, input_file.name, 'stdout', "-psm", "0"] -+ with temp_dir() as tmpdir: -+ command = [TESSERACT_CMD, "input.bmp", 'stdout', "-psm", "0"] - if lang is not None: - command += ['-l', lang] - - if image.mode != "RGB": - image = image.convert("RGB") -- image.save(input_file.name) -+ image.save(os.path.join(tmpdir, "input.bmp")) - - proc = subprocess.Popen(command, stdin=subprocess.PIPE, shell=False, - startupinfo=g_subprocess_startup_info, - creationflags=g_creation_flags, -+ cwd=tmpdir, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - proc.stdin.close() -@@ -224,8 +229,8 @@ def get_available_builders(): - ] - - --def run_tesseract(input_filename, output_filename_base, lang=None, -- configs=None): -+def run_tesseract(input_filename, output_filename_base, cwd=None, lang=None, -+ flags=None, configs=None): - ''' - Runs Tesseract: - `TESSERACT_CMD` \ -@@ -238,6 +243,8 @@ def run_tesseract(input_filename, output_filename_base, lang=None, - input_filename --- image to read - output_filename_base --- file name in which must be stored the result - (without the extension) -+ cwd --- Run Tesseract in the specified working directory or use current -+ one if None - lang --- Tesseract language to use (if None, none will be specified) - config --- List of Tesseract configs to use (if None, none will be - specified) -@@ -252,10 +259,13 @@ def run_tesseract(input_filename, output_filename_base, lang=None, - if lang is not None: - command += ['-l', lang] - -+ if flags is not None: -+ command += flags -+ - if configs is not None: - command += configs - -- proc = subprocess.Popen(command, -+ proc = subprocess.Popen(command, cwd=cwd, - startupinfo=g_subprocess_startup_info, - creationflags=g_creation_flags, - stdout=subprocess.PIPE, -@@ -301,11 +311,18 @@ class ReOpenableTempfile(object): - self.name = None - - --def temp_file(suffix): -- ''' Returns a temporary file ''' -- if os.name == 'nt': # Windows -- return ReOpenableTempfile(suffix) -- return tempfile.NamedTemporaryFile(prefix='tess_', suffix=suffix) -+@contextlib.contextmanager -+def temp_dir(): -+ """ -+ A context manager for maintaining a temporary directory -+ """ -+ # NOTE: Drop this as soon as we don't support Python 2.7 anymore, because -+ # since Python 3.2 there is a context manager called TemporaryDirectory(). -+ path = tempfile.mkdtemp(prefix='tess_') -+ try: -+ yield path -+ finally: -+ shutil.rmtree(path) - - - def image_to_string(image, lang=None, builder=None): -@@ -329,23 +346,20 @@ def image_to_string(image, lang=None, builder=None): - - if builder is None: - builder = builders.TextBuilder() -- with temp_file(".bmp") as input_file: -- with temp_file('') as output_file: -- output_file_name_base = output_file.name -- -+ with temp_dir() as tmpdir: - if image.mode != "RGB": - image = image.convert("RGB") -- image.save(input_file.name) -- (status, errors) = run_tesseract(input_file.name, -- output_file_name_base, -+ image.save(os.path.join(tmpdir, "input.bmp")) -+ (status, errors) = run_tesseract("input.bmp", "output", cwd=tmpdir, - lang=lang, -+ flags=builder.tesseract_flags, - configs=builder.tesseract_configs) - if status: - raise TesseractError(status, errors) - - output_file_name = "ERROR" - for file_extension in builder.file_extensions: -- output_file_name = ('%s.%s' % (output_file_name_base, -+ output_file_name = ('%s.%s' % (os.path.join(tmpdir, "output"), - file_extension)) - if not os.access(output_file_name, os.F_OK): - continue -diff --git a/tests/tests_libtesseract.py b/tests/tests_libtesseract.py -index ccddd07..1ac2a4b 100644 ---- a/tests/tests_libtesseract.py -+++ b/tests/tests_libtesseract.py -@@ -33,8 +33,9 @@ class TestContext(unittest.TestCase): - (3, 3, 0), - (3, 4, 0), - (3, 4, 1), -+ (3, 5, 0), - ), ("Tesseract does not have the expected version" -- " (3.4.0) ! Some tests will be skipped !")) -+ " (3.5.0) ! Some tests will be skipped !")) - - def test_langs(self): - langs = libtesseract.get_available_languages() -diff --git a/tests/tests_tesseract.py b/tests/tests_tesseract.py -index e29c512..fa4d483 100644 ---- a/tests/tests_tesseract.py -+++ b/tests/tests_tesseract.py -@@ -27,8 +27,9 @@ class TestContext(unittest.TestCase): - (3, 3, 0), - (3, 4, 0), - (3, 4, 1), -+ (3, 5, 0), - ), ("Tesseract does not have the expected version" -- " (3.4.0) ! Some tests will be skipped !")) -+ " (3.5.0) ! Some tests will be skipped !")) - - def test_langs(self): - langs = tesseract.get_available_languages()