python/pyocr: Move package into python-modules
We already have a patch feeling lonely inside the python-modules directory and to have everything at one place let's actually move pyocr into its own dedicated directory so it's easier to patch it up (which we're going to). Right now, the package fails to build because of a few test failures, so I haven't tested this apart from evaluating. Signed-off-by: aszlig <aszlig@redmoonstudios.org>
This commit is contained in:
64
pkgs/development/python-modules/pyocr/default.nix
Normal file
64
pkgs/development/python-modules/pyocr/default.nix
Normal file
@@ -0,0 +1,64 @@
|
||||
{ lib, fetchFromGitHub, buildPythonPackage, pillow, six
|
||||
, tesseract, cuneiform
|
||||
}:
|
||||
|
||||
buildPythonPackage rec {
|
||||
name = "pyocr-${version}";
|
||||
version = "0.4.6";
|
||||
|
||||
# Don't fetch from PYPI because it doesn't contain tests.
|
||||
src = fetchFromGitHub {
|
||||
owner = "jflesch";
|
||||
repo = "pyocr";
|
||||
rev = version;
|
||||
sha256 = "0amyhkkm400qzbw65ivyzrzxl2r7vxqgsgqm7ml95m7gwkwhnzz0";
|
||||
};
|
||||
|
||||
patches = [ ./tesseract.patch ];
|
||||
|
||||
postPatch = ''
|
||||
sed -i \
|
||||
-e 's,^\(TESSERACT_CMD *= *\).*,\1"${tesseract}/bin/tesseract",' \
|
||||
-e 's,^\(CUNEIFORM_CMD *= *\).*,\1"${cuneiform}/bin/cuneiform",' \
|
||||
-e '/^CUNIFORM_POSSIBLE_PATHS *= *\[/,/^\]$/ {
|
||||
c CUNIFORM_POSSIBLE_PATHS = ["${cuneiform}/share/cuneiform"]
|
||||
}' src/pyocr/{tesseract,cuneiform}.py
|
||||
|
||||
sed -i -r \
|
||||
-e 's,"libtesseract\.so\.3","${tesseract}/lib/libtesseract.so",' \
|
||||
-e 's,^(TESSDATA_PREFIX *=).*,\1 "${tesseract}/share/tessdata",' \
|
||||
src/pyocr/libtesseract/tesseract_raw.py
|
||||
|
||||
# Disable specific tests that are probably failing because of this issue:
|
||||
# https://github.com/jflesch/pyocr/issues/52
|
||||
for test in $disabledTests; do
|
||||
file="''${test%%:*}"
|
||||
fun="''${test#*:}"
|
||||
echo "$fun = unittest.skip($fun)" >> "tests/tests_$file.py"
|
||||
done
|
||||
'';
|
||||
|
||||
disabledTests = [
|
||||
"cuneiform:TestTxt.test_basic"
|
||||
"cuneiform:TestTxt.test_european"
|
||||
"cuneiform:TestTxt.test_french"
|
||||
"cuneiform:TestWordBox.test_basic"
|
||||
"cuneiform:TestWordBox.test_european"
|
||||
"cuneiform:TestWordBox.test_french"
|
||||
"libtesseract:TestBasicDoc.test_basic"
|
||||
"libtesseract:TestDigitLineBox.test_digits"
|
||||
"libtesseract:TestLineBox.test_japanese"
|
||||
"libtesseract:TestTxt.test_japanese"
|
||||
"libtesseract:TestWordBox.test_japanese"
|
||||
"tesseract:TestDigitLineBox.test_digits"
|
||||
"tesseract:TestTxt.test_japanese"
|
||||
];
|
||||
|
||||
propagatedBuildInputs = [ pillow six ];
|
||||
|
||||
meta = {
|
||||
homepage = "https://github.com/jflesch/pyocr";
|
||||
description = "A Python wrapper for Tesseract and Cuneiform";
|
||||
license = lib.licenses.gpl3Plus;
|
||||
};
|
||||
}
|
||||
316
pkgs/development/python-modules/pyocr/tesseract.patch
Normal file
316
pkgs/development/python-modules/pyocr/tesseract.patch
Normal file
@@ -0,0 +1,316 @@
|
||||
This patch is required for pyocr to work with Tesseract version 3.05.00
|
||||
and has been submitted upstream at the following URL:
|
||||
|
||||
https://github.com/jflesch/pyocr/pull/62
|
||||
|
||||
diff --git a/src/pyocr/builders.py b/src/pyocr/builders.py
|
||||
index 73c964d..20f390c 100644
|
||||
--- a/src/pyocr/builders.py
|
||||
+++ b/src/pyocr/builders.py
|
||||
@@ -240,8 +240,10 @@ class BaseBuilder(object):
|
||||
cuneiform_args : Arguments passed to the Cuneiform command line.
|
||||
"""
|
||||
|
||||
- def __init__(self, file_extensions, tesseract_configs, cuneiform_args):
|
||||
+ def __init__(self, file_extensions, tesseract_flags, tesseract_configs,
|
||||
+ cuneiform_args):
|
||||
self.file_extensions = file_extensions
|
||||
+ self.tesseract_flags = tesseract_flags
|
||||
self.tesseract_configs = tesseract_configs
|
||||
self.cuneiform_args = cuneiform_args
|
||||
|
||||
@@ -298,7 +300,7 @@ class TextBuilder(BaseBuilder):
|
||||
def __init__(self, tesseract_layout=3, cuneiform_dotmatrix=False,
|
||||
cuneiform_fax=False, cuneiform_singlecolumn=False):
|
||||
file_ext = ["txt"]
|
||||
- tess_conf = ["-psm", str(tesseract_layout)]
|
||||
+ tess_flags = ["-psm", str(tesseract_layout)]
|
||||
cun_args = ["-f", "text"]
|
||||
# Add custom cuneiform parameters if needed
|
||||
for par, arg in [(cuneiform_dotmatrix, "--dotmatrix"),
|
||||
@@ -306,7 +308,7 @@ class TextBuilder(BaseBuilder):
|
||||
(cuneiform_singlecolumn, "--singlecolumn")]:
|
||||
if par:
|
||||
cun_args.append(arg)
|
||||
- super(TextBuilder, self).__init__(file_ext, tess_conf, cun_args)
|
||||
+ super(TextBuilder, self).__init__(file_ext, tess_flags, [], cun_args)
|
||||
self.tesseract_layout = tesseract_layout
|
||||
self.built_text = []
|
||||
|
||||
@@ -540,9 +542,11 @@ class WordBoxBuilder(BaseBuilder):
|
||||
|
||||
def __init__(self, tesseract_layout=1):
|
||||
file_ext = ["html", "hocr"]
|
||||
- tess_conf = ["hocr", "-psm", str(tesseract_layout)]
|
||||
+ tess_flags = ["-psm", str(tesseract_layout)]
|
||||
+ tess_conf = ["hocr"]
|
||||
cun_args = ["-f", "hocr"]
|
||||
- super(WordBoxBuilder, self).__init__(file_ext, tess_conf, cun_args)
|
||||
+ super(WordBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
|
||||
+ cun_args)
|
||||
self.word_boxes = []
|
||||
self.tesseract_layout = tesseract_layout
|
||||
|
||||
@@ -614,9 +618,11 @@ class LineBoxBuilder(BaseBuilder):
|
||||
|
||||
def __init__(self, tesseract_layout=1):
|
||||
file_ext = ["html", "hocr"]
|
||||
- tess_conf = ["hocr", "-psm", str(tesseract_layout)]
|
||||
+ tess_flags = ["-psm", str(tesseract_layout)]
|
||||
+ tess_conf = ["hocr"]
|
||||
cun_args = ["-f", "hocr"]
|
||||
- super(LineBoxBuilder, self).__init__(file_ext, tess_conf, cun_args)
|
||||
+ super(LineBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
|
||||
+ cun_args)
|
||||
self.lines = []
|
||||
self.tesseract_layout = tesseract_layout
|
||||
|
||||
diff --git a/src/pyocr/libtesseract/tesseract_raw.py b/src/pyocr/libtesseract/tesseract_raw.py
|
||||
index 0c2259a..f7ab309 100644
|
||||
--- a/src/pyocr/libtesseract/tesseract_raw.py
|
||||
+++ b/src/pyocr/libtesseract/tesseract_raw.py
|
||||
@@ -263,11 +263,22 @@ if g_libtesseract:
|
||||
]
|
||||
g_libtesseract.TessDeleteText.restype = None
|
||||
|
||||
- g_libtesseract.TessBaseAPIDetectOS.argtypes = [
|
||||
- ctypes.c_void_p, # TessBaseAPI*
|
||||
- ctypes.POINTER(OSResults),
|
||||
- ]
|
||||
- g_libtesseract.TessBaseAPIDetectOS.restype = ctypes.c_bool
|
||||
+ if hasattr(g_libtesseract, 'TessBaseAPIDetectOrientationScript'):
|
||||
+ g_libtesseract.TessBaseAPIDetectOrientationScript.argtypes = [
|
||||
+ ctypes.c_void_p, # TessBaseAPI*
|
||||
+ ctypes.POINTER(ctypes.c_int), # orient_deg
|
||||
+ ctypes.POINTER(ctypes.c_float), # orient_conf
|
||||
+ ctypes.POINTER(ctypes.c_char_p), # script_name
|
||||
+ ctypes.POINTER(ctypes.c_float), # script_conf
|
||||
+ ]
|
||||
+ g_libtesseract.TessBaseAPIDetectOrientationScript.restype = \
|
||||
+ ctypes.c_bool
|
||||
+ else:
|
||||
+ g_libtesseract.TessBaseAPIDetectOS.argtypes = [
|
||||
+ ctypes.c_void_p, # TessBaseAPI*
|
||||
+ ctypes.POINTER(OSResults),
|
||||
+ ]
|
||||
+ g_libtesseract.TessBaseAPIDetectOS.restype = ctypes.c_bool
|
||||
|
||||
|
||||
def init(lang=None):
|
||||
@@ -526,15 +537,37 @@ def detect_os(handle):
|
||||
global g_libtesseract
|
||||
assert(g_libtesseract)
|
||||
|
||||
- results = OSResults()
|
||||
- r = g_libtesseract.TessBaseAPIDetectOS(
|
||||
- ctypes.c_void_p(handle),
|
||||
- ctypes.pointer(results)
|
||||
- )
|
||||
- if not r:
|
||||
- raise TesseractError("detect_orientation failed",
|
||||
- "TessBaseAPIDetectOS() failed")
|
||||
- return {
|
||||
- "orientation": results.best_orientation_id,
|
||||
- "confidence": results.best_oconfidence,
|
||||
- }
|
||||
+ # Use the new API function if it is available, because since Tesseract
|
||||
+ # 3.05.00 the old API function _always_ returns False.
|
||||
+ if hasattr(g_libtesseract, 'TessBaseAPIDetectOrientationScript'):
|
||||
+ orientation_deg = ctypes.c_int(0)
|
||||
+ orientation_confidence = ctypes.c_float(0.0)
|
||||
+
|
||||
+ r = g_libtesseract.TessBaseAPIDetectOrientationScript(
|
||||
+ ctypes.c_void_p(handle),
|
||||
+ ctypes.byref(orientation_deg),
|
||||
+ ctypes.byref(orientation_confidence),
|
||||
+ None, # script_name
|
||||
+ None # script_confidence
|
||||
+ )
|
||||
+
|
||||
+ if not r:
|
||||
+ raise TesseractError("detect_orientation failed",
|
||||
+ "TessBaseAPIDetectOrientationScript() failed")
|
||||
+ return {
|
||||
+ "orientation": round(orientation_deg.value / 90),
|
||||
+ "confidence": orientation_confidence.value,
|
||||
+ }
|
||||
+ else: # old API (before Tesseract 3.05.00)
|
||||
+ results = OSResults()
|
||||
+ r = g_libtesseract.TessBaseAPIDetectOS(
|
||||
+ ctypes.c_void_p(handle),
|
||||
+ ctypes.pointer(results)
|
||||
+ )
|
||||
+ if not r:
|
||||
+ raise TesseractError("detect_orientation failed",
|
||||
+ "TessBaseAPIDetectOS() failed")
|
||||
+ return {
|
||||
+ "orientation": results.best_orientation_id,
|
||||
+ "confidence": results.best_oconfidence,
|
||||
+ }
|
||||
diff --git a/src/pyocr/tesseract.py b/src/pyocr/tesseract.py
|
||||
index 99b0121..658c96b 100755
|
||||
--- a/src/pyocr/tesseract.py
|
||||
+++ b/src/pyocr/tesseract.py
|
||||
@@ -22,6 +22,8 @@ import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
+import contextlib
|
||||
+import shutil
|
||||
|
||||
from . import builders
|
||||
from . import error
|
||||
@@ -62,9 +64,11 @@ class CharBoxBuilder(builders.BaseBuilder):
|
||||
|
||||
def __init__(self):
|
||||
file_ext = ["box"]
|
||||
+ tess_flags = []
|
||||
tess_conf = ["batch.nochop", "makebox"]
|
||||
cun_args = []
|
||||
- super(CharBoxBuilder, self).__init__(file_ext, tess_conf, cun_args)
|
||||
+ super(CharBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
|
||||
+ cun_args)
|
||||
self.tesseract_layout = 1
|
||||
|
||||
@staticmethod
|
||||
@@ -173,18 +177,19 @@ def detect_orientation(image, lang=None):
|
||||
TesseractError --- if no script detected on the image
|
||||
"""
|
||||
_set_environment()
|
||||
- with temp_file(".bmp") as input_file:
|
||||
- command = [TESSERACT_CMD, input_file.name, 'stdout', "-psm", "0"]
|
||||
+ with temp_dir() as tmpdir:
|
||||
+ command = [TESSERACT_CMD, "input.bmp", 'stdout', "-psm", "0"]
|
||||
if lang is not None:
|
||||
command += ['-l', lang]
|
||||
|
||||
if image.mode != "RGB":
|
||||
image = image.convert("RGB")
|
||||
- image.save(input_file.name)
|
||||
+ image.save(os.path.join(tmpdir, "input.bmp"))
|
||||
|
||||
proc = subprocess.Popen(command, stdin=subprocess.PIPE, shell=False,
|
||||
startupinfo=g_subprocess_startup_info,
|
||||
creationflags=g_creation_flags,
|
||||
+ cwd=tmpdir,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT)
|
||||
proc.stdin.close()
|
||||
@@ -224,8 +229,8 @@ def get_available_builders():
|
||||
]
|
||||
|
||||
|
||||
-def run_tesseract(input_filename, output_filename_base, lang=None,
|
||||
- configs=None):
|
||||
+def run_tesseract(input_filename, output_filename_base, cwd=None, lang=None,
|
||||
+ flags=None, configs=None):
|
||||
'''
|
||||
Runs Tesseract:
|
||||
`TESSERACT_CMD` \
|
||||
@@ -238,6 +243,8 @@ def run_tesseract(input_filename, output_filename_base, lang=None,
|
||||
input_filename --- image to read
|
||||
output_filename_base --- file name in which must be stored the result
|
||||
(without the extension)
|
||||
+ cwd --- Run Tesseract in the specified working directory or use current
|
||||
+ one if None
|
||||
lang --- Tesseract language to use (if None, none will be specified)
|
||||
config --- List of Tesseract configs to use (if None, none will be
|
||||
specified)
|
||||
@@ -252,10 +259,13 @@ def run_tesseract(input_filename, output_filename_base, lang=None,
|
||||
if lang is not None:
|
||||
command += ['-l', lang]
|
||||
|
||||
+ if flags is not None:
|
||||
+ command += flags
|
||||
+
|
||||
if configs is not None:
|
||||
command += configs
|
||||
|
||||
- proc = subprocess.Popen(command,
|
||||
+ proc = subprocess.Popen(command, cwd=cwd,
|
||||
startupinfo=g_subprocess_startup_info,
|
||||
creationflags=g_creation_flags,
|
||||
stdout=subprocess.PIPE,
|
||||
@@ -301,11 +311,18 @@ class ReOpenableTempfile(object):
|
||||
self.name = None
|
||||
|
||||
|
||||
-def temp_file(suffix):
|
||||
- ''' Returns a temporary file '''
|
||||
- if os.name == 'nt': # Windows
|
||||
- return ReOpenableTempfile(suffix)
|
||||
- return tempfile.NamedTemporaryFile(prefix='tess_', suffix=suffix)
|
||||
+@contextlib.contextmanager
|
||||
+def temp_dir():
|
||||
+ """
|
||||
+ A context manager for maintaining a temporary directory
|
||||
+ """
|
||||
+ # NOTE: Drop this as soon as we don't support Python 2.7 anymore, because
|
||||
+ # since Python 3.2 there is a context manager called TemporaryDirectory().
|
||||
+ path = tempfile.mkdtemp(prefix='tess_')
|
||||
+ try:
|
||||
+ yield path
|
||||
+ finally:
|
||||
+ shutil.rmtree(path)
|
||||
|
||||
|
||||
def image_to_string(image, lang=None, builder=None):
|
||||
@@ -329,23 +346,20 @@ def image_to_string(image, lang=None, builder=None):
|
||||
|
||||
if builder is None:
|
||||
builder = builders.TextBuilder()
|
||||
- with temp_file(".bmp") as input_file:
|
||||
- with temp_file('') as output_file:
|
||||
- output_file_name_base = output_file.name
|
||||
-
|
||||
+ with temp_dir() as tmpdir:
|
||||
if image.mode != "RGB":
|
||||
image = image.convert("RGB")
|
||||
- image.save(input_file.name)
|
||||
- (status, errors) = run_tesseract(input_file.name,
|
||||
- output_file_name_base,
|
||||
+ image.save(os.path.join(tmpdir, "input.bmp"))
|
||||
+ (status, errors) = run_tesseract("input.bmp", "output", cwd=tmpdir,
|
||||
lang=lang,
|
||||
+ flags=builder.tesseract_flags,
|
||||
configs=builder.tesseract_configs)
|
||||
if status:
|
||||
raise TesseractError(status, errors)
|
||||
|
||||
output_file_name = "ERROR"
|
||||
for file_extension in builder.file_extensions:
|
||||
- output_file_name = ('%s.%s' % (output_file_name_base,
|
||||
+ output_file_name = ('%s.%s' % (os.path.join(tmpdir, "output"),
|
||||
file_extension))
|
||||
if not os.access(output_file_name, os.F_OK):
|
||||
continue
|
||||
diff --git a/tests/tests_libtesseract.py b/tests/tests_libtesseract.py
|
||||
index ccddd07..1ac2a4b 100644
|
||||
--- a/tests/tests_libtesseract.py
|
||||
+++ b/tests/tests_libtesseract.py
|
||||
@@ -33,8 +33,9 @@ class TestContext(unittest.TestCase):
|
||||
(3, 3, 0),
|
||||
(3, 4, 0),
|
||||
(3, 4, 1),
|
||||
+ (3, 5, 0),
|
||||
), ("Tesseract does not have the expected version"
|
||||
- " (3.4.0) ! Some tests will be skipped !"))
|
||||
+ " (3.5.0) ! Some tests will be skipped !"))
|
||||
|
||||
def test_langs(self):
|
||||
langs = libtesseract.get_available_languages()
|
||||
diff --git a/tests/tests_tesseract.py b/tests/tests_tesseract.py
|
||||
index e29c512..fa4d483 100644
|
||||
--- a/tests/tests_tesseract.py
|
||||
+++ b/tests/tests_tesseract.py
|
||||
@@ -27,8 +27,9 @@ class TestContext(unittest.TestCase):
|
||||
(3, 3, 0),
|
||||
(3, 4, 0),
|
||||
(3, 4, 1),
|
||||
+ (3, 5, 0),
|
||||
), ("Tesseract does not have the expected version"
|
||||
- " (3.4.0) ! Some tests will be skipped !"))
|
||||
+ " (3.5.0) ! Some tests will be skipped !"))
|
||||
|
||||
def test_langs(self):
|
||||
langs = tesseract.get_available_languages()
|
||||
Reference in New Issue
Block a user