317 lines
12 KiB
Diff
317 lines
12 KiB
Diff
This patch is required for pyocr to work with Tesseract version 3.05.00
|
|
and has been submitted upstream at the following URL:
|
|
|
|
https://github.com/jflesch/pyocr/pull/62
|
|
|
|
diff --git a/src/pyocr/builders.py b/src/pyocr/builders.py
|
|
index 73c964d..20f390c 100644
|
|
--- a/src/pyocr/builders.py
|
|
+++ b/src/pyocr/builders.py
|
|
@@ -240,8 +240,10 @@ class BaseBuilder(object):
|
|
cuneiform_args : Arguments passed to the Cuneiform command line.
|
|
"""
|
|
|
|
- def __init__(self, file_extensions, tesseract_configs, cuneiform_args):
|
|
+ def __init__(self, file_extensions, tesseract_flags, tesseract_configs,
|
|
+ cuneiform_args):
|
|
self.file_extensions = file_extensions
|
|
+ self.tesseract_flags = tesseract_flags
|
|
self.tesseract_configs = tesseract_configs
|
|
self.cuneiform_args = cuneiform_args
|
|
|
|
@@ -298,7 +300,7 @@ class TextBuilder(BaseBuilder):
|
|
def __init__(self, tesseract_layout=3, cuneiform_dotmatrix=False,
|
|
cuneiform_fax=False, cuneiform_singlecolumn=False):
|
|
file_ext = ["txt"]
|
|
- tess_conf = ["-psm", str(tesseract_layout)]
|
|
+ tess_flags = ["-psm", str(tesseract_layout)]
|
|
cun_args = ["-f", "text"]
|
|
# Add custom cuneiform parameters if needed
|
|
for par, arg in [(cuneiform_dotmatrix, "--dotmatrix"),
|
|
@@ -306,7 +308,7 @@ class TextBuilder(BaseBuilder):
|
|
(cuneiform_singlecolumn, "--singlecolumn")]:
|
|
if par:
|
|
cun_args.append(arg)
|
|
- super(TextBuilder, self).__init__(file_ext, tess_conf, cun_args)
|
|
+ super(TextBuilder, self).__init__(file_ext, tess_flags, [], cun_args)
|
|
self.tesseract_layout = tesseract_layout
|
|
self.built_text = []
|
|
|
|
@@ -540,9 +542,11 @@ class WordBoxBuilder(BaseBuilder):
|
|
|
|
def __init__(self, tesseract_layout=1):
|
|
file_ext = ["html", "hocr"]
|
|
- tess_conf = ["hocr", "-psm", str(tesseract_layout)]
|
|
+ tess_flags = ["-psm", str(tesseract_layout)]
|
|
+ tess_conf = ["hocr"]
|
|
cun_args = ["-f", "hocr"]
|
|
- super(WordBoxBuilder, self).__init__(file_ext, tess_conf, cun_args)
|
|
+ super(WordBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
|
|
+ cun_args)
|
|
self.word_boxes = []
|
|
self.tesseract_layout = tesseract_layout
|
|
|
|
@@ -614,9 +618,11 @@ class LineBoxBuilder(BaseBuilder):
|
|
|
|
def __init__(self, tesseract_layout=1):
|
|
file_ext = ["html", "hocr"]
|
|
- tess_conf = ["hocr", "-psm", str(tesseract_layout)]
|
|
+ tess_flags = ["-psm", str(tesseract_layout)]
|
|
+ tess_conf = ["hocr"]
|
|
cun_args = ["-f", "hocr"]
|
|
- super(LineBoxBuilder, self).__init__(file_ext, tess_conf, cun_args)
|
|
+ super(LineBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
|
|
+ cun_args)
|
|
self.lines = []
|
|
self.tesseract_layout = tesseract_layout
|
|
|
|
diff --git a/src/pyocr/libtesseract/tesseract_raw.py b/src/pyocr/libtesseract/tesseract_raw.py
|
|
index 0c2259a..f7ab309 100644
|
|
--- a/src/pyocr/libtesseract/tesseract_raw.py
|
|
+++ b/src/pyocr/libtesseract/tesseract_raw.py
|
|
@@ -263,11 +263,22 @@ if g_libtesseract:
|
|
]
|
|
g_libtesseract.TessDeleteText.restype = None
|
|
|
|
- g_libtesseract.TessBaseAPIDetectOS.argtypes = [
|
|
- ctypes.c_void_p, # TessBaseAPI*
|
|
- ctypes.POINTER(OSResults),
|
|
- ]
|
|
- g_libtesseract.TessBaseAPIDetectOS.restype = ctypes.c_bool
|
|
+ if hasattr(g_libtesseract, 'TessBaseAPIDetectOrientationScript'):
|
|
+ g_libtesseract.TessBaseAPIDetectOrientationScript.argtypes = [
|
|
+ ctypes.c_void_p, # TessBaseAPI*
|
|
+ ctypes.POINTER(ctypes.c_int), # orient_deg
|
|
+ ctypes.POINTER(ctypes.c_float), # orient_conf
|
|
+ ctypes.POINTER(ctypes.c_char_p), # script_name
|
|
+ ctypes.POINTER(ctypes.c_float), # script_conf
|
|
+ ]
|
|
+ g_libtesseract.TessBaseAPIDetectOrientationScript.restype = \
|
|
+ ctypes.c_bool
|
|
+ else:
|
|
+ g_libtesseract.TessBaseAPIDetectOS.argtypes = [
|
|
+ ctypes.c_void_p, # TessBaseAPI*
|
|
+ ctypes.POINTER(OSResults),
|
|
+ ]
|
|
+ g_libtesseract.TessBaseAPIDetectOS.restype = ctypes.c_bool
|
|
|
|
|
|
def init(lang=None):
|
|
@@ -526,15 +537,37 @@ def detect_os(handle):
|
|
global g_libtesseract
|
|
assert(g_libtesseract)
|
|
|
|
- results = OSResults()
|
|
- r = g_libtesseract.TessBaseAPIDetectOS(
|
|
- ctypes.c_void_p(handle),
|
|
- ctypes.pointer(results)
|
|
- )
|
|
- if not r:
|
|
- raise TesseractError("detect_orientation failed",
|
|
- "TessBaseAPIDetectOS() failed")
|
|
- return {
|
|
- "orientation": results.best_orientation_id,
|
|
- "confidence": results.best_oconfidence,
|
|
- }
|
|
+ # Use the new API function if it is available, because since Tesseract
|
|
+ # 3.05.00 the old API function _always_ returns False.
|
|
+ if hasattr(g_libtesseract, 'TessBaseAPIDetectOrientationScript'):
|
|
+ orientation_deg = ctypes.c_int(0)
|
|
+ orientation_confidence = ctypes.c_float(0.0)
|
|
+
|
|
+ r = g_libtesseract.TessBaseAPIDetectOrientationScript(
|
|
+ ctypes.c_void_p(handle),
|
|
+ ctypes.byref(orientation_deg),
|
|
+ ctypes.byref(orientation_confidence),
|
|
+ None, # script_name
|
|
+ None # script_confidence
|
|
+ )
|
|
+
|
|
+ if not r:
|
|
+ raise TesseractError("detect_orientation failed",
|
|
+ "TessBaseAPIDetectOrientationScript() failed")
|
|
+ return {
|
|
+ "orientation": round(orientation_deg.value / 90),
|
|
+ "confidence": orientation_confidence.value,
|
|
+ }
|
|
+ else: # old API (before Tesseract 3.05.00)
|
|
+ results = OSResults()
|
|
+ r = g_libtesseract.TessBaseAPIDetectOS(
|
|
+ ctypes.c_void_p(handle),
|
|
+ ctypes.pointer(results)
|
|
+ )
|
|
+ if not r:
|
|
+ raise TesseractError("detect_orientation failed",
|
|
+ "TessBaseAPIDetectOS() failed")
|
|
+ return {
|
|
+ "orientation": results.best_orientation_id,
|
|
+ "confidence": results.best_oconfidence,
|
|
+ }
|
|
diff --git a/src/pyocr/tesseract.py b/src/pyocr/tesseract.py
|
|
index 99b0121..658c96b 100755
|
|
--- a/src/pyocr/tesseract.py
|
|
+++ b/src/pyocr/tesseract.py
|
|
@@ -22,6 +22,8 @@ import os
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
+import contextlib
|
|
+import shutil
|
|
|
|
from . import builders
|
|
from . import error
|
|
@@ -62,9 +64,11 @@ class CharBoxBuilder(builders.BaseBuilder):
|
|
|
|
def __init__(self):
|
|
file_ext = ["box"]
|
|
+ tess_flags = []
|
|
tess_conf = ["batch.nochop", "makebox"]
|
|
cun_args = []
|
|
- super(CharBoxBuilder, self).__init__(file_ext, tess_conf, cun_args)
|
|
+ super(CharBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
|
|
+ cun_args)
|
|
self.tesseract_layout = 1
|
|
|
|
@staticmethod
|
|
@@ -173,18 +177,19 @@ def detect_orientation(image, lang=None):
|
|
TesseractError --- if no script detected on the image
|
|
"""
|
|
_set_environment()
|
|
- with temp_file(".bmp") as input_file:
|
|
- command = [TESSERACT_CMD, input_file.name, 'stdout', "-psm", "0"]
|
|
+ with temp_dir() as tmpdir:
|
|
+ command = [TESSERACT_CMD, "input.bmp", 'stdout', "-psm", "0"]
|
|
if lang is not None:
|
|
command += ['-l', lang]
|
|
|
|
if image.mode != "RGB":
|
|
image = image.convert("RGB")
|
|
- image.save(input_file.name)
|
|
+ image.save(os.path.join(tmpdir, "input.bmp"))
|
|
|
|
proc = subprocess.Popen(command, stdin=subprocess.PIPE, shell=False,
|
|
startupinfo=g_subprocess_startup_info,
|
|
creationflags=g_creation_flags,
|
|
+ cwd=tmpdir,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.STDOUT)
|
|
proc.stdin.close()
|
|
@@ -224,8 +229,8 @@ def get_available_builders():
|
|
]
|
|
|
|
|
|
-def run_tesseract(input_filename, output_filename_base, lang=None,
|
|
- configs=None):
|
|
+def run_tesseract(input_filename, output_filename_base, cwd=None, lang=None,
|
|
+ flags=None, configs=None):
|
|
'''
|
|
Runs Tesseract:
|
|
`TESSERACT_CMD` \
|
|
@@ -238,6 +243,8 @@ def run_tesseract(input_filename, output_filename_base, lang=None,
|
|
input_filename --- image to read
|
|
output_filename_base --- file name in which must be stored the result
|
|
(without the extension)
|
|
+ cwd --- Run Tesseract in the specified working directory or use current
|
|
+ one if None
|
|
lang --- Tesseract language to use (if None, none will be specified)
|
|
config --- List of Tesseract configs to use (if None, none will be
|
|
specified)
|
|
@@ -252,10 +259,13 @@ def run_tesseract(input_filename, output_filename_base, lang=None,
|
|
if lang is not None:
|
|
command += ['-l', lang]
|
|
|
|
+ if flags is not None:
|
|
+ command += flags
|
|
+
|
|
if configs is not None:
|
|
command += configs
|
|
|
|
- proc = subprocess.Popen(command,
|
|
+ proc = subprocess.Popen(command, cwd=cwd,
|
|
startupinfo=g_subprocess_startup_info,
|
|
creationflags=g_creation_flags,
|
|
stdout=subprocess.PIPE,
|
|
@@ -301,11 +311,18 @@ class ReOpenableTempfile(object):
|
|
self.name = None
|
|
|
|
|
|
-def temp_file(suffix):
|
|
- ''' Returns a temporary file '''
|
|
- if os.name == 'nt': # Windows
|
|
- return ReOpenableTempfile(suffix)
|
|
- return tempfile.NamedTemporaryFile(prefix='tess_', suffix=suffix)
|
|
+@contextlib.contextmanager
|
|
+def temp_dir():
|
|
+ """
|
|
+ A context manager for maintaining a temporary directory
|
|
+ """
|
|
+ # NOTE: Drop this as soon as we don't support Python 2.7 anymore, because
|
|
+ # since Python 3.2 there is a context manager called TemporaryDirectory().
|
|
+ path = tempfile.mkdtemp(prefix='tess_')
|
|
+ try:
|
|
+ yield path
|
|
+ finally:
|
|
+ shutil.rmtree(path)
|
|
|
|
|
|
def image_to_string(image, lang=None, builder=None):
|
|
@@ -329,23 +346,20 @@ def image_to_string(image, lang=None, builder=None):
|
|
|
|
if builder is None:
|
|
builder = builders.TextBuilder()
|
|
- with temp_file(".bmp") as input_file:
|
|
- with temp_file('') as output_file:
|
|
- output_file_name_base = output_file.name
|
|
-
|
|
+ with temp_dir() as tmpdir:
|
|
if image.mode != "RGB":
|
|
image = image.convert("RGB")
|
|
- image.save(input_file.name)
|
|
- (status, errors) = run_tesseract(input_file.name,
|
|
- output_file_name_base,
|
|
+ image.save(os.path.join(tmpdir, "input.bmp"))
|
|
+ (status, errors) = run_tesseract("input.bmp", "output", cwd=tmpdir,
|
|
lang=lang,
|
|
+ flags=builder.tesseract_flags,
|
|
configs=builder.tesseract_configs)
|
|
if status:
|
|
raise TesseractError(status, errors)
|
|
|
|
output_file_name = "ERROR"
|
|
for file_extension in builder.file_extensions:
|
|
- output_file_name = ('%s.%s' % (output_file_name_base,
|
|
+ output_file_name = ('%s.%s' % (os.path.join(tmpdir, "output"),
|
|
file_extension))
|
|
if not os.access(output_file_name, os.F_OK):
|
|
continue
|
|
diff --git a/tests/tests_libtesseract.py b/tests/tests_libtesseract.py
|
|
index ccddd07..1ac2a4b 100644
|
|
--- a/tests/tests_libtesseract.py
|
|
+++ b/tests/tests_libtesseract.py
|
|
@@ -33,8 +33,9 @@ class TestContext(unittest.TestCase):
|
|
(3, 3, 0),
|
|
(3, 4, 0),
|
|
(3, 4, 1),
|
|
+ (3, 5, 0),
|
|
), ("Tesseract does not have the expected version"
|
|
- " (3.4.0) ! Some tests will be skipped !"))
|
|
+ " (3.5.0) ! Some tests will be skipped !"))
|
|
|
|
def test_langs(self):
|
|
langs = libtesseract.get_available_languages()
|
|
diff --git a/tests/tests_tesseract.py b/tests/tests_tesseract.py
|
|
index e29c512..fa4d483 100644
|
|
--- a/tests/tests_tesseract.py
|
|
+++ b/tests/tests_tesseract.py
|
|
@@ -27,8 +27,9 @@ class TestContext(unittest.TestCase):
|
|
(3, 3, 0),
|
|
(3, 4, 0),
|
|
(3, 4, 1),
|
|
+ (3, 5, 0),
|
|
), ("Tesseract does not have the expected version"
|
|
- " (3.4.0) ! Some tests will be skipped !"))
|
|
+ " (3.5.0) ! Some tests will be skipped !"))
|
|
|
|
def test_langs(self):
|
|
langs = tesseract.get_available_languages()
|