tesseract: add a wrapper to setup languages
Tesseract is now decoupled from the tessdata language corpus. This avoids recompilation when building Tesseract with a custom set of languages. Update k2pdfopt to use the new wrapper interface.
This commit is contained in:
parent
45d2a2dd91
commit
aaaed13077
@ -1,11 +1,18 @@
|
|||||||
{ callPackage, lowPrio }:
|
{ callPackage, lowPrio }:
|
||||||
|
|
||||||
let
|
let
|
||||||
tesseract3 = callPackage ./tesseract3.nix {};
|
base3 = callPackage ./tesseract3.nix {};
|
||||||
tesseract4 = callPackage ./tesseract4.nix {};
|
base4 = callPackage ./tesseract4.nix {};
|
||||||
|
languages = callPackage ./languages.nix {};
|
||||||
in
|
in
|
||||||
{
|
{
|
||||||
tesseract = tesseract3;
|
tesseract = callPackage ./wrapper.nix {
|
||||||
|
tesseractBase = base3;
|
||||||
|
languages = languages.v3;
|
||||||
|
};
|
||||||
|
|
||||||
tesseract_4 = lowPrio tesseract4;
|
tesseract_4 = lowPrio (callPackage ./wrapper.nix {
|
||||||
|
tesseractBase = base4;
|
||||||
|
languages = languages.v4;
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
43
pkgs/applications/graphics/tesseract/languages.nix
Normal file
43
pkgs/applications/graphics/tesseract/languages.nix
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
{ stdenv, lib, fetchurl, fetchFromGitHub }:
|
||||||
|
|
||||||
|
rec {
|
||||||
|
makeLanguages = { tessdataRev, tessdata ? null, all ? null, languages ? {} }:
|
||||||
|
let
|
||||||
|
tessdataSrc = fetchFromGitHub {
|
||||||
|
owner = "tesseract-ocr";
|
||||||
|
repo = "tessdata";
|
||||||
|
rev = tessdataRev;
|
||||||
|
sha256 = tessdata;
|
||||||
|
};
|
||||||
|
|
||||||
|
languageFile = lang: sha256: fetchurl {
|
||||||
|
url = "https://github.com/tesseract-ocr/tessdata/raw/${tessdataRev}/${lang}.traineddata";
|
||||||
|
inherit sha256;
|
||||||
|
};
|
||||||
|
in
|
||||||
|
{
|
||||||
|
all = stdenv.mkDerivation {
|
||||||
|
name = "all";
|
||||||
|
buildCommand = ''
|
||||||
|
mkdir $out
|
||||||
|
cd ${tessdataSrc}
|
||||||
|
cp *.traineddata $out
|
||||||
|
'';
|
||||||
|
outputHashMode = "recursive";
|
||||||
|
outputHashAlgo = "sha256";
|
||||||
|
outputHash = all;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
v3 = makeLanguages {
|
||||||
|
tessdataRev = "3cf1e2df1fe1d1da29295c9ef0983796c7958b7d";
|
||||||
|
tessdata = "1v4b63v5nzcxr2y3635r19l7lj5smjmc9vfk0wmxlryxncb4vpg7";
|
||||||
|
all = "0yj6h9n6h0kzzcqsn3z87vsi8pa60szp0yiayb0znd0v9my0dqhn";
|
||||||
|
};
|
||||||
|
|
||||||
|
v4 = makeLanguages {
|
||||||
|
tessdataRev = "4.0.0";
|
||||||
|
tessdata = "1chw1ya5zf8aaj2ixr9x013x7vwwwjjmx6f2ag0d6i14lypygy28";
|
||||||
|
all = "0dqgkp369rcvq72yhgnzj1pj8yrv7kqzc7y6sqs7nzcq7l5qazlg";
|
||||||
|
};
|
||||||
|
}
|
@ -1,37 +1,5 @@
|
|||||||
{ stdenv, fetchFromGitHub, autoreconfHook, pkgconfig
|
{ stdenv, fetchurl, fetchFromGitHub, autoreconfHook, pkgconfig
|
||||||
, leptonica, libpng, libtiff, icu, pango, opencl-headers
|
, leptonica, libpng, libtiff, icu, pango, opencl-headers }:
|
||||||
# Supported list of languages or `null' for all available languages
|
|
||||||
, enableLanguages ? null
|
|
||||||
# if you want just a specific list of languages, optionally specify a hash
|
|
||||||
# to make tessdata a fixed output derivation.
|
|
||||||
, enableLanguagesHash ? (if enableLanguages == null # all languages
|
|
||||||
then "1h48xfzabhn0ldbx5ib67cp9607pr0zpblsy8z6fs4knn0zznfnw"
|
|
||||||
else null)
|
|
||||||
}:
|
|
||||||
|
|
||||||
let tessdata = stdenv.mkDerivation ({
|
|
||||||
name = "tessdata";
|
|
||||||
src = fetchFromGitHub {
|
|
||||||
owner = "tesseract-ocr";
|
|
||||||
repo = "tessdata";
|
|
||||||
rev = "3cf1e2df1fe1d1da29295c9ef0983796c7958b7d";
|
|
||||||
# when updating don't forget to update the default value fo enableLanguagesHash
|
|
||||||
sha256 = "1v4b63v5nzcxr2y3635r19l7lj5smjmc9vfk0wmxlryxncb4vpg7";
|
|
||||||
};
|
|
||||||
buildCommand = ''
|
|
||||||
cd $src;
|
|
||||||
for lang in ${if enableLanguages==null then "*.traineddata" else stdenv.lib.concatMapStringsSep " " (x: x+".traineddata") enableLanguages} ; do
|
|
||||||
install -Dt $out/share/tessdata $src/$lang ;
|
|
||||||
done;
|
|
||||||
'';
|
|
||||||
preferLocalBuild = true;
|
|
||||||
} // (stdenv.lib.optionalAttrs (enableLanguagesHash != null) {
|
|
||||||
# when a hash is given, we make this a fixed output derivation.
|
|
||||||
outputHashMode = "recursive";
|
|
||||||
outputHashAlgo = "sha256";
|
|
||||||
outputHash = enableLanguagesHash;
|
|
||||||
}));
|
|
||||||
in
|
|
||||||
|
|
||||||
stdenv.mkDerivation rec {
|
stdenv.mkDerivation rec {
|
||||||
name = "tesseract-${version}";
|
name = "tesseract-${version}";
|
||||||
@ -51,17 +19,11 @@ stdenv.mkDerivation rec {
|
|||||||
|
|
||||||
LIBLEPT_HEADERSDIR = "${leptonica}/include";
|
LIBLEPT_HEADERSDIR = "${leptonica}/include";
|
||||||
|
|
||||||
postInstall = ''
|
|
||||||
for i in ${tessdata}/share/tessdata/*; do
|
|
||||||
ln -s $i $out/share/tessdata;
|
|
||||||
done
|
|
||||||
'';
|
|
||||||
|
|
||||||
meta = {
|
meta = {
|
||||||
description = "OCR engine";
|
description = "OCR engine";
|
||||||
homepage = https://github.com/tesseract-ocr/tesseract;
|
homepage = https://github.com/tesseract-ocr/tesseract;
|
||||||
license = stdenv.lib.licenses.asl20;
|
license = stdenv.lib.licenses.asl20;
|
||||||
maintainers = with stdenv.lib.maintainers; [viric];
|
maintainers = with stdenv.lib.maintainers; [ viric earvstedt ];
|
||||||
platforms = with stdenv.lib.platforms; linux ++ darwin;
|
platforms = with stdenv.lib.platforms; linux ++ darwin;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -1,9 +1,5 @@
|
|||||||
{ stdenv, fetchFromGitHub, autoreconfHook, autoconf-archive, pkgconfig
|
{ stdenv, fetchFromGitHub, autoreconfHook, autoconf-archive, pkgconfig
|
||||||
, leptonica, libpng, libtiff, icu, pango, opencl-headers
|
, leptonica, libpng, libtiff, icu, pango, opencl-headers }:
|
||||||
|
|
||||||
# Supported list of languages or `null' for all available languages
|
|
||||||
, enableLanguages ? null
|
|
||||||
}:
|
|
||||||
|
|
||||||
stdenv.mkDerivation rec {
|
stdenv.mkDerivation rec {
|
||||||
name = "tesseract-${version}";
|
name = "tesseract-${version}";
|
||||||
@ -16,46 +12,16 @@ stdenv.mkDerivation rec {
|
|||||||
sha256 = "1b5fi2vibc4kk9b30kkk4ais4bw8fbbv24bzr5709194hb81cav8";
|
sha256 = "1b5fi2vibc4kk9b30kkk4ais4bw8fbbv24bzr5709194hb81cav8";
|
||||||
};
|
};
|
||||||
|
|
||||||
tessdata = fetchFromGitHub {
|
enableParallelBuilding = true;
|
||||||
owner = "tesseract-ocr";
|
|
||||||
repo = "tessdata";
|
|
||||||
rev = version;
|
|
||||||
sha256 = "1chw1ya5zf8aaj2ixr9x013x7vwwwjjmx6f2ag0d6i14lypygy28";
|
|
||||||
};
|
|
||||||
|
|
||||||
nativeBuildInputs = [ pkgconfig autoreconfHook autoconf-archive ];
|
nativeBuildInputs = [ pkgconfig autoreconfHook autoconf-archive ];
|
||||||
buildInputs = [ leptonica libpng libtiff icu pango opencl-headers ];
|
buildInputs = [ leptonica libpng libtiff icu pango opencl-headers ];
|
||||||
|
|
||||||
# Copy the .traineddata files of the languages specified in enableLanguages
|
|
||||||
# into `$out/share/tessdata' and check afterwards if copying was successful.
|
|
||||||
postInstall = let
|
|
||||||
mkArg = lang: "-iname ${stdenv.lib.escapeShellArg "${lang}.traineddata"}";
|
|
||||||
mkFindArgs = stdenv.lib.concatMapStringsSep " -o " mkArg;
|
|
||||||
findLangArgs = if enableLanguages != null
|
|
||||||
then "\\( ${mkFindArgs enableLanguages} \\)"
|
|
||||||
else "-iname '*.traineddata'";
|
|
||||||
in ''
|
|
||||||
numLangs="$(find "$tessdata" -mindepth 1 -maxdepth 1 -type f \
|
|
||||||
${findLangArgs} -exec cp -t "$out/share/tessdata" {} + -print | wc -l)"
|
|
||||||
|
|
||||||
${if enableLanguages != null then ''
|
|
||||||
expected=${toString (builtins.length enableLanguages)}
|
|
||||||
'' else ''
|
|
||||||
expected="$(ls -1 "$tessdata/"*.traineddata | wc -l)"
|
|
||||||
''}
|
|
||||||
|
|
||||||
if [ "$numLangs" -ne "$expected" ]; then
|
|
||||||
echo "Expected $expected languages, but $numLangs" \
|
|
||||||
"were copied to \`$out/share/tessdata'" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
'';
|
|
||||||
|
|
||||||
meta = {
|
meta = {
|
||||||
description = "OCR engine";
|
description = "OCR engine";
|
||||||
homepage = https://github.com/tesseract-ocr/tesseract;
|
homepage = https://github.com/tesseract-ocr/tesseract;
|
||||||
license = stdenv.lib.licenses.asl20;
|
license = stdenv.lib.licenses.asl20;
|
||||||
maintainers = with stdenv.lib.maintainers; [viric];
|
maintainers = with stdenv.lib.maintainers; [ viric earvstedt ];
|
||||||
platforms = with stdenv.lib.platforms; linux;
|
platforms = with stdenv.lib.platforms; linux ++ darwin;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
69
pkgs/applications/graphics/tesseract/wrapper.nix
Normal file
69
pkgs/applications/graphics/tesseract/wrapper.nix
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
{ stdenv, makeWrapper, tesseractBase, languages
|
||||||
|
|
||||||
|
# A list of languages like [ "eng" "spa" … ] or `null` for all available languages
|
||||||
|
, enableLanguages ? null
|
||||||
|
|
||||||
|
# A list of files or a directory containing files
|
||||||
|
, tessdata ? (
|
||||||
|
if enableLanguages == null then
|
||||||
|
languages.all
|
||||||
|
else stdenv.mkDerivation ({
|
||||||
|
name = "tessdata";
|
||||||
|
buildCommand = ''
|
||||||
|
for lang in ${stdenv.lib.concatMapStringsSep " " (x: x + ".traineddata") enableLanguages}; do
|
||||||
|
install -Dt $out ${languages.all}/$lang
|
||||||
|
done
|
||||||
|
'';
|
||||||
|
preferLocalBuild = true;
|
||||||
|
} // (stdenv.lib.optionalAttrs (enableLanguagesHash != null) {
|
||||||
|
# when a hash is given, we make this a fixed output derivation.
|
||||||
|
outputHashMode = "recursive";
|
||||||
|
outputHashAlgo = "sha256";
|
||||||
|
outputHash = enableLanguagesHash;
|
||||||
|
}))
|
||||||
|
)
|
||||||
|
|
||||||
|
, enableLanguagesHash ? null
|
||||||
|
}:
|
||||||
|
|
||||||
|
let
|
||||||
|
passthru = { inherit tesseractBase languages tessdata; };
|
||||||
|
|
||||||
|
tesseractWithData = tesseractBase.overrideAttrs (_: {
|
||||||
|
inherit tesseractBase tessdata;
|
||||||
|
|
||||||
|
buildInputs = [ makeWrapper ];
|
||||||
|
|
||||||
|
buildCommand = ''
|
||||||
|
makeWrapper {$tesseractBase,$out}/bin/tesseract --set-default TESSDATA_PREFIX $out/share/tessdata
|
||||||
|
|
||||||
|
# Recursively link include, share
|
||||||
|
cp -rs --no-preserve=mode $tesseractBase/{include,share} $out
|
||||||
|
|
||||||
|
cp -r --no-preserve=mode $tesseractBase/lib $out
|
||||||
|
# Fixup the store paths in lib so that the tessdata from this derivation is used.
|
||||||
|
if (( ''${#tesseractBase} != ''${#out} )); then
|
||||||
|
echo "Can't replace store paths due to differing lengths"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
find $out/lib -type f -exec sed -i "s|$tesseractBase|$out|g" {} \;
|
||||||
|
|
||||||
|
if [[ -d "$tessdata" ]]; then
|
||||||
|
ln -s $tessdata/* $out/share/tessdata
|
||||||
|
else
|
||||||
|
for lang in $tessdata; do
|
||||||
|
ln -s $lang $out/share/tessdata/''${lang#/nix/store*-}
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -e $out/share/tessdata/eng.traineddata ]]; then
|
||||||
|
# This is a bug in Tesseract's internal tessdata discovery mechanism
|
||||||
|
echo "eng.traineddata must be present in tessdata for Tesseract to work"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
'';
|
||||||
|
});
|
||||||
|
|
||||||
|
tesseract = (if enableLanguages == [] then tesseractBase else tesseractWithData) // passthru;
|
||||||
|
in
|
||||||
|
tesseract
|
@ -75,19 +75,21 @@ stdenv.mkDerivation rec {
|
|||||||
cp ${src}/leptonica_mod/* src/
|
cp ${src}/leptonica_mod/* src/
|
||||||
'';
|
'';
|
||||||
});
|
});
|
||||||
tesseract_modded = tesseract.overrideAttrs (attrs: {
|
tesseract_modded = tesseract.override {
|
||||||
prePatch = ''
|
tesseractBase = tesseract.tesseractBase.overrideAttrs (_: {
|
||||||
cp ${src}/tesseract_mod/{ambigs.cpp,ccutil.h,ccutil.cpp} ccutil/
|
prePatch = ''
|
||||||
cp ${src}/tesseract_mod/dawg.cpp api/
|
cp ${src}/tesseract_mod/{ambigs.cpp,ccutil.h,ccutil.cpp} ccutil/
|
||||||
cp ${src}/tesseract_mod/{imagedata.cpp,tessdatamanager.cpp} ccstruct/
|
cp ${src}/tesseract_mod/dawg.cpp api/
|
||||||
cp ${src}/tesseract_mod/openclwrapper.h opencl/
|
cp ${src}/tesseract_mod/{imagedata.cpp,tessdatamanager.cpp} ccstruct/
|
||||||
cp ${src}/tesseract_mod/{tessedit.cpp,thresholder.cpp} ccmain/
|
cp ${src}/tesseract_mod/openclwrapper.h opencl/
|
||||||
cp ${src}/tesseract_mod/tess_lang_mod_edge.h cube/
|
cp ${src}/tesseract_mod/{tessedit.cpp,thresholder.cpp} ccmain/
|
||||||
cp ${src}/tesseract_mod/tesscapi.cpp api/
|
cp ${src}/tesseract_mod/tess_lang_mod_edge.h cube/
|
||||||
cp ${src}/include_mod/{tesseract.h,leptonica.h} api/
|
cp ${src}/tesseract_mod/tesscapi.cpp api/
|
||||||
'';
|
cp ${src}/include_mod/{tesseract.h,leptonica.h} api/
|
||||||
patches = [ ./tesseract.patch ];
|
'';
|
||||||
});
|
patches = [ ./tesseract.patch ];
|
||||||
|
});
|
||||||
|
};
|
||||||
in
|
in
|
||||||
[ zlib libpng ] ++
|
[ zlib libpng ] ++
|
||||||
optional enableGSL gsl ++
|
optional enableGSL gsl ++
|
||||||
|
Loading…
x
Reference in New Issue
Block a user