tesseract: make tessdata a fix output derivation (#41227)
the full tessdata is nearly a GB, so sparing a copy each time we need to rebuild tesseract without updating tessdata is worth it.
This commit is contained in:
parent
8807039549
commit
b30d52905e
|
@ -1,10 +1,38 @@
|
|||
{ stdenv, fetchFromGitHub, autoreconfHook, pkgconfig
|
||||
, leptonica, libpng, libtiff, icu, pango, opencl-headers
|
||||
|
||||
# Supported list of languages or `null' for all available languages
|
||||
, enableLanguages ? null
|
||||
# if you want just a specific list of languages, optionally specify a hash
|
||||
# to make tessdata a fixed output derivation.
|
||||
, enableLanguagesHash ? (if enableLanguages == null # all languages
|
||||
then "1h48xfzabhn0ldbx5ib67cp9607pr0zpblsy8z6fs4knn0zznfnw"
|
||||
else null)
|
||||
}:
|
||||
|
||||
let tessdata = stdenv.mkDerivation ({
|
||||
name = "tessdata";
|
||||
src = fetchFromGitHub {
|
||||
owner = "tesseract-ocr";
|
||||
repo = "tessdata";
|
||||
rev = "3cf1e2df1fe1d1da29295c9ef0983796c7958b7d";
|
||||
# when updating don't forget to update the default value fo enableLanguagesHash
|
||||
sha256 = "1v4b63v5nzcxr2y3635r19l7lj5smjmc9vfk0wmxlryxncb4vpg7";
|
||||
};
|
||||
buildCommand = ''
|
||||
cd $src;
|
||||
for lang in ${if enableLanguages==null then "*.traineddata" else stdenv.lib.concatMapStringsSep " " (x: x+".traineddata") enableLanguages} ; do
|
||||
install -Dt $out/share/tessdata $src/$lang ;
|
||||
done;
|
||||
'';
|
||||
preferLocalBuild = true;
|
||||
} // (stdenv.lib.optionalAttrs (enableLanguagesHash != null) {
|
||||
# when a hash is given, we make this a fixed output derivation.
|
||||
outputHashMode = "recursive";
|
||||
outputHashAlgo = "sha256";
|
||||
outputHash = enableLanguagesHash;
|
||||
}));
|
||||
in
|
||||
|
||||
stdenv.mkDerivation rec {
|
||||
name = "tesseract-${version}";
|
||||
version = "3.05.00";
|
||||
|
@ -16,41 +44,17 @@ stdenv.mkDerivation rec {
|
|||
sha256 = "11wrpcfl118wxsv2c3w2scznwb48c4547qml42s2bpdz079g8y30";
|
||||
};
|
||||
|
||||
tessdata = fetchFromGitHub {
|
||||
owner = "tesseract-ocr";
|
||||
repo = "tessdata";
|
||||
rev = "3cf1e2df1fe1d1da29295c9ef0983796c7958b7d";
|
||||
sha256 = "1v4b63v5nzcxr2y3635r19l7lj5smjmc9vfk0wmxlryxncb4vpg7";
|
||||
};
|
||||
enableParallelBuilding = true;
|
||||
|
||||
nativeBuildInputs = [ pkgconfig autoreconfHook ];
|
||||
buildInputs = [ leptonica libpng libtiff icu pango opencl-headers ];
|
||||
|
||||
LIBLEPT_HEADERSDIR = "${leptonica}/include";
|
||||
|
||||
# Copy the .traineddata files of the languages specified in enableLanguages
|
||||
# into `$out/share/tessdata' and check afterwards if copying was successful.
|
||||
postInstall = let
|
||||
mkArg = lang: "-iname ${stdenv.lib.escapeShellArg "${lang}.traineddata"}";
|
||||
mkFindArgs = stdenv.lib.concatMapStringsSep " -o " mkArg;
|
||||
findLangArgs = if enableLanguages != null
|
||||
then "\\( ${mkFindArgs enableLanguages} \\)"
|
||||
else "-iname '*.traineddata'";
|
||||
in ''
|
||||
numLangs="$(find "$tessdata" -mindepth 1 -maxdepth 1 -type f \
|
||||
${findLangArgs} -exec cp -t "$out/share/tessdata" {} + -print | wc -l)"
|
||||
|
||||
${if enableLanguages != null then ''
|
||||
expected=${toString (builtins.length enableLanguages)}
|
||||
'' else ''
|
||||
expected="$(ls -1 "$tessdata/"*.traineddata | wc -l)"
|
||||
''}
|
||||
|
||||
if [ "$numLangs" -ne "$expected" ]; then
|
||||
echo "Expected $expected languages, but $numLangs" \
|
||||
"were copied to \`$out/share/tessdata'" >&2
|
||||
exit 1
|
||||
fi
|
||||
postInstall = ''
|
||||
for i in ${tessdata}/share/tessdata/*; do
|
||||
ln -s $i $out/share/tessdata;
|
||||
done
|
||||
'';
|
||||
|
||||
meta = {
|
||||
|
|
Loading…
Reference in New Issue