From b30d52905e618c6ae3e5ef0cea41777ee72be835 Mon Sep 17 00:00:00 2001 From: symphorien Date: Mon, 18 Jun 2018 22:03:48 +0000 Subject: [PATCH] tesseract: make tessdata a fix output derivation (#41227) the full tessdata is nearly a GB, so sparing a copy each time we need to rebuild tesseract without updating tessdata is worth it. --- .../graphics/tesseract/default.nix | 64 ++++++++++--------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/pkgs/applications/graphics/tesseract/default.nix b/pkgs/applications/graphics/tesseract/default.nix index eb9a6383861..7940079d099 100644 --- a/pkgs/applications/graphics/tesseract/default.nix +++ b/pkgs/applications/graphics/tesseract/default.nix @@ -1,10 +1,38 @@ { stdenv, fetchFromGitHub, autoreconfHook, pkgconfig , leptonica, libpng, libtiff, icu, pango, opencl-headers - # Supported list of languages or `null' for all available languages , enableLanguages ? null +# if you want just a specific list of languages, optionally specify a hash +# to make tessdata a fixed output derivation. +, enableLanguagesHash ? (if enableLanguages == null # all languages + then "1h48xfzabhn0ldbx5ib67cp9607pr0zpblsy8z6fs4knn0zznfnw" + else null) }: +let tessdata = stdenv.mkDerivation ({ + name = "tessdata"; + src = fetchFromGitHub { + owner = "tesseract-ocr"; + repo = "tessdata"; + rev = "3cf1e2df1fe1d1da29295c9ef0983796c7958b7d"; + # when updating don't forget to update the default value fo enableLanguagesHash + sha256 = "1v4b63v5nzcxr2y3635r19l7lj5smjmc9vfk0wmxlryxncb4vpg7"; + }; + buildCommand = '' + cd $src; + for lang in ${if enableLanguages==null then "*.traineddata" else stdenv.lib.concatMapStringsSep " " (x: x+".traineddata") enableLanguages} ; do + install -Dt $out/share/tessdata $src/$lang ; + done; + ''; + preferLocalBuild = true; + } // (stdenv.lib.optionalAttrs (enableLanguagesHash != null) { + # when a hash is given, we make this a fixed output derivation. + outputHashMode = "recursive"; + outputHashAlgo = "sha256"; + outputHash = enableLanguagesHash; + })); +in + stdenv.mkDerivation rec { name = "tesseract-${version}"; version = "3.05.00"; @@ -16,41 +44,17 @@ stdenv.mkDerivation rec { sha256 = "11wrpcfl118wxsv2c3w2scznwb48c4547qml42s2bpdz079g8y30"; }; - tessdata = fetchFromGitHub { - owner = "tesseract-ocr"; - repo = "tessdata"; - rev = "3cf1e2df1fe1d1da29295c9ef0983796c7958b7d"; - sha256 = "1v4b63v5nzcxr2y3635r19l7lj5smjmc9vfk0wmxlryxncb4vpg7"; - }; + enableParallelBuilding = true; nativeBuildInputs = [ pkgconfig autoreconfHook ]; buildInputs = [ leptonica libpng libtiff icu pango opencl-headers ]; LIBLEPT_HEADERSDIR = "${leptonica}/include"; - # Copy the .traineddata files of the languages specified in enableLanguages - # into `$out/share/tessdata' and check afterwards if copying was successful. - postInstall = let - mkArg = lang: "-iname ${stdenv.lib.escapeShellArg "${lang}.traineddata"}"; - mkFindArgs = stdenv.lib.concatMapStringsSep " -o " mkArg; - findLangArgs = if enableLanguages != null - then "\\( ${mkFindArgs enableLanguages} \\)" - else "-iname '*.traineddata'"; - in '' - numLangs="$(find "$tessdata" -mindepth 1 -maxdepth 1 -type f \ - ${findLangArgs} -exec cp -t "$out/share/tessdata" {} + -print | wc -l)" - - ${if enableLanguages != null then '' - expected=${toString (builtins.length enableLanguages)} - '' else '' - expected="$(ls -1 "$tessdata/"*.traineddata | wc -l)" - ''} - - if [ "$numLangs" -ne "$expected" ]; then - echo "Expected $expected languages, but $numLangs" \ - "were copied to \`$out/share/tessdata'" >&2 - exit 1 - fi + postInstall = '' + for i in ${tessdata}/share/tessdata/*; do + ln -s $i $out/share/tessdata; + done ''; meta = {