diff --git a/pkgs/development/python-modules/tokenizers/default.nix b/pkgs/development/python-modules/tokenizers/default.nix new file mode 100644 index 00000000000..348df4ae95e --- /dev/null +++ b/pkgs/development/python-modules/tokenizers/default.nix @@ -0,0 +1,121 @@ +{ stdenv +, rustPlatform +, fetchFromGitHub +, fetchurl +, maturin +, pipInstallHook +, pytest +, python +, requests +}: + +let + robertaVocab = fetchurl { + url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"; + sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy"; + }; + robertaMerges = fetchurl { + url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"; + sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w"; + }; + bertVocab = fetchurl { + url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"; + sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07"; + }; + openaiVocab = fetchurl { + url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"; + sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x"; + }; + openaiMerges = fetchurl { + url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"; + sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f"; + }; +in rustPlatform.buildRustPackage rec { + pname = "tokenizers"; + version = "0.8.0"; + + src = fetchFromGitHub { + owner = "huggingface"; + repo = pname; + rev = "python-v${version}"; + sha256 = "0f5r1wm5ybyk3jvihj1g98y7ihq0iklg0pwkaa11pk1gv0k869w3"; + }; + + cargoSha256 = "131bvf35q5n65mq6zws1rp5fn2qkfwfg9sbxi5y6if24n8fpdz4m"; + + sourceRoot = "source/bindings/python"; + + nativeBuildInputs = [ + maturin + pipInstallHook + ]; + + propagatedBuildInputs = [ + python + ]; + + # tokenizers uses pyo3, which requires Rust nightly. + RUSTC_BOOTSTRAP = 1; + + doCheck = false; + doInstallCheck = true; + + postUnpack = '' + # Add data files for tests, otherwise tests attempt network access. + mkdir $sourceRoot/tests/data + ( cd $sourceRoot/tests/data + ln -s ${robertaVocab} roberta-base-vocab.json + ln -s ${robertaMerges} roberta-base-merges.txt + ln -s ${bertVocab} bert-base-uncased-vocab.txt + ln -s ${openaiVocab} openai-gpt-vocab.json + ln -s ${openaiMerges} openai-gpt-merges.txt ) + ''; + + postPatch = '' + # pyo3's build check verifies that Rust is a nightly + # version. Disable this check. + substituteInPlace $NIX_BUILD_TOP/$cargoDepsCopy/pyo3/build.rs \ + --replace "check_rustc_version()?;" "" + + # Patching the vendored dependency invalidates the file + # checksums, so remove them. This should be safe, since + # this is just a copy of the vendored dependencies and + # the integrity of the vendored dependencies is validated + # by cargoSha256. + sed -r -i 's|"files":\{[^}]+\}|"files":{}|' \ + $NIX_BUILD_TOP/$cargoDepsCopy/pyo3/.cargo-checksum.json + + # Maturin uses the crate name as the wheel name. + substituteInPlace Cargo.toml \ + --replace "tokenizers-python" "tokenizers" + ''; + + buildPhase = '' + maturin build --release --manylinux off + ''; + + installPhase = '' + # Put the wheels where the pip install hook can find them. + install -Dm644 -t dist target/wheels/*.whl + pipInstallPhase + ''; + + installCheckInputs = [ + pytest + requests + ]; + + installCheckPhase = '' + # Append paths, or the binding's tokenizer module will be + # used, since the test directories have __init__.py + pytest --import-mode=append + ''; + + meta = with stdenv.lib; { + homepage = "https://github.com/huggingface/tokenizers"; + description = "Fast State-of-the-Art Tokenizers optimized for Research and Production"; + license = licenses.asl20; + platforms = platforms.unix; + maintainers = with maintainers; [ danieldk ]; + }; +} diff --git a/pkgs/top-level/python-packages.nix b/pkgs/top-level/python-packages.nix index dc40a0668a7..b2b3f85e716 100644 --- a/pkgs/top-level/python-packages.nix +++ b/pkgs/top-level/python-packages.nix @@ -1502,6 +1502,9 @@ in { inherit (pkgs) sentencepiece pkgconfig; }; + tokenizers = disabledIf (!isPy3k) + (toPythonModule (callPackage ../development/python-modules/tokenizers { })); + transformers = callPackage ../development/python-modules/transformers { }; transforms3d = callPackage ../development/python-modules/transforms3d { };