Daniël de Kok 2020-11-10 18:16:13 +01:00 committed by Jonathan Ringer
parent 5c737382f3
commit c67382b02f

View File

@ -32,6 +32,14 @@ let
url = "https://norvig.com/big.txt"; url = "https://norvig.com/big.txt";
sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps"; sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps";
}; };
docPipelineTokenizer = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
};
docQuicktourTokenizer = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
};
openaiVocab = fetchurl { openaiVocab = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"; url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x"; sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
@ -42,16 +50,16 @@ let
}; };
in rustPlatform.buildRustPackage rec { in rustPlatform.buildRustPackage rec {
pname = "tokenizers"; pname = "tokenizers";
version = "0.9.2"; version = "0.9.4";
src = fetchFromGitHub { src = fetchFromGitHub {
owner = "huggingface"; owner = "huggingface";
repo = pname; repo = pname;
rev = "python-v${version}"; rev = "python-v${version}";
sha256 = "0rsm1g5zfq3ygdb3s8v9xqqpgfzvvkc4n5ik3ahy8sw7pyjljb4m"; hash = "sha256-JXoH9yfhMIFg5qDY5zrF6iWb7XKugjMfk1NxSizfaWg=";
}; };
cargoSha256 = "0yn699dq9hdjh7fyci99ni8mmd5qdhzrsi80grzgf5cch8g38rbi"; cargoSha256 = "sha256-u9qitrOxJSABs0VjwHUZgmw7VTQXNbp6l8fKKE/RQ7M=";
sourceRoot = "source/bindings/python"; sourceRoot = "source/bindings/python";
@ -82,6 +90,8 @@ in rustPlatform.buildRustPackage rec {
ln -s ${robertaMerges} roberta-base-merges.txt ln -s ${robertaMerges} roberta-base-merges.txt
ln -s ${albertVocab} albert-base-v1-tokenizer.json ln -s ${albertVocab} albert-base-v1-tokenizer.json
ln -s ${bertVocab} bert-base-uncased-vocab.txt ln -s ${bertVocab} bert-base-uncased-vocab.txt
ln -s ${docPipelineTokenizer} bert-wiki.json
ln -s ${docQuicktourTokenizer} tokenizer-wiki.json
ln -s ${norvigBig} big.txt ln -s ${norvigBig} big.txt
ln -s ${openaiVocab} openai-gpt-vocab.json ln -s ${openaiVocab} openai-gpt-vocab.json
ln -s ${openaiMerges} openai-gpt-merges.txt ) ln -s ${openaiMerges} openai-gpt-merges.txt )