34 # See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details
35 # about URLs and file names
36 test-data = linkFarm "tokenizers-test-data" {
37 "roberta-base-vocab.json" = fetchurl {
38 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
39 hash = "sha256-nn9jwtFdZmtS4h0lDS5RO4fJtxPPpph6gu2J5eblBlU=";
41 "roberta-base-merges.txt" = fetchurl {
42 url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
43 hash = "sha256-HOFmR3PFDz4MyIQmGak+3EYkUltyixiKngvjO3cmrcU=";
45 "albert-base-v1-tokenizer.json" = fetchurl {
46 url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
47 hash = "sha256-biqj1cpMaEG8NqUCgXnLTWPBKZMfoY/OOP2zjOxNKsM=";
49 "bert-base-uncased-vocab.txt" = fetchurl {
50 url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
51 hash = "sha256-B+ztN1zsFE0nyQAkHz4zlHjeyVj5L928VR8pXJkgOKM=";
53 "big.txt" = fetchurl {
54 url = "https://norvig.com/big.txt";
55 hash = "sha256-+gZsfUDw8gGsQUTmUqpiQw5YprOAXscGUPZ42lgE6Hs=";
57 "bert-wiki.json" = fetchurl {
58 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
59 hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
61 "tokenizer-wiki.json" = fetchurl {
62 url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
63 hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
65 "openai-gpt-vocab.json" = fetchurl {
66 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
67 hash = "sha256-/fSbGefeI2hSCR2gm4Sno81eew55kWN2z0X2uBJ7gHg=";
69 "openai-gpt-merges.txt" = fetchurl {
70 url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
71 hash = "sha256-Dqm1GuaVBzzYceA1j3AWMR1nGn/zlj42fVI2Ui8pRyU=";
75 buildPythonPackage rec {
80 src = fetchFromGitHub {
81 owner = "huggingface";
83 rev = "refs/tags/v${version}";
84 hash = "sha256-QTe1QdmJHSoosNG9cCJS7uQNdoMwgL+CJHQQUX5VtSY=";
87 cargoDeps = rustPlatform.importCargoLock { lockFile = ./Cargo.lock; };
89 sourceRoot = "${src.name}/bindings/python";
90 maturinBuildFlags = [ "--interpreter ${python.executable}" ];
95 rustPlatform.cargoSetupHook
96 rustPlatform.maturinBuildHook
103 ++ lib.optionals stdenv.hostPlatform.isDarwin [
113 nativeCheckInputs = [
121 # Add data files for tests, otherwise tests attempt network access
122 mkdir $sourceRoot/tests/data
123 ln -s ${test-data}/* $sourceRoot/tests/data/
127 export HOME=$(mktemp -d);
130 pythonImportsCheck = [ "tokenizers" ];
133 # Downloads data using the datasets module
134 "test_encode_special_tokens"
136 "TestTrainFromIterators"
138 # Those tests require more data
139 "test_from_pretrained"
140 "test_from_pretrained_revision"
141 "test_continuing_prefix_trainer_mistmatch"
144 disabledTestPaths = [
145 # fixture 'model' not found
146 "benches/test_tiktoken.py"
150 description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
151 homepage = "https://github.com/huggingface/tokenizers";
152 changelog = "https://github.com/huggingface/tokenizers/releases/tag/v${version}";
153 license = lib.licenses.asl20;
154 maintainers = with lib.maintainers; [ GaetanLepage ];
155 platforms = lib.platforms.unix;