biome: 1.9.2 -> 1.9.3 (#349335)
[NixPkgs.git] / pkgs / development / python-modules / tokenizers / default.nix
blobce8e034b8ce0c3c61a80c82e54e2e19832eb2f8f
2   lib,
3   stdenv,
4   linkFarm,
5   fetchurl,
6   buildPythonPackage,
7   fetchFromGitHub,
8   python,
10   # nativeBuildInputs
11   pkg-config,
12   setuptools-rust,
13   rustPlatform,
14   cargo,
15   rustc,
17   # buildInputs
18   openssl,
19   libiconv,
20   Security,
22   # dependencies
23   huggingface-hub,
24   numpy,
26   # tests
27   datasets,
28   pytestCheckHook,
29   requests,
30   tiktoken,
33 let
34   # See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details
35   # about URLs and file names
36   test-data = linkFarm "tokenizers-test-data" {
37     "roberta-base-vocab.json" = fetchurl {
38       url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
39       hash = "sha256-nn9jwtFdZmtS4h0lDS5RO4fJtxPPpph6gu2J5eblBlU=";
40     };
41     "roberta-base-merges.txt" = fetchurl {
42       url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
43       hash = "sha256-HOFmR3PFDz4MyIQmGak+3EYkUltyixiKngvjO3cmrcU=";
44     };
45     "albert-base-v1-tokenizer.json" = fetchurl {
46       url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
47       hash = "sha256-biqj1cpMaEG8NqUCgXnLTWPBKZMfoY/OOP2zjOxNKsM=";
48     };
49     "bert-base-uncased-vocab.txt" = fetchurl {
50       url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
51       hash = "sha256-B+ztN1zsFE0nyQAkHz4zlHjeyVj5L928VR8pXJkgOKM=";
52     };
53     "big.txt" = fetchurl {
54       url = "https://norvig.com/big.txt";
55       hash = "sha256-+gZsfUDw8gGsQUTmUqpiQw5YprOAXscGUPZ42lgE6Hs=";
56     };
57     "bert-wiki.json" = fetchurl {
58       url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
59       hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
60     };
61     "tokenizer-wiki.json" = fetchurl {
62       url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
63       hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
64     };
65     "openai-gpt-vocab.json" = fetchurl {
66       url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
67       hash = "sha256-/fSbGefeI2hSCR2gm4Sno81eew55kWN2z0X2uBJ7gHg=";
68     };
69     "openai-gpt-merges.txt" = fetchurl {
70       url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
71       hash = "sha256-Dqm1GuaVBzzYceA1j3AWMR1nGn/zlj42fVI2Ui8pRyU=";
72     };
73   };
75 buildPythonPackage rec {
76   pname = "tokenizers";
77   version = "0.20.1";
78   pyproject = true;
80   src = fetchFromGitHub {
81     owner = "huggingface";
82     repo = "tokenizers";
83     rev = "refs/tags/v${version}";
84     hash = "sha256-QTe1QdmJHSoosNG9cCJS7uQNdoMwgL+CJHQQUX5VtSY=";
85   };
87   cargoDeps = rustPlatform.importCargoLock { lockFile = ./Cargo.lock; };
89   sourceRoot = "${src.name}/bindings/python";
90   maturinBuildFlags = [ "--interpreter ${python.executable}" ];
92   nativeBuildInputs = [
93     pkg-config
94     setuptools-rust
95     rustPlatform.cargoSetupHook
96     rustPlatform.maturinBuildHook
97     cargo
98     rustc
99   ];
101   buildInputs =
102     [ openssl ]
103     ++ lib.optionals stdenv.hostPlatform.isDarwin [
104       libiconv
105       Security
106     ];
108   dependencies = [
109     huggingface-hub
110     numpy
111   ];
113   nativeCheckInputs = [
114     datasets
115     pytestCheckHook
116     requests
117     tiktoken
118   ];
120   postUnpack = ''
121     # Add data files for tests, otherwise tests attempt network access
122     mkdir $sourceRoot/tests/data
123     ln -s ${test-data}/* $sourceRoot/tests/data/
124   '';
126   preCheck = ''
127     export HOME=$(mktemp -d);
128   '';
130   pythonImportsCheck = [ "tokenizers" ];
132   disabledTests = [
133     # Downloads data using the datasets module
134     "test_encode_special_tokens"
135     "test_splitting"
136     "TestTrainFromIterators"
138     # Those tests require more data
139     "test_from_pretrained"
140     "test_from_pretrained_revision"
141     "test_continuing_prefix_trainer_mistmatch"
142   ];
144   disabledTestPaths = [
145     # fixture 'model' not found
146     "benches/test_tiktoken.py"
147   ];
149   meta = {
150     description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
151     homepage = "https://github.com/huggingface/tokenizers";
152     changelog = "https://github.com/huggingface/tokenizers/releases/tag/v${version}";
153     license = lib.licenses.asl20;
154     maintainers = with lib.maintainers; [ GaetanLepage ];
155     platforms = lib.platforms.unix;
156   };