btrbk: add mainProgram
[NixPkgs.git] / pkgs / by-name / da / datatrove / package.nix
blob61ac911cc8e458ff83ebefe7e575d1eb1743bacd
2   lib,
3   fetchFromGitHub,
4   python3Packages,
5 }:
6 let
7   version = "0.2.0";
8 in
9 python3Packages.buildPythonPackage {
10   pname = "datatrove";
11   inherit version;
12   pyproject = true;
14   src = fetchFromGitHub {
15     owner = "huggingface";
16     repo = "datatrove";
17     rev = "refs/tags/v${version}";
18     hash = "sha256-2NJja2yWeHOgo1pCuwHN6SgYnsimuZdK0jE8ucTH4r8=";
19   };
21   nativeBuildInputs = with python3Packages; [ setuptools ];
23   propagatedBuildInputs = with python3Packages; [
24     dill
25     fsspec
26     huggingface-hub
27     tokenizers
28     humanize
29     loguru
30     multiprocess
31     numpy
32     rich
33   ];
35   nativeCheckInputs = with python3Packages; [ pytestCheckHook ];
36   dependencies = with python3Packages; [
37     boto3
38     fasteners
39     huggingface-hub
40     moto
41     nltk
42     s3fs
43     xxhash
44   ];
46   disabledTestPaths = [
47     "tests/executor/test_local.py"
48     "tests/pipeline/test_filters.py"
49     "tests/pipeline/test_bloom_filter.py"
50     "tests/pipeline/test_minhash.py"
51     "tests/pipeline/test_sentence_deduplication.py"
52     "tests/pipeline/test_tokenization.py"
53     "tests/pipeline/test_exact_substrings.py"
54   ];
56   pythonImportsCheck = [ "datatrove" ];
57   meta = {
58     description = "Set of platform-agnostic customizable pipeline processing blocks for data processing";
59     homepage = "https://github.com/huggingface/datatrove";
60     changelog = "https://github.com/huggingface/datatrove/releases/tag/v${version}";
61     license = lib.licenses.asl20;
62     maintainers = with lib.maintainers; [ luftmensch-luftmensch ];
63     platforms = lib.platforms.all;
64   };