9 python3Packages.buildPythonPackage {
14 src = fetchFromGitHub {
15 owner = "huggingface";
17 rev = "refs/tags/v${version}";
18 hash = "sha256-2NJja2yWeHOgo1pCuwHN6SgYnsimuZdK0jE8ucTH4r8=";
21 nativeBuildInputs = with python3Packages; [ setuptools ];
23 propagatedBuildInputs = with python3Packages; [
35 nativeCheckInputs = with python3Packages; [ pytestCheckHook ];
36 dependencies = with python3Packages; [
47 "tests/executor/test_local.py"
48 "tests/pipeline/test_filters.py"
49 "tests/pipeline/test_bloom_filter.py"
50 "tests/pipeline/test_minhash.py"
51 "tests/pipeline/test_sentence_deduplication.py"
52 "tests/pipeline/test_tokenization.py"
53 "tests/pipeline/test_exact_substrings.py"
56 pythonImportsCheck = [ "datatrove" ];
58 description = "Set of platform-agnostic customizable pipeline processing blocks for data processing";
59 homepage = "https://github.com/huggingface/datatrove";
60 changelog = "https://github.com/huggingface/datatrove/releases/tag/v${version}";
61 license = lib.licenses.asl20;
62 maintainers = with lib.maintainers; [ luftmensch-luftmensch ];
63 platforms = lib.platforms.all;