pytrainer: unpin python 3.10
[NixPkgs.git] / pkgs / development / python-modules / unstructured / default.nix
blob6625de40c87c6a5d047cb400511d0ca901fb6088
2   lib,
3   buildPythonPackage,
4   fetchFromGitHub,
5   # propagated build inputs
6   chardet,
7   filetype,
8   lxml,
9   msg-parser,
10   nltk,
11   openpyxl,
12   pandas,
13   pdf2image,
14   pdfminer-six,
15   pillow,
16   pypandoc,
17   python-docx,
18   python-pptx,
19   python-magic,
20   markdown,
21   requests,
22   tabulate,
23   xlrd,
24   # optional-dependencies
25   langdetect,
26   sacremoses,
27   sentencepiece,
28   torch,
29   transformers,
30   unstructured-inference,
31   s3fs,
32   fsspec,
33   adlfs,
34   # , discord-py
35   pygithub,
36   python-gitlab,
37   praw,
38   slack-sdk,
39   wikipedia,
40   google-api-python-client,
41   # , gcsfs
42   elasticsearch8,
43   jq,
44   # , dropboxdrivefs
45   atlassian-python-api,
46   # test dependencies
47   pytestCheckHook,
48   black,
49   coverage,
50   click,
51   freezegun,
52   # , label-studio-sdk
53   mypy,
54   pytest-cov,
55   pytest-mock,
56   vcrpy,
57   grpcio,
59 let
60   version = "0.15.14";
61   optional-dependencies = {
62     huggingflace = [
63       langdetect
64       sacremoses
65       sentencepiece
66       torch
67       transformers
68     ];
69     local-inference = [ unstructured-inference ];
70     s3 = [
71       s3fs
72       fsspec
73     ];
74     azure = [
75       adlfs
76       fsspec
77     ];
78     discord = [ ]; # discord-py
79     github = [ pygithub ];
80     gitlab = [ python-gitlab ];
81     reddit = [ praw ];
82     slack = [ slack-sdk ];
83     wikipedia = [ wikipedia ];
84     google-drive = [ google-api-python-client ];
85     gcs = [ ]; # gcsfs fsspec
86     elasticsearch = [
87       elasticsearch8
88       jq
89     ];
90     dropbox = [ ]; # dropboxdrivefs fsspec
91     confluence = [ atlassian-python-api ];
92   };
94 buildPythonPackage {
95   pname = "unstructured";
96   inherit version;
97   format = "setuptools";
99   src = fetchFromGitHub {
100     owner = "Unstructured-IO";
101     repo = "unstructured";
102     rev = "refs/tags/${version}";
103     hash = "sha256-cIMKaSKG4T832rpiJeiwftqVrcMezD9ICfislGPV/TQ=";
104   };
106   propagatedBuildInputs = [
107     chardet
108     filetype
109     lxml
110     msg-parser
111     nltk
112     openpyxl
113     pandas
114     pdf2image
115     pdfminer-six
116     pillow
117     pypandoc
118     python-docx
119     python-pptx
120     python-magic
121     markdown
122     requests
123     tabulate
124     xlrd
125   ];
127   pythonImportsCheck = [ "unstructured" ];
129   # test try to download punkt from nltk
130   # figure out how to make it available to enable the tests
131   doCheck = false;
133   nativeCheckInputs = [
134     pytestCheckHook
135     black
136     coverage
137     click
138     freezegun
139     mypy
140     pytest-cov
141     pytest-mock
142     vcrpy
143     grpcio
144   ];
146   optional-dependencies = optional-dependencies;
148   meta = with lib; {
149     description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines";
150     mainProgram = "unstructured-ingest";
151     homepage = "https://github.com/Unstructured-IO/unstructured";
152     changelog = "https://github.com/Unstructured-IO/unstructured/blob/${version}/CHANGELOG.md";
153     license = licenses.asl20;
154     maintainers = with maintainers; [ happysalada ];
155   };