keepalived add meta.mainProgram (#380296)
[NixPkgs.git] / pkgs / development / python-modules / unstructured / default.nix
blobeac7e5e0422462b68e6c931ee8f343b900702a3c
2   lib,
3   buildPythonPackage,
4   fetchFromGitHub,
6   # core networking and async dependencies
7   anyio,
8   backoff,
9   certifi,
10   httpcore,
11   httpx,
12   h11,
13   nest-asyncio,
14   requests,
15   requests-toolbelt,
16   sniffio,
17   urllib3,
19   # core parsing and processing
20   beautifulsoup4,
21   chardet,
22   charset-normalizer,
23   emoji,
24   filetype,
25   html5lib,
26   idna,
27   joblib,
28   # jsonpath-python,
29   nltk,
30   olefile,
31   orderly-set,
32   python-dateutil,
33   # python-iso639,
34   python-magic,
35   # python-oxmsg,
36   rapidfuzz,
37   regex,
38   soupsieve,
39   webencodings,
41   # core data handling
42   dataclasses-json,
43   deepdiff,
44   marshmallow,
45   mypy-extensions,
46   packaging,
47   typing-extensions,
48   typing-inspect,
50   # core system utilities
51   cffi,
52   cryptography,
53   psutil,
54   pycparser,
55   six,
56   tqdm,
57   wrapt,
59   # document format support
60   markdown,
61   pdfminer-six,
62   pdfplumber,
63   # pi-heif,
64   pikepdf,
65   pypandoc,
66   pypdf,
67   python-docx,
68   # unstructured-client,
69   # unstructured-pytesseract,
70   # optional dependencies
71   # csv
72   pytz,
73   tzdata,
74   # markdown
75   importlib-metadata,
76   zipp,
77   # pdf
78   opencv-python,
79   paddlepaddle,
80   pdf2image,
81   # unstructured-paddleocr,
82   # pptx
83   lxml,
84   pillow,
85   python-pptx,
86   xlsxwriter,
87   # xslx
88   et-xmlfile,
89   networkx,
90   numpy,
91   openpyxl,
92   pandas,
93   xlrd,
94   # huggingface
95   langdetect,
96   sacremoses,
97   sentencepiece,
98   torch,
99   transformers,
100   # local-inference
101   unstructured-inference,
102   # test dependencies
103   pytestCheckHook,
104   black,
105   coverage,
106   click,
107   freezegun,
108   # , label-studio-sdk
109   mypy,
110   pytest-cov-stub,
111   pytest-mock,
112   vcrpy,
113   grpcio,
116   version = "0.16.15";
118 buildPythonPackage {
119   pname = "unstructured";
120   inherit version;
121   format = "setuptools";
123   src = fetchFromGitHub {
124     owner = "Unstructured-IO";
125     repo = "unstructured";
126     tag = version;
127     hash = "sha256-Wp51LOgM/zE81324Qzu83XGupUMAzz2wn+COmNq95H8=";
128   };
130   propagatedBuildInputs = [
131     # Base dependencies
132     anyio
133     backoff
134     beautifulsoup4
135     certifi
136     cffi
137     chardet
138     charset-normalizer
139     click
140     cryptography
141     dataclasses-json
142     deepdiff
143     emoji
144     filetype
145     h11
146     html5lib
147     httpcore
148     httpx
149     idna
150     joblib
151     # jsonpath-python
152     langdetect
153     lxml
154     marshmallow
155     mypy-extensions
156     nest-asyncio
157     nltk
158     numpy
159     olefile
160     orderly-set
161     packaging
162     psutil
163     pycparser
164     pypdf
165     python-dateutil
166     # python-iso639
167     python-magic
168     # python-oxmsg
169     rapidfuzz
170     regex
171     requests
172     requests-toolbelt
173     six
174     sniffio
175     soupsieve
176     tqdm
177     typing-extensions
178     typing-inspect
179     # unstructured-client
180     urllib3
181     webencodings
182     wrapt
183   ];
185   optional-dependencies = rec {
186     all-docs = csv ++ docx ++ epub ++ pdf ++ req-markdown ++ odt ++ org ++ pptx ++ xlsx;
187     csv = [
188       numpy
189       pandas
190       python-dateutil
191       pytz
192       tzdata
193     ];
194     docx = [
195       lxml
196       python-docx
197       typing-extensions
198     ];
199     epub = [ pypandoc ];
200     req-markdown = [
201       importlib-metadata
202       markdown
203       zipp
204     ];
205     odt = [
206       lxml
207       pypandoc
208       python-docx
209       typing-extensions
210     ];
211     org = [
212       pypandoc
213     ];
214     paddleocr = [
215       opencv-python
216       # paddlepaddle # 3.12 not supported for now
217       pdf2image
218       # unstructured-paddleocr
219     ];
220     pdf = [
221       pdf2image
222       pdfminer-six
223       pdfplumber
224       # pi-heif
225       pikepdf
226       pypdf
227       unstructured-inference
228       # unstructured-pytesseract
229     ];
230     pptx = [
231       lxml
232       pillow
233       python-pptx
234       xlsxwriter
235     ];
236     xlsx = [
237       et-xmlfile
238       networkx
239       numpy
240       openpyxl
241       pandas
242       xlrd
243     ];
244     huggingface = [
245       langdetect
246       sacremoses
247       sentencepiece
248       torch
249       transformers
250     ];
251   };
253   pythonImportsCheck = [ "unstructured" ];
255   # test try to download punkt from nltk
256   # figure out how to make it available to enable the tests
257   doCheck = false;
259   nativeCheckInputs = [
260     pytestCheckHook
261     black
262     coverage
263     click
264     freezegun
265     mypy
266     pytest-cov-stub
267     pytest-mock
268     vcrpy
269     grpcio
270   ];
272   meta = with lib; {
273     description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines";
274     mainProgram = "unstructured-ingest";
275     homepage = "https://github.com/Unstructured-IO/unstructured";
276     changelog = "https://github.com/Unstructured-IO/unstructured/blob/${version}/CHANGELOG.md";
277     license = licenses.asl20;
278     maintainers = with maintainers; [ happysalada ];
279   };