6 # core networking and async dependencies
19 # core parsing and processing
50 # core system utilities
59 # document format support
68 # unstructured-client,
69 # unstructured-pytesseract,
70 # optional dependencies
81 # unstructured-paddleocr,
101 unstructured-inference,
119 pname = "unstructured";
121 format = "setuptools";
123 src = fetchFromGitHub {
124 owner = "Unstructured-IO";
125 repo = "unstructured";
127 hash = "sha256-Wp51LOgM/zE81324Qzu83XGupUMAzz2wn+COmNq95H8=";
130 propagatedBuildInputs = [
179 # unstructured-client
185 optional-dependencies = rec {
186 all-docs = csv ++ docx ++ epub ++ pdf ++ req-markdown ++ odt ++ org ++ pptx ++ xlsx;
216 # paddlepaddle # 3.12 not supported for now
218 # unstructured-paddleocr
227 unstructured-inference
228 # unstructured-pytesseract
253 pythonImportsCheck = [ "unstructured" ];
255 # test try to download punkt from nltk
256 # figure out how to make it available to enable the tests
259 nativeCheckInputs = [
273 description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines";
274 mainProgram = "unstructured-ingest";
275 homepage = "https://github.com/Unstructured-IO/unstructured";
276 changelog = "https://github.com/Unstructured-IO/unstructured/blob/${version}/CHANGELOG.md";
277 license = licenses.asl20;
278 maintainers = with maintainers; [ happysalada ];