pkgs/development/python-modules/scrapy/default.nix

   1 { lib
   2 , stdenv
   3 , botocore
   4 , buildPythonPackage
   5 , cryptography
   6 , cssselect
   7 , fetchPypi
   8 , fetchpatch
   9 , glibcLocales
  10 , installShellFiles
  11 , itemadapter
  12 , itemloaders
  13 , jmespath
  14 , lxml
  15 , packaging
  16 , parsel
  17 , pexpect
  18 , protego
  19 , pydispatcher
  20 , pyopenssl
  21 , pytestCheckHook
  22 , pythonOlder
  23 , queuelib
  24 , service-identity
  25 , sybil
  26 , testfixtures
  27 , tldextract
  28 , twisted
  29 , w3lib
  30 , zope_interface
  31 }:
  32
  33 buildPythonPackage rec {
  34   pname = "scrapy";
  35   version = "2.11.0";
  36   format = "setuptools";
  37
  38   disabled = pythonOlder "3.8";
  39
  40   src = fetchPypi {
  41     inherit version;
  42     pname = "Scrapy";
  43     hash = "sha256-PL3tzgw/DgSC1hvi10WGg758188UsO5q37rduA9bNqU=";
  44   };
  45
  46   patches = [
  47     # Fix compatiblity with Twisted>=23.8. Remove with the next release.
  48     (fetchpatch {
  49       url = "https://github.com/scrapy/scrapy/commit/aa95ada42cdf570f840f55c463375f8a81b303f8.patch";
  50       hash = "sha256-LuhA5BqtjSUgkotplvUCtvGNYOTrl0MJRCXiSBMDFzY=";
  51       excludes = [
  52         "tests/CrawlerProcess/sleeping.py"
  53         "tests/test_crawler.py"
  54       ];
  55     })
  56   ];
  57
  58   nativeBuildInputs = [
  59     installShellFiles
  60   ];
  61
  62   propagatedBuildInputs = [
  63     cryptography
  64     cssselect
  65     itemadapter
  66     itemloaders
  67     lxml
  68     packaging
  69     parsel
  70     protego
  71     pydispatcher
  72     pyopenssl
  73     queuelib
  74     service-identity
  75     tldextract
  76     twisted
  77     w3lib
  78     zope_interface
  79   ];
  80
  81   nativeCheckInputs = [
  82     botocore
  83     glibcLocales
  84     jmespath
  85     pexpect
  86     pytestCheckHook
  87     sybil
  88     testfixtures
  89   ];
  90
  91   LC_ALL = "en_US.UTF-8";
  92
  93   disabledTestPaths = [
  94     "tests/test_proxy_connect.py"
  95     "tests/test_utils_display.py"
  96     "tests/test_command_check.py"
  97     # Don't test the documentation
  98     "docs"
  99   ];
 100
 101   disabledTests = [
 102     # It's unclear if the failures are related to libxml2, https://github.com/NixOS/nixpkgs/pull/123890
 103     "test_nested_css"
 104     "test_nested_xpath"
 105     "test_flavor_detection"
 106     "test_follow_whitespace"
 107     # Requires network access
 108     "AnonymousFTPTestCase"
 109     "FTPFeedStorageTest"
 110     "FeedExportTest"
 111     "test_custom_asyncio_loop_enabled_true"
 112     "test_custom_loop_asyncio"
 113     "test_custom_loop_asyncio_deferred_signal"
 114     "FileFeedStoragePreFeedOptionsTest"  # https://github.com/scrapy/scrapy/issues/5157
 115     "test_persist"
 116     "test_timeout_download_from_spider_nodata_rcvd"
 117     "test_timeout_download_from_spider_server_hangs"
 118     "test_unbounded_response"
 119     "CookiesMiddlewareTest"
 120     # Depends on uvloop
 121     "test_asyncio_enabled_reactor_different_loop"
 122     "test_asyncio_enabled_reactor_same_loop"
 123     # Fails with AssertionError
 124     "test_peek_fifo"
 125     "test_peek_one_element"
 126     "test_peek_lifo"
 127     "test_callback_kwargs"
 128     # Test fails on Hydra
 129     "test_start_requests_laziness"
 130   ] ++ lib.optionals stdenv.isDarwin [
 131     "test_xmliter_encoding"
 132     "test_download"
 133     "test_reactor_default_twisted_reactor_select"
 134     "URIParamsSettingTest"
 135     "URIParamsFeedOptionTest"
 136     # flaky on darwin-aarch64
 137     "test_fixed_delay"
 138     "test_start_requests_laziness"
 139   ];
 140
 141   postInstall = ''
 142     installManPage extras/scrapy.1
 143     installShellCompletion --cmd scrapy \
 144       --zsh extras/scrapy_zsh_completion \
 145       --bash extras/scrapy_bash_completion
 146   '';
 147
 148   pythonImportsCheck = [
 149     "scrapy"
 150   ];
 151
 152   __darwinAllowLocalNetworking = true;
 153
 154   meta = with lib; {
 155     description = "High-level web crawling and web scraping framework";
 156     longDescription = ''
 157       Scrapy is a fast high-level web crawling and web scraping framework, used to crawl
 158       websites and extract structured data from their pages. It can be used for a wide
 159       range of purposes, from data mining to monitoring and automated testing.
 160     '';
 161     homepage = "https://scrapy.org/";
 162     changelog = "https://github.com/scrapy/scrapy/raw/${version}/docs/news.rst";
 163     license = licenses.bsd3;
 164     maintainers = with maintainers; [ marsam ];
 165   };
 166 }