yt_dlp/networking/_requests.py

   1 from __future__ import annotations
   2
   3 import contextlib
   4 import functools
   5 import http.client
   6 import logging
   7 import re
   8 import socket
   9 import warnings
  10
  11 from ..dependencies import brotli, requests, urllib3
  12 from ..utils import bug_reports_message, int_or_none, variadic
  13 from ..utils.networking import normalize_url
  14
  15 if requests is None:
  16     raise ImportError('requests module is not installed')
  17
  18 if urllib3 is None:
  19     raise ImportError('urllib3 module is not installed')
  20
  21 urllib3_version = tuple(int_or_none(x, default=0) for x in urllib3.__version__.split('.'))
  22
  23 if urllib3_version < (1, 26, 17):
  24     raise ImportError('Only urllib3 >= 1.26.17 is supported')
  25
  26 if requests.__build__ < 0x023202:
  27     raise ImportError('Only requests >= 2.32.2 is supported')
  28
  29 import requests.adapters
  30 import requests.utils
  31 import urllib3.connection
  32 import urllib3.exceptions
  33 import urllib3.util
  34
  35 from ._helper import (
  36     InstanceStoreMixin,
  37     add_accept_encoding_header,
  38     create_connection,
  39     create_socks_proxy_socket,
  40     get_redirect_method,
  41     make_socks_proxy_opts,
  42     select_proxy,
  43 )
  44 from .common import (
  45     Features,
  46     RequestHandler,
  47     Response,
  48     register_preference,
  49     register_rh,
  50 )
  51 from .exceptions import (
  52     CertificateVerifyError,
  53     HTTPError,
  54     IncompleteRead,
  55     ProxyError,
  56     RequestError,
  57     SSLError,
  58     TransportError,
  59 )
  60 from ..socks import ProxyError as SocksProxyError
  61
  62 SUPPORTED_ENCODINGS = [
  63     'gzip', 'deflate',
  64 ]
  65
  66 if brotli is not None:
  67     SUPPORTED_ENCODINGS.append('br')
  68
  69 '''
  70 Override urllib3's behavior to not convert lower-case percent-encoded characters
  71 to upper-case during url normalization process.
  72
  73 RFC3986 defines that the lower or upper case percent-encoded hexidecimal characters are equivalent
  74 and normalizers should convert them to uppercase for consistency [1].
  75
  76 However, some sites may have an incorrect implementation where they provide
  77 a percent-encoded url that is then compared case-sensitively.[2]
  78
  79 While this is a very rare case, since urllib does not do this normalization step, it
  80 is best to avoid it in requests too for compatability reasons.
  81
  82 1: https://tools.ietf.org/html/rfc3986#section-2.1
  83 2: https://github.com/streamlink/streamlink/pull/4003
  84 '''
  85
  86
  87 class Urllib3PercentREOverride:
  88     def __init__(self, r: re.Pattern):
  89         self.re = r
  90
  91     # pass through all other attribute calls to the original re
  92     def __getattr__(self, item):
  93         return self.re.__getattribute__(item)
  94
  95     def subn(self, repl, string, *args, **kwargs):
  96         return string, self.re.subn(repl, string, *args, **kwargs)[1]
  97
  98
  99 # urllib3 >= 1.25.8 uses subn:
 100 # https://github.com/urllib3/urllib3/commit/a2697e7c6b275f05879b60f593c5854a816489f0
 101 import urllib3.util.url
 102
 103 if hasattr(urllib3.util.url, 'PERCENT_RE'):
 104     urllib3.util.url.PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url.PERCENT_RE)
 105 elif hasattr(urllib3.util.url, '_PERCENT_RE'):  # urllib3 >= 2.0.0
 106     urllib3.util.url._PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url._PERCENT_RE)
 107 else:
 108     warnings.warn('Failed to patch PERCENT_RE in urllib3 (does the attribute exist?)' + bug_reports_message())
 109
 110 '''
 111 Workaround for issue in urllib.util.ssl_.py: ssl_wrap_context does not pass
 112 server_hostname to SSLContext.wrap_socket if server_hostname is an IP,
 113 however this is an issue because we set check_hostname to True in our SSLContext.
 114
 115 Monkey-patching IS_SECURETRANSPORT forces ssl_wrap_context to pass server_hostname regardless.
 116
 117 This has been fixed in urllib3 2.0+.
 118 See: https://github.com/urllib3/urllib3/issues/517
 119 '''
 120
 121 if urllib3_version < (2, 0, 0):
 122     with contextlib.suppress(Exception):
 123         urllib3.util.IS_SECURETRANSPORT = urllib3.util.ssl_.IS_SECURETRANSPORT = True
 124
 125
 126 # Requests will not automatically handle no_proxy by default
 127 # due to buggy no_proxy handling with proxy dict [1].
 128 # 1. https://github.com/psf/requests/issues/5000
 129 requests.adapters.select_proxy = select_proxy
 130
 131
 132 class RequestsResponseAdapter(Response):
 133     def __init__(self, res: requests.models.Response):
 134         super().__init__(
 135             fp=res.raw, headers=res.headers, url=res.url,
 136             status=res.status_code, reason=res.reason)
 137
 138         self._requests_response = res
 139
 140     def read(self, amt: int | None = None):
 141         try:
 142             # Interact with urllib3 response directly.
 143             return self.fp.read(amt, decode_content=True)
 144
 145         # See urllib3.response.HTTPResponse.read() for exceptions raised on read
 146         except urllib3.exceptions.SSLError as e:
 147             raise SSLError(cause=e) from e
 148
 149         except urllib3.exceptions.ProtocolError as e:
 150             # IncompleteRead is always contained within ProtocolError
 151             # See urllib3.response.HTTPResponse._error_catcher()
 152             ir_err = next(
 153                 (err for err in (e.__context__, e.__cause__, *variadic(e.args))
 154                  if isinstance(err, http.client.IncompleteRead)), None)
 155             if ir_err is not None:
 156                 # `urllib3.exceptions.IncompleteRead` is subclass of `http.client.IncompleteRead`
 157                 # but uses an `int` for its `partial` property.
 158                 partial = ir_err.partial if isinstance(ir_err.partial, int) else len(ir_err.partial)
 159                 raise IncompleteRead(partial=partial, expected=ir_err.expected) from e
 160             raise TransportError(cause=e) from e
 161
 162         except urllib3.exceptions.HTTPError as e:
 163             # catch-all for any other urllib3 response exceptions
 164             raise TransportError(cause=e) from e
 165
 166
 167 class RequestsHTTPAdapter(requests.adapters.HTTPAdapter):
 168     def __init__(self, ssl_context=None, proxy_ssl_context=None, source_address=None, **kwargs):
 169         self._pm_args = {}
 170         if ssl_context:
 171             self._pm_args['ssl_context'] = ssl_context
 172         if source_address:
 173             self._pm_args['source_address'] = (source_address, 0)
 174         self._proxy_ssl_context = proxy_ssl_context or ssl_context
 175         super().__init__(**kwargs)
 176
 177     def init_poolmanager(self, *args, **kwargs):
 178         return super().init_poolmanager(*args, **kwargs, **self._pm_args)
 179
 180     def proxy_manager_for(self, proxy, **proxy_kwargs):
 181         extra_kwargs = {}
 182         if not proxy.lower().startswith('socks') and self._proxy_ssl_context:
 183             extra_kwargs['proxy_ssl_context'] = self._proxy_ssl_context
 184         return super().proxy_manager_for(proxy, **proxy_kwargs, **self._pm_args, **extra_kwargs)
 185
 186     # Skip `requests` internal verification; we use our own SSLContext
 187     def cert_verify(*args, **kwargs):
 188         pass
 189
 190     # requests 2.32.2+: Reimplementation without `_urllib3_request_context`
 191     def get_connection_with_tls_context(self, request, verify, proxies=None, cert=None):
 192         url = urllib3.util.parse_url(request.url).url
 193
 194         manager = self.poolmanager
 195         if proxy := select_proxy(url, proxies):
 196             manager = self.proxy_manager_for(proxy)
 197
 198         return manager.connection_from_url(url)
 199
 200
 201 class RequestsSession(requests.sessions.Session):
 202     """
 203     Ensure unified redirect method handling with our urllib redirect handler.
 204     """
 205
 206     def rebuild_method(self, prepared_request, response):
 207         new_method = get_redirect_method(prepared_request.method, response.status_code)
 208
 209         # HACK: requests removes headers/body on redirect unless code was a 307/308.
 210         if new_method == prepared_request.method:
 211             response._real_status_code = response.status_code
 212             response.status_code = 308
 213
 214         prepared_request.method = new_method
 215
 216         # Requests fails to resolve dot segments on absolute redirect locations
 217         # See: https://github.com/yt-dlp/yt-dlp/issues/9020
 218         prepared_request.url = normalize_url(prepared_request.url)
 219
 220     def rebuild_auth(self, prepared_request, response):
 221         # HACK: undo status code change from rebuild_method, if applicable.
 222         # rebuild_auth runs after requests would remove headers/body based on status code
 223         if hasattr(response, '_real_status_code'):
 224             response.status_code = response._real_status_code
 225             del response._real_status_code
 226         return super().rebuild_auth(prepared_request, response)
 227
 228
 229 class Urllib3LoggingFilter(logging.Filter):
 230
 231     def filter(self, record):
 232         # Ignore HTTP request messages since HTTPConnection prints those
 233         return record.msg != '%s://%s:%s "%s %s %s" %s %s'
 234
 235
 236 class Urllib3LoggingHandler(logging.Handler):
 237     """Redirect urllib3 logs to our logger"""
 238
 239     def __init__(self, logger, *args, **kwargs):
 240         super().__init__(*args, **kwargs)
 241         self._logger = logger
 242
 243     def emit(self, record):
 244         try:
 245             msg = self.format(record)
 246             if record.levelno >= logging.ERROR:
 247                 self._logger.error(msg)
 248             else:
 249                 self._logger.stdout(msg)
 250
 251         except Exception:
 252             self.handleError(record)
 253
 254
 255 @register_rh
 256 class RequestsRH(RequestHandler, InstanceStoreMixin):
 257
 258     """Requests RequestHandler
 259     https://github.com/psf/requests
 260     """
 261     _SUPPORTED_URL_SCHEMES = ('http', 'https')
 262     _SUPPORTED_ENCODINGS = tuple(SUPPORTED_ENCODINGS)
 263     _SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h')
 264     _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
 265     RH_NAME = 'requests'
 266
 267     def __init__(self, *args, **kwargs):
 268         super().__init__(*args, **kwargs)
 269
 270         # Forward urllib3 debug messages to our logger
 271         logger = logging.getLogger('urllib3')
 272         self.__logging_handler = Urllib3LoggingHandler(logger=self._logger)
 273         self.__logging_handler.setFormatter(logging.Formatter('requests: %(message)s'))
 274         self.__logging_handler.addFilter(Urllib3LoggingFilter())
 275         logger.addHandler(self.__logging_handler)
 276         # TODO: Use a logger filter to suppress pool reuse warning instead
 277         logger.setLevel(logging.ERROR)
 278
 279         if self.verbose:
 280             # Setting this globally is not ideal, but is easier than hacking with urllib3.
 281             # It could technically be problematic for scripts embedding yt-dlp.
 282             # However, it is unlikely debug traffic is used in that context in a way this will cause problems.
 283             urllib3.connection.HTTPConnection.debuglevel = 1
 284             logger.setLevel(logging.DEBUG)
 285         # this is expected if we are using --no-check-certificate
 286         urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 287
 288     def close(self):
 289         self._clear_instances()
 290         # Remove the logging handler that contains a reference to our logger
 291         # See: https://github.com/yt-dlp/yt-dlp/issues/8922
 292         logging.getLogger('urllib3').removeHandler(self.__logging_handler)
 293
 294     def _check_extensions(self, extensions):
 295         super()._check_extensions(extensions)
 296         extensions.pop('cookiejar', None)
 297         extensions.pop('timeout', None)
 298         extensions.pop('legacy_ssl', None)
 299
 300     def _create_instance(self, cookiejar, legacy_ssl_support=None):
 301         session = RequestsSession()
 302         http_adapter = RequestsHTTPAdapter(
 303             ssl_context=self._make_sslcontext(legacy_ssl_support=legacy_ssl_support),
 304             source_address=self.source_address,
 305             max_retries=urllib3.util.retry.Retry(False),
 306         )
 307         session.adapters.clear()
 308         session.headers = requests.models.CaseInsensitiveDict({'Connection': 'keep-alive'})
 309         session.mount('https://', http_adapter)
 310         session.mount('http://', http_adapter)
 311         session.cookies = cookiejar
 312         session.trust_env = False  # no need, we already load proxies from env
 313         return session
 314
 315     def _send(self, request):
 316
 317         headers = self._merge_headers(request.headers)
 318         add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
 319
 320         max_redirects_exceeded = False
 321
 322         session = self._get_instance(
 323             cookiejar=self._get_cookiejar(request),
 324             legacy_ssl_support=request.extensions.get('legacy_ssl'),
 325         )
 326
 327         try:
 328             requests_res = session.request(
 329                 method=request.method,
 330                 url=request.url,
 331                 data=request.data,
 332                 headers=headers,
 333                 timeout=self._calculate_timeout(request),
 334                 proxies=self._get_proxies(request),
 335                 allow_redirects=True,
 336                 stream=True,
 337             )
 338
 339         except requests.exceptions.TooManyRedirects as e:
 340             max_redirects_exceeded = True
 341             requests_res = e.response
 342
 343         except requests.exceptions.SSLError as e:
 344             if 'CERTIFICATE_VERIFY_FAILED' in str(e):
 345                 raise CertificateVerifyError(cause=e) from e
 346             raise SSLError(cause=e) from e
 347
 348         except requests.exceptions.ProxyError as e:
 349             raise ProxyError(cause=e) from e
 350
 351         except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
 352             raise TransportError(cause=e) from e
 353
 354         except urllib3.exceptions.HTTPError as e:
 355             # Catch any urllib3 exceptions that may leak through
 356             raise TransportError(cause=e) from e
 357
 358         except requests.exceptions.RequestException as e:
 359             # Miscellaneous Requests exceptions. May not necessary be network related e.g. InvalidURL
 360             raise RequestError(cause=e) from e
 361
 362         res = RequestsResponseAdapter(requests_res)
 363
 364         if not 200 <= res.status < 300:
 365             raise HTTPError(res, redirect_loop=max_redirects_exceeded)
 366
 367         return res
 368
 369
 370 @register_preference(RequestsRH)
 371 def requests_preference(rh, request):
 372     return 100
 373
 374
 375 # Use our socks proxy implementation with requests to avoid an extra dependency.
 376 class SocksHTTPConnection(urllib3.connection.HTTPConnection):
 377     def __init__(self, _socks_options, *args, **kwargs):  # must use _socks_options to pass PoolKey checks
 378         self._proxy_args = _socks_options
 379         super().__init__(*args, **kwargs)
 380
 381     def _new_conn(self):
 382         try:
 383             return create_connection(
 384                 address=(self._proxy_args['addr'], self._proxy_args['port']),
 385                 timeout=self.timeout,
 386                 source_address=self.source_address,
 387                 _create_socket_func=functools.partial(
 388                     create_socks_proxy_socket, (self.host, self.port), self._proxy_args))
 389         except (socket.timeout, TimeoutError) as e:
 390             raise urllib3.exceptions.ConnectTimeoutError(
 391                 self, f'Connection to {self.host} timed out. (connect timeout={self.timeout})') from e
 392         except SocksProxyError as e:
 393             raise urllib3.exceptions.ProxyError(str(e), e) from e
 394         except OSError as e:
 395             raise urllib3.exceptions.NewConnectionError(
 396                 self, f'Failed to establish a new connection: {e}') from e
 397
 398
 399 class SocksHTTPSConnection(SocksHTTPConnection, urllib3.connection.HTTPSConnection):
 400     pass
 401
 402
 403 class SocksHTTPConnectionPool(urllib3.HTTPConnectionPool):
 404     ConnectionCls = SocksHTTPConnection
 405
 406
 407 class SocksHTTPSConnectionPool(urllib3.HTTPSConnectionPool):
 408     ConnectionCls = SocksHTTPSConnection
 409
 410
 411 class SocksProxyManager(urllib3.PoolManager):
 412
 413     def __init__(self, socks_proxy, username=None, password=None, num_pools=10, headers=None, **connection_pool_kw):
 414         connection_pool_kw['_socks_options'] = make_socks_proxy_opts(socks_proxy)
 415         super().__init__(num_pools, headers, **connection_pool_kw)
 416         self.pool_classes_by_scheme = {
 417             'http': SocksHTTPConnectionPool,
 418             'https': SocksHTTPSConnectionPool,
 419         }
 420
 421
 422 requests.adapters.SOCKSProxyManager = SocksProxyManager