yt_dlp/networking/_requests.py

   1 import contextlib
   2 import functools
   3 import http.client
   4 import logging
   5 import re
   6 import socket
   7 import warnings
   8
   9 from ..dependencies import brotli, requests, urllib3
  10 from ..utils import bug_reports_message, int_or_none, variadic
  11 from ..utils.networking import normalize_url
  12
  13 if requests is None:
  14     raise ImportError('requests module is not installed')
  15
  16 if urllib3 is None:
  17     raise ImportError('urllib3 module is not installed')
  18
  19 urllib3_version = tuple(int_or_none(x, default=0) for x in urllib3.__version__.split('.'))
  20
  21 if urllib3_version < (1, 26, 17):
  22     raise ImportError('Only urllib3 >= 1.26.17 is supported')
  23
  24 if requests.__build__ < 0x023100:
  25     raise ImportError('Only requests >= 2.31.0 is supported')
  26
  27 import requests.adapters
  28 import requests.utils
  29 import urllib3.connection
  30 import urllib3.exceptions
  31
  32 from ._helper import (
  33     InstanceStoreMixin,
  34     add_accept_encoding_header,
  35     create_connection,
  36     create_socks_proxy_socket,
  37     get_redirect_method,
  38     make_socks_proxy_opts,
  39     select_proxy,
  40 )
  41 from .common import (
  42     Features,
  43     RequestHandler,
  44     Response,
  45     register_preference,
  46     register_rh,
  47 )
  48 from .exceptions import (
  49     CertificateVerifyError,
  50     HTTPError,
  51     IncompleteRead,
  52     ProxyError,
  53     RequestError,
  54     SSLError,
  55     TransportError,
  56 )
  57 from ..socks import ProxyError as SocksProxyError
  58
  59 SUPPORTED_ENCODINGS = [
  60     'gzip', 'deflate'
  61 ]
  62
  63 if brotli is not None:
  64     SUPPORTED_ENCODINGS.append('br')
  65
  66 """
  67 Override urllib3's behavior to not convert lower-case percent-encoded characters
  68 to upper-case during url normalization process.
  69
  70 RFC3986 defines that the lower or upper case percent-encoded hexidecimal characters are equivalent
  71 and normalizers should convert them to uppercase for consistency [1].
  72
  73 However, some sites may have an incorrect implementation where they provide
  74 a percent-encoded url that is then compared case-sensitively.[2]
  75
  76 While this is a very rare case, since urllib does not do this normalization step, it
  77 is best to avoid it in requests too for compatability reasons.
  78
  79 1: https://tools.ietf.org/html/rfc3986#section-2.1
  80 2: https://github.com/streamlink/streamlink/pull/4003
  81 """
  82
  83
  84 class Urllib3PercentREOverride:
  85     def __init__(self, r: re.Pattern):
  86         self.re = r
  87
  88     # pass through all other attribute calls to the original re
  89     def __getattr__(self, item):
  90         return self.re.__getattribute__(item)
  91
  92     def subn(self, repl, string, *args, **kwargs):
  93         return string, self.re.subn(repl, string, *args, **kwargs)[1]
  94
  95
  96 # urllib3 >= 1.25.8 uses subn:
  97 # https://github.com/urllib3/urllib3/commit/a2697e7c6b275f05879b60f593c5854a816489f0
  98 import urllib3.util.url  # noqa: E305
  99
 100 if hasattr(urllib3.util.url, 'PERCENT_RE'):
 101     urllib3.util.url.PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url.PERCENT_RE)
 102 elif hasattr(urllib3.util.url, '_PERCENT_RE'):  # urllib3 >= 2.0.0
 103     urllib3.util.url._PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url._PERCENT_RE)
 104 else:
 105     warnings.warn('Failed to patch PERCENT_RE in urllib3 (does the attribute exist?)' + bug_reports_message())
 106
 107 """
 108 Workaround for issue in urllib.util.ssl_.py: ssl_wrap_context does not pass
 109 server_hostname to SSLContext.wrap_socket if server_hostname is an IP,
 110 however this is an issue because we set check_hostname to True in our SSLContext.
 111
 112 Monkey-patching IS_SECURETRANSPORT forces ssl_wrap_context to pass server_hostname regardless.
 113
 114 This has been fixed in urllib3 2.0+.
 115 See: https://github.com/urllib3/urllib3/issues/517
 116 """
 117
 118 if urllib3_version < (2, 0, 0):
 119     with contextlib.suppress(Exception):
 120         urllib3.util.IS_SECURETRANSPORT = urllib3.util.ssl_.IS_SECURETRANSPORT = True
 121
 122
 123 # Requests will not automatically handle no_proxy by default
 124 # due to buggy no_proxy handling with proxy dict [1].
 125 # 1. https://github.com/psf/requests/issues/5000
 126 requests.adapters.select_proxy = select_proxy
 127
 128
 129 class RequestsResponseAdapter(Response):
 130     def __init__(self, res: requests.models.Response):
 131         super().__init__(
 132             fp=res.raw, headers=res.headers, url=res.url,
 133             status=res.status_code, reason=res.reason)
 134
 135         self._requests_response = res
 136
 137     def read(self, amt: int = None):
 138         try:
 139             # Interact with urllib3 response directly.
 140             return self.fp.read(amt, decode_content=True)
 141
 142         # See urllib3.response.HTTPResponse.read() for exceptions raised on read
 143         except urllib3.exceptions.SSLError as e:
 144             raise SSLError(cause=e) from e
 145
 146         except urllib3.exceptions.ProtocolError as e:
 147             # IncompleteRead is always contained within ProtocolError
 148             # See urllib3.response.HTTPResponse._error_catcher()
 149             ir_err = next(
 150                 (err for err in (e.__context__, e.__cause__, *variadic(e.args))
 151                  if isinstance(err, http.client.IncompleteRead)), None)
 152             if ir_err is not None:
 153                 # `urllib3.exceptions.IncompleteRead` is subclass of `http.client.IncompleteRead`
 154                 # but uses an `int` for its `partial` property.
 155                 partial = ir_err.partial if isinstance(ir_err.partial, int) else len(ir_err.partial)
 156                 raise IncompleteRead(partial=partial, expected=ir_err.expected) from e
 157             raise TransportError(cause=e) from e
 158
 159         except urllib3.exceptions.HTTPError as e:
 160             # catch-all for any other urllib3 response exceptions
 161             raise TransportError(cause=e) from e
 162
 163
 164 class RequestsHTTPAdapter(requests.adapters.HTTPAdapter):
 165     def __init__(self, ssl_context=None, proxy_ssl_context=None, source_address=None, **kwargs):
 166         self._pm_args = {}
 167         if ssl_context:
 168             self._pm_args['ssl_context'] = ssl_context
 169         if source_address:
 170             self._pm_args['source_address'] = (source_address, 0)
 171         self._proxy_ssl_context = proxy_ssl_context or ssl_context
 172         super().__init__(**kwargs)
 173
 174     def init_poolmanager(self, *args, **kwargs):
 175         return super().init_poolmanager(*args, **kwargs, **self._pm_args)
 176
 177     def proxy_manager_for(self, proxy, **proxy_kwargs):
 178         extra_kwargs = {}
 179         if not proxy.lower().startswith('socks') and self._proxy_ssl_context:
 180             extra_kwargs['proxy_ssl_context'] = self._proxy_ssl_context
 181         return super().proxy_manager_for(proxy, **proxy_kwargs, **self._pm_args, **extra_kwargs)
 182
 183     def cert_verify(*args, **kwargs):
 184         # lean on SSLContext for cert verification
 185         pass
 186
 187
 188 class RequestsSession(requests.sessions.Session):
 189     """
 190     Ensure unified redirect method handling with our urllib redirect handler.
 191     """
 192
 193     def rebuild_method(self, prepared_request, response):
 194         new_method = get_redirect_method(prepared_request.method, response.status_code)
 195
 196         # HACK: requests removes headers/body on redirect unless code was a 307/308.
 197         if new_method == prepared_request.method:
 198             response._real_status_code = response.status_code
 199             response.status_code = 308
 200
 201         prepared_request.method = new_method
 202
 203         # Requests fails to resolve dot segments on absolute redirect locations
 204         # See: https://github.com/yt-dlp/yt-dlp/issues/9020
 205         prepared_request.url = normalize_url(prepared_request.url)
 206
 207     def rebuild_auth(self, prepared_request, response):
 208         # HACK: undo status code change from rebuild_method, if applicable.
 209         # rebuild_auth runs after requests would remove headers/body based on status code
 210         if hasattr(response, '_real_status_code'):
 211             response.status_code = response._real_status_code
 212             del response._real_status_code
 213         return super().rebuild_auth(prepared_request, response)
 214
 215
 216 class Urllib3LoggingFilter(logging.Filter):
 217
 218     def filter(self, record):
 219         # Ignore HTTP request messages since HTTPConnection prints those
 220         if record.msg == '%s://%s:%s "%s %s %s" %s %s':
 221             return False
 222         return True
 223
 224
 225 class Urllib3LoggingHandler(logging.Handler):
 226     """Redirect urllib3 logs to our logger"""
 227
 228     def __init__(self, logger, *args, **kwargs):
 229         super().__init__(*args, **kwargs)
 230         self._logger = logger
 231
 232     def emit(self, record):
 233         try:
 234             msg = self.format(record)
 235             if record.levelno >= logging.ERROR:
 236                 self._logger.error(msg)
 237             else:
 238                 self._logger.stdout(msg)
 239
 240         except Exception:
 241             self.handleError(record)
 242
 243
 244 @register_rh
 245 class RequestsRH(RequestHandler, InstanceStoreMixin):
 246
 247     """Requests RequestHandler
 248     https://github.com/psf/requests
 249     """
 250     _SUPPORTED_URL_SCHEMES = ('http', 'https')
 251     _SUPPORTED_ENCODINGS = tuple(SUPPORTED_ENCODINGS)
 252     _SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h')
 253     _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
 254     RH_NAME = 'requests'
 255
 256     def __init__(self, *args, **kwargs):
 257         super().__init__(*args, **kwargs)
 258
 259         # Forward urllib3 debug messages to our logger
 260         logger = logging.getLogger('urllib3')
 261         self.__logging_handler = Urllib3LoggingHandler(logger=self._logger)
 262         self.__logging_handler.setFormatter(logging.Formatter('requests: %(message)s'))
 263         self.__logging_handler.addFilter(Urllib3LoggingFilter())
 264         logger.addHandler(self.__logging_handler)
 265         # TODO: Use a logger filter to suppress pool reuse warning instead
 266         logger.setLevel(logging.ERROR)
 267
 268         if self.verbose:
 269             # Setting this globally is not ideal, but is easier than hacking with urllib3.
 270             # It could technically be problematic for scripts embedding yt-dlp.
 271             # However, it is unlikely debug traffic is used in that context in a way this will cause problems.
 272             urllib3.connection.HTTPConnection.debuglevel = 1
 273             logger.setLevel(logging.DEBUG)
 274         # this is expected if we are using --no-check-certificate
 275         urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 276
 277     def close(self):
 278         self._clear_instances()
 279         # Remove the logging handler that contains a reference to our logger
 280         # See: https://github.com/yt-dlp/yt-dlp/issues/8922
 281         logging.getLogger('urllib3').removeHandler(self.__logging_handler)
 282
 283     def _check_extensions(self, extensions):
 284         super()._check_extensions(extensions)
 285         extensions.pop('cookiejar', None)
 286         extensions.pop('timeout', None)
 287
 288     def _create_instance(self, cookiejar):
 289         session = RequestsSession()
 290         http_adapter = RequestsHTTPAdapter(
 291             ssl_context=self._make_sslcontext(),
 292             source_address=self.source_address,
 293             max_retries=urllib3.util.retry.Retry(False),
 294         )
 295         session.adapters.clear()
 296         session.headers = requests.models.CaseInsensitiveDict({'Connection': 'keep-alive'})
 297         session.mount('https://', http_adapter)
 298         session.mount('http://', http_adapter)
 299         session.cookies = cookiejar
 300         session.trust_env = False  # no need, we already load proxies from env
 301         return session
 302
 303     def _send(self, request):
 304
 305         headers = self._merge_headers(request.headers)
 306         add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
 307
 308         max_redirects_exceeded = False
 309
 310         session = self._get_instance(
 311             cookiejar=request.extensions.get('cookiejar') or self.cookiejar)
 312
 313         try:
 314             requests_res = session.request(
 315                 method=request.method,
 316                 url=request.url,
 317                 data=request.data,
 318                 headers=headers,
 319                 timeout=float(request.extensions.get('timeout') or self.timeout),
 320                 proxies=request.proxies or self.proxies,
 321                 allow_redirects=True,
 322                 stream=True
 323             )
 324
 325         except requests.exceptions.TooManyRedirects as e:
 326             max_redirects_exceeded = True
 327             requests_res = e.response
 328
 329         except requests.exceptions.SSLError as e:
 330             if 'CERTIFICATE_VERIFY_FAILED' in str(e):
 331                 raise CertificateVerifyError(cause=e) from e
 332             raise SSLError(cause=e) from e
 333
 334         except requests.exceptions.ProxyError as e:
 335             raise ProxyError(cause=e) from e
 336
 337         except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
 338             raise TransportError(cause=e) from e
 339
 340         except urllib3.exceptions.HTTPError as e:
 341             # Catch any urllib3 exceptions that may leak through
 342             raise TransportError(cause=e) from e
 343
 344         except requests.exceptions.RequestException as e:
 345             # Miscellaneous Requests exceptions. May not necessary be network related e.g. InvalidURL
 346             raise RequestError(cause=e) from e
 347
 348         res = RequestsResponseAdapter(requests_res)
 349
 350         if not 200 <= res.status < 300:
 351             raise HTTPError(res, redirect_loop=max_redirects_exceeded)
 352
 353         return res
 354
 355
 356 @register_preference(RequestsRH)
 357 def requests_preference(rh, request):
 358     return 100
 359
 360
 361 # Use our socks proxy implementation with requests to avoid an extra dependency.
 362 class SocksHTTPConnection(urllib3.connection.HTTPConnection):
 363     def __init__(self, _socks_options, *args, **kwargs):  # must use _socks_options to pass PoolKey checks
 364         self._proxy_args = _socks_options
 365         super().__init__(*args, **kwargs)
 366
 367     def _new_conn(self):
 368         try:
 369             return create_connection(
 370                 address=(self._proxy_args['addr'], self._proxy_args['port']),
 371                 timeout=self.timeout,
 372                 source_address=self.source_address,
 373                 _create_socket_func=functools.partial(
 374                     create_socks_proxy_socket, (self.host, self.port), self._proxy_args))
 375         except (socket.timeout, TimeoutError) as e:
 376             raise urllib3.exceptions.ConnectTimeoutError(
 377                 self, f'Connection to {self.host} timed out. (connect timeout={self.timeout})') from e
 378         except SocksProxyError as e:
 379             raise urllib3.exceptions.ProxyError(str(e), e) from e
 380         except OSError as e:
 381             raise urllib3.exceptions.NewConnectionError(
 382                 self, f'Failed to establish a new connection: {e}') from e
 383
 384
 385 class SocksHTTPSConnection(SocksHTTPConnection, urllib3.connection.HTTPSConnection):
 386     pass
 387
 388
 389 class SocksHTTPConnectionPool(urllib3.HTTPConnectionPool):
 390     ConnectionCls = SocksHTTPConnection
 391
 392
 393 class SocksHTTPSConnectionPool(urllib3.HTTPSConnectionPool):
 394     ConnectionCls = SocksHTTPSConnection
 395
 396
 397 class SocksProxyManager(urllib3.PoolManager):
 398
 399     def __init__(self, socks_proxy, username=None, password=None, num_pools=10, headers=None, **connection_pool_kw):
 400         connection_pool_kw['_socks_options'] = make_socks_proxy_opts(socks_proxy)
 401         super().__init__(num_pools, headers, **connection_pool_kw)
 402         self.pool_classes_by_scheme = {
 403             'http': SocksHTTPConnectionPool,
 404             'https': SocksHTTPSConnectionPool
 405         }
 406
 407
 408 requests.adapters.SOCKSProxyManager = SocksProxyManager