yt_dlp/networking/_urllib.py

   1 from __future__ import annotations
   2
   3 import functools
   4 import http.client
   5 import io
   6 import ssl
   7 import urllib.error
   8 import urllib.parse
   9 import urllib.request
  10 import urllib.response
  11 import zlib
  12 from urllib.request import (
  13     DataHandler,
  14     FileHandler,
  15     FTPHandler,
  16     HTTPCookieProcessor,
  17     HTTPDefaultErrorHandler,
  18     HTTPErrorProcessor,
  19     UnknownHandler,
  20 )
  21
  22 from ._helper import (
  23     InstanceStoreMixin,
  24     add_accept_encoding_header,
  25     create_connection,
  26     create_socks_proxy_socket,
  27     get_redirect_method,
  28     make_socks_proxy_opts,
  29     select_proxy,
  30 )
  31 from .common import Features, RequestHandler, Response, register_rh
  32 from .exceptions import (
  33     CertificateVerifyError,
  34     HTTPError,
  35     IncompleteRead,
  36     ProxyError,
  37     RequestError,
  38     SSLError,
  39     TransportError,
  40 )
  41 from ..dependencies import brotli
  42 from ..socks import ProxyError as SocksProxyError
  43 from ..utils import update_url_query
  44 from ..utils.networking import normalize_url
  45
  46 SUPPORTED_ENCODINGS = ['gzip', 'deflate']
  47 CONTENT_DECODE_ERRORS = [zlib.error, OSError]
  48
  49 if brotli:
  50     SUPPORTED_ENCODINGS.append('br')
  51     CONTENT_DECODE_ERRORS.append(brotli.error)
  52
  53
  54 def _create_http_connection(http_class, source_address, *args, **kwargs):
  55     hc = http_class(*args, **kwargs)
  56
  57     if hasattr(hc, '_create_connection'):
  58         hc._create_connection = create_connection
  59
  60     if source_address is not None:
  61         hc.source_address = (source_address, 0)
  62
  63     return hc
  64
  65
  66 class HTTPHandler(urllib.request.AbstractHTTPHandler):
  67     """Handler for HTTP requests and responses.
  68
  69     This class, when installed with an OpenerDirector, automatically adds
  70     the standard headers to every HTTP request and handles gzipped, deflated and
  71     brotli responses from web servers.
  72
  73     Part of this code was copied from:
  74
  75     http://techknack.net/python-urllib2-handlers/
  76
  77     Andrew Rowls, the author of that code, agreed to release it to the
  78     public domain.
  79     """
  80
  81     def __init__(self, context=None, source_address=None, *args, **kwargs):
  82         super().__init__(*args, **kwargs)
  83         self._source_address = source_address
  84         self._context = context
  85
  86     @staticmethod
  87     def _make_conn_class(base, req):
  88         conn_class = base
  89         socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
  90         if socks_proxy:
  91             conn_class = make_socks_conn_class(conn_class, socks_proxy)
  92         return conn_class
  93
  94     def http_open(self, req):
  95         conn_class = self._make_conn_class(http.client.HTTPConnection, req)
  96         return self.do_open(functools.partial(
  97             _create_http_connection, conn_class, self._source_address), req)
  98
  99     def https_open(self, req):
 100         conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
 101         return self.do_open(
 102             functools.partial(
 103                 _create_http_connection, conn_class, self._source_address),
 104             req, context=self._context)
 105
 106     @staticmethod
 107     def deflate(data):
 108         if not data:
 109             return data
 110         try:
 111             return zlib.decompress(data, -zlib.MAX_WBITS)
 112         except zlib.error:
 113             return zlib.decompress(data)
 114
 115     @staticmethod
 116     def brotli(data):
 117         if not data:
 118             return data
 119         return brotli.decompress(data)
 120
 121     @staticmethod
 122     def gz(data):
 123         # There may be junk added the end of the file
 124         # We ignore it by only ever decoding a single gzip payload
 125         if not data:
 126             return data
 127         return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16)
 128
 129     def http_request(self, req):
 130         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 131         # always respected by websites, some tend to give out URLs with non percent-encoded
 132         # non-ASCII characters (see telemb.py, ard.py [#3412])
 133         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 134         # To work around aforementioned issue we will replace request's original URL with
 135         # percent-encoded one
 136         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 137         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 138         url = req.get_full_url()
 139         url_escaped = normalize_url(url)
 140
 141         # Substitute URL if any change after escaping
 142         if url != url_escaped:
 143             req = update_Request(req, url=url_escaped)
 144
 145         return super().do_request_(req)
 146
 147     def http_response(self, req, resp):
 148         old_resp = resp
 149
 150         # Content-Encoding header lists the encodings in order that they were applied [1].
 151         # To decompress, we simply do the reverse.
 152         # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
 153         decoded_response = None
 154         for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
 155             if encoding == 'gzip':
 156                 decoded_response = self.gz(decoded_response or resp.read())
 157             elif encoding == 'deflate':
 158                 decoded_response = self.deflate(decoded_response or resp.read())
 159             elif encoding == 'br' and brotli:
 160                 decoded_response = self.brotli(decoded_response or resp.read())
 161
 162         if decoded_response is not None:
 163             resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
 164             resp.msg = old_resp.msg
 165         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 166         # https://github.com/ytdl-org/youtube-dl/issues/6457).
 167         if 300 <= resp.code < 400:
 168             location = resp.headers.get('Location')
 169             if location:
 170                 # As of RFC 2616 default charset is iso-8859-1 that is respected by Python 3
 171                 location = location.encode('iso-8859-1').decode()
 172                 location_escaped = normalize_url(location)
 173                 if location != location_escaped:
 174                     del resp.headers['Location']
 175                     resp.headers['Location'] = location_escaped
 176         return resp
 177
 178     https_request = http_request
 179     https_response = http_response
 180
 181
 182 def make_socks_conn_class(base_class, socks_proxy):
 183     assert issubclass(base_class, (
 184         http.client.HTTPConnection, http.client.HTTPSConnection))
 185
 186     proxy_args = make_socks_proxy_opts(socks_proxy)
 187
 188     class SocksConnection(base_class):
 189         _create_connection = create_connection
 190
 191         def connect(self):
 192             self.sock = create_connection(
 193                 (proxy_args['addr'], proxy_args['port']),
 194                 timeout=self.timeout,
 195                 source_address=self.source_address,
 196                 _create_socket_func=functools.partial(
 197                     create_socks_proxy_socket, (self.host, self.port), proxy_args))
 198             if isinstance(self, http.client.HTTPSConnection):
 199                 self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
 200
 201     return SocksConnection
 202
 203
 204 class RedirectHandler(urllib.request.HTTPRedirectHandler):
 205     """YoutubeDL redirect handler
 206
 207     The code is based on HTTPRedirectHandler implementation from CPython [1].
 208
 209     This redirect handler fixes and improves the logic to better align with RFC7261
 210      and what browsers tend to do [2][3]
 211
 212     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
 213     2. https://datatracker.ietf.org/doc/html/rfc7231
 214     3. https://github.com/python/cpython/issues/91306
 215     """
 216
 217     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
 218
 219     def redirect_request(self, req, fp, code, msg, headers, newurl):
 220         if code not in (301, 302, 303, 307, 308):
 221             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
 222
 223         new_data = req.data
 224
 225         # Technically the Cookie header should be in unredirected_hdrs,
 226         # however in practice some may set it in normal headers anyway.
 227         # We will remove it here to prevent any leaks.
 228         remove_headers = ['Cookie']
 229
 230         new_method = get_redirect_method(req.get_method(), code)
 231         # only remove payload if method changed (e.g. POST to GET)
 232         if new_method != req.get_method():
 233             new_data = None
 234             remove_headers.extend(['Content-Length', 'Content-Type'])
 235
 236         new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
 237
 238         return urllib.request.Request(
 239             newurl, headers=new_headers, origin_req_host=req.origin_req_host,
 240             unverifiable=True, method=new_method, data=new_data)
 241
 242
 243 class ProxyHandler(urllib.request.BaseHandler):
 244     handler_order = 100
 245
 246     def __init__(self, proxies=None):
 247         self.proxies = proxies
 248         # Set default handlers
 249         for scheme in ('http', 'https', 'ftp'):
 250             setattr(self, f'{scheme}_open', lambda r, meth=self.proxy_open: meth(r))
 251
 252     def proxy_open(self, req):
 253         proxy = select_proxy(req.get_full_url(), self.proxies)
 254         if proxy is None:
 255             return
 256         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
 257             req.add_header('Ytdl-socks-proxy', proxy)
 258             # yt-dlp's http/https handlers do wrapping the socket with socks
 259             return None
 260         return urllib.request.ProxyHandler.proxy_open(
 261             self, req, proxy, None)
 262
 263
 264 class PUTRequest(urllib.request.Request):
 265     def get_method(self):
 266         return 'PUT'
 267
 268
 269 class HEADRequest(urllib.request.Request):
 270     def get_method(self):
 271         return 'HEAD'
 272
 273
 274 def update_Request(req, url=None, data=None, headers=None, query=None):
 275     req_headers = req.headers.copy()
 276     req_headers.update(headers or {})
 277     req_data = data if data is not None else req.data
 278     req_url = update_url_query(url or req.get_full_url(), query)
 279     req_get_method = req.get_method()
 280     if req_get_method == 'HEAD':
 281         req_type = HEADRequest
 282     elif req_get_method == 'PUT':
 283         req_type = PUTRequest
 284     else:
 285         req_type = urllib.request.Request
 286     new_req = req_type(
 287         req_url, data=req_data, headers=req_headers,
 288         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 289     if hasattr(req, 'timeout'):
 290         new_req.timeout = req.timeout
 291     return new_req
 292
 293
 294 class UrllibResponseAdapter(Response):
 295     """
 296     HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
 297     """
 298
 299     def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
 300         # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
 301         # HTTPResponse: .getcode() was deprecated, .status always existed [2]
 302         # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
 303         # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
 304         super().__init__(
 305             fp=res, headers=res.headers, url=res.url,
 306             status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
 307
 308     def read(self, amt=None):
 309         try:
 310             return self.fp.read(amt)
 311         except Exception as e:
 312             handle_response_read_exceptions(e)
 313             raise e
 314
 315
 316 def handle_sslerror(e: ssl.SSLError):
 317     if not isinstance(e, ssl.SSLError):
 318         return
 319     if isinstance(e, ssl.SSLCertVerificationError):
 320         raise CertificateVerifyError(cause=e) from e
 321     raise SSLError(cause=e) from e
 322
 323
 324 def handle_response_read_exceptions(e):
 325     if isinstance(e, http.client.IncompleteRead):
 326         raise IncompleteRead(partial=len(e.partial), cause=e, expected=e.expected) from e
 327     elif isinstance(e, ssl.SSLError):
 328         handle_sslerror(e)
 329     elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
 330         # OSErrors raised here should mostly be network related
 331         raise TransportError(cause=e) from e
 332
 333
 334 @register_rh
 335 class UrllibRH(RequestHandler, InstanceStoreMixin):
 336     _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
 337     _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
 338     _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
 339     RH_NAME = 'urllib'
 340
 341     def __init__(self, *, enable_file_urls: bool = False, **kwargs):
 342         super().__init__(**kwargs)
 343         self.enable_file_urls = enable_file_urls
 344         if self.enable_file_urls:
 345             self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
 346
 347     def _check_extensions(self, extensions):
 348         super()._check_extensions(extensions)
 349         extensions.pop('cookiejar', None)
 350         extensions.pop('timeout', None)
 351         extensions.pop('legacy_ssl', None)
 352
 353     def _create_instance(self, proxies, cookiejar, legacy_ssl_support=None):
 354         opener = urllib.request.OpenerDirector()
 355         handlers = [
 356             ProxyHandler(proxies),
 357             HTTPHandler(
 358                 debuglevel=int(bool(self.verbose)),
 359                 context=self._make_sslcontext(legacy_ssl_support=legacy_ssl_support),
 360                 source_address=self.source_address),
 361             HTTPCookieProcessor(cookiejar),
 362             DataHandler(),
 363             UnknownHandler(),
 364             HTTPDefaultErrorHandler(),
 365             FTPHandler(),
 366             HTTPErrorProcessor(),
 367             RedirectHandler(),
 368         ]
 369
 370         if self.enable_file_urls:
 371             handlers.append(FileHandler())
 372
 373         for handler in handlers:
 374             opener.add_handler(handler)
 375
 376         # Delete the default user-agent header, which would otherwise apply in
 377         # cases where our custom HTTP handler doesn't come into play
 378         # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
 379         opener.addheaders = []
 380         return opener
 381
 382     def _send(self, request):
 383         headers = self._merge_headers(request.headers)
 384         add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
 385         urllib_req = urllib.request.Request(
 386             url=request.url,
 387             data=request.data,
 388             headers=dict(headers),
 389             method=request.method,
 390         )
 391
 392         opener = self._get_instance(
 393             proxies=self._get_proxies(request),
 394             cookiejar=self._get_cookiejar(request),
 395             legacy_ssl_support=request.extensions.get('legacy_ssl'),
 396         )
 397         try:
 398             res = opener.open(urllib_req, timeout=self._calculate_timeout(request))
 399         except urllib.error.HTTPError as e:
 400             if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
 401                 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
 402                 e._closer.close_called = True
 403                 raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
 404             raise  # unexpected
 405         except urllib.error.URLError as e:
 406             cause = e.reason  # NOTE: cause may be a string
 407
 408             # proxy errors
 409             if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
 410                 raise ProxyError(cause=e) from e
 411
 412             handle_response_read_exceptions(cause)
 413             raise TransportError(cause=e) from e
 414         except (http.client.InvalidURL, ValueError) as e:
 415             # Validation errors
 416             # http.client.HTTPConnection raises ValueError in some validation cases
 417             # such as if request method contains illegal control characters [1]
 418             # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
 419             raise RequestError(cause=e) from e
 420         except Exception as e:
 421             handle_response_read_exceptions(e)
 422             raise  # unexpected
 423
 424         return UrllibResponseAdapter(res)