[ie/youtube] Fix `uploader_id` extraction (#11818)
[yt-dlp.git] / yt_dlp / networking / _urllib.py
blob510bb2a691fd18048ca9c3250d860afc396cf416
1 from __future__ import annotations
3 import functools
4 import http.client
5 import io
6 import ssl
7 import urllib.error
8 import urllib.parse
9 import urllib.request
10 import urllib.response
11 import zlib
12 from urllib.request import (
13 DataHandler,
14 FileHandler,
15 FTPHandler,
16 HTTPCookieProcessor,
17 HTTPDefaultErrorHandler,
18 HTTPErrorProcessor,
19 UnknownHandler,
22 from ._helper import (
23 InstanceStoreMixin,
24 add_accept_encoding_header,
25 create_connection,
26 create_socks_proxy_socket,
27 get_redirect_method,
28 make_socks_proxy_opts,
29 select_proxy,
31 from .common import Features, RequestHandler, Response, register_rh
32 from .exceptions import (
33 CertificateVerifyError,
34 HTTPError,
35 IncompleteRead,
36 ProxyError,
37 RequestError,
38 SSLError,
39 TransportError,
41 from ..dependencies import brotli
42 from ..socks import ProxyError as SocksProxyError
43 from ..utils import update_url_query
44 from ..utils.networking import normalize_url
46 SUPPORTED_ENCODINGS = ['gzip', 'deflate']
47 CONTENT_DECODE_ERRORS = [zlib.error, OSError]
49 if brotli:
50 SUPPORTED_ENCODINGS.append('br')
51 CONTENT_DECODE_ERRORS.append(brotli.error)
54 def _create_http_connection(http_class, source_address, *args, **kwargs):
55 hc = http_class(*args, **kwargs)
57 if hasattr(hc, '_create_connection'):
58 hc._create_connection = create_connection
60 if source_address is not None:
61 hc.source_address = (source_address, 0)
63 return hc
66 class HTTPHandler(urllib.request.AbstractHTTPHandler):
67 """Handler for HTTP requests and responses.
69 This class, when installed with an OpenerDirector, automatically adds
70 the standard headers to every HTTP request and handles gzipped, deflated and
71 brotli responses from web servers.
73 Part of this code was copied from:
75 http://techknack.net/python-urllib2-handlers/
77 Andrew Rowls, the author of that code, agreed to release it to the
78 public domain.
79 """
81 def __init__(self, context=None, source_address=None, *args, **kwargs):
82 super().__init__(*args, **kwargs)
83 self._source_address = source_address
84 self._context = context
86 @staticmethod
87 def _make_conn_class(base, req):
88 conn_class = base
89 socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
90 if socks_proxy:
91 conn_class = make_socks_conn_class(conn_class, socks_proxy)
92 return conn_class
94 def http_open(self, req):
95 conn_class = self._make_conn_class(http.client.HTTPConnection, req)
96 return self.do_open(functools.partial(
97 _create_http_connection, conn_class, self._source_address), req)
99 def https_open(self, req):
100 conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
101 return self.do_open(
102 functools.partial(
103 _create_http_connection, conn_class, self._source_address),
104 req, context=self._context)
106 @staticmethod
107 def deflate(data):
108 if not data:
109 return data
110 try:
111 return zlib.decompress(data, -zlib.MAX_WBITS)
112 except zlib.error:
113 return zlib.decompress(data)
115 @staticmethod
116 def brotli(data):
117 if not data:
118 return data
119 return brotli.decompress(data)
121 @staticmethod
122 def gz(data):
123 # There may be junk added the end of the file
124 # We ignore it by only ever decoding a single gzip payload
125 if not data:
126 return data
127 return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16)
129 def http_request(self, req):
130 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
131 # always respected by websites, some tend to give out URLs with non percent-encoded
132 # non-ASCII characters (see telemb.py, ard.py [#3412])
133 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
134 # To work around aforementioned issue we will replace request's original URL with
135 # percent-encoded one
136 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
137 # the code of this workaround has been moved here from YoutubeDL.urlopen()
138 url = req.get_full_url()
139 url_escaped = normalize_url(url)
141 # Substitute URL if any change after escaping
142 if url != url_escaped:
143 req = update_Request(req, url=url_escaped)
145 return super().do_request_(req)
147 def http_response(self, req, resp):
148 old_resp = resp
150 # Content-Encoding header lists the encodings in order that they were applied [1].
151 # To decompress, we simply do the reverse.
152 # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
153 decoded_response = None
154 for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
155 if encoding == 'gzip':
156 decoded_response = self.gz(decoded_response or resp.read())
157 elif encoding == 'deflate':
158 decoded_response = self.deflate(decoded_response or resp.read())
159 elif encoding == 'br' and brotli:
160 decoded_response = self.brotli(decoded_response or resp.read())
162 if decoded_response is not None:
163 resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
164 resp.msg = old_resp.msg
165 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
166 # https://github.com/ytdl-org/youtube-dl/issues/6457).
167 if 300 <= resp.code < 400:
168 location = resp.headers.get('Location')
169 if location:
170 # As of RFC 2616 default charset is iso-8859-1 that is respected by Python 3
171 location = location.encode('iso-8859-1').decode()
172 location_escaped = normalize_url(location)
173 if location != location_escaped:
174 del resp.headers['Location']
175 resp.headers['Location'] = location_escaped
176 return resp
178 https_request = http_request
179 https_response = http_response
182 def make_socks_conn_class(base_class, socks_proxy):
183 assert issubclass(base_class, (
184 http.client.HTTPConnection, http.client.HTTPSConnection))
186 proxy_args = make_socks_proxy_opts(socks_proxy)
188 class SocksConnection(base_class):
189 _create_connection = create_connection
191 def connect(self):
192 self.sock = create_connection(
193 (proxy_args['addr'], proxy_args['port']),
194 timeout=self.timeout,
195 source_address=self.source_address,
196 _create_socket_func=functools.partial(
197 create_socks_proxy_socket, (self.host, self.port), proxy_args))
198 if isinstance(self, http.client.HTTPSConnection):
199 self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
201 return SocksConnection
204 class RedirectHandler(urllib.request.HTTPRedirectHandler):
205 """YoutubeDL redirect handler
207 The code is based on HTTPRedirectHandler implementation from CPython [1].
209 This redirect handler fixes and improves the logic to better align with RFC7261
210 and what browsers tend to do [2][3]
212 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
213 2. https://datatracker.ietf.org/doc/html/rfc7231
214 3. https://github.com/python/cpython/issues/91306
217 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
219 def redirect_request(self, req, fp, code, msg, headers, newurl):
220 if code not in (301, 302, 303, 307, 308):
221 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
223 new_data = req.data
225 # Technically the Cookie header should be in unredirected_hdrs,
226 # however in practice some may set it in normal headers anyway.
227 # We will remove it here to prevent any leaks.
228 remove_headers = ['Cookie']
230 new_method = get_redirect_method(req.get_method(), code)
231 # only remove payload if method changed (e.g. POST to GET)
232 if new_method != req.get_method():
233 new_data = None
234 remove_headers.extend(['Content-Length', 'Content-Type'])
236 new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
238 return urllib.request.Request(
239 newurl, headers=new_headers, origin_req_host=req.origin_req_host,
240 unverifiable=True, method=new_method, data=new_data)
243 class ProxyHandler(urllib.request.BaseHandler):
244 handler_order = 100
246 def __init__(self, proxies=None):
247 self.proxies = proxies
248 # Set default handlers
249 for scheme in ('http', 'https', 'ftp'):
250 setattr(self, f'{scheme}_open', lambda r, meth=self.proxy_open: meth(r))
252 def proxy_open(self, req):
253 proxy = select_proxy(req.get_full_url(), self.proxies)
254 if proxy is None:
255 return
256 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
257 req.add_header('Ytdl-socks-proxy', proxy)
258 # yt-dlp's http/https handlers do wrapping the socket with socks
259 return None
260 return urllib.request.ProxyHandler.proxy_open(
261 self, req, proxy, None)
264 class PUTRequest(urllib.request.Request):
265 def get_method(self):
266 return 'PUT'
269 class HEADRequest(urllib.request.Request):
270 def get_method(self):
271 return 'HEAD'
274 def update_Request(req, url=None, data=None, headers=None, query=None):
275 req_headers = req.headers.copy()
276 req_headers.update(headers or {})
277 req_data = data if data is not None else req.data
278 req_url = update_url_query(url or req.get_full_url(), query)
279 req_get_method = req.get_method()
280 if req_get_method == 'HEAD':
281 req_type = HEADRequest
282 elif req_get_method == 'PUT':
283 req_type = PUTRequest
284 else:
285 req_type = urllib.request.Request
286 new_req = req_type(
287 req_url, data=req_data, headers=req_headers,
288 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
289 if hasattr(req, 'timeout'):
290 new_req.timeout = req.timeout
291 return new_req
294 class UrllibResponseAdapter(Response):
296 HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
299 def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
300 # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
301 # HTTPResponse: .getcode() was deprecated, .status always existed [2]
302 # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
303 # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
304 super().__init__(
305 fp=res, headers=res.headers, url=res.url,
306 status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
308 def read(self, amt=None):
309 try:
310 return self.fp.read(amt)
311 except Exception as e:
312 handle_response_read_exceptions(e)
313 raise e
316 def handle_sslerror(e: ssl.SSLError):
317 if not isinstance(e, ssl.SSLError):
318 return
319 if isinstance(e, ssl.SSLCertVerificationError):
320 raise CertificateVerifyError(cause=e) from e
321 raise SSLError(cause=e) from e
324 def handle_response_read_exceptions(e):
325 if isinstance(e, http.client.IncompleteRead):
326 raise IncompleteRead(partial=len(e.partial), cause=e, expected=e.expected) from e
327 elif isinstance(e, ssl.SSLError):
328 handle_sslerror(e)
329 elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
330 # OSErrors raised here should mostly be network related
331 raise TransportError(cause=e) from e
334 @register_rh
335 class UrllibRH(RequestHandler, InstanceStoreMixin):
336 _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
337 _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
338 _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
339 RH_NAME = 'urllib'
341 def __init__(self, *, enable_file_urls: bool = False, **kwargs):
342 super().__init__(**kwargs)
343 self.enable_file_urls = enable_file_urls
344 if self.enable_file_urls:
345 self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
347 def _check_extensions(self, extensions):
348 super()._check_extensions(extensions)
349 extensions.pop('cookiejar', None)
350 extensions.pop('timeout', None)
351 extensions.pop('legacy_ssl', None)
353 def _create_instance(self, proxies, cookiejar, legacy_ssl_support=None):
354 opener = urllib.request.OpenerDirector()
355 handlers = [
356 ProxyHandler(proxies),
357 HTTPHandler(
358 debuglevel=int(bool(self.verbose)),
359 context=self._make_sslcontext(legacy_ssl_support=legacy_ssl_support),
360 source_address=self.source_address),
361 HTTPCookieProcessor(cookiejar),
362 DataHandler(),
363 UnknownHandler(),
364 HTTPDefaultErrorHandler(),
365 FTPHandler(),
366 HTTPErrorProcessor(),
367 RedirectHandler(),
370 if self.enable_file_urls:
371 handlers.append(FileHandler())
373 for handler in handlers:
374 opener.add_handler(handler)
376 # Delete the default user-agent header, which would otherwise apply in
377 # cases where our custom HTTP handler doesn't come into play
378 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
379 opener.addheaders = []
380 return opener
382 def _send(self, request):
383 headers = self._merge_headers(request.headers)
384 add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
385 urllib_req = urllib.request.Request(
386 url=request.url,
387 data=request.data,
388 headers=dict(headers),
389 method=request.method,
392 opener = self._get_instance(
393 proxies=self._get_proxies(request),
394 cookiejar=self._get_cookiejar(request),
395 legacy_ssl_support=request.extensions.get('legacy_ssl'),
397 try:
398 res = opener.open(urllib_req, timeout=self._calculate_timeout(request))
399 except urllib.error.HTTPError as e:
400 if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
401 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
402 e._closer.close_called = True
403 raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
404 raise # unexpected
405 except urllib.error.URLError as e:
406 cause = e.reason # NOTE: cause may be a string
408 # proxy errors
409 if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
410 raise ProxyError(cause=e) from e
412 handle_response_read_exceptions(cause)
413 raise TransportError(cause=e) from e
414 except (http.client.InvalidURL, ValueError) as e:
415 # Validation errors
416 # http.client.HTTPConnection raises ValueError in some validation cases
417 # such as if request method contains illegal control characters [1]
418 # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
419 raise RequestError(cause=e) from e
420 except Exception as e:
421 handle_response_read_exceptions(e)
422 raise # unexpected
424 return UrllibResponseAdapter(res)