[ie/mlbtv] Fix extractor (#10515)
[yt-dlp.git] / yt_dlp / networking / common.py
blobe8951c7e7d9f9966b63692fe46ac2d27dd88ffef
1 from __future__ import annotations
3 import abc
4 import copy
5 import enum
6 import functools
7 import io
8 import typing
9 import urllib.parse
10 import urllib.request
11 import urllib.response
12 from collections.abc import Iterable, Mapping
13 from email.message import Message
14 from http import HTTPStatus
16 from ._helper import make_ssl_context, wrap_request_errors
17 from .exceptions import (
18 NoSupportingHandlers,
19 RequestError,
20 TransportError,
21 UnsupportedRequest,
23 from ..compat.types import NoneType
24 from ..cookies import YoutubeDLCookieJar
25 from ..utils import (
26 bug_reports_message,
27 classproperty,
28 deprecation_warning,
29 error_to_str,
30 update_url_query,
32 from ..utils.networking import HTTPHeaderDict, normalize_url
34 DEFAULT_TIMEOUT = 20
37 def register_preference(*handlers: type[RequestHandler]):
38 assert all(issubclass(handler, RequestHandler) for handler in handlers)
40 def outer(preference: Preference):
41 @functools.wraps(preference)
42 def inner(handler, *args, **kwargs):
43 if not handlers or isinstance(handler, handlers):
44 return preference(handler, *args, **kwargs)
45 return 0
46 _RH_PREFERENCES.add(inner)
47 return inner
48 return outer
51 class RequestDirector:
52 """RequestDirector class
54 Helper class that, when given a request, forward it to a RequestHandler that supports it.
56 Preference functions in the form of func(handler, request) -> int
57 can be registered into the `preferences` set. These are used to sort handlers
58 in order of preference.
60 @param logger: Logger instance.
61 @param verbose: Print debug request information to stdout.
62 """
64 def __init__(self, logger, verbose=False):
65 self.handlers: dict[str, RequestHandler] = {}
66 self.preferences: set[Preference] = set()
67 self.logger = logger # TODO(Grub4k): default logger
68 self.verbose = verbose
70 def close(self):
71 for handler in self.handlers.values():
72 handler.close()
73 self.handlers.clear()
75 def add_handler(self, handler: RequestHandler):
76 """Add a handler. If a handler of the same RH_KEY exists, it will overwrite it"""
77 assert isinstance(handler, RequestHandler), 'handler must be a RequestHandler'
78 self.handlers[handler.RH_KEY] = handler
80 def _get_handlers(self, request: Request) -> list[RequestHandler]:
81 """Sorts handlers by preference, given a request"""
82 preferences = {
83 rh: sum(pref(rh, request) for pref in self.preferences)
84 for rh in self.handlers.values()
86 self._print_verbose('Handler preferences for this request: {}'.format(', '.join(
87 f'{rh.RH_NAME}={pref}' for rh, pref in preferences.items())))
88 return sorted(self.handlers.values(), key=preferences.get, reverse=True)
90 def _print_verbose(self, msg):
91 if self.verbose:
92 self.logger.stdout(f'director: {msg}')
94 def send(self, request: Request) -> Response:
95 """
96 Passes a request onto a suitable RequestHandler
97 """
98 if not self.handlers:
99 raise RequestError('No request handlers configured')
101 assert isinstance(request, Request)
103 unexpected_errors = []
104 unsupported_errors = []
105 for handler in self._get_handlers(request):
106 self._print_verbose(f'Checking if "{handler.RH_NAME}" supports this request.')
107 try:
108 handler.validate(request)
109 except UnsupportedRequest as e:
110 self._print_verbose(
111 f'"{handler.RH_NAME}" cannot handle this request (reason: {error_to_str(e)})')
112 unsupported_errors.append(e)
113 continue
115 self._print_verbose(f'Sending request via "{handler.RH_NAME}"')
116 try:
117 response = handler.send(request)
118 except RequestError:
119 raise
120 except Exception as e:
121 self.logger.error(
122 f'[{handler.RH_NAME}] Unexpected error: {error_to_str(e)}{bug_reports_message()}',
123 is_error=False)
124 unexpected_errors.append(e)
125 continue
127 assert isinstance(response, Response)
128 return response
130 raise NoSupportingHandlers(unsupported_errors, unexpected_errors)
133 _REQUEST_HANDLERS = {}
136 def register_rh(handler):
137 """Register a RequestHandler class"""
138 assert issubclass(handler, RequestHandler), f'{handler} must be a subclass of RequestHandler'
139 assert handler.RH_KEY not in _REQUEST_HANDLERS, f'RequestHandler {handler.RH_KEY} already registered'
140 _REQUEST_HANDLERS[handler.RH_KEY] = handler
141 return handler
144 class Features(enum.Enum):
145 ALL_PROXY = enum.auto()
146 NO_PROXY = enum.auto()
149 class RequestHandler(abc.ABC):
151 """Request Handler class
153 Request handlers are class that, given a Request,
154 process the request from start to finish and return a Response.
156 Concrete subclasses need to redefine the _send(request) method,
157 which handles the underlying request logic and returns a Response.
159 RH_NAME class variable may contain a display name for the RequestHandler.
160 By default, this is generated from the class name.
162 The concrete request handler MUST have "RH" as the suffix in the class name.
164 All exceptions raised by a RequestHandler should be an instance of RequestError.
165 Any other exception raised will be treated as a handler issue.
167 If a Request is not supported by the handler, an UnsupportedRequest
168 should be raised with a reason.
170 By default, some checks are done on the request in _validate() based on the following class variables:
171 - `_SUPPORTED_URL_SCHEMES`: a tuple of supported url schemes.
172 Any Request with an url scheme not in this list will raise an UnsupportedRequest.
174 - `_SUPPORTED_PROXY_SCHEMES`: a tuple of support proxy url schemes. Any Request that contains
175 a proxy url with an url scheme not in this list will raise an UnsupportedRequest.
177 - `_SUPPORTED_FEATURES`: a tuple of supported features, as defined in Features enum.
179 The above may be set to None to disable the checks.
181 Parameters:
182 @param logger: logger instance
183 @param headers: HTTP Headers to include when sending requests.
184 @param cookiejar: Cookiejar to use for requests.
185 @param timeout: Socket timeout to use when sending requests.
186 @param proxies: Proxies to use for sending requests.
187 @param source_address: Client-side IP address to bind to for requests.
188 @param verbose: Print debug request and traffic information to stdout.
189 @param prefer_system_certs: Whether to prefer system certificates over other means (e.g. certifi).
190 @param client_cert: SSL client certificate configuration.
191 dict with {client_certificate, client_certificate_key, client_certificate_password}
192 @param verify: Verify SSL certificates
193 @param legacy_ssl_support: Enable legacy SSL options such as legacy server connect and older cipher support.
195 Some configuration options may be available for individual Requests too. In this case,
196 either the Request configuration option takes precedence or they are merged.
198 Requests may have additional optional parameters defined as extensions.
199 RequestHandler subclasses may choose to support custom extensions.
201 If an extension is supported, subclasses should extend _check_extensions(extensions)
202 to pop and validate the extension.
203 - Extensions left in `extensions` are treated as unsupported and UnsupportedRequest will be raised.
205 The following extensions are defined for RequestHandler:
206 - `cookiejar`: Cookiejar to use for this request.
207 - `timeout`: socket timeout to use for this request.
208 - `legacy_ssl`: Enable legacy SSL options for this request. See legacy_ssl_support.
209 To enable these, add extensions.pop('<extension>', None) to _check_extensions
211 Apart from the url protocol, proxies dict may contain the following keys:
212 - `all`: proxy to use for all protocols. Used as a fallback if no proxy is set for a specific protocol.
213 - `no`: comma seperated list of hostnames (optionally with port) to not use a proxy for.
214 Note: a RequestHandler may not support these, as defined in `_SUPPORTED_FEATURES`.
218 _SUPPORTED_URL_SCHEMES = ()
219 _SUPPORTED_PROXY_SCHEMES = ()
220 _SUPPORTED_FEATURES = ()
222 def __init__(
223 self, *,
224 logger, # TODO(Grub4k): default logger
225 headers: HTTPHeaderDict = None,
226 cookiejar: YoutubeDLCookieJar = None,
227 timeout: float | int | None = None,
228 proxies: dict | None = None,
229 source_address: str | None = None,
230 verbose: bool = False,
231 prefer_system_certs: bool = False,
232 client_cert: dict[str, str | None] | None = None,
233 verify: bool = True,
234 legacy_ssl_support: bool = False,
235 **_,
238 self._logger = logger
239 self.headers = headers or {}
240 self.cookiejar = cookiejar if cookiejar is not None else YoutubeDLCookieJar()
241 self.timeout = float(timeout or DEFAULT_TIMEOUT)
242 self.proxies = proxies or {}
243 self.source_address = source_address
244 self.verbose = verbose
245 self.prefer_system_certs = prefer_system_certs
246 self._client_cert = client_cert or {}
247 self.verify = verify
248 self.legacy_ssl_support = legacy_ssl_support
249 super().__init__()
251 def _make_sslcontext(self, legacy_ssl_support=None):
252 return make_ssl_context(
253 verify=self.verify,
254 legacy_support=legacy_ssl_support if legacy_ssl_support is not None else self.legacy_ssl_support,
255 use_certifi=not self.prefer_system_certs,
256 **self._client_cert,
259 def _merge_headers(self, request_headers):
260 return HTTPHeaderDict(self.headers, request_headers)
262 def _calculate_timeout(self, request):
263 return float(request.extensions.get('timeout') or self.timeout)
265 def _get_cookiejar(self, request):
266 cookiejar = request.extensions.get('cookiejar')
267 return self.cookiejar if cookiejar is None else cookiejar
269 def _get_proxies(self, request):
270 return (request.proxies or self.proxies).copy()
272 def _check_url_scheme(self, request: Request):
273 scheme = urllib.parse.urlparse(request.url).scheme.lower()
274 if self._SUPPORTED_URL_SCHEMES is not None and scheme not in self._SUPPORTED_URL_SCHEMES:
275 raise UnsupportedRequest(f'Unsupported url scheme: "{scheme}"')
276 return scheme # for further processing
278 def _check_proxies(self, proxies):
279 for proxy_key, proxy_url in proxies.items():
280 if proxy_url is None:
281 continue
282 if proxy_key == 'no':
283 if self._SUPPORTED_FEATURES is not None and Features.NO_PROXY not in self._SUPPORTED_FEATURES:
284 raise UnsupportedRequest('"no" proxy is not supported')
285 continue
286 if (
287 proxy_key == 'all'
288 and self._SUPPORTED_FEATURES is not None
289 and Features.ALL_PROXY not in self._SUPPORTED_FEATURES
291 raise UnsupportedRequest('"all" proxy is not supported')
293 # Unlikely this handler will use this proxy, so ignore.
294 # This is to allow a case where a proxy may be set for a protocol
295 # for one handler in which such protocol (and proxy) is not supported by another handler.
296 if self._SUPPORTED_URL_SCHEMES is not None and proxy_key not in (*self._SUPPORTED_URL_SCHEMES, 'all'):
297 continue
299 if self._SUPPORTED_PROXY_SCHEMES is None:
300 # Skip proxy scheme checks
301 continue
303 try:
304 if urllib.request._parse_proxy(proxy_url)[0] is None:
305 # Scheme-less proxies are not supported
306 raise UnsupportedRequest(f'Proxy "{proxy_url}" missing scheme')
307 except ValueError as e:
308 # parse_proxy may raise on some invalid proxy urls such as "/a/b/c"
309 raise UnsupportedRequest(f'Invalid proxy url "{proxy_url}": {e}')
311 scheme = urllib.parse.urlparse(proxy_url).scheme.lower()
312 if scheme not in self._SUPPORTED_PROXY_SCHEMES:
313 raise UnsupportedRequest(f'Unsupported proxy type: "{scheme}"')
315 def _check_extensions(self, extensions):
316 """Check extensions for unsupported extensions. Subclasses should extend this."""
317 assert isinstance(extensions.get('cookiejar'), (YoutubeDLCookieJar, NoneType))
318 assert isinstance(extensions.get('timeout'), (float, int, NoneType))
319 assert isinstance(extensions.get('legacy_ssl'), (bool, NoneType))
321 def _validate(self, request):
322 self._check_url_scheme(request)
323 self._check_proxies(request.proxies or self.proxies)
324 extensions = request.extensions.copy()
325 self._check_extensions(extensions)
326 if extensions:
327 # TODO: add support for optional extensions
328 raise UnsupportedRequest(f'Unsupported extensions: {", ".join(extensions.keys())}')
330 @wrap_request_errors
331 def validate(self, request: Request):
332 if not isinstance(request, Request):
333 raise TypeError('Expected an instance of Request')
334 self._validate(request)
336 @wrap_request_errors
337 def send(self, request: Request) -> Response:
338 if not isinstance(request, Request):
339 raise TypeError('Expected an instance of Request')
340 return self._send(request)
342 @abc.abstractmethod
343 def _send(self, request: Request):
344 """Handle a request from start to finish. Redefine in subclasses."""
345 pass
347 def close(self): # noqa: B027
348 pass
350 @classproperty
351 def RH_NAME(cls):
352 return cls.__name__[:-2]
354 @classproperty
355 def RH_KEY(cls):
356 assert cls.__name__.endswith('RH'), 'RequestHandler class names must end with "RH"'
357 return cls.__name__[:-2]
359 def __enter__(self):
360 return self
362 def __exit__(self, *args):
363 self.close()
366 class Request:
368 Represents a request to be made.
369 Partially backwards-compatible with urllib.request.Request.
371 @param url: url to send. Will be sanitized.
372 @param data: payload data to send. Must be bytes, iterable of bytes, a file-like object or None
373 @param headers: headers to send.
374 @param proxies: proxy dict mapping of proto:proxy to use for the request and any redirects.
375 @param query: URL query parameters to update the url with.
376 @param method: HTTP method to use. If no method specified, will use POST if payload data is present else GET
377 @param extensions: Dictionary of Request extensions to add, as supported by handlers.
380 def __init__(
381 self,
382 url: str,
383 data: RequestData = None,
384 headers: typing.Mapping | None = None,
385 proxies: dict | None = None,
386 query: dict | None = None,
387 method: str | None = None,
388 extensions: dict | None = None,
391 self._headers = HTTPHeaderDict()
392 self._data = None
394 if query:
395 url = update_url_query(url, query)
397 self.url = url
398 self.method = method
399 if headers:
400 self.headers = headers
401 self.data = data # note: must be done after setting headers
402 self.proxies = proxies or {}
403 self.extensions = extensions or {}
405 @property
406 def url(self):
407 return self._url
409 @url.setter
410 def url(self, url):
411 if not isinstance(url, str):
412 raise TypeError('url must be a string')
413 elif url.startswith('//'):
414 url = 'http:' + url
415 self._url = normalize_url(url)
417 @property
418 def method(self):
419 return self._method or ('POST' if self.data is not None else 'GET')
421 @method.setter
422 def method(self, method):
423 if method is None:
424 self._method = None
425 elif isinstance(method, str):
426 self._method = method.upper()
427 else:
428 raise TypeError('method must be a string')
430 @property
431 def data(self):
432 return self._data
434 @data.setter
435 def data(self, data: RequestData):
436 # Try catch some common mistakes
437 if data is not None and (
438 not isinstance(data, (bytes, io.IOBase, Iterable)) or isinstance(data, (str, Mapping))
440 raise TypeError('data must be bytes, iterable of bytes, or a file-like object')
442 if data == self._data and self._data is None:
443 self.headers.pop('Content-Length', None)
445 # https://docs.python.org/3/library/urllib.request.html#urllib.request.Request.data
446 if data != self._data:
447 if self._data is not None:
448 self.headers.pop('Content-Length', None)
449 self._data = data
451 if self._data is None:
452 self.headers.pop('Content-Type', None)
454 if 'Content-Type' not in self.headers and self._data is not None:
455 self.headers['Content-Type'] = 'application/x-www-form-urlencoded'
457 @property
458 def headers(self) -> HTTPHeaderDict:
459 return self._headers
461 @headers.setter
462 def headers(self, new_headers: Mapping):
463 """Replaces headers of the request. If not a HTTPHeaderDict, it will be converted to one."""
464 if isinstance(new_headers, HTTPHeaderDict):
465 self._headers = new_headers
466 elif isinstance(new_headers, Mapping):
467 self._headers = HTTPHeaderDict(new_headers)
468 else:
469 raise TypeError('headers must be a mapping')
471 def update(self, url=None, data=None, headers=None, query=None, extensions=None):
472 self.data = data if data is not None else self.data
473 self.headers.update(headers or {})
474 self.extensions.update(extensions or {})
475 self.url = update_url_query(url or self.url, query or {})
477 def copy(self):
478 return self.__class__(
479 url=self.url,
480 headers=copy.deepcopy(self.headers),
481 proxies=copy.deepcopy(self.proxies),
482 data=self._data,
483 extensions=copy.copy(self.extensions),
484 method=self._method,
488 HEADRequest = functools.partial(Request, method='HEAD')
489 PUTRequest = functools.partial(Request, method='PUT')
492 class Response(io.IOBase):
494 Base class for HTTP response adapters.
496 By default, it provides a basic wrapper for a file-like response object.
498 Interface partially backwards-compatible with addinfourl and http.client.HTTPResponse.
500 @param fp: Original, file-like, response.
501 @param url: URL that this is a response of.
502 @param headers: response headers.
503 @param status: Response HTTP status code. Default is 200 OK.
504 @param reason: HTTP status reason. Will use built-in reasons based on status code if not provided.
505 @param extensions: Dictionary of handler-specific response extensions.
508 def __init__(
509 self,
510 fp: io.IOBase,
511 url: str,
512 headers: Mapping[str, str],
513 status: int = 200,
514 reason: str | None = None,
515 extensions: dict | None = None,
518 self.fp = fp
519 self.headers = Message()
520 for name, value in headers.items():
521 self.headers.add_header(name, value)
522 self.status = status
523 self.url = url
524 try:
525 self.reason = reason or HTTPStatus(status).phrase
526 except ValueError:
527 self.reason = None
528 self.extensions = extensions or {}
530 def readable(self):
531 return self.fp.readable()
533 def read(self, amt: int | None = None) -> bytes:
534 # Expected errors raised here should be of type RequestError or subclasses.
535 # Subclasses should redefine this method with more precise error handling.
536 try:
537 return self.fp.read(amt)
538 except Exception as e:
539 raise TransportError(cause=e) from e
541 def close(self):
542 self.fp.close()
543 return super().close()
545 def get_header(self, name, default=None):
546 """Get header for name.
547 If there are multiple matching headers, return all seperated by comma."""
548 headers = self.headers.get_all(name)
549 if not headers:
550 return default
551 if name.title() == 'Set-Cookie':
552 # Special case, only get the first one
553 # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.3-4.1
554 return headers[0]
555 return ', '.join(headers)
557 # The following methods are for compatability reasons and are deprecated
558 @property
559 def code(self):
560 deprecation_warning('Response.code is deprecated, use Response.status', stacklevel=2)
561 return self.status
563 def getcode(self):
564 deprecation_warning('Response.getcode() is deprecated, use Response.status', stacklevel=2)
565 return self.status
567 def geturl(self):
568 deprecation_warning('Response.geturl() is deprecated, use Response.url', stacklevel=2)
569 return self.url
571 def info(self):
572 deprecation_warning('Response.info() is deprecated, use Response.headers', stacklevel=2)
573 return self.headers
575 def getheader(self, name, default=None):
576 deprecation_warning('Response.getheader() is deprecated, use Response.get_header', stacklevel=2)
577 return self.get_header(name, default)
580 if typing.TYPE_CHECKING:
581 RequestData = bytes | Iterable[bytes] | typing.IO | None
582 Preference = typing.Callable[[RequestHandler, Request], int]
584 _RH_PREFERENCES: set[Preference] = set()