yt_dlp/utils/networking.py

   1 import collections
   2 import random
   3 import urllib.parse
   4 import urllib.request
   5
   6 from ._utils import remove_start
   7
   8
   9 def random_user_agent():
  10     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  11     _CHROME_VERSIONS = (
  12         '90.0.4430.212',
  13         '90.0.4430.24',
  14         '90.0.4430.70',
  15         '90.0.4430.72',
  16         '90.0.4430.85',
  17         '90.0.4430.93',
  18         '91.0.4472.101',
  19         '91.0.4472.106',
  20         '91.0.4472.114',
  21         '91.0.4472.124',
  22         '91.0.4472.164',
  23         '91.0.4472.19',
  24         '91.0.4472.77',
  25         '92.0.4515.107',
  26         '92.0.4515.115',
  27         '92.0.4515.131',
  28         '92.0.4515.159',
  29         '92.0.4515.43',
  30         '93.0.4556.0',
  31         '93.0.4577.15',
  32         '93.0.4577.63',
  33         '93.0.4577.82',
  34         '94.0.4606.41',
  35         '94.0.4606.54',
  36         '94.0.4606.61',
  37         '94.0.4606.71',
  38         '94.0.4606.81',
  39         '94.0.4606.85',
  40         '95.0.4638.17',
  41         '95.0.4638.50',
  42         '95.0.4638.54',
  43         '95.0.4638.69',
  44         '95.0.4638.74',
  45         '96.0.4664.18',
  46         '96.0.4664.45',
  47         '96.0.4664.55',
  48         '96.0.4664.93',
  49         '97.0.4692.20',
  50     )
  51     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
  52
  53
  54 class HTTPHeaderDict(collections.UserDict, dict):
  55     """
  56     Store and access keys case-insensitively.
  57     The constructor can take multiple dicts, in which keys in the latter are prioritised.
  58     """
  59
  60     def __init__(self, *args, **kwargs):
  61         super().__init__()
  62         for dct in args:
  63             if dct is not None:
  64                 self.update(dct)
  65         self.update(kwargs)
  66
  67     def __setitem__(self, key, value):
  68         if isinstance(value, bytes):
  69             value = value.decode('latin-1')
  70         super().__setitem__(key.title(), str(value).strip())
  71
  72     def __getitem__(self, key):
  73         return super().__getitem__(key.title())
  74
  75     def __delitem__(self, key):
  76         super().__delitem__(key.title())
  77
  78     def __contains__(self, key):
  79         return super().__contains__(key.title() if isinstance(key, str) else key)
  80
  81
  82 std_headers = HTTPHeaderDict({
  83     'User-Agent': random_user_agent(),
  84     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  85     'Accept-Language': 'en-us,en;q=0.5',
  86     'Sec-Fetch-Mode': 'navigate',
  87 })
  88
  89
  90 def clean_proxies(proxies: dict, headers: HTTPHeaderDict):
  91     req_proxy = headers.pop('Ytdl-Request-Proxy', None)
  92     if req_proxy:
  93         proxies.clear()  # XXX: compat: Ytdl-Request-Proxy takes preference over everything, including NO_PROXY
  94         proxies['all'] = req_proxy
  95     for proxy_key, proxy_url in proxies.items():
  96         if proxy_url == '__noproxy__':
  97             proxies[proxy_key] = None
  98             continue
  99         if proxy_key == 'no':  # special case
 100             continue
 101         if proxy_url is not None:
 102             # Ensure proxies without a scheme are http.
 103             try:
 104                 proxy_scheme = urllib.request._parse_proxy(proxy_url)[0]
 105             except ValueError:
 106                 # Ignore invalid proxy URLs. Sometimes these may be introduced through environment
 107                 # variables unrelated to proxy settings - e.g. Colab `COLAB_LANGUAGE_SERVER_PROXY`.
 108                 # If the proxy is going to be used, the Request Handler proxy validation will handle it.
 109                 continue
 110             if proxy_scheme is None:
 111                 proxies[proxy_key] = 'http://' + remove_start(proxy_url, '//')
 112
 113             replace_scheme = {
 114                 'socks5': 'socks5h',  # compat: socks5 was treated as socks5h
 115                 'socks': 'socks4',  # compat: non-standard
 116             }
 117             if proxy_scheme in replace_scheme:
 118                 proxies[proxy_key] = urllib.parse.urlunparse(
 119                     urllib.parse.urlparse(proxy_url)._replace(scheme=replace_scheme[proxy_scheme]))
 120
 121
 122 def clean_headers(headers: HTTPHeaderDict):
 123     if 'Youtubedl-No-Compression' in headers:  # compat
 124         del headers['Youtubedl-No-Compression']
 125         headers['Accept-Encoding'] = 'identity'
 126     headers.pop('Ytdl-socks-proxy', None)
 127
 128
 129 def remove_dot_segments(path):
 130     # Implements RFC3986 5.2.4 remote_dot_segments
 131     # Pseudo-code: https://tools.ietf.org/html/rfc3986#section-5.2.4
 132     # https://github.com/urllib3/urllib3/blob/ba49f5c4e19e6bca6827282feb77a3c9f937e64b/src/urllib3/util/url.py#L263
 133     output = []
 134     segments = path.split('/')
 135     for s in segments:
 136         if s == '.':
 137             continue
 138         elif s == '..':
 139             if output:
 140                 output.pop()
 141         else:
 142             output.append(s)
 143     if not segments[0] and (not output or output[0]):
 144         output.insert(0, '')
 145     if segments[-1] in ('.', '..'):
 146         output.append('')
 147     return '/'.join(output)
 148
 149
 150 def escape_rfc3986(s):
 151     """Escape non-ASCII characters as suggested by RFC 3986"""
 152     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
 153
 154
 155 def normalize_url(url):
 156     """Normalize URL as suggested by RFC 3986"""
 157     url_parsed = urllib.parse.urlparse(url)
 158     return url_parsed._replace(
 159         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
 160         path=escape_rfc3986(remove_dot_segments(url_parsed.path)),
 161         params=escape_rfc3986(url_parsed.params),
 162         query=escape_rfc3986(url_parsed.query),
 163         fragment=escape_rfc3986(url_parsed.fragment),
 164     ).geturl()