yt_dlp/utils/_utils.py

   1 import base64
   2 import binascii
   3 import calendar
   4 import codecs
   5 import collections
   6 import collections.abc
   7 import contextlib
   8 import datetime as dt
   9 import email.header
  10 import email.utils
  11 import errno
  12 import hashlib
  13 import hmac
  14 import html.entities
  15 import html.parser
  16 import inspect
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import mimetypes
  23 import netrc
  24 import operator
  25 import os
  26 import platform
  27 import random
  28 import re
  29 import shlex
  30 import socket
  31 import ssl
  32 import struct
  33 import subprocess
  34 import sys
  35 import tempfile
  36 import time
  37 import traceback
  38 import types
  39 import unicodedata
  40 import urllib.error
  41 import urllib.parse
  42 import urllib.request
  43 import xml.etree.ElementTree
  44
  45 from . import traversal
  46
  47 from ..compat import functools  # isort: split
  48 from ..compat import (
  49     compat_etree_fromstring,
  50     compat_expanduser,
  51     compat_HTMLParseError,
  52     compat_os_name,
  53 )
  54 from ..dependencies import xattr
  55
  56 __name__ = __name__.rsplit('.', 1)[0]  # noqa: A001: Pretend to be the parent module
  57
  58 # This is not clearly defined otherwise
  59 compiled_regex_type = type(re.compile(''))
  60
  61
  62 class NO_DEFAULT:
  63     pass
  64
  65
  66 def IDENTITY(x):
  67     return x
  68
  69
  70 ENGLISH_MONTH_NAMES = [
  71     'January', 'February', 'March', 'April', 'May', 'June',
  72     'July', 'August', 'September', 'October', 'November', 'December']
  73
  74 MONTH_NAMES = {
  75     'en': ENGLISH_MONTH_NAMES,
  76     'fr': [
  77         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
  78         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
  79     # these follow the genitive grammatical case (dopełniacz)
  80     # some websites might be using nominative, which will require another month list
  81     # https://en.wikibooks.org/wiki/Polish/Noun_cases
  82     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
  83            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
  84 }
  85
  86 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
  87 TIMEZONE_NAMES = {
  88     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
  89     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
  90     'EST': -5, 'EDT': -4,  # Eastern
  91     'CST': -6, 'CDT': -5,  # Central
  92     'MST': -7, 'MDT': -6,  # Mountain
  93     'PST': -8, 'PDT': -7,   # Pacific
  94 }
  95
  96 # needed for sanitizing filenames in restricted mode
  97 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
  98                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
  99                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 100
 101 DATE_FORMATS = (
 102     '%d %B %Y',
 103     '%d %b %Y',
 104     '%B %d %Y',
 105     '%B %dst %Y',
 106     '%B %dnd %Y',
 107     '%B %drd %Y',
 108     '%B %dth %Y',
 109     '%b %d %Y',
 110     '%b %dst %Y',
 111     '%b %dnd %Y',
 112     '%b %drd %Y',
 113     '%b %dth %Y',
 114     '%b %dst %Y %I:%M',
 115     '%b %dnd %Y %I:%M',
 116     '%b %drd %Y %I:%M',
 117     '%b %dth %Y %I:%M',
 118     '%Y %m %d',
 119     '%Y-%m-%d',
 120     '%Y.%m.%d.',
 121     '%Y/%m/%d',
 122     '%Y/%m/%d %H:%M',
 123     '%Y/%m/%d %H:%M:%S',
 124     '%Y%m%d%H%M',
 125     '%Y%m%d%H%M%S',
 126     '%Y%m%d',
 127     '%Y-%m-%d %H:%M',
 128     '%Y-%m-%d %H:%M:%S',
 129     '%Y-%m-%d %H:%M:%S.%f',
 130     '%Y-%m-%d %H:%M:%S:%f',
 131     '%d.%m.%Y %H:%M',
 132     '%d.%m.%Y %H.%M',
 133     '%Y-%m-%dT%H:%M:%SZ',
 134     '%Y-%m-%dT%H:%M:%S.%fZ',
 135     '%Y-%m-%dT%H:%M:%S.%f0Z',
 136     '%Y-%m-%dT%H:%M:%S',
 137     '%Y-%m-%dT%H:%M:%S.%f',
 138     '%Y-%m-%dT%H:%M',
 139     '%b %d %Y at %H:%M',
 140     '%b %d %Y at %H:%M:%S',
 141     '%B %d %Y at %H:%M',
 142     '%B %d %Y at %H:%M:%S',
 143     '%H:%M %d-%b-%Y',
 144 )
 145
 146 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 147 DATE_FORMATS_DAY_FIRST.extend([
 148     '%d-%m-%Y',
 149     '%d.%m.%Y',
 150     '%d.%m.%y',
 151     '%d/%m/%Y',
 152     '%d/%m/%y',
 153     '%d/%m/%Y %H:%M:%S',
 154     '%d-%m-%Y %H:%M',
 155     '%H:%M %d/%m/%Y',
 156 ])
 157
 158 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 159 DATE_FORMATS_MONTH_FIRST.extend([
 160     '%m-%d-%Y',
 161     '%m.%d.%Y',
 162     '%m/%d/%Y',
 163     '%m/%d/%y',
 164     '%m/%d/%Y %H:%M:%S',
 165 ])
 166
 167 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 168 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 169
 170 NUMBER_RE = r'\d+(?:\.\d+)?'
 171
 172
 173 @functools.cache
 174 def preferredencoding():
 175     """Get preferred encoding.
 176
 177     Returns the best encoding scheme for the system, based on
 178     locale.getpreferredencoding() and some further tweaks.
 179     """
 180     try:
 181         pref = locale.getpreferredencoding()
 182         'TEST'.encode(pref)
 183     except Exception:
 184         pref = 'UTF-8'
 185
 186     return pref
 187
 188
 189 def write_json_file(obj, fn):
 190     """ Encode obj as JSON and write it to fn, atomically if possible """
 191
 192     tf = tempfile.NamedTemporaryFile(
 193         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 194         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 195
 196     try:
 197         with tf:
 198             json.dump(obj, tf, ensure_ascii=False)
 199         if sys.platform == 'win32':
 200             # Need to remove existing file on Windows, else os.rename raises
 201             # WindowsError or FileExistsError.
 202             with contextlib.suppress(OSError):
 203                 os.unlink(fn)
 204         with contextlib.suppress(OSError):
 205             mask = os.umask(0)
 206             os.umask(mask)
 207             os.chmod(tf.name, 0o666 & ~mask)
 208         os.rename(tf.name, fn)
 209     except Exception:
 210         with contextlib.suppress(OSError):
 211             os.remove(tf.name)
 212         raise
 213
 214
 215 def find_xpath_attr(node, xpath, key, val=None):
 216     """ Find the xpath xpath[@key=val] """
 217     assert re.match(r'^[a-zA-Z_-]+$', key)
 218     expr = xpath + (f'[@{key}]' if val is None else f"[@{key}='{val}']")
 219     return node.find(expr)
 220
 221 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 222 # the namespace parameter
 223
 224
 225 def xpath_with_ns(path, ns_map):
 226     components = [c.split(':') for c in path.split('/')]
 227     replaced = []
 228     for c in components:
 229         if len(c) == 1:
 230             replaced.append(c[0])
 231         else:
 232             ns, tag = c
 233             replaced.append(f'{{{ns_map[ns]}}}{tag}')
 234     return '/'.join(replaced)
 235
 236
 237 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 238     def _find_xpath(xpath):
 239         return node.find(xpath)
 240
 241     if isinstance(xpath, str):
 242         n = _find_xpath(xpath)
 243     else:
 244         for xp in xpath:
 245             n = _find_xpath(xp)
 246             if n is not None:
 247                 break
 248
 249     if n is None:
 250         if default is not NO_DEFAULT:
 251             return default
 252         elif fatal:
 253             name = xpath if name is None else name
 254             raise ExtractorError(f'Could not find XML element {name}')
 255         else:
 256             return None
 257     return n
 258
 259
 260 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 261     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 262     if n is None or n == default:
 263         return n
 264     if n.text is None:
 265         if default is not NO_DEFAULT:
 266             return default
 267         elif fatal:
 268             name = xpath if name is None else name
 269             raise ExtractorError(f'Could not find XML element\'s text {name}')
 270         else:
 271             return None
 272     return n.text
 273
 274
 275 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 276     n = find_xpath_attr(node, xpath, key)
 277     if n is None:
 278         if default is not NO_DEFAULT:
 279             return default
 280         elif fatal:
 281             name = f'{xpath}[@{key}]' if name is None else name
 282             raise ExtractorError(f'Could not find XML attribute {name}')
 283         else:
 284             return None
 285     return n.attrib[key]
 286
 287
 288 def get_element_by_id(id, html, **kwargs):
 289     """Return the content of the tag with the specified ID in the passed HTML document"""
 290     return get_element_by_attribute('id', id, html, **kwargs)
 291
 292
 293 def get_element_html_by_id(id, html, **kwargs):
 294     """Return the html of the tag with the specified ID in the passed HTML document"""
 295     return get_element_html_by_attribute('id', id, html, **kwargs)
 296
 297
 298 def get_element_by_class(class_name, html):
 299     """Return the content of the first tag with the specified class in the passed HTML document"""
 300     retval = get_elements_by_class(class_name, html)
 301     return retval[0] if retval else None
 302
 303
 304 def get_element_html_by_class(class_name, html):
 305     """Return the html of the first tag with the specified class in the passed HTML document"""
 306     retval = get_elements_html_by_class(class_name, html)
 307     return retval[0] if retval else None
 308
 309
 310 def get_element_by_attribute(attribute, value, html, **kwargs):
 311     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 312     return retval[0] if retval else None
 313
 314
 315 def get_element_html_by_attribute(attribute, value, html, **kargs):
 316     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 317     return retval[0] if retval else None
 318
 319
 320 def get_elements_by_class(class_name, html, **kargs):
 321     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 322     return get_elements_by_attribute(
 323         'class', rf'[^\'"]*(?<=[\'"\s]){re.escape(class_name)}(?=[\'"\s])[^\'"]*',
 324         html, escape_value=False)
 325
 326
 327 def get_elements_html_by_class(class_name, html):
 328     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 329     return get_elements_html_by_attribute(
 330         'class', rf'[^\'"]*(?<=[\'"\s]){re.escape(class_name)}(?=[\'"\s])[^\'"]*',
 331         html, escape_value=False)
 332
 333
 334 def get_elements_by_attribute(*args, **kwargs):
 335     """Return the content of the tag with the specified attribute in the passed HTML document"""
 336     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 337
 338
 339 def get_elements_html_by_attribute(*args, **kwargs):
 340     """Return the html of the tag with the specified attribute in the passed HTML document"""
 341     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 342
 343
 344 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 345     """
 346     Return the text (content) and the html (whole) of the tag with the specified
 347     attribute in the passed HTML document
 348     """
 349     if not value:
 350         return
 351
 352     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 353
 354     value = re.escape(value) if escape_value else value
 355
 356     partial_element_re = rf'''(?x)
 357         <(?P<tag>{tag})
 358          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 359          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 360         '''
 361
 362     for m in re.finditer(partial_element_re, html):
 363         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 364
 365         yield (
 366             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 367             whole,
 368         )
 369
 370
 371 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 372     """
 373     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 374     closing tag for the first opening tag it has encountered, and can be used
 375     as a context manager
 376     """
 377
 378     class HTMLBreakOnClosingTagException(Exception):
 379         pass
 380
 381     def __init__(self):
 382         self.tagstack = collections.deque()
 383         html.parser.HTMLParser.__init__(self)
 384
 385     def __enter__(self):
 386         return self
 387
 388     def __exit__(self, *_):
 389         self.close()
 390
 391     def close(self):
 392         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 393         # so data remains buffered; we no longer have any interest in it, thus
 394         # override this method to discard it
 395         pass
 396
 397     def handle_starttag(self, tag, _):
 398         self.tagstack.append(tag)
 399
 400     def handle_endtag(self, tag):
 401         if not self.tagstack:
 402             raise compat_HTMLParseError('no tags in the stack')
 403         while self.tagstack:
 404             inner_tag = self.tagstack.pop()
 405             if inner_tag == tag:
 406                 break
 407         else:
 408             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 409         if not self.tagstack:
 410             raise self.HTMLBreakOnClosingTagException
 411
 412
 413 # XXX: This should be far less strict
 414 def get_element_text_and_html_by_tag(tag, html):
 415     """
 416     For the first element with the specified tag in the passed HTML document
 417     return its' content (text) and the whole element (html)
 418     """
 419     def find_or_raise(haystack, needle, exc):
 420         try:
 421             return haystack.index(needle)
 422         except ValueError:
 423             raise exc
 424     closing_tag = f'</{tag}>'
 425     whole_start = find_or_raise(
 426         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 427     content_start = find_or_raise(
 428         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 429     content_start += whole_start + 1
 430     with HTMLBreakOnClosingTagParser() as parser:
 431         parser.feed(html[whole_start:content_start])
 432         if not parser.tagstack or parser.tagstack[0] != tag:
 433             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 434         offset = content_start
 435         while offset < len(html):
 436             next_closing_tag_start = find_or_raise(
 437                 html[offset:], closing_tag,
 438                 compat_HTMLParseError(f'closing {tag} tag not found'))
 439             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 440             try:
 441                 parser.feed(html[offset:offset + next_closing_tag_end])
 442                 offset += next_closing_tag_end
 443             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 444                 return html[content_start:offset + next_closing_tag_start], \
 445                     html[whole_start:offset + next_closing_tag_end]
 446         raise compat_HTMLParseError('unexpected end of html')
 447
 448
 449 class HTMLAttributeParser(html.parser.HTMLParser):
 450     """Trivial HTML parser to gather the attributes for a single element"""
 451
 452     def __init__(self):
 453         self.attrs = {}
 454         html.parser.HTMLParser.__init__(self)
 455
 456     def handle_starttag(self, tag, attrs):
 457         self.attrs = dict(attrs)
 458         raise compat_HTMLParseError('done')
 459
 460
 461 class HTMLListAttrsParser(html.parser.HTMLParser):
 462     """HTML parser to gather the attributes for the elements of a list"""
 463
 464     def __init__(self):
 465         html.parser.HTMLParser.__init__(self)
 466         self.items = []
 467         self._level = 0
 468
 469     def handle_starttag(self, tag, attrs):
 470         if tag == 'li' and self._level == 0:
 471             self.items.append(dict(attrs))
 472         self._level += 1
 473
 474     def handle_endtag(self, tag):
 475         self._level -= 1
 476
 477
 478 def extract_attributes(html_element):
 479     """Given a string for an HTML element such as
 480     <el
 481          a="foo" B="bar" c="&98;az" d=boz
 482          empty= noval entity="&amp;"
 483          sq='"' dq="'"
 484     >
 485     Decode and return a dictionary of attributes.
 486     {
 487         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 488         'empty': '', 'noval': None, 'entity': '&',
 489         'sq': '"', 'dq': '\''
 490     }.
 491     """
 492     parser = HTMLAttributeParser()
 493     with contextlib.suppress(compat_HTMLParseError):
 494         parser.feed(html_element)
 495         parser.close()
 496     return parser.attrs
 497
 498
 499 def parse_list(webpage):
 500     """Given a string for an series of HTML <li> elements,
 501     return a dictionary of their attributes"""
 502     parser = HTMLListAttrsParser()
 503     parser.feed(webpage)
 504     parser.close()
 505     return parser.items
 506
 507
 508 def clean_html(html):
 509     """Clean an HTML snippet into a readable string"""
 510
 511     if html is None:  # Convenience for sanitizing descriptions etc.
 512         return html
 513
 514     html = re.sub(r'\s+', ' ', html)
 515     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 516     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 517     # Strip html tags
 518     html = re.sub('<.*?>', '', html)
 519     # Replace html entities
 520     html = unescapeHTML(html)
 521     return html.strip()
 522
 523
 524 class LenientJSONDecoder(json.JSONDecoder):
 525     # TODO: Write tests
 526     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 527         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 528         self._close_attempts = 2 * close_objects
 529         super().__init__(*args, **kwargs)
 530
 531     @staticmethod
 532     def _close_object(err):
 533         doc = err.doc[:err.pos]
 534         # We need to add comma first to get the correct error message
 535         if err.msg.startswith('Expecting \',\''):
 536             return doc + ','
 537         elif not doc.endswith(','):
 538             return
 539
 540         if err.msg.startswith('Expecting property name'):
 541             return doc[:-1] + '}'
 542         elif err.msg.startswith('Expecting value'):
 543             return doc[:-1] + ']'
 544
 545     def decode(self, s):
 546         if self.transform_source:
 547             s = self.transform_source(s)
 548         for attempt in range(self._close_attempts + 1):
 549             try:
 550                 if self.ignore_extra:
 551                     return self.raw_decode(s.lstrip())[0]
 552                 return super().decode(s)
 553             except json.JSONDecodeError as e:
 554                 if e.pos is None:
 555                     raise
 556                 elif attempt < self._close_attempts:
 557                     s = self._close_object(e)
 558                     if s is not None:
 559                         continue
 560                 raise type(e)(f'{e.msg} in {s[e.pos - 10:e.pos + 10]!r}', s, e.pos)
 561         assert False, 'Too many attempts to decode JSON'
 562
 563
 564 def sanitize_open(filename, open_mode):
 565     """Try to open the given filename, and slightly tweak it if this fails.
 566
 567     Attempts to open the given filename. If this fails, it tries to change
 568     the filename slightly, step by step, until it's either able to open it
 569     or it fails and raises a final exception, like the standard open()
 570     function.
 571
 572     It returns the tuple (stream, definitive_file_name).
 573     """
 574     if filename == '-':
 575         if sys.platform == 'win32':
 576             import msvcrt
 577
 578             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 579             with contextlib.suppress(io.UnsupportedOperation):
 580                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 581         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 582
 583     for attempt in range(2):
 584         try:
 585             try:
 586                 if sys.platform == 'win32':
 587                     # FIXME: An exclusive lock also locks the file from being read.
 588                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 589                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 590                     raise LockingUnsupportedError
 591                 stream = locked_file(filename, open_mode, block=False).__enter__()
 592             except OSError:
 593                 stream = open(filename, open_mode)
 594             return stream, filename
 595         except OSError as err:
 596             if attempt or err.errno in (errno.EACCES,):
 597                 raise
 598             old_filename, filename = filename, sanitize_path(filename)
 599             if old_filename == filename:
 600                 raise
 601
 602
 603 def timeconvert(timestr):
 604     """Convert RFC 2822 defined time string into system timestamp"""
 605     timestamp = None
 606     timetuple = email.utils.parsedate_tz(timestr)
 607     if timetuple is not None:
 608         timestamp = email.utils.mktime_tz(timetuple)
 609     return timestamp
 610
 611
 612 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 613     """Sanitizes a string so it could be used as part of a filename.
 614     @param restricted   Use a stricter subset of allowed characters
 615     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 616                         If unset, yt-dlp's new sanitization rules are in effect
 617     """
 618     if s == '':
 619         return ''
 620
 621     def replace_insane(char):
 622         if restricted and char in ACCENT_CHARS:
 623             return ACCENT_CHARS[char]
 624         elif not restricted and char == '\n':
 625             return '\0 '
 626         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 627             # Replace with their full-width unicode counterparts
 628             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 629         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 630             return ''
 631         elif char == '"':
 632             return '' if restricted else '\''
 633         elif char == ':':
 634             return '\0_\0-' if restricted else '\0 \0-'
 635         elif char in '\\/|*<>':
 636             return '\0_'
 637         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 638             return '' if unicodedata.category(char)[0] in 'CM' else '\0_'
 639         return char
 640
 641     # Replace look-alike Unicode glyphs
 642     if restricted and (is_id is NO_DEFAULT or not is_id):
 643         s = unicodedata.normalize('NFKC', s)
 644     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 645     result = ''.join(map(replace_insane, s))
 646     if is_id is NO_DEFAULT:
 647         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 648         STRIP_RE = r'(?:\0.|[ _-])*'
 649         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 650     result = result.replace('\0', '') or '_'
 651
 652     if not is_id:
 653         while '__' in result:
 654             result = result.replace('__', '_')
 655         result = result.strip('_')
 656         # Common case of "Foreign band name - English song title"
 657         if restricted and result.startswith('-_'):
 658             result = result[2:]
 659         if result.startswith('-'):
 660             result = '_' + result[len('-'):]
 661         result = result.lstrip('.')
 662         if not result:
 663             result = '_'
 664     return result
 665
 666
 667 def sanitize_path(s, force=False):
 668     """Sanitizes and normalizes path on Windows"""
 669     # XXX: this handles drive relative paths (c:sth) incorrectly
 670     if sys.platform == 'win32':
 671         force = False
 672         drive_or_unc, _ = os.path.splitdrive(s)
 673     elif force:
 674         drive_or_unc = ''
 675     else:
 676         return s
 677
 678     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 679     if drive_or_unc:
 680         norm_path.pop(0)
 681     sanitized_path = [
 682         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 683         for path_part in norm_path]
 684     if drive_or_unc:
 685         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 686     elif force and s and s[0] == os.path.sep:
 687         sanitized_path.insert(0, os.path.sep)
 688     # TODO: Fix behavioral differences <3.12
 689     # The workaround using `normpath` only superficially passes tests
 690     # Ref: https://github.com/python/cpython/pull/100351
 691     return os.path.normpath(os.path.join(*sanitized_path))
 692
 693
 694 def sanitize_url(url, *, scheme='http'):
 695     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 696     # the number of unwanted failures due to missing protocol
 697     if url is None:
 698         return
 699     elif url.startswith('//'):
 700         return f'{scheme}:{url}'
 701     # Fix some common typos seen so far
 702     COMMON_TYPOS = (
 703         # https://github.com/ytdl-org/youtube-dl/issues/15649
 704         (r'^httpss://', r'https://'),
 705         # https://bx1.be/lives/direct-tv/
 706         (r'^rmtp([es]?)://', r'rtmp\1://'),
 707     )
 708     for mistake, fixup in COMMON_TYPOS:
 709         if re.match(mistake, url):
 710             return re.sub(mistake, fixup, url)
 711     return url
 712
 713
 714 def extract_basic_auth(url):
 715     parts = urllib.parse.urlsplit(url)
 716     if parts.username is None:
 717         return url, None
 718     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 719         parts.hostname if parts.port is None
 720         else f'{parts.hostname}:{parts.port}')))
 721     auth_payload = base64.b64encode(
 722         ('{}:{}'.format(parts.username, parts.password or '')).encode())
 723     return url, f'Basic {auth_payload.decode()}'
 724
 725
 726 def expand_path(s):
 727     """Expand shell variables and ~"""
 728     return os.path.expandvars(compat_expanduser(s))
 729
 730
 731 def orderedSet(iterable, *, lazy=False):
 732     """Remove all duplicates from the input iterable"""
 733     def _iter():
 734         seen = []  # Do not use set since the items can be unhashable
 735         for x in iterable:
 736             if x not in seen:
 737                 seen.append(x)
 738                 yield x
 739
 740     return _iter() if lazy else list(_iter())
 741
 742
 743 def _htmlentity_transform(entity_with_semicolon):
 744     """Transforms an HTML entity to a character."""
 745     entity = entity_with_semicolon[:-1]
 746
 747     # Known non-numeric HTML entity
 748     if entity in html.entities.name2codepoint:
 749         return chr(html.entities.name2codepoint[entity])
 750
 751     # TODO: HTML5 allows entities without a semicolon.
 752     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 753     if entity_with_semicolon in html.entities.html5:
 754         return html.entities.html5[entity_with_semicolon]
 755
 756     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 757     if mobj is not None:
 758         numstr = mobj.group(1)
 759         if numstr.startswith('x'):
 760             base = 16
 761             numstr = f'0{numstr}'
 762         else:
 763             base = 10
 764         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 765         with contextlib.suppress(ValueError):
 766             return chr(int(numstr, base))
 767
 768     # Unknown entity in name, return its literal representation
 769     return f'&{entity};'
 770
 771
 772 def unescapeHTML(s):
 773     if s is None:
 774         return None
 775     assert isinstance(s, str)
 776
 777     return re.sub(
 778         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 779
 780
 781 def escapeHTML(text):
 782     return (
 783         text
 784         .replace('&', '&amp;')
 785         .replace('<', '&lt;')
 786         .replace('>', '&gt;')
 787         .replace('"', '&quot;')
 788         .replace("'", '&#39;')
 789     )
 790
 791
 792 class netrc_from_content(netrc.netrc):
 793     def __init__(self, content):
 794         self.hosts, self.macros = {}, {}
 795         with io.StringIO(content) as stream:
 796             self._parse('-', stream, False)
 797
 798
 799 class Popen(subprocess.Popen):
 800     if sys.platform == 'win32':
 801         _startupinfo = subprocess.STARTUPINFO()
 802         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 803     else:
 804         _startupinfo = None
 805
 806     @staticmethod
 807     def _fix_pyinstaller_ld_path(env):
 808         """Restore LD_LIBRARY_PATH when using PyInstaller
 809             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 810                  https://github.com/yt-dlp/yt-dlp/issues/4573
 811         """
 812         if not hasattr(sys, '_MEIPASS'):
 813             return
 814
 815         def _fix(key):
 816             orig = env.get(f'{key}_ORIG')
 817             if orig is None:
 818                 env.pop(key, None)
 819             else:
 820                 env[key] = orig
 821
 822         _fix('LD_LIBRARY_PATH')  # Linux
 823         _fix('DYLD_LIBRARY_PATH')  # macOS
 824
 825     def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs):
 826         if env is None:
 827             env = os.environ.copy()
 828         self._fix_pyinstaller_ld_path(env)
 829
 830         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 831         if text is True:
 832             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 833             kwargs.setdefault('encoding', 'utf-8')
 834             kwargs.setdefault('errors', 'replace')
 835
 836         if shell and compat_os_name == 'nt' and kwargs.get('executable') is None:
 837             if not isinstance(args, str):
 838                 args = shell_quote(args, shell=True)
 839             shell = False
 840             # Set variable for `cmd.exe` newline escaping (see `utils.shell_quote`)
 841             env['='] = '"^\n\n"'
 842             args = f'{self.__comspec()} /Q /S /D /V:OFF /E:ON /C "{args}"'
 843
 844         super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo)
 845
 846     def __comspec(self):
 847         comspec = os.environ.get('ComSpec') or os.path.join(
 848             os.environ.get('SystemRoot', ''), 'System32', 'cmd.exe')
 849         if os.path.isabs(comspec):
 850             return comspec
 851         raise FileNotFoundError('shell not found: neither %ComSpec% nor %SystemRoot% is set')
 852
 853     def communicate_or_kill(self, *args, **kwargs):
 854         try:
 855             return self.communicate(*args, **kwargs)
 856         except BaseException:  # Including KeyboardInterrupt
 857             self.kill(timeout=None)
 858             raise
 859
 860     def kill(self, *, timeout=0):
 861         super().kill()
 862         if timeout != 0:
 863             self.wait(timeout=timeout)
 864
 865     @classmethod
 866     def run(cls, *args, timeout=None, **kwargs):
 867         with cls(*args, **kwargs) as proc:
 868             default = '' if proc.__text_mode else b''
 869             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 870             return stdout or default, stderr or default, proc.returncode
 871
 872
 873 def encodeArgument(s):
 874     # Legacy code that uses byte strings
 875     # Uncomment the following line after fixing all post processors
 876     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 877     return s if isinstance(s, str) else s.decode('ascii')
 878
 879
 880 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 881
 882
 883 def timetuple_from_msec(msec):
 884     secs, msec = divmod(msec, 1000)
 885     mins, secs = divmod(secs, 60)
 886     hrs, mins = divmod(mins, 60)
 887     return _timetuple(hrs, mins, secs, msec)
 888
 889
 890 def formatSeconds(secs, delim=':', msec=False):
 891     time = timetuple_from_msec(secs * 1000)
 892     if time.hours:
 893         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 894     elif time.minutes:
 895         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 896     else:
 897         ret = '%d' % time.seconds
 898     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 899
 900
 901 def bug_reports_message(before=';'):
 902     from ..update import REPOSITORY
 903
 904     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 905            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 906
 907     before = before.rstrip()
 908     if not before or before.endswith(('.', '!', '?')):
 909         msg = msg[0].title() + msg[1:]
 910
 911     return (before + ' ' if before else '') + msg
 912
 913
 914 class YoutubeDLError(Exception):
 915     """Base exception for YoutubeDL errors."""
 916     msg = None
 917
 918     def __init__(self, msg=None):
 919         if msg is not None:
 920             self.msg = msg
 921         elif self.msg is None:
 922             self.msg = type(self).__name__
 923         super().__init__(self.msg)
 924
 925
 926 class ExtractorError(YoutubeDLError):
 927     """Error during info extraction."""
 928
 929     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
 930         """ tb, if given, is the original traceback (so that it can be printed out).
 931         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
 932         """
 933         from ..networking.exceptions import network_exceptions
 934         if sys.exc_info()[0] in network_exceptions:
 935             expected = True
 936
 937         self.orig_msg = str(msg)
 938         self.traceback = tb
 939         self.expected = expected
 940         self.cause = cause
 941         self.video_id = video_id
 942         self.ie = ie
 943         self.exc_info = sys.exc_info()  # preserve original exception
 944         if isinstance(self.exc_info[1], ExtractorError):
 945             self.exc_info = self.exc_info[1].exc_info
 946         super().__init__(self.__msg)
 947
 948     @property
 949     def __msg(self):
 950         return ''.join((
 951             format_field(self.ie, None, '[%s] '),
 952             format_field(self.video_id, None, '%s: '),
 953             self.orig_msg,
 954             format_field(self.cause, None, ' (caused by %r)'),
 955             '' if self.expected else bug_reports_message()))
 956
 957     def format_traceback(self):
 958         return join_nonempty(
 959             self.traceback and ''.join(traceback.format_tb(self.traceback)),
 960             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
 961             delim='\n') or None
 962
 963     def __setattr__(self, name, value):
 964         super().__setattr__(name, value)
 965         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
 966             self.msg = self.__msg or type(self).__name__
 967             self.args = (self.msg, )  # Cannot be property
 968
 969
 970 class UnsupportedError(ExtractorError):
 971     def __init__(self, url):
 972         super().__init__(
 973             f'Unsupported URL: {url}', expected=True)
 974         self.url = url
 975
 976
 977 class RegexNotFoundError(ExtractorError):
 978     """Error when a regex didn't match"""
 979     pass
 980
 981
 982 class GeoRestrictedError(ExtractorError):
 983     """Geographic restriction Error exception.
 984
 985     This exception may be thrown when a video is not available from your
 986     geographic location due to geographic restrictions imposed by a website.
 987     """
 988
 989     def __init__(self, msg, countries=None, **kwargs):
 990         kwargs['expected'] = True
 991         super().__init__(msg, **kwargs)
 992         self.countries = countries
 993
 994
 995 class UserNotLive(ExtractorError):
 996     """Error when a channel/user is not live"""
 997
 998     def __init__(self, msg=None, **kwargs):
 999         kwargs['expected'] = True
1000         super().__init__(msg or 'The channel is not currently live', **kwargs)
1001
1002
1003 class DownloadError(YoutubeDLError):
1004     """Download Error exception.
1005
1006     This exception may be thrown by FileDownloader objects if they are not
1007     configured to continue on errors. They will contain the appropriate
1008     error message.
1009     """
1010
1011     def __init__(self, msg, exc_info=None):
1012         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1013         super().__init__(msg)
1014         self.exc_info = exc_info
1015
1016
1017 class EntryNotInPlaylist(YoutubeDLError):
1018     """Entry not in playlist exception.
1019
1020     This exception will be thrown by YoutubeDL when a requested entry
1021     is not found in the playlist info_dict
1022     """
1023     msg = 'Entry not found in info'
1024
1025
1026 class SameFileError(YoutubeDLError):
1027     """Same File exception.
1028
1029     This exception will be thrown by FileDownloader objects if they detect
1030     multiple files would have to be downloaded to the same file on disk.
1031     """
1032     msg = 'Fixed output name but more than one file to download'
1033
1034     def __init__(self, filename=None):
1035         if filename is not None:
1036             self.msg += f': {filename}'
1037         super().__init__(self.msg)
1038
1039
1040 class PostProcessingError(YoutubeDLError):
1041     """Post Processing exception.
1042
1043     This exception may be raised by PostProcessor's .run() method to
1044     indicate an error in the postprocessing task.
1045     """
1046
1047
1048 class DownloadCancelled(YoutubeDLError):
1049     """ Exception raised when the download queue should be interrupted """
1050     msg = 'The download was cancelled'
1051
1052
1053 class ExistingVideoReached(DownloadCancelled):
1054     """ --break-on-existing triggered """
1055     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1056
1057
1058 class RejectedVideoReached(DownloadCancelled):
1059     """ --break-match-filter triggered """
1060     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1061
1062
1063 class MaxDownloadsReached(DownloadCancelled):
1064     """ --max-downloads limit has been reached. """
1065     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1066
1067
1068 class ReExtractInfo(YoutubeDLError):
1069     """ Video info needs to be re-extracted. """
1070
1071     def __init__(self, msg, expected=False):
1072         super().__init__(msg)
1073         self.expected = expected
1074
1075
1076 class ThrottledDownload(ReExtractInfo):
1077     """ Download speed below --throttled-rate. """
1078     msg = 'The download speed is below throttle limit'
1079
1080     def __init__(self):
1081         super().__init__(self.msg, expected=False)
1082
1083
1084 class UnavailableVideoError(YoutubeDLError):
1085     """Unavailable Format exception.
1086
1087     This exception will be thrown when a video is requested
1088     in a format that is not available for that video.
1089     """
1090     msg = 'Unable to download video'
1091
1092     def __init__(self, err=None):
1093         if err is not None:
1094             self.msg += f': {err}'
1095         super().__init__(self.msg)
1096
1097
1098 class ContentTooShortError(YoutubeDLError):
1099     """Content Too Short exception.
1100
1101     This exception may be raised by FileDownloader objects when a file they
1102     download is too small for what the server announced first, indicating
1103     the connection was probably interrupted.
1104     """
1105
1106     def __init__(self, downloaded, expected):
1107         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1108         # Both in bytes
1109         self.downloaded = downloaded
1110         self.expected = expected
1111
1112
1113 class XAttrMetadataError(YoutubeDLError):
1114     def __init__(self, code=None, msg='Unknown error'):
1115         super().__init__(msg)
1116         self.code = code
1117         self.msg = msg
1118
1119         # Parsing code and msg
1120         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1121                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1122             self.reason = 'NO_SPACE'
1123         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1124             self.reason = 'VALUE_TOO_LONG'
1125         else:
1126             self.reason = 'NOT_SUPPORTED'
1127
1128
1129 class XAttrUnavailableError(YoutubeDLError):
1130     pass
1131
1132
1133 def is_path_like(f):
1134     return isinstance(f, (str, bytes, os.PathLike))
1135
1136
1137 def extract_timezone(date_str, default=None):
1138     m = re.search(
1139         r'''(?x)
1140             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1141             (?P<tz>Z|                                            # just the UTC Z, or
1142                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1143                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1144                    [ ]?                                          # optional space
1145                 (?P<sign>\+|-)                                   # +/-
1146                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1147             $)
1148         ''', date_str)
1149     timezone = None
1150
1151     if not m:
1152         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1153         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1154         if timezone is not None:
1155             date_str = date_str[:-len(m.group('tz'))]
1156             timezone = dt.timedelta(hours=timezone)
1157     else:
1158         date_str = date_str[:-len(m.group('tz'))]
1159         if m.group('sign'):
1160             sign = 1 if m.group('sign') == '+' else -1
1161             timezone = dt.timedelta(
1162                 hours=sign * int(m.group('hours')),
1163                 minutes=sign * int(m.group('minutes')))
1164
1165     if timezone is None and default is not NO_DEFAULT:
1166         timezone = default or dt.timedelta()
1167
1168     return timezone, date_str
1169
1170
1171 def parse_iso8601(date_str, delimiter='T', timezone=None):
1172     """ Return a UNIX timestamp from the given date """
1173
1174     if date_str is None:
1175         return None
1176
1177     date_str = re.sub(r'\.[0-9]+', '', date_str)
1178
1179     timezone, date_str = extract_timezone(date_str, timezone)
1180
1181     with contextlib.suppress(ValueError, TypeError):
1182         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1183         dt_ = dt.datetime.strptime(date_str, date_format) - timezone
1184         return calendar.timegm(dt_.timetuple())
1185
1186
1187 def date_formats(day_first=True):
1188     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1189
1190
1191 def unified_strdate(date_str, day_first=True):
1192     """Return a string with the date in the format YYYYMMDD"""
1193
1194     if date_str is None:
1195         return None
1196     upload_date = None
1197     # Replace commas
1198     date_str = date_str.replace(',', ' ')
1199     # Remove AM/PM + timezone
1200     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1201     _, date_str = extract_timezone(date_str)
1202
1203     for expression in date_formats(day_first):
1204         with contextlib.suppress(ValueError):
1205             upload_date = dt.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1206     if upload_date is None:
1207         timetuple = email.utils.parsedate_tz(date_str)
1208         if timetuple:
1209             with contextlib.suppress(ValueError):
1210                 upload_date = dt.datetime(*timetuple[:6]).strftime('%Y%m%d')
1211     if upload_date is not None:
1212         return str(upload_date)
1213
1214
1215 def unified_timestamp(date_str, day_first=True):
1216     if not isinstance(date_str, str):
1217         return None
1218
1219     date_str = re.sub(r'\s+', ' ', re.sub(
1220         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?|sun)(day)?', '', date_str))
1221
1222     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1223     timezone, date_str = extract_timezone(date_str)
1224
1225     # Remove AM/PM + timezone
1226     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1227
1228     # Remove unrecognized timezones from ISO 8601 alike timestamps
1229     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1230     if m:
1231         date_str = date_str[:-len(m.group('tz'))]
1232
1233     # Python only supports microseconds, so remove nanoseconds
1234     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1235     if m:
1236         date_str = m.group(1)
1237
1238     for expression in date_formats(day_first):
1239         with contextlib.suppress(ValueError):
1240             dt_ = dt.datetime.strptime(date_str, expression) - timezone + dt.timedelta(hours=pm_delta)
1241             return calendar.timegm(dt_.timetuple())
1242
1243     timetuple = email.utils.parsedate_tz(date_str)
1244     if timetuple:
1245         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1246
1247
1248 def determine_ext(url, default_ext='unknown_video'):
1249     if url is None or '.' not in url:
1250         return default_ext
1251     guess = url.partition('?')[0].rpartition('.')[2]
1252     if re.match(r'^[A-Za-z0-9]+$', guess):
1253         return guess
1254     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1255     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1256         return guess.rstrip('/')
1257     else:
1258         return default_ext
1259
1260
1261 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1262     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1263
1264
1265 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1266     R"""
1267     Return a datetime object from a string.
1268     Supported format:
1269         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1270
1271     @param format       strftime format of DATE
1272     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1273                         auto: round to the unit provided in date_str (if applicable).
1274     """
1275     auto_precision = False
1276     if precision == 'auto':
1277         auto_precision = True
1278         precision = 'microsecond'
1279     today = datetime_round(dt.datetime.now(dt.timezone.utc), precision)
1280     if date_str in ('now', 'today'):
1281         return today
1282     if date_str == 'yesterday':
1283         return today - dt.timedelta(days=1)
1284     match = re.match(
1285         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1286         date_str)
1287     if match is not None:
1288         start_time = datetime_from_str(match.group('start'), precision, format)
1289         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1290         unit = match.group('unit')
1291         if unit == 'month' or unit == 'year':
1292             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1293             unit = 'day'
1294         else:
1295             if unit == 'week':
1296                 unit = 'day'
1297                 time *= 7
1298             delta = dt.timedelta(**{unit + 's': time})
1299             new_date = start_time + delta
1300         if auto_precision:
1301             return datetime_round(new_date, unit)
1302         return new_date
1303
1304     return datetime_round(dt.datetime.strptime(date_str, format), precision)
1305
1306
1307 def date_from_str(date_str, format='%Y%m%d', strict=False):
1308     R"""
1309     Return a date object from a string using datetime_from_str
1310
1311     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1312                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1313     """
1314     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1315         raise ValueError(f'Invalid date format "{date_str}"')
1316     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1317
1318
1319 def datetime_add_months(dt_, months):
1320     """Increment/Decrement a datetime object by months."""
1321     month = dt_.month + months - 1
1322     year = dt_.year + month // 12
1323     month = month % 12 + 1
1324     day = min(dt_.day, calendar.monthrange(year, month)[1])
1325     return dt_.replace(year, month, day)
1326
1327
1328 def datetime_round(dt_, precision='day'):
1329     """
1330     Round a datetime object's time to a specific precision
1331     """
1332     if precision == 'microsecond':
1333         return dt_
1334
1335     unit_seconds = {
1336         'day': 86400,
1337         'hour': 3600,
1338         'minute': 60,
1339         'second': 1,
1340     }
1341     roundto = lambda x, n: ((x + n / 2) // n) * n
1342     timestamp = roundto(calendar.timegm(dt_.timetuple()), unit_seconds[precision])
1343     return dt.datetime.fromtimestamp(timestamp, dt.timezone.utc)
1344
1345
1346 def hyphenate_date(date_str):
1347     """
1348     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1349     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1350     if match is not None:
1351         return '-'.join(match.groups())
1352     else:
1353         return date_str
1354
1355
1356 class DateRange:
1357     """Represents a time interval between two dates"""
1358
1359     def __init__(self, start=None, end=None):
1360         """start and end must be strings in the format accepted by date"""
1361         if start is not None:
1362             self.start = date_from_str(start, strict=True)
1363         else:
1364             self.start = dt.datetime.min.date()
1365         if end is not None:
1366             self.end = date_from_str(end, strict=True)
1367         else:
1368             self.end = dt.datetime.max.date()
1369         if self.start > self.end:
1370             raise ValueError(f'Date range: "{self}" , the start date must be before the end date')
1371
1372     @classmethod
1373     def day(cls, day):
1374         """Returns a range that only contains the given day"""
1375         return cls(day, day)
1376
1377     def __contains__(self, date):
1378         """Check if the date is in the range"""
1379         if not isinstance(date, dt.date):
1380             date = date_from_str(date)
1381         return self.start <= date <= self.end
1382
1383     def __repr__(self):
1384         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1385
1386     def __str__(self):
1387         return f'{self.start} to {self.end}'
1388
1389     def __eq__(self, other):
1390         return (isinstance(other, DateRange)
1391                 and self.start == other.start and self.end == other.end)
1392
1393
1394 @functools.cache
1395 def system_identifier():
1396     python_implementation = platform.python_implementation()
1397     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1398         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1399     libc_ver = []
1400     with contextlib.suppress(OSError):  # We may not have access to the executable
1401         libc_ver = platform.libc_ver()
1402
1403     return 'Python {} ({} {} {}) - {} ({}{})'.format(
1404         platform.python_version(),
1405         python_implementation,
1406         platform.machine(),
1407         platform.architecture()[0],
1408         platform.platform(),
1409         ssl.OPENSSL_VERSION,
1410         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1411     )
1412
1413
1414 @functools.cache
1415 def get_windows_version():
1416     """ Get Windows version. returns () if it's not running on Windows """
1417     if compat_os_name == 'nt':
1418         return version_tuple(platform.win32_ver()[1])
1419     else:
1420         return ()
1421
1422
1423 def write_string(s, out=None, encoding=None):
1424     assert isinstance(s, str)
1425     out = out or sys.stderr
1426     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1427     if not out:
1428         return
1429
1430     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1431         s = re.sub(r'([\r\n]+)', r' \1', s)
1432
1433     enc, buffer = None, out
1434     # `mode` might be `None` (Ref: https://github.com/yt-dlp/yt-dlp/issues/8816)
1435     if 'b' in (getattr(out, 'mode', None) or ''):
1436         enc = encoding or preferredencoding()
1437     elif hasattr(out, 'buffer'):
1438         buffer = out.buffer
1439         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1440
1441     buffer.write(s.encode(enc, 'ignore') if enc else s)
1442     out.flush()
1443
1444
1445 # TODO: Use global logger
1446 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1447     from .. import _IN_CLI
1448     if _IN_CLI:
1449         if msg in deprecation_warning._cache:
1450             return
1451         deprecation_warning._cache.add(msg)
1452         if printer:
1453             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1454         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1455     else:
1456         import warnings
1457         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1458
1459
1460 deprecation_warning._cache = set()
1461
1462
1463 def bytes_to_intlist(bs):
1464     if not bs:
1465         return []
1466     if isinstance(bs[0], int):  # Python 3
1467         return list(bs)
1468     else:
1469         return [ord(c) for c in bs]
1470
1471
1472 def intlist_to_bytes(xs):
1473     if not xs:
1474         return b''
1475     return struct.pack('%dB' % len(xs), *xs)
1476
1477
1478 class LockingUnsupportedError(OSError):
1479     msg = 'File locking is not supported'
1480
1481     def __init__(self):
1482         super().__init__(self.msg)
1483
1484
1485 # Cross-platform file locking
1486 if sys.platform == 'win32':
1487     import ctypes
1488     import ctypes.wintypes
1489     import msvcrt
1490
1491     class OVERLAPPED(ctypes.Structure):
1492         _fields_ = [
1493             ('Internal', ctypes.wintypes.LPVOID),
1494             ('InternalHigh', ctypes.wintypes.LPVOID),
1495             ('Offset', ctypes.wintypes.DWORD),
1496             ('OffsetHigh', ctypes.wintypes.DWORD),
1497             ('hEvent', ctypes.wintypes.HANDLE),
1498         ]
1499
1500     kernel32 = ctypes.WinDLL('kernel32')
1501     LockFileEx = kernel32.LockFileEx
1502     LockFileEx.argtypes = [
1503         ctypes.wintypes.HANDLE,     # hFile
1504         ctypes.wintypes.DWORD,      # dwFlags
1505         ctypes.wintypes.DWORD,      # dwReserved
1506         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1507         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1508         ctypes.POINTER(OVERLAPPED),  # Overlapped
1509     ]
1510     LockFileEx.restype = ctypes.wintypes.BOOL
1511     UnlockFileEx = kernel32.UnlockFileEx
1512     UnlockFileEx.argtypes = [
1513         ctypes.wintypes.HANDLE,     # hFile
1514         ctypes.wintypes.DWORD,      # dwReserved
1515         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1516         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1517         ctypes.POINTER(OVERLAPPED),  # Overlapped
1518     ]
1519     UnlockFileEx.restype = ctypes.wintypes.BOOL
1520     whole_low = 0xffffffff
1521     whole_high = 0x7fffffff
1522
1523     def _lock_file(f, exclusive, block):
1524         overlapped = OVERLAPPED()
1525         overlapped.Offset = 0
1526         overlapped.OffsetHigh = 0
1527         overlapped.hEvent = 0
1528         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1529
1530         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1531                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1532                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1533             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1534             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1535
1536     def _unlock_file(f):
1537         assert f._lock_file_overlapped_p
1538         handle = msvcrt.get_osfhandle(f.fileno())
1539         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1540             raise OSError(f'Unlocking file failed: {ctypes.FormatError()!r}')
1541
1542 else:
1543     try:
1544         import fcntl
1545
1546         def _lock_file(f, exclusive, block):
1547             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1548             if not block:
1549                 flags |= fcntl.LOCK_NB
1550             try:
1551                 fcntl.flock(f, flags)
1552             except BlockingIOError:
1553                 raise
1554             except OSError:  # AOSP does not have flock()
1555                 fcntl.lockf(f, flags)
1556
1557         def _unlock_file(f):
1558             with contextlib.suppress(OSError):
1559                 return fcntl.flock(f, fcntl.LOCK_UN)
1560             with contextlib.suppress(OSError):
1561                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
1562             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
1563
1564     except ImportError:
1565
1566         def _lock_file(f, exclusive, block):
1567             raise LockingUnsupportedError
1568
1569         def _unlock_file(f):
1570             raise LockingUnsupportedError
1571
1572
1573 class locked_file:
1574     locked = False
1575
1576     def __init__(self, filename, mode, block=True, encoding=None):
1577         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1578             raise NotImplementedError(mode)
1579         self.mode, self.block = mode, block
1580
1581         writable = any(f in mode for f in 'wax+')
1582         readable = any(f in mode for f in 'r+')
1583         flags = functools.reduce(operator.ior, (
1584             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
1585             getattr(os, 'O_BINARY', 0),  # Windows only
1586             getattr(os, 'O_NOINHERIT', 0),  # Windows only
1587             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
1588             os.O_APPEND if 'a' in mode else 0,
1589             os.O_EXCL if 'x' in mode else 0,
1590             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1591         ))
1592
1593         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
1594
1595     def __enter__(self):
1596         exclusive = 'r' not in self.mode
1597         try:
1598             _lock_file(self.f, exclusive, self.block)
1599             self.locked = True
1600         except OSError:
1601             self.f.close()
1602             raise
1603         if 'w' in self.mode:
1604             try:
1605                 self.f.truncate()
1606             except OSError as e:
1607                 if e.errno not in (
1608                     errno.ESPIPE,  # Illegal seek - expected for FIFO
1609                     errno.EINVAL,  # Invalid argument - expected for /dev/null
1610                 ):
1611                     raise
1612         return self
1613
1614     def unlock(self):
1615         if not self.locked:
1616             return
1617         try:
1618             _unlock_file(self.f)
1619         finally:
1620             self.locked = False
1621
1622     def __exit__(self, *_):
1623         try:
1624             self.unlock()
1625         finally:
1626             self.f.close()
1627
1628     open = __enter__
1629     close = __exit__
1630
1631     def __getattr__(self, attr):
1632         return getattr(self.f, attr)
1633
1634     def __iter__(self):
1635         return iter(self.f)
1636
1637
1638 @functools.cache
1639 def get_filesystem_encoding():
1640     encoding = sys.getfilesystemencoding()
1641     return encoding if encoding is not None else 'utf-8'
1642
1643
1644 _WINDOWS_QUOTE_TRANS = str.maketrans({'"': R'\"'})
1645 _CMD_QUOTE_TRANS = str.maketrans({
1646     # Keep quotes balanced by replacing them with `""` instead of `\\"`
1647     '"': '""',
1648     # These require an env-variable `=` containing `"^\n\n"` (set in `utils.Popen`)
1649     # `=` should be unique since variables containing `=` cannot be set using cmd
1650     '\n': '%=%',
1651     '\r': '%=%',
1652     # Use zero length variable replacement so `%` doesn't get expanded
1653     # `cd` is always set as long as extensions are enabled (`/E:ON` in `utils.Popen`)
1654     '%': '%%cd:~,%',
1655 })
1656
1657
1658 def shell_quote(args, *, shell=False):
1659     args = list(variadic(args))
1660
1661     if compat_os_name != 'nt':
1662         return shlex.join(args)
1663
1664     trans = _CMD_QUOTE_TRANS if shell else _WINDOWS_QUOTE_TRANS
1665     return ' '.join(
1666         s if re.fullmatch(r'[\w#$*\-+./:?@\\]+', s, re.ASCII)
1667         else re.sub(r'(\\+)("|$)', r'\1\1\2', s).translate(trans).join('""')
1668         for s in args)
1669
1670
1671 def smuggle_url(url, data):
1672     """ Pass additional data in a URL for internal use. """
1673
1674     url, idata = unsmuggle_url(url, {})
1675     data.update(idata)
1676     sdata = urllib.parse.urlencode(
1677         {'__youtubedl_smuggle': json.dumps(data)})
1678     return url + '#' + sdata
1679
1680
1681 def unsmuggle_url(smug_url, default=None):
1682     if '#__youtubedl_smuggle' not in smug_url:
1683         return smug_url, default
1684     url, _, sdata = smug_url.rpartition('#')
1685     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
1686     data = json.loads(jsond)
1687     return url, data
1688
1689
1690 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1691     """ Formats numbers with decimal sufixes like K, M, etc """
1692     num, factor = float_or_none(num), float(factor)
1693     if num is None or num < 0:
1694         return None
1695     POSSIBLE_SUFFIXES = 'kMGTPEZY'
1696     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1697     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
1698     if factor == 1024:
1699         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
1700     converted = num / (factor ** exponent)
1701     return fmt % (converted, suffix)
1702
1703
1704 def format_bytes(bytes):
1705     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
1706
1707
1708 def lookup_unit_table(unit_table, s, strict=False):
1709     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
1710     units_re = '|'.join(re.escape(u) for u in unit_table)
1711     m = (re.fullmatch if strict else re.match)(
1712         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
1713     if not m:
1714         return None
1715
1716     num = float(m.group('num').replace(',', '.'))
1717     mult = unit_table[m.group('unit')]
1718     return round(num * mult)
1719
1720
1721 def parse_bytes(s):
1722     """Parse a string indicating a byte quantity into an integer"""
1723     return lookup_unit_table(
1724         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1725         s.upper(), strict=True)
1726
1727
1728 def parse_filesize(s):
1729     if s is None:
1730         return None
1731
1732     # The lower-case forms are of course incorrect and unofficial,
1733     # but we support those too
1734     _UNIT_TABLE = {
1735         'B': 1,
1736         'b': 1,
1737         'bytes': 1,
1738         'KiB': 1024,
1739         'KB': 1000,
1740         'kB': 1024,
1741         'Kb': 1000,
1742         'kb': 1000,
1743         'kilobytes': 1000,
1744         'kibibytes': 1024,
1745         'MiB': 1024 ** 2,
1746         'MB': 1000 ** 2,
1747         'mB': 1024 ** 2,
1748         'Mb': 1000 ** 2,
1749         'mb': 1000 ** 2,
1750         'megabytes': 1000 ** 2,
1751         'mebibytes': 1024 ** 2,
1752         'GiB': 1024 ** 3,
1753         'GB': 1000 ** 3,
1754         'gB': 1024 ** 3,
1755         'Gb': 1000 ** 3,
1756         'gb': 1000 ** 3,
1757         'gigabytes': 1000 ** 3,
1758         'gibibytes': 1024 ** 3,
1759         'TiB': 1024 ** 4,
1760         'TB': 1000 ** 4,
1761         'tB': 1024 ** 4,
1762         'Tb': 1000 ** 4,
1763         'tb': 1000 ** 4,
1764         'terabytes': 1000 ** 4,
1765         'tebibytes': 1024 ** 4,
1766         'PiB': 1024 ** 5,
1767         'PB': 1000 ** 5,
1768         'pB': 1024 ** 5,
1769         'Pb': 1000 ** 5,
1770         'pb': 1000 ** 5,
1771         'petabytes': 1000 ** 5,
1772         'pebibytes': 1024 ** 5,
1773         'EiB': 1024 ** 6,
1774         'EB': 1000 ** 6,
1775         'eB': 1024 ** 6,
1776         'Eb': 1000 ** 6,
1777         'eb': 1000 ** 6,
1778         'exabytes': 1000 ** 6,
1779         'exbibytes': 1024 ** 6,
1780         'ZiB': 1024 ** 7,
1781         'ZB': 1000 ** 7,
1782         'zB': 1024 ** 7,
1783         'Zb': 1000 ** 7,
1784         'zb': 1000 ** 7,
1785         'zettabytes': 1000 ** 7,
1786         'zebibytes': 1024 ** 7,
1787         'YiB': 1024 ** 8,
1788         'YB': 1000 ** 8,
1789         'yB': 1024 ** 8,
1790         'Yb': 1000 ** 8,
1791         'yb': 1000 ** 8,
1792         'yottabytes': 1000 ** 8,
1793         'yobibytes': 1024 ** 8,
1794     }
1795
1796     return lookup_unit_table(_UNIT_TABLE, s)
1797
1798
1799 def parse_count(s):
1800     if s is None:
1801         return None
1802
1803     s = re.sub(r'^[^\d]+\s', '', s).strip()
1804
1805     if re.match(r'^[\d,.]+$', s):
1806         return str_to_int(s)
1807
1808     _UNIT_TABLE = {
1809         'k': 1000,
1810         'K': 1000,
1811         'm': 1000 ** 2,
1812         'M': 1000 ** 2,
1813         'kk': 1000 ** 2,
1814         'KK': 1000 ** 2,
1815         'b': 1000 ** 3,
1816         'B': 1000 ** 3,
1817     }
1818
1819     ret = lookup_unit_table(_UNIT_TABLE, s)
1820     if ret is not None:
1821         return ret
1822
1823     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1824     if mobj:
1825         return str_to_int(mobj.group(1))
1826
1827
1828 def parse_resolution(s, *, lenient=False):
1829     if s is None:
1830         return {}
1831
1832     if lenient:
1833         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1834     else:
1835         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
1836     if mobj:
1837         return {
1838             'width': int(mobj.group('w')),
1839             'height': int(mobj.group('h')),
1840         }
1841
1842     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
1843     if mobj:
1844         return {'height': int(mobj.group(1))}
1845
1846     mobj = re.search(r'\b([48])[kK]\b', s)
1847     if mobj:
1848         return {'height': int(mobj.group(1)) * 540}
1849
1850     return {}
1851
1852
1853 def parse_bitrate(s):
1854     if not isinstance(s, str):
1855         return
1856     mobj = re.search(r'\b(\d+)\s*kbps', s)
1857     if mobj:
1858         return int(mobj.group(1))
1859
1860
1861 def month_by_name(name, lang='en'):
1862     """ Return the number of a month by (locale-independently) English name """
1863
1864     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1865
1866     try:
1867         return month_names.index(name) + 1
1868     except ValueError:
1869         return None
1870
1871
1872 def month_by_abbreviation(abbrev):
1873     """ Return the number of a month by (locale-independently) English
1874         abbreviations """
1875
1876     try:
1877         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1878     except ValueError:
1879         return None
1880
1881
1882 def fix_xml_ampersands(xml_str):
1883     """Replace all the '&' by '&amp;' in XML"""
1884     return re.sub(
1885         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1886         '&amp;',
1887         xml_str)
1888
1889
1890 def setproctitle(title):
1891     assert isinstance(title, str)
1892
1893     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1894     try:
1895         import ctypes
1896     except ImportError:
1897         return
1898
1899     try:
1900         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1901     except OSError:
1902         return
1903     except TypeError:
1904         # LoadLibrary in Windows Python 2.7.13 only expects
1905         # a bytestring, but since unicode_literals turns
1906         # every string into a unicode string, it fails.
1907         return
1908     title_bytes = title.encode()
1909     buf = ctypes.create_string_buffer(len(title_bytes))
1910     buf.value = title_bytes
1911     try:
1912         # PR_SET_NAME = 15      Ref: /usr/include/linux/prctl.h
1913         libc.prctl(15, buf, 0, 0, 0)
1914     except AttributeError:
1915         return  # Strange libc, just skip this
1916
1917
1918 def remove_start(s, start):
1919     return s[len(start):] if s is not None and s.startswith(start) else s
1920
1921
1922 def remove_end(s, end):
1923     return s[:-len(end)] if s is not None and s.endswith(end) else s
1924
1925
1926 def remove_quotes(s):
1927     if s is None or len(s) < 2:
1928         return s
1929     for quote in ('"', "'"):
1930         if s[0] == quote and s[-1] == quote:
1931             return s[1:-1]
1932     return s
1933
1934
1935 def get_domain(url):
1936     """
1937     This implementation is inconsistent, but is kept for compatibility.
1938     Use this only for "webpage_url_domain"
1939     """
1940     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
1941
1942
1943 def url_basename(url):
1944     path = urllib.parse.urlparse(url).path
1945     return path.strip('/').split('/')[-1]
1946
1947
1948 def base_url(url):
1949     return re.match(r'https?://[^?#]+/', url).group()
1950
1951
1952 def urljoin(base, path):
1953     if isinstance(path, bytes):
1954         path = path.decode()
1955     if not isinstance(path, str) or not path:
1956         return None
1957     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
1958         return path
1959     if isinstance(base, bytes):
1960         base = base.decode()
1961     if not isinstance(base, str) or not re.match(
1962             r'^(?:https?:)?//', base):
1963         return None
1964     return urllib.parse.urljoin(base, path)
1965
1966
1967 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1968     if get_attr and v is not None:
1969         v = getattr(v, get_attr, None)
1970     try:
1971         return int(v) * invscale // scale
1972     except (ValueError, TypeError, OverflowError):
1973         return default
1974
1975
1976 def str_or_none(v, default=None):
1977     return default if v is None else str(v)
1978
1979
1980 def str_to_int(int_str):
1981     """ A more relaxed version of int_or_none """
1982     if isinstance(int_str, int):
1983         return int_str
1984     elif isinstance(int_str, str):
1985         int_str = re.sub(r'[,\.\+]', '', int_str)
1986         return int_or_none(int_str)
1987
1988
1989 def float_or_none(v, scale=1, invscale=1, default=None):
1990     if v is None:
1991         return default
1992     try:
1993         return float(v) * invscale / scale
1994     except (ValueError, TypeError):
1995         return default
1996
1997
1998 def bool_or_none(v, default=None):
1999     return v if isinstance(v, bool) else default
2000
2001
2002 def strip_or_none(v, default=None):
2003     return v.strip() if isinstance(v, str) else default
2004
2005
2006 def url_or_none(url):
2007     if not url or not isinstance(url, str):
2008         return None
2009     url = url.strip()
2010     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2011
2012
2013 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
2014     datetime_object = None
2015     try:
2016         if isinstance(timestamp, (int, float)):  # unix timestamp
2017             # Using naive datetime here can break timestamp() in Windows
2018             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2019             # Also, dt.datetime.fromtimestamp breaks for negative timestamps
2020             # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2021             datetime_object = (dt.datetime.fromtimestamp(0, dt.timezone.utc)
2022                                + dt.timedelta(seconds=timestamp))
2023         elif isinstance(timestamp, str):  # assume YYYYMMDD
2024             datetime_object = dt.datetime.strptime(timestamp, '%Y%m%d')
2025         date_format = re.sub(  # Support %s on windows
2026             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2027         return datetime_object.strftime(date_format)
2028     except (ValueError, TypeError, AttributeError):
2029         return default
2030
2031
2032 def parse_duration(s):
2033     if not isinstance(s, str):
2034         return None
2035     s = s.strip()
2036     if not s:
2037         return None
2038
2039     days, hours, mins, secs, ms = [None] * 5
2040     m = re.match(r'''(?x)
2041             (?P<before_secs>
2042                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2043             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2044             (?P<ms>[.:][0-9]+)?Z?$
2045         ''', s)
2046     if m:
2047         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2048     else:
2049         m = re.match(
2050             r'''(?ix)(?:P?
2051                 (?:
2052                     [0-9]+\s*y(?:ears?)?,?\s*
2053                 )?
2054                 (?:
2055                     [0-9]+\s*m(?:onths?)?,?\s*
2056                 )?
2057                 (?:
2058                     [0-9]+\s*w(?:eeks?)?,?\s*
2059                 )?
2060                 (?:
2061                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2062                 )?
2063                 T)?
2064                 (?:
2065                     (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
2066                 )?
2067                 (?:
2068                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2069                 )?
2070                 (?:
2071                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2072                 )?Z?$''', s)
2073         if m:
2074             days, hours, mins, secs, ms = m.groups()
2075         else:
2076             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2077             if m:
2078                 hours, mins = m.groups()
2079             else:
2080                 return None
2081
2082     if ms:
2083         ms = ms.replace(':', '.')
2084     return sum(float(part or 0) * mult for part, mult in (
2085         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2086
2087
2088 def _change_extension(prepend, filename, ext, expected_real_ext=None):
2089     name, real_ext = os.path.splitext(filename)
2090
2091     if not expected_real_ext or real_ext[1:] == expected_real_ext:
2092         filename = name
2093         if prepend and real_ext:
2094             _UnsafeExtensionError.sanitize_extension(ext, prepend=True)
2095             return f'{filename}.{ext}{real_ext}'
2096
2097     return f'{filename}.{_UnsafeExtensionError.sanitize_extension(ext)}'
2098
2099
2100 prepend_extension = functools.partial(_change_extension, True)
2101 replace_extension = functools.partial(_change_extension, False)
2102
2103
2104 def check_executable(exe, args=[]):
2105     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2106     args can be a list of arguments for a short output (like -version) """
2107     try:
2108         Popen.run([exe, *args], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2109     except OSError:
2110         return False
2111     return exe
2112
2113
2114 def _get_exe_version_output(exe, args):
2115     try:
2116         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2117         # SIGTTOU if yt-dlp is run in the background.
2118         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2119         stdout, _, ret = Popen.run([encodeArgument(exe), *args], text=True,
2120                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2121         if ret:
2122             return None
2123     except OSError:
2124         return False
2125     return stdout
2126
2127
2128 def detect_exe_version(output, version_re=None, unrecognized='present'):
2129     assert isinstance(output, str)
2130     if version_re is None:
2131         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2132     m = re.search(version_re, output)
2133     if m:
2134         return m.group(1)
2135     else:
2136         return unrecognized
2137
2138
2139 def get_exe_version(exe, args=['--version'],
2140                     version_re=None, unrecognized=('present', 'broken')):
2141     """ Returns the version of the specified executable,
2142     or False if the executable is not present """
2143     unrecognized = variadic(unrecognized)
2144     assert len(unrecognized) in (1, 2)
2145     out = _get_exe_version_output(exe, args)
2146     if out is None:
2147         return unrecognized[-1]
2148     return out and detect_exe_version(out, version_re, unrecognized[0])
2149
2150
2151 def frange(start=0, stop=None, step=1):
2152     """Float range"""
2153     if stop is None:
2154         start, stop = 0, start
2155     sign = [-1, 1][step > 0] if step else 0
2156     while sign * start < sign * stop:
2157         yield start
2158         start += step
2159
2160
2161 class LazyList(collections.abc.Sequence):
2162     """Lazy immutable list from an iterable
2163     Note that slices of a LazyList are lists and not LazyList"""
2164
2165     class IndexError(IndexError):  # noqa: A001
2166         pass
2167
2168     def __init__(self, iterable, *, reverse=False, _cache=None):
2169         self._iterable = iter(iterable)
2170         self._cache = [] if _cache is None else _cache
2171         self._reversed = reverse
2172
2173     def __iter__(self):
2174         if self._reversed:
2175             # We need to consume the entire iterable to iterate in reverse
2176             yield from self.exhaust()
2177             return
2178         yield from self._cache
2179         for item in self._iterable:
2180             self._cache.append(item)
2181             yield item
2182
2183     def _exhaust(self):
2184         self._cache.extend(self._iterable)
2185         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2186         return self._cache
2187
2188     def exhaust(self):
2189         """Evaluate the entire iterable"""
2190         return self._exhaust()[::-1 if self._reversed else 1]
2191
2192     @staticmethod
2193     def _reverse_index(x):
2194         return None if x is None else ~x
2195
2196     def __getitem__(self, idx):
2197         if isinstance(idx, slice):
2198             if self._reversed:
2199                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2200             start, stop, step = idx.start, idx.stop, idx.step or 1
2201         elif isinstance(idx, int):
2202             if self._reversed:
2203                 idx = self._reverse_index(idx)
2204             start, stop, step = idx, idx, 0
2205         else:
2206             raise TypeError('indices must be integers or slices')
2207         if ((start or 0) < 0 or (stop or 0) < 0
2208                 or (start is None and step < 0)
2209                 or (stop is None and step > 0)):
2210             # We need to consume the entire iterable to be able to slice from the end
2211             # Obviously, never use this with infinite iterables
2212             self._exhaust()
2213             try:
2214                 return self._cache[idx]
2215             except IndexError as e:
2216                 raise self.IndexError(e) from e
2217         n = max(start or 0, stop or 0) - len(self._cache) + 1
2218         if n > 0:
2219             self._cache.extend(itertools.islice(self._iterable, n))
2220         try:
2221             return self._cache[idx]
2222         except IndexError as e:
2223             raise self.IndexError(e) from e
2224
2225     def __bool__(self):
2226         try:
2227             self[-1] if self._reversed else self[0]
2228         except self.IndexError:
2229             return False
2230         return True
2231
2232     def __len__(self):
2233         self._exhaust()
2234         return len(self._cache)
2235
2236     def __reversed__(self):
2237         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2238
2239     def __copy__(self):
2240         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2241
2242     def __repr__(self):
2243         # repr and str should mimic a list. So we exhaust the iterable
2244         return repr(self.exhaust())
2245
2246     def __str__(self):
2247         return repr(self.exhaust())
2248
2249
2250 class PagedList:
2251
2252     class IndexError(IndexError):  # noqa: A001
2253         pass
2254
2255     def __len__(self):
2256         # This is only useful for tests
2257         return len(self.getslice())
2258
2259     def __init__(self, pagefunc, pagesize, use_cache=True):
2260         self._pagefunc = pagefunc
2261         self._pagesize = pagesize
2262         self._pagecount = float('inf')
2263         self._use_cache = use_cache
2264         self._cache = {}
2265
2266     def getpage(self, pagenum):
2267         page_results = self._cache.get(pagenum)
2268         if page_results is None:
2269             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2270         if self._use_cache:
2271             self._cache[pagenum] = page_results
2272         return page_results
2273
2274     def getslice(self, start=0, end=None):
2275         return list(self._getslice(start, end))
2276
2277     def _getslice(self, start, end):
2278         raise NotImplementedError('This method must be implemented by subclasses')
2279
2280     def __getitem__(self, idx):
2281         assert self._use_cache, 'Indexing PagedList requires cache'
2282         if not isinstance(idx, int) or idx < 0:
2283             raise TypeError('indices must be non-negative integers')
2284         entries = self.getslice(idx, idx + 1)
2285         if not entries:
2286             raise self.IndexError
2287         return entries[0]
2288
2289     def __bool__(self):
2290         return bool(self.getslice(0, 1))
2291
2292
2293 class OnDemandPagedList(PagedList):
2294     """Download pages until a page with less than maximum results"""
2295
2296     def _getslice(self, start, end):
2297         for pagenum in itertools.count(start // self._pagesize):
2298             firstid = pagenum * self._pagesize
2299             nextfirstid = pagenum * self._pagesize + self._pagesize
2300             if start >= nextfirstid:
2301                 continue
2302
2303             startv = (
2304                 start % self._pagesize
2305                 if firstid <= start < nextfirstid
2306                 else 0)
2307             endv = (
2308                 ((end - 1) % self._pagesize) + 1
2309                 if (end is not None and firstid <= end <= nextfirstid)
2310                 else None)
2311
2312             try:
2313                 page_results = self.getpage(pagenum)
2314             except Exception:
2315                 self._pagecount = pagenum - 1
2316                 raise
2317             if startv != 0 or endv is not None:
2318                 page_results = page_results[startv:endv]
2319             yield from page_results
2320
2321             # A little optimization - if current page is not "full", ie. does
2322             # not contain page_size videos then we can assume that this page
2323             # is the last one - there are no more ids on further pages -
2324             # i.e. no need to query again.
2325             if len(page_results) + startv < self._pagesize:
2326                 break
2327
2328             # If we got the whole page, but the next page is not interesting,
2329             # break out early as well
2330             if end == nextfirstid:
2331                 break
2332
2333
2334 class InAdvancePagedList(PagedList):
2335     """PagedList with total number of pages known in advance"""
2336
2337     def __init__(self, pagefunc, pagecount, pagesize):
2338         PagedList.__init__(self, pagefunc, pagesize, True)
2339         self._pagecount = pagecount
2340
2341     def _getslice(self, start, end):
2342         start_page = start // self._pagesize
2343         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2344         skip_elems = start - start_page * self._pagesize
2345         only_more = None if end is None else end - start
2346         for pagenum in range(start_page, end_page):
2347             page_results = self.getpage(pagenum)
2348             if skip_elems:
2349                 page_results = page_results[skip_elems:]
2350                 skip_elems = None
2351             if only_more is not None:
2352                 if len(page_results) < only_more:
2353                     only_more -= len(page_results)
2354                 else:
2355                     yield from page_results[:only_more]
2356                     break
2357             yield from page_results
2358
2359
2360 class PlaylistEntries:
2361     MissingEntry = object()
2362     is_exhausted = False
2363
2364     def __init__(self, ydl, info_dict):
2365         self.ydl = ydl
2366
2367         # _entries must be assigned now since infodict can change during iteration
2368         entries = info_dict.get('entries')
2369         if entries is None:
2370             raise EntryNotInPlaylist('There are no entries')
2371         elif isinstance(entries, list):
2372             self.is_exhausted = True
2373
2374         requested_entries = info_dict.get('requested_entries')
2375         self.is_incomplete = requested_entries is not None
2376         if self.is_incomplete:
2377             assert self.is_exhausted
2378             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2379             for i, entry in zip(requested_entries, entries):
2380                 self._entries[i - 1] = entry
2381         elif isinstance(entries, (list, PagedList, LazyList)):
2382             self._entries = entries
2383         else:
2384             self._entries = LazyList(entries)
2385
2386     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2387         (?P<start>[+-]?\d+)?
2388         (?P<range>[:-]
2389             (?P<end>[+-]?\d+|inf(?:inite)?)?
2390             (?::(?P<step>[+-]?\d+))?
2391         )?''')
2392
2393     @classmethod
2394     def parse_playlist_items(cls, string):
2395         for segment in string.split(','):
2396             if not segment:
2397                 raise ValueError('There is two or more consecutive commas')
2398             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2399             if not mobj:
2400                 raise ValueError(f'{segment!r} is not a valid specification')
2401             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2402             if int_or_none(step) == 0:
2403                 raise ValueError(f'Step in {segment!r} cannot be zero')
2404             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2405
2406     def get_requested_items(self):
2407         playlist_items = self.ydl.params.get('playlist_items')
2408         playlist_start = self.ydl.params.get('playliststart', 1)
2409         playlist_end = self.ydl.params.get('playlistend')
2410         # For backwards compatibility, interpret -1 as whole list
2411         if playlist_end in (-1, None):
2412             playlist_end = ''
2413         if not playlist_items:
2414             playlist_items = f'{playlist_start}:{playlist_end}'
2415         elif playlist_start != 1 or playlist_end:
2416             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2417
2418         for index in self.parse_playlist_items(playlist_items):
2419             for i, entry in self[index]:
2420                 yield i, entry
2421                 if not entry:
2422                     continue
2423                 try:
2424                     # The item may have just been added to archive. Don't break due to it
2425                     if not self.ydl.params.get('lazy_playlist'):
2426                         # TODO: Add auto-generated fields
2427                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2428                 except (ExistingVideoReached, RejectedVideoReached):
2429                     return
2430
2431     def get_full_count(self):
2432         if self.is_exhausted and not self.is_incomplete:
2433             return len(self)
2434         elif isinstance(self._entries, InAdvancePagedList):
2435             if self._entries._pagesize == 1:
2436                 return self._entries._pagecount
2437
2438     @functools.cached_property
2439     def _getter(self):
2440         if isinstance(self._entries, list):
2441             def get_entry(i):
2442                 try:
2443                     entry = self._entries[i]
2444                 except IndexError:
2445                     entry = self.MissingEntry
2446                     if not self.is_incomplete:
2447                         raise self.IndexError
2448                 if entry is self.MissingEntry:
2449                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2450                 return entry
2451         else:
2452             def get_entry(i):
2453                 try:
2454                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2455                 except (LazyList.IndexError, PagedList.IndexError):
2456                     raise self.IndexError
2457         return get_entry
2458
2459     def __getitem__(self, idx):
2460         if isinstance(idx, int):
2461             idx = slice(idx, idx)
2462
2463         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2464         step = 1 if idx.step is None else idx.step
2465         if idx.start is None:
2466             start = 0 if step > 0 else len(self) - 1
2467         else:
2468             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2469
2470         # NB: Do not call len(self) when idx == [:]
2471         if idx.stop is None:
2472             stop = 0 if step < 0 else float('inf')
2473         else:
2474             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2475         stop += [-1, 1][step > 0]
2476
2477         for i in frange(start, stop, step):
2478             if i < 0:
2479                 continue
2480             try:
2481                 entry = self._getter(i)
2482             except self.IndexError:
2483                 self.is_exhausted = True
2484                 if step > 0:
2485                     break
2486                 continue
2487             yield i + 1, entry
2488
2489     def __len__(self):
2490         return len(tuple(self[:]))
2491
2492     class IndexError(IndexError):  # noqa: A001
2493         pass
2494
2495
2496 def uppercase_escape(s):
2497     unicode_escape = codecs.getdecoder('unicode_escape')
2498     return re.sub(
2499         r'\\U[0-9a-fA-F]{8}',
2500         lambda m: unicode_escape(m.group(0))[0],
2501         s)
2502
2503
2504 def lowercase_escape(s):
2505     unicode_escape = codecs.getdecoder('unicode_escape')
2506     return re.sub(
2507         r'\\u[0-9a-fA-F]{4}',
2508         lambda m: unicode_escape(m.group(0))[0],
2509         s)
2510
2511
2512 def parse_qs(url, **kwargs):
2513     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2514
2515
2516 def read_batch_urls(batch_fd):
2517     def fixup(url):
2518         if not isinstance(url, str):
2519             url = url.decode('utf-8', 'replace')
2520         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2521         for bom in BOM_UTF8:
2522             if url.startswith(bom):
2523                 url = url[len(bom):]
2524         url = url.lstrip()
2525         if not url or url.startswith(('#', ';', ']')):
2526             return False
2527         # "#" cannot be stripped out since it is part of the URI
2528         # However, it can be safely stripped out if following a whitespace
2529         return re.split(r'\s#', url, maxsplit=1)[0].rstrip()
2530
2531     with contextlib.closing(batch_fd) as fd:
2532         return [url for url in map(fixup, fd) if url]
2533
2534
2535 def urlencode_postdata(*args, **kargs):
2536     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2537
2538
2539 def update_url(url, *, query_update=None, **kwargs):
2540     """Replace URL components specified by kwargs
2541        @param url           str or parse url tuple
2542        @param query_update  update query
2543        @returns             str
2544     """
2545     if isinstance(url, str):
2546         if not kwargs and not query_update:
2547             return url
2548         else:
2549             url = urllib.parse.urlparse(url)
2550     if query_update:
2551         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2552         kwargs['query'] = urllib.parse.urlencode({
2553             **urllib.parse.parse_qs(url.query),
2554             **query_update,
2555         }, True)
2556     return urllib.parse.urlunparse(url._replace(**kwargs))
2557
2558
2559 def update_url_query(url, query):
2560     return update_url(url, query_update=query)
2561
2562
2563 def _multipart_encode_impl(data, boundary):
2564     content_type = f'multipart/form-data; boundary={boundary}'
2565
2566     out = b''
2567     for k, v in data.items():
2568         out += b'--' + boundary.encode('ascii') + b'\r\n'
2569         if isinstance(k, str):
2570             k = k.encode()
2571         if isinstance(v, str):
2572             v = v.encode()
2573         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2574         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2575         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2576         if boundary.encode('ascii') in content:
2577             raise ValueError('Boundary overlaps with data')
2578         out += content
2579
2580     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2581
2582     return out, content_type
2583
2584
2585 def multipart_encode(data, boundary=None):
2586     """
2587     Encode a dict to RFC 7578-compliant form-data
2588
2589     data:
2590         A dict where keys and values can be either Unicode or bytes-like
2591         objects.
2592     boundary:
2593         If specified a Unicode object, it's used as the boundary. Otherwise
2594         a random boundary is generated.
2595
2596     Reference: https://tools.ietf.org/html/rfc7578
2597     """
2598     has_specified_boundary = boundary is not None
2599
2600     while True:
2601         if boundary is None:
2602             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2603
2604         try:
2605             out, content_type = _multipart_encode_impl(data, boundary)
2606             break
2607         except ValueError:
2608             if has_specified_boundary:
2609                 raise
2610             boundary = None
2611
2612     return out, content_type
2613
2614
2615 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2616     if blocked_types is NO_DEFAULT:
2617         blocked_types = (str, bytes, collections.abc.Mapping)
2618     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2619
2620
2621 def variadic(x, allowed_types=NO_DEFAULT):
2622     if not isinstance(allowed_types, (tuple, type)):
2623         deprecation_warning('allowed_types should be a tuple or a type')
2624         allowed_types = tuple(allowed_types)
2625     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
2626
2627
2628 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2629     for f in funcs:
2630         try:
2631             val = f(*args, **kwargs)
2632         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
2633             pass
2634         else:
2635             if expected_type is None or isinstance(val, expected_type):
2636                 return val
2637
2638
2639 def try_get(src, getter, expected_type=None):
2640     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2641
2642
2643 def filter_dict(dct, cndn=lambda _, v: v is not None):
2644     return {k: v for k, v in dct.items() if cndn(k, v)}
2645
2646
2647 def merge_dicts(*dicts):
2648     merged = {}
2649     for a_dict in dicts:
2650         for k, v in a_dict.items():
2651             if (v is not None and k not in merged
2652                     or isinstance(v, str) and merged[k] == ''):
2653                 merged[k] = v
2654     return merged
2655
2656
2657 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2658     return string if isinstance(string, str) else str(string, encoding, errors)
2659
2660
2661 US_RATINGS = {
2662     'G': 0,
2663     'PG': 10,
2664     'PG-13': 13,
2665     'R': 16,
2666     'NC': 18,
2667 }
2668
2669
2670 TV_PARENTAL_GUIDELINES = {
2671     'TV-Y': 0,
2672     'TV-Y7': 7,
2673     'TV-G': 0,
2674     'TV-PG': 0,
2675     'TV-14': 14,
2676     'TV-MA': 17,
2677 }
2678
2679
2680 def parse_age_limit(s):
2681     # isinstance(False, int) is True. So type() must be used instead
2682     if type(s) is int:  # noqa: E721
2683         return s if 0 <= s <= 21 else None
2684     elif not isinstance(s, str):
2685         return None
2686     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2687     if m:
2688         return int(m.group('age'))
2689     s = s.upper()
2690     if s in US_RATINGS:
2691         return US_RATINGS[s]
2692     m = re.match(r'^TV[_-]?({})$'.format('|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES)), s)
2693     if m:
2694         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2695     return None
2696
2697
2698 def strip_jsonp(code):
2699     return re.sub(
2700         r'''(?sx)^
2701             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2702             (?:\s*&&\s*(?P=func_name))?
2703             \s*\(\s*(?P<callback_data>.*)\);?
2704             \s*?(?://[^\n]*)*$''',
2705         r'\g<callback_data>', code)
2706
2707
2708 def js_to_json(code, vars={}, *, strict=False):
2709     # vars is a dict of var, val pairs to substitute
2710     STRING_QUOTES = '\'"`'
2711     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
2712     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2713     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
2714     INTEGER_TABLE = (
2715         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2716         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
2717     )
2718
2719     def process_escape(match):
2720         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2721         escape = match.group(1) or match.group(2)
2722
2723         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2724                 else R'\u00' if escape == 'x'
2725                 else '' if escape == '\n'
2726                 else escape)
2727
2728     def template_substitute(match):
2729         evaluated = js_to_json(match.group(1), vars, strict=strict)
2730         if evaluated[0] == '"':
2731             return json.loads(evaluated)
2732         return evaluated
2733
2734     def fix_kv(m):
2735         v = m.group(0)
2736         if v in ('true', 'false', 'null'):
2737             return v
2738         elif v in ('undefined', 'void 0'):
2739             return 'null'
2740         elif v.startswith(('/*', '//', '!')) or v == ',':
2741             return ''
2742
2743         if v[0] in STRING_QUOTES:
2744             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2745             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
2746             return f'"{escaped}"'
2747
2748         for regex, base in INTEGER_TABLE:
2749             im = re.match(regex, v)
2750             if im:
2751                 i = int(im.group(1), base)
2752                 return f'"{i}":' if v.endswith(':') else str(i)
2753
2754         if v in vars:
2755             try:
2756                 if not strict:
2757                     json.loads(vars[v])
2758             except json.JSONDecodeError:
2759                 return json.dumps(vars[v])
2760             else:
2761                 return vars[v]
2762
2763         if not strict:
2764             return f'"{v}"'
2765
2766         raise ValueError(f'Unknown value: {v}')
2767
2768     def create_map(mobj):
2769         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2770
2771     code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code)
2772     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
2773     if not strict:
2774         code = re.sub(rf'new Date\(({STRING_RE})\)', r'\g<1>', code)
2775         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
2776         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2777         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
2778
2779     return re.sub(rf'''(?sx)
2780         {STRING_RE}|
2781         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
2782         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
2783         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2784         [0-9]+(?={SKIP_RE}:)|
2785         !+
2786         ''', fix_kv, code)
2787
2788
2789 def qualities(quality_ids):
2790     """ Get a numeric quality value out of a list of possible values """
2791     def q(qid):
2792         try:
2793             return quality_ids.index(qid)
2794         except ValueError:
2795             return -1
2796     return q
2797
2798
2799 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
2800
2801
2802 DEFAULT_OUTTMPL = {
2803     'default': '%(title)s [%(id)s].%(ext)s',
2804     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
2805 }
2806 OUTTMPL_TYPES = {
2807     'chapter': None,
2808     'subtitle': None,
2809     'thumbnail': None,
2810     'description': 'description',
2811     'annotation': 'annotations.xml',
2812     'infojson': 'info.json',
2813     'link': None,
2814     'pl_video': None,
2815     'pl_thumbnail': None,
2816     'pl_description': 'description',
2817     'pl_infojson': 'info.json',
2818 }
2819
2820 # As of [1] format syntax is:
2821 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2822 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
2823 STR_FORMAT_RE_TMPL = r'''(?x)
2824     (?<!%)(?P<prefix>(?:%%)*)
2825     %
2826     (?P<has_key>\((?P<key>{0})\))?
2827     (?P<format>
2828         (?P<conversion>[#0\-+ ]+)?
2829         (?P<min_width>\d+)?
2830         (?P<precision>\.\d+)?
2831         (?P<len_mod>[hlL])?  # unused in python
2832         {1}  # conversion type
2833     )
2834 '''
2835
2836
2837 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
2838
2839
2840 def limit_length(s, length):
2841     """ Add ellipses to overly long strings """
2842     if s is None:
2843         return None
2844     ELLIPSES = '...'
2845     if len(s) > length:
2846         return s[:length - len(ELLIPSES)] + ELLIPSES
2847     return s
2848
2849
2850 def version_tuple(v):
2851     return tuple(int(e) for e in re.split(r'[-.]', v))
2852
2853
2854 def is_outdated_version(version, limit, assume_new=True):
2855     if not version:
2856         return not assume_new
2857     try:
2858         return version_tuple(version) < version_tuple(limit)
2859     except ValueError:
2860         return not assume_new
2861
2862
2863 def ytdl_is_updateable():
2864     """ Returns if yt-dlp can be updated with -U """
2865
2866     from ..update import is_non_updateable
2867
2868     return not is_non_updateable()
2869
2870
2871 def args_to_str(args):
2872     # Get a short string representation for a subprocess command
2873     return shell_quote(args)
2874
2875
2876 def error_to_str(err):
2877     return f'{type(err).__name__}: {err}'
2878
2879
2880 def mimetype2ext(mt, default=NO_DEFAULT):
2881     if not isinstance(mt, str):
2882         if default is not NO_DEFAULT:
2883             return default
2884         return None
2885
2886     MAP = {
2887         # video
2888         '3gpp': '3gp',
2889         'mp2t': 'ts',
2890         'mp4': 'mp4',
2891         'mpeg': 'mpeg',
2892         'mpegurl': 'm3u8',
2893         'quicktime': 'mov',
2894         'webm': 'webm',
2895         'vp9': 'vp9',
2896         'video/ogg': 'ogv',
2897         'x-flv': 'flv',
2898         'x-m4v': 'm4v',
2899         'x-matroska': 'mkv',
2900         'x-mng': 'mng',
2901         'x-mp4-fragmented': 'mp4',
2902         'x-ms-asf': 'asf',
2903         'x-ms-wmv': 'wmv',
2904         'x-msvideo': 'avi',
2905
2906         # application (streaming playlists)
2907         'dash+xml': 'mpd',
2908         'f4m+xml': 'f4m',
2909         'hds+xml': 'f4m',
2910         'vnd.apple.mpegurl': 'm3u8',
2911         'vnd.ms-sstr+xml': 'ism',
2912         'x-mpegurl': 'm3u8',
2913
2914         # audio
2915         'audio/mp4': 'm4a',
2916         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2917         # Using .mp3 as it's the most popular one
2918         'audio/mpeg': 'mp3',
2919         'audio/webm': 'webm',
2920         'audio/x-matroska': 'mka',
2921         'audio/x-mpegurl': 'm3u',
2922         'midi': 'mid',
2923         'ogg': 'ogg',
2924         'wav': 'wav',
2925         'wave': 'wav',
2926         'x-aac': 'aac',
2927         'x-flac': 'flac',
2928         'x-m4a': 'm4a',
2929         'x-realaudio': 'ra',
2930         'x-wav': 'wav',
2931
2932         # image
2933         'avif': 'avif',
2934         'bmp': 'bmp',
2935         'gif': 'gif',
2936         'jpeg': 'jpg',
2937         'png': 'png',
2938         'svg+xml': 'svg',
2939         'tiff': 'tif',
2940         'vnd.wap.wbmp': 'wbmp',
2941         'webp': 'webp',
2942         'x-icon': 'ico',
2943         'x-jng': 'jng',
2944         'x-ms-bmp': 'bmp',
2945
2946         # caption
2947         'filmstrip+json': 'fs',
2948         'smptett+xml': 'tt',
2949         'ttaf+xml': 'dfxp',
2950         'ttml+xml': 'ttml',
2951         'x-ms-sami': 'sami',
2952
2953         # misc
2954         'gzip': 'gz',
2955         'json': 'json',
2956         'xml': 'xml',
2957         'zip': 'zip',
2958     }
2959
2960     mimetype = mt.partition(';')[0].strip().lower()
2961     _, _, subtype = mimetype.rpartition('/')
2962
2963     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2964     if ext:
2965         return ext
2966     elif default is not NO_DEFAULT:
2967         return default
2968     return subtype.replace('+', '.')
2969
2970
2971 def ext2mimetype(ext_or_url):
2972     if not ext_or_url:
2973         return None
2974     if '.' not in ext_or_url:
2975         ext_or_url = f'file.{ext_or_url}'
2976     return mimetypes.guess_type(ext_or_url)[0]
2977
2978
2979 def parse_codecs(codecs_str):
2980     # http://tools.ietf.org/html/rfc6381
2981     if not codecs_str:
2982         return {}
2983     split_codecs = list(filter(None, map(
2984         str.strip, codecs_str.strip().strip(',').split(','))))
2985     vcodec, acodec, scodec, hdr = None, None, None, None
2986     for full_codec in split_codecs:
2987         full_codec = re.sub(r'^([^.]+)', lambda m: m.group(1).lower(), full_codec)
2988         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
2989         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
2990                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
2991             if vcodec:
2992                 continue
2993             vcodec = full_codec
2994             if parts[0] in ('dvh1', 'dvhe'):
2995                 hdr = 'DV'
2996             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
2997                 hdr = 'HDR10'
2998             elif parts[:2] == ['vp9', '2']:
2999                 hdr = 'HDR10'
3000         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3001                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3002             acodec = acodec or full_codec
3003         elif parts[0] in ('stpp', 'wvtt'):
3004             scodec = scodec or full_codec
3005         else:
3006             write_string(f'WARNING: Unknown codec {full_codec}\n')
3007     if vcodec or acodec or scodec:
3008         return {
3009             'vcodec': vcodec or 'none',
3010             'acodec': acodec or 'none',
3011             'dynamic_range': hdr,
3012             **({'scodec': scodec} if scodec is not None else {}),
3013         }
3014     elif len(split_codecs) == 2:
3015         return {
3016             'vcodec': split_codecs[0],
3017             'acodec': split_codecs[1],
3018         }
3019     return {}
3020
3021
3022 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3023     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3024
3025     allow_mkv = not preferences or 'mkv' in preferences
3026
3027     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3028         return 'mkv'  # TODO: any other format allows this?
3029
3030     # TODO: All codecs supported by parse_codecs isn't handled here
3031     COMPATIBLE_CODECS = {
3032         'mp4': {
3033             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3034             'h264', 'aacl', 'ec-3',  # Set in ISM
3035         },
3036         'webm': {
3037             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3038             'vp9x', 'vp8x',  # in the webm spec
3039         },
3040     }
3041
3042     sanitize_codec = functools.partial(
3043         try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3044     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3045
3046     for ext in preferences or COMPATIBLE_CODECS.keys():
3047         codec_set = COMPATIBLE_CODECS.get(ext, set())
3048         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3049             return ext
3050
3051     COMPATIBLE_EXTS = (
3052         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3053         {'webm', 'weba'},
3054     )
3055     for ext in preferences or vexts:
3056         current_exts = {ext, *vexts, *aexts}
3057         if ext == 'mkv' or current_exts == {ext} or any(
3058                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3059             return ext
3060     return 'mkv' if allow_mkv else preferences[-1]
3061
3062
3063 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3064     getheader = url_handle.headers.get
3065
3066     cd = getheader('Content-Disposition')
3067     if cd:
3068         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3069         if m:
3070             e = determine_ext(m.group('filename'), default_ext=None)
3071             if e:
3072                 return e
3073
3074     meta_ext = getheader('x-amz-meta-name')
3075     if meta_ext:
3076         e = meta_ext.rpartition('.')[2]
3077         if e:
3078             return e
3079
3080     return mimetype2ext(getheader('Content-Type'), default=default)
3081
3082
3083 def encode_data_uri(data, mime_type):
3084     return 'data:{};base64,{}'.format(mime_type, base64.b64encode(data).decode('ascii'))
3085
3086
3087 def age_restricted(content_limit, age_limit):
3088     """ Returns True iff the content should be blocked """
3089
3090     if age_limit is None:  # No limit set
3091         return False
3092     if content_limit is None:
3093         return False  # Content available for everyone
3094     return age_limit < content_limit
3095
3096
3097 # List of known byte-order-marks (BOM)
3098 BOMS = [
3099     (b'\xef\xbb\xbf', 'utf-8'),
3100     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3101     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3102     (b'\xff\xfe', 'utf-16-le'),
3103     (b'\xfe\xff', 'utf-16-be'),
3104 ]
3105
3106
3107 def is_html(first_bytes):
3108     """ Detect whether a file contains HTML by examining its first bytes. """
3109
3110     encoding = 'utf-8'
3111     for bom, enc in BOMS:
3112         while first_bytes.startswith(bom):
3113             encoding, first_bytes = enc, first_bytes[len(bom):]
3114
3115     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3116
3117
3118 def determine_protocol(info_dict):
3119     protocol = info_dict.get('protocol')
3120     if protocol is not None:
3121         return protocol
3122
3123     url = sanitize_url(info_dict['url'])
3124     if url.startswith('rtmp'):
3125         return 'rtmp'
3126     elif url.startswith('mms'):
3127         return 'mms'
3128     elif url.startswith('rtsp'):
3129         return 'rtsp'
3130
3131     ext = determine_ext(url)
3132     if ext == 'm3u8':
3133         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3134     elif ext == 'f4m':
3135         return 'f4m'
3136
3137     return urllib.parse.urlparse(url).scheme
3138
3139
3140 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3141     """ Render a list of rows, each as a list of values.
3142     Text after a \t will be right aligned """
3143     def width(string):
3144         return len(remove_terminal_sequences(string).replace('\t', ''))
3145
3146     def get_max_lens(table):
3147         return [max(width(str(v)) for v in col) for col in zip(*table)]
3148
3149     def filter_using_list(row, filter_array):
3150         return [col for take, col in itertools.zip_longest(filter_array, row, fillvalue=True) if take]
3151
3152     max_lens = get_max_lens(data) if hide_empty else []
3153     header_row = filter_using_list(header_row, max_lens)
3154     data = [filter_using_list(row, max_lens) for row in data]
3155
3156     table = [header_row, *data]
3157     max_lens = get_max_lens(table)
3158     extra_gap += 1
3159     if delim:
3160         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens], *data]
3161         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3162     for row in table:
3163         for pos, text in enumerate(map(str, row)):
3164             if '\t' in text:
3165                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3166             else:
3167                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3168     return '\n'.join(''.join(row).rstrip() for row in table)
3169
3170
3171 def _match_one(filter_part, dct, incomplete):
3172     # TODO: Generalize code with YoutubeDL._build_format_filter
3173     STRING_OPERATORS = {
3174         '*=': operator.contains,
3175         '^=': lambda attr, value: attr.startswith(value),
3176         '$=': lambda attr, value: attr.endswith(value),
3177         '~=': lambda attr, value: re.search(value, attr),
3178     }
3179     COMPARISON_OPERATORS = {
3180         **STRING_OPERATORS,
3181         '<=': operator.le,  # "<=" must be defined above "<"
3182         '<': operator.lt,
3183         '>=': operator.ge,
3184         '>': operator.gt,
3185         '=': operator.eq,
3186     }
3187
3188     if isinstance(incomplete, bool):
3189         is_incomplete = lambda _: incomplete
3190     else:
3191         is_incomplete = lambda k: k in incomplete
3192
3193     operator_rex = re.compile(r'''(?x)
3194         (?P<key>[a-z_]+)
3195         \s*(?P<negation>!\s*)?(?P<op>{})(?P<none_inclusive>\s*\?)?\s*
3196         (?:
3197             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3198             (?P<strval>.+?)
3199         )
3200         '''.format('|'.join(map(re.escape, COMPARISON_OPERATORS.keys()))))
3201     m = operator_rex.fullmatch(filter_part.strip())
3202     if m:
3203         m = m.groupdict()
3204         unnegated_op = COMPARISON_OPERATORS[m['op']]
3205         if m['negation']:
3206             op = lambda attr, value: not unnegated_op(attr, value)
3207         else:
3208             op = unnegated_op
3209         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3210         if m['quote']:
3211             comparison_value = comparison_value.replace(r'\{}'.format(m['quote']), m['quote'])
3212         actual_value = dct.get(m['key'])
3213         numeric_comparison = None
3214         if isinstance(actual_value, (int, float)):
3215             # If the original field is a string and matching comparisonvalue is
3216             # a number we should respect the origin of the original field
3217             # and process comparison value as a string (see
3218             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3219             try:
3220                 numeric_comparison = int(comparison_value)
3221             except ValueError:
3222                 numeric_comparison = parse_filesize(comparison_value)
3223                 if numeric_comparison is None:
3224                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3225                 if numeric_comparison is None:
3226                     numeric_comparison = parse_duration(comparison_value)
3227         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3228             raise ValueError('Operator {} only supports string values!'.format(m['op']))
3229         if actual_value is None:
3230             return is_incomplete(m['key']) or m['none_inclusive']
3231         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3232
3233     UNARY_OPERATORS = {
3234         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3235         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3236     }
3237     operator_rex = re.compile(r'''(?x)
3238         (?P<op>{})\s*(?P<key>[a-z_]+)
3239         '''.format('|'.join(map(re.escape, UNARY_OPERATORS.keys()))))
3240     m = operator_rex.fullmatch(filter_part.strip())
3241     if m:
3242         op = UNARY_OPERATORS[m.group('op')]
3243         actual_value = dct.get(m.group('key'))
3244         if is_incomplete(m.group('key')) and actual_value is None:
3245             return True
3246         return op(actual_value)
3247
3248     raise ValueError(f'Invalid filter part {filter_part!r}')
3249
3250
3251 def match_str(filter_str, dct, incomplete=False):
3252     """ Filter a dictionary with a simple string syntax.
3253     @returns           Whether the filter passes
3254     @param incomplete  Set of keys that is expected to be missing from dct.
3255                        Can be True/False to indicate all/none of the keys may be missing.
3256                        All conditions on incomplete keys pass if the key is missing
3257     """
3258     return all(
3259         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3260         for filter_part in re.split(r'(?<!\\)&', filter_str))
3261
3262
3263 def match_filter_func(filters, breaking_filters=None):
3264     if not filters and not breaking_filters:
3265         return None
3266     repr_ = f'{match_filter_func.__module__}.{match_filter_func.__qualname__}({filters}, {breaking_filters})'
3267
3268     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3269     filters = set(variadic(filters or []))
3270
3271     interactive = '-' in filters
3272     if interactive:
3273         filters.remove('-')
3274
3275     @function_with_repr.set_repr(repr_)
3276     def _match_func(info_dict, incomplete=False):
3277         ret = breaking_filters(info_dict, incomplete)
3278         if ret is not None:
3279             raise RejectedVideoReached(ret)
3280
3281         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3282             return NO_DEFAULT if interactive and not incomplete else None
3283         else:
3284             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3285             filter_str = ') | ('.join(map(str.strip, filters))
3286             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3287     return _match_func
3288
3289
3290 class download_range_func:
3291     def __init__(self, chapters, ranges, from_info=False):
3292         self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3293
3294     def __call__(self, info_dict, ydl):
3295
3296         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3297                    else 'Cannot match chapters since chapter information is unavailable')
3298         for regex in self.chapters or []:
3299             for i, chapter in enumerate(info_dict.get('chapters') or []):
3300                 if re.search(regex, chapter['title']):
3301                     warning = None
3302                     yield {**chapter, 'index': i}
3303         if self.chapters and warning:
3304             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3305
3306         for start, end in self.ranges or []:
3307             yield {
3308                 'start_time': self._handle_negative_timestamp(start, info_dict),
3309                 'end_time': self._handle_negative_timestamp(end, info_dict),
3310             }
3311
3312         if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3313             yield {
3314                 'start_time': info_dict.get('start_time') or 0,
3315                 'end_time': info_dict.get('end_time') or float('inf'),
3316             }
3317         elif not self.ranges and not self.chapters:
3318             yield {}
3319
3320     @staticmethod
3321     def _handle_negative_timestamp(time, info):
3322         return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3323
3324     def __eq__(self, other):
3325         return (isinstance(other, download_range_func)
3326                 and self.chapters == other.chapters and self.ranges == other.ranges)
3327
3328     def __repr__(self):
3329         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3330
3331
3332 def parse_dfxp_time_expr(time_expr):
3333     if not time_expr:
3334         return
3335
3336     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3337     if mobj:
3338         return float(mobj.group('time_offset'))
3339
3340     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3341     if mobj:
3342         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3343
3344
3345 def srt_subtitles_timecode(seconds):
3346     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3347
3348
3349 def ass_subtitles_timecode(seconds):
3350     time = timetuple_from_msec(seconds * 1000)
3351     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3352
3353
3354 def dfxp2srt(dfxp_data):
3355     """
3356     @param dfxp_data A bytes-like object containing DFXP data
3357     @returns A unicode object containing converted SRT data
3358     """
3359     LEGACY_NAMESPACES = (
3360         (b'http://www.w3.org/ns/ttml', [
3361             b'http://www.w3.org/2004/11/ttaf1',
3362             b'http://www.w3.org/2006/04/ttaf1',
3363             b'http://www.w3.org/2006/10/ttaf1',
3364         ]),
3365         (b'http://www.w3.org/ns/ttml#styling', [
3366             b'http://www.w3.org/ns/ttml#style',
3367         ]),
3368     )
3369
3370     SUPPORTED_STYLING = [
3371         'color',
3372         'fontFamily',
3373         'fontSize',
3374         'fontStyle',
3375         'fontWeight',
3376         'textDecoration',
3377     ]
3378
3379     _x = functools.partial(xpath_with_ns, ns_map={
3380         'xml': 'http://www.w3.org/XML/1998/namespace',
3381         'ttml': 'http://www.w3.org/ns/ttml',
3382         'tts': 'http://www.w3.org/ns/ttml#styling',
3383     })
3384
3385     styles = {}
3386     default_style = {}
3387
3388     class TTMLPElementParser:
3389         _out = ''
3390         _unclosed_elements = []
3391         _applied_styles = []
3392
3393         def start(self, tag, attrib):
3394             if tag in (_x('ttml:br'), 'br'):
3395                 self._out += '\n'
3396             else:
3397                 unclosed_elements = []
3398                 style = {}
3399                 element_style_id = attrib.get('style')
3400                 if default_style:
3401                     style.update(default_style)
3402                 if element_style_id:
3403                     style.update(styles.get(element_style_id, {}))
3404                 for prop in SUPPORTED_STYLING:
3405                     prop_val = attrib.get(_x('tts:' + prop))
3406                     if prop_val:
3407                         style[prop] = prop_val
3408                 if style:
3409                     font = ''
3410                     for k, v in sorted(style.items()):
3411                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3412                             continue
3413                         if k == 'color':
3414                             font += f' color="{v}"'
3415                         elif k == 'fontSize':
3416                             font += f' size="{v}"'
3417                         elif k == 'fontFamily':
3418                             font += f' face="{v}"'
3419                         elif k == 'fontWeight' and v == 'bold':
3420                             self._out += '<b>'
3421                             unclosed_elements.append('b')
3422                         elif k == 'fontStyle' and v == 'italic':
3423                             self._out += '<i>'
3424                             unclosed_elements.append('i')
3425                         elif k == 'textDecoration' and v == 'underline':
3426                             self._out += '<u>'
3427                             unclosed_elements.append('u')
3428                     if font:
3429                         self._out += '<font' + font + '>'
3430                         unclosed_elements.append('font')
3431                     applied_style = {}
3432                     if self._applied_styles:
3433                         applied_style.update(self._applied_styles[-1])
3434                     applied_style.update(style)
3435                     self._applied_styles.append(applied_style)
3436                 self._unclosed_elements.append(unclosed_elements)
3437
3438         def end(self, tag):
3439             if tag not in (_x('ttml:br'), 'br'):
3440                 unclosed_elements = self._unclosed_elements.pop()
3441                 for element in reversed(unclosed_elements):
3442                     self._out += f'</{element}>'
3443                 if unclosed_elements and self._applied_styles:
3444                     self._applied_styles.pop()
3445
3446         def data(self, data):
3447             self._out += data
3448
3449         def close(self):
3450             return self._out.strip()
3451
3452     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3453     # This will not trigger false positives since only UTF-8 text is being replaced
3454     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3455
3456     def parse_node(node):
3457         target = TTMLPElementParser()
3458         parser = xml.etree.ElementTree.XMLParser(target=target)
3459         parser.feed(xml.etree.ElementTree.tostring(node))
3460         return parser.close()
3461
3462     for k, v in LEGACY_NAMESPACES:
3463         for ns in v:
3464             dfxp_data = dfxp_data.replace(ns, k)
3465
3466     dfxp = compat_etree_fromstring(dfxp_data)
3467     out = []
3468     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3469
3470     if not paras:
3471         raise ValueError('Invalid dfxp/TTML subtitle')
3472
3473     repeat = False
3474     while True:
3475         for style in dfxp.findall(_x('.//ttml:style')):
3476             style_id = style.get('id') or style.get(_x('xml:id'))
3477             if not style_id:
3478                 continue
3479             parent_style_id = style.get('style')
3480             if parent_style_id:
3481                 if parent_style_id not in styles:
3482                     repeat = True
3483                     continue
3484                 styles[style_id] = styles[parent_style_id].copy()
3485             for prop in SUPPORTED_STYLING:
3486                 prop_val = style.get(_x('tts:' + prop))
3487                 if prop_val:
3488                     styles.setdefault(style_id, {})[prop] = prop_val
3489         if repeat:
3490             repeat = False
3491         else:
3492             break
3493
3494     for p in ('body', 'div'):
3495         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3496         if ele is None:
3497             continue
3498         style = styles.get(ele.get('style'))
3499         if not style:
3500             continue
3501         default_style.update(style)
3502
3503     for para, index in zip(paras, itertools.count(1)):
3504         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3505         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3506         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3507         if begin_time is None:
3508             continue
3509         if not end_time:
3510             if not dur:
3511                 continue
3512             end_time = begin_time + dur
3513         out.append('%d\n%s --> %s\n%s\n\n' % (
3514             index,
3515             srt_subtitles_timecode(begin_time),
3516             srt_subtitles_timecode(end_time),
3517             parse_node(para)))
3518
3519     return ''.join(out)
3520
3521
3522 def cli_option(params, command_option, param, separator=None):
3523     param = params.get(param)
3524     return ([] if param is None
3525             else [command_option, str(param)] if separator is None
3526             else [f'{command_option}{separator}{param}'])
3527
3528
3529 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3530     param = params.get(param)
3531     assert param in (True, False, None)
3532     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3533
3534
3535 def cli_valueless_option(params, command_option, param, expected_value=True):
3536     return [command_option] if params.get(param) == expected_value else []
3537
3538
3539 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3540     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3541         if use_compat:
3542             return argdict
3543         else:
3544             argdict = None
3545     if argdict is None:
3546         return default
3547     assert isinstance(argdict, dict)
3548
3549     assert isinstance(keys, (list, tuple))
3550     for key_list in keys:
3551         arg_list = list(filter(
3552             lambda x: x is not None,
3553             [argdict.get(key.lower()) for key in variadic(key_list)]))
3554         if arg_list:
3555             return [arg for args in arg_list for arg in args]
3556     return default
3557
3558
3559 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3560     main_key, exe = main_key.lower(), exe.lower()
3561     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3562     keys = [f'{root_key}{k}' for k in (keys or [''])]
3563     if root_key in keys:
3564         if main_key != exe:
3565             keys.append((main_key, exe))
3566         keys.append('default')
3567     else:
3568         use_compat = False
3569     return cli_configuration_args(argdict, keys, default, use_compat)
3570
3571
3572 class ISO639Utils:
3573     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3574     _lang_map = {
3575         'aa': 'aar',
3576         'ab': 'abk',
3577         'ae': 'ave',
3578         'af': 'afr',
3579         'ak': 'aka',
3580         'am': 'amh',
3581         'an': 'arg',
3582         'ar': 'ara',
3583         'as': 'asm',
3584         'av': 'ava',
3585         'ay': 'aym',
3586         'az': 'aze',
3587         'ba': 'bak',
3588         'be': 'bel',
3589         'bg': 'bul',
3590         'bh': 'bih',
3591         'bi': 'bis',
3592         'bm': 'bam',
3593         'bn': 'ben',
3594         'bo': 'bod',
3595         'br': 'bre',
3596         'bs': 'bos',
3597         'ca': 'cat',
3598         'ce': 'che',
3599         'ch': 'cha',
3600         'co': 'cos',
3601         'cr': 'cre',
3602         'cs': 'ces',
3603         'cu': 'chu',
3604         'cv': 'chv',
3605         'cy': 'cym',
3606         'da': 'dan',
3607         'de': 'deu',
3608         'dv': 'div',
3609         'dz': 'dzo',
3610         'ee': 'ewe',
3611         'el': 'ell',
3612         'en': 'eng',
3613         'eo': 'epo',
3614         'es': 'spa',
3615         'et': 'est',
3616         'eu': 'eus',
3617         'fa': 'fas',
3618         'ff': 'ful',
3619         'fi': 'fin',
3620         'fj': 'fij',
3621         'fo': 'fao',
3622         'fr': 'fra',
3623         'fy': 'fry',
3624         'ga': 'gle',
3625         'gd': 'gla',
3626         'gl': 'glg',
3627         'gn': 'grn',
3628         'gu': 'guj',
3629         'gv': 'glv',
3630         'ha': 'hau',
3631         'he': 'heb',
3632         'iw': 'heb',  # Replaced by he in 1989 revision
3633         'hi': 'hin',
3634         'ho': 'hmo',
3635         'hr': 'hrv',
3636         'ht': 'hat',
3637         'hu': 'hun',
3638         'hy': 'hye',
3639         'hz': 'her',
3640         'ia': 'ina',
3641         'id': 'ind',
3642         'in': 'ind',  # Replaced by id in 1989 revision
3643         'ie': 'ile',
3644         'ig': 'ibo',
3645         'ii': 'iii',
3646         'ik': 'ipk',
3647         'io': 'ido',
3648         'is': 'isl',
3649         'it': 'ita',
3650         'iu': 'iku',
3651         'ja': 'jpn',
3652         'jv': 'jav',
3653         'ka': 'kat',
3654         'kg': 'kon',
3655         'ki': 'kik',
3656         'kj': 'kua',
3657         'kk': 'kaz',
3658         'kl': 'kal',
3659         'km': 'khm',
3660         'kn': 'kan',
3661         'ko': 'kor',
3662         'kr': 'kau',
3663         'ks': 'kas',
3664         'ku': 'kur',
3665         'kv': 'kom',
3666         'kw': 'cor',
3667         'ky': 'kir',
3668         'la': 'lat',
3669         'lb': 'ltz',
3670         'lg': 'lug',
3671         'li': 'lim',
3672         'ln': 'lin',
3673         'lo': 'lao',
3674         'lt': 'lit',
3675         'lu': 'lub',
3676         'lv': 'lav',
3677         'mg': 'mlg',
3678         'mh': 'mah',
3679         'mi': 'mri',
3680         'mk': 'mkd',
3681         'ml': 'mal',
3682         'mn': 'mon',
3683         'mr': 'mar',
3684         'ms': 'msa',
3685         'mt': 'mlt',
3686         'my': 'mya',
3687         'na': 'nau',
3688         'nb': 'nob',
3689         'nd': 'nde',
3690         'ne': 'nep',
3691         'ng': 'ndo',
3692         'nl': 'nld',
3693         'nn': 'nno',
3694         'no': 'nor',
3695         'nr': 'nbl',
3696         'nv': 'nav',
3697         'ny': 'nya',
3698         'oc': 'oci',
3699         'oj': 'oji',
3700         'om': 'orm',
3701         'or': 'ori',
3702         'os': 'oss',
3703         'pa': 'pan',
3704         'pe': 'per',
3705         'pi': 'pli',
3706         'pl': 'pol',
3707         'ps': 'pus',
3708         'pt': 'por',
3709         'qu': 'que',
3710         'rm': 'roh',
3711         'rn': 'run',
3712         'ro': 'ron',
3713         'ru': 'rus',
3714         'rw': 'kin',
3715         'sa': 'san',
3716         'sc': 'srd',
3717         'sd': 'snd',
3718         'se': 'sme',
3719         'sg': 'sag',
3720         'si': 'sin',
3721         'sk': 'slk',
3722         'sl': 'slv',
3723         'sm': 'smo',
3724         'sn': 'sna',
3725         'so': 'som',
3726         'sq': 'sqi',
3727         'sr': 'srp',
3728         'ss': 'ssw',
3729         'st': 'sot',
3730         'su': 'sun',
3731         'sv': 'swe',
3732         'sw': 'swa',
3733         'ta': 'tam',
3734         'te': 'tel',
3735         'tg': 'tgk',
3736         'th': 'tha',
3737         'ti': 'tir',
3738         'tk': 'tuk',
3739         'tl': 'tgl',
3740         'tn': 'tsn',
3741         'to': 'ton',
3742         'tr': 'tur',
3743         'ts': 'tso',
3744         'tt': 'tat',
3745         'tw': 'twi',
3746         'ty': 'tah',
3747         'ug': 'uig',
3748         'uk': 'ukr',
3749         'ur': 'urd',
3750         'uz': 'uzb',
3751         've': 'ven',
3752         'vi': 'vie',
3753         'vo': 'vol',
3754         'wa': 'wln',
3755         'wo': 'wol',
3756         'xh': 'xho',
3757         'yi': 'yid',
3758         'ji': 'yid',  # Replaced by yi in 1989 revision
3759         'yo': 'yor',
3760         'za': 'zha',
3761         'zh': 'zho',
3762         'zu': 'zul',
3763     }
3764
3765     @classmethod
3766     def short2long(cls, code):
3767         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3768         return cls._lang_map.get(code[:2])
3769
3770     @classmethod
3771     def long2short(cls, code):
3772         """Convert language code from ISO 639-2/T to ISO 639-1"""
3773         for short_name, long_name in cls._lang_map.items():
3774             if long_name == code:
3775                 return short_name
3776
3777
3778 class ISO3166Utils:
3779     # From http://data.okfn.org/data/core/country-list
3780     _country_map = {
3781         'AF': 'Afghanistan',
3782         'AX': 'Åland Islands',
3783         'AL': 'Albania',
3784         'DZ': 'Algeria',
3785         'AS': 'American Samoa',
3786         'AD': 'Andorra',
3787         'AO': 'Angola',
3788         'AI': 'Anguilla',
3789         'AQ': 'Antarctica',
3790         'AG': 'Antigua and Barbuda',
3791         'AR': 'Argentina',
3792         'AM': 'Armenia',
3793         'AW': 'Aruba',
3794         'AU': 'Australia',
3795         'AT': 'Austria',
3796         'AZ': 'Azerbaijan',
3797         'BS': 'Bahamas',
3798         'BH': 'Bahrain',
3799         'BD': 'Bangladesh',
3800         'BB': 'Barbados',
3801         'BY': 'Belarus',
3802         'BE': 'Belgium',
3803         'BZ': 'Belize',
3804         'BJ': 'Benin',
3805         'BM': 'Bermuda',
3806         'BT': 'Bhutan',
3807         'BO': 'Bolivia, Plurinational State of',
3808         'BQ': 'Bonaire, Sint Eustatius and Saba',
3809         'BA': 'Bosnia and Herzegovina',
3810         'BW': 'Botswana',
3811         'BV': 'Bouvet Island',
3812         'BR': 'Brazil',
3813         'IO': 'British Indian Ocean Territory',
3814         'BN': 'Brunei Darussalam',
3815         'BG': 'Bulgaria',
3816         'BF': 'Burkina Faso',
3817         'BI': 'Burundi',
3818         'KH': 'Cambodia',
3819         'CM': 'Cameroon',
3820         'CA': 'Canada',
3821         'CV': 'Cape Verde',
3822         'KY': 'Cayman Islands',
3823         'CF': 'Central African Republic',
3824         'TD': 'Chad',
3825         'CL': 'Chile',
3826         'CN': 'China',
3827         'CX': 'Christmas Island',
3828         'CC': 'Cocos (Keeling) Islands',
3829         'CO': 'Colombia',
3830         'KM': 'Comoros',
3831         'CG': 'Congo',
3832         'CD': 'Congo, the Democratic Republic of the',
3833         'CK': 'Cook Islands',
3834         'CR': 'Costa Rica',
3835         'CI': 'Côte d\'Ivoire',
3836         'HR': 'Croatia',
3837         'CU': 'Cuba',
3838         'CW': 'Curaçao',
3839         'CY': 'Cyprus',
3840         'CZ': 'Czech Republic',
3841         'DK': 'Denmark',
3842         'DJ': 'Djibouti',
3843         'DM': 'Dominica',
3844         'DO': 'Dominican Republic',
3845         'EC': 'Ecuador',
3846         'EG': 'Egypt',
3847         'SV': 'El Salvador',
3848         'GQ': 'Equatorial Guinea',
3849         'ER': 'Eritrea',
3850         'EE': 'Estonia',
3851         'ET': 'Ethiopia',
3852         'FK': 'Falkland Islands (Malvinas)',
3853         'FO': 'Faroe Islands',
3854         'FJ': 'Fiji',
3855         'FI': 'Finland',
3856         'FR': 'France',
3857         'GF': 'French Guiana',
3858         'PF': 'French Polynesia',
3859         'TF': 'French Southern Territories',
3860         'GA': 'Gabon',
3861         'GM': 'Gambia',
3862         'GE': 'Georgia',
3863         'DE': 'Germany',
3864         'GH': 'Ghana',
3865         'GI': 'Gibraltar',
3866         'GR': 'Greece',
3867         'GL': 'Greenland',
3868         'GD': 'Grenada',
3869         'GP': 'Guadeloupe',
3870         'GU': 'Guam',
3871         'GT': 'Guatemala',
3872         'GG': 'Guernsey',
3873         'GN': 'Guinea',
3874         'GW': 'Guinea-Bissau',
3875         'GY': 'Guyana',
3876         'HT': 'Haiti',
3877         'HM': 'Heard Island and McDonald Islands',
3878         'VA': 'Holy See (Vatican City State)',
3879         'HN': 'Honduras',
3880         'HK': 'Hong Kong',
3881         'HU': 'Hungary',
3882         'IS': 'Iceland',
3883         'IN': 'India',
3884         'ID': 'Indonesia',
3885         'IR': 'Iran, Islamic Republic of',
3886         'IQ': 'Iraq',
3887         'IE': 'Ireland',
3888         'IM': 'Isle of Man',
3889         'IL': 'Israel',
3890         'IT': 'Italy',
3891         'JM': 'Jamaica',
3892         'JP': 'Japan',
3893         'JE': 'Jersey',
3894         'JO': 'Jordan',
3895         'KZ': 'Kazakhstan',
3896         'KE': 'Kenya',
3897         'KI': 'Kiribati',
3898         'KP': 'Korea, Democratic People\'s Republic of',
3899         'KR': 'Korea, Republic of',
3900         'KW': 'Kuwait',
3901         'KG': 'Kyrgyzstan',
3902         'LA': 'Lao People\'s Democratic Republic',
3903         'LV': 'Latvia',
3904         'LB': 'Lebanon',
3905         'LS': 'Lesotho',
3906         'LR': 'Liberia',
3907         'LY': 'Libya',
3908         'LI': 'Liechtenstein',
3909         'LT': 'Lithuania',
3910         'LU': 'Luxembourg',
3911         'MO': 'Macao',
3912         'MK': 'Macedonia, the Former Yugoslav Republic of',
3913         'MG': 'Madagascar',
3914         'MW': 'Malawi',
3915         'MY': 'Malaysia',
3916         'MV': 'Maldives',
3917         'ML': 'Mali',
3918         'MT': 'Malta',
3919         'MH': 'Marshall Islands',
3920         'MQ': 'Martinique',
3921         'MR': 'Mauritania',
3922         'MU': 'Mauritius',
3923         'YT': 'Mayotte',
3924         'MX': 'Mexico',
3925         'FM': 'Micronesia, Federated States of',
3926         'MD': 'Moldova, Republic of',
3927         'MC': 'Monaco',
3928         'MN': 'Mongolia',
3929         'ME': 'Montenegro',
3930         'MS': 'Montserrat',
3931         'MA': 'Morocco',
3932         'MZ': 'Mozambique',
3933         'MM': 'Myanmar',
3934         'NA': 'Namibia',
3935         'NR': 'Nauru',
3936         'NP': 'Nepal',
3937         'NL': 'Netherlands',
3938         'NC': 'New Caledonia',
3939         'NZ': 'New Zealand',
3940         'NI': 'Nicaragua',
3941         'NE': 'Niger',
3942         'NG': 'Nigeria',
3943         'NU': 'Niue',
3944         'NF': 'Norfolk Island',
3945         'MP': 'Northern Mariana Islands',
3946         'NO': 'Norway',
3947         'OM': 'Oman',
3948         'PK': 'Pakistan',
3949         'PW': 'Palau',
3950         'PS': 'Palestine, State of',
3951         'PA': 'Panama',
3952         'PG': 'Papua New Guinea',
3953         'PY': 'Paraguay',
3954         'PE': 'Peru',
3955         'PH': 'Philippines',
3956         'PN': 'Pitcairn',
3957         'PL': 'Poland',
3958         'PT': 'Portugal',
3959         'PR': 'Puerto Rico',
3960         'QA': 'Qatar',
3961         'RE': 'Réunion',
3962         'RO': 'Romania',
3963         'RU': 'Russian Federation',
3964         'RW': 'Rwanda',
3965         'BL': 'Saint Barthélemy',
3966         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3967         'KN': 'Saint Kitts and Nevis',
3968         'LC': 'Saint Lucia',
3969         'MF': 'Saint Martin (French part)',
3970         'PM': 'Saint Pierre and Miquelon',
3971         'VC': 'Saint Vincent and the Grenadines',
3972         'WS': 'Samoa',
3973         'SM': 'San Marino',
3974         'ST': 'Sao Tome and Principe',
3975         'SA': 'Saudi Arabia',
3976         'SN': 'Senegal',
3977         'RS': 'Serbia',
3978         'SC': 'Seychelles',
3979         'SL': 'Sierra Leone',
3980         'SG': 'Singapore',
3981         'SX': 'Sint Maarten (Dutch part)',
3982         'SK': 'Slovakia',
3983         'SI': 'Slovenia',
3984         'SB': 'Solomon Islands',
3985         'SO': 'Somalia',
3986         'ZA': 'South Africa',
3987         'GS': 'South Georgia and the South Sandwich Islands',
3988         'SS': 'South Sudan',
3989         'ES': 'Spain',
3990         'LK': 'Sri Lanka',
3991         'SD': 'Sudan',
3992         'SR': 'Suriname',
3993         'SJ': 'Svalbard and Jan Mayen',
3994         'SZ': 'Swaziland',
3995         'SE': 'Sweden',
3996         'CH': 'Switzerland',
3997         'SY': 'Syrian Arab Republic',
3998         'TW': 'Taiwan, Province of China',
3999         'TJ': 'Tajikistan',
4000         'TZ': 'Tanzania, United Republic of',
4001         'TH': 'Thailand',
4002         'TL': 'Timor-Leste',
4003         'TG': 'Togo',
4004         'TK': 'Tokelau',
4005         'TO': 'Tonga',
4006         'TT': 'Trinidad and Tobago',
4007         'TN': 'Tunisia',
4008         'TR': 'Turkey',
4009         'TM': 'Turkmenistan',
4010         'TC': 'Turks and Caicos Islands',
4011         'TV': 'Tuvalu',
4012         'UG': 'Uganda',
4013         'UA': 'Ukraine',
4014         'AE': 'United Arab Emirates',
4015         'GB': 'United Kingdom',
4016         'US': 'United States',
4017         'UM': 'United States Minor Outlying Islands',
4018         'UY': 'Uruguay',
4019         'UZ': 'Uzbekistan',
4020         'VU': 'Vanuatu',
4021         'VE': 'Venezuela, Bolivarian Republic of',
4022         'VN': 'Viet Nam',
4023         'VG': 'Virgin Islands, British',
4024         'VI': 'Virgin Islands, U.S.',
4025         'WF': 'Wallis and Futuna',
4026         'EH': 'Western Sahara',
4027         'YE': 'Yemen',
4028         'ZM': 'Zambia',
4029         'ZW': 'Zimbabwe',
4030         # Not ISO 3166 codes, but used for IP blocks
4031         'AP': 'Asia/Pacific Region',
4032         'EU': 'Europe',
4033     }
4034
4035     @classmethod
4036     def short2full(cls, code):
4037         """Convert an ISO 3166-2 country code to the corresponding full name"""
4038         return cls._country_map.get(code.upper())
4039
4040
4041 class GeoUtils:
4042     # Major IPv4 address blocks per country
4043     _country_ip_map = {
4044         'AD': '46.172.224.0/19',
4045         'AE': '94.200.0.0/13',
4046         'AF': '149.54.0.0/17',
4047         'AG': '209.59.64.0/18',
4048         'AI': '204.14.248.0/21',
4049         'AL': '46.99.0.0/16',
4050         'AM': '46.70.0.0/15',
4051         'AO': '105.168.0.0/13',
4052         'AP': '182.50.184.0/21',
4053         'AQ': '23.154.160.0/24',
4054         'AR': '181.0.0.0/12',
4055         'AS': '202.70.112.0/20',
4056         'AT': '77.116.0.0/14',
4057         'AU': '1.128.0.0/11',
4058         'AW': '181.41.0.0/18',
4059         'AX': '185.217.4.0/22',
4060         'AZ': '5.197.0.0/16',
4061         'BA': '31.176.128.0/17',
4062         'BB': '65.48.128.0/17',
4063         'BD': '114.130.0.0/16',
4064         'BE': '57.0.0.0/8',
4065         'BF': '102.178.0.0/15',
4066         'BG': '95.42.0.0/15',
4067         'BH': '37.131.0.0/17',
4068         'BI': '154.117.192.0/18',
4069         'BJ': '137.255.0.0/16',
4070         'BL': '185.212.72.0/23',
4071         'BM': '196.12.64.0/18',
4072         'BN': '156.31.0.0/16',
4073         'BO': '161.56.0.0/16',
4074         'BQ': '161.0.80.0/20',
4075         'BR': '191.128.0.0/12',
4076         'BS': '24.51.64.0/18',
4077         'BT': '119.2.96.0/19',
4078         'BW': '168.167.0.0/16',
4079         'BY': '178.120.0.0/13',
4080         'BZ': '179.42.192.0/18',
4081         'CA': '99.224.0.0/11',
4082         'CD': '41.243.0.0/16',
4083         'CF': '197.242.176.0/21',
4084         'CG': '160.113.0.0/16',
4085         'CH': '85.0.0.0/13',
4086         'CI': '102.136.0.0/14',
4087         'CK': '202.65.32.0/19',
4088         'CL': '152.172.0.0/14',
4089         'CM': '102.244.0.0/14',
4090         'CN': '36.128.0.0/10',
4091         'CO': '181.240.0.0/12',
4092         'CR': '201.192.0.0/12',
4093         'CU': '152.206.0.0/15',
4094         'CV': '165.90.96.0/19',
4095         'CW': '190.88.128.0/17',
4096         'CY': '31.153.0.0/16',
4097         'CZ': '88.100.0.0/14',
4098         'DE': '53.0.0.0/8',
4099         'DJ': '197.241.0.0/17',
4100         'DK': '87.48.0.0/12',
4101         'DM': '192.243.48.0/20',
4102         'DO': '152.166.0.0/15',
4103         'DZ': '41.96.0.0/12',
4104         'EC': '186.68.0.0/15',
4105         'EE': '90.190.0.0/15',
4106         'EG': '156.160.0.0/11',
4107         'ER': '196.200.96.0/20',
4108         'ES': '88.0.0.0/11',
4109         'ET': '196.188.0.0/14',
4110         'EU': '2.16.0.0/13',
4111         'FI': '91.152.0.0/13',
4112         'FJ': '144.120.0.0/16',
4113         'FK': '80.73.208.0/21',
4114         'FM': '119.252.112.0/20',
4115         'FO': '88.85.32.0/19',
4116         'FR': '90.0.0.0/9',
4117         'GA': '41.158.0.0/15',
4118         'GB': '25.0.0.0/8',
4119         'GD': '74.122.88.0/21',
4120         'GE': '31.146.0.0/16',
4121         'GF': '161.22.64.0/18',
4122         'GG': '62.68.160.0/19',
4123         'GH': '154.160.0.0/12',
4124         'GI': '95.164.0.0/16',
4125         'GL': '88.83.0.0/19',
4126         'GM': '160.182.0.0/15',
4127         'GN': '197.149.192.0/18',
4128         'GP': '104.250.0.0/19',
4129         'GQ': '105.235.224.0/20',
4130         'GR': '94.64.0.0/13',
4131         'GT': '168.234.0.0/16',
4132         'GU': '168.123.0.0/16',
4133         'GW': '197.214.80.0/20',
4134         'GY': '181.41.64.0/18',
4135         'HK': '113.252.0.0/14',
4136         'HN': '181.210.0.0/16',
4137         'HR': '93.136.0.0/13',
4138         'HT': '148.102.128.0/17',
4139         'HU': '84.0.0.0/14',
4140         'ID': '39.192.0.0/10',
4141         'IE': '87.32.0.0/12',
4142         'IL': '79.176.0.0/13',
4143         'IM': '5.62.80.0/20',
4144         'IN': '117.192.0.0/10',
4145         'IO': '203.83.48.0/21',
4146         'IQ': '37.236.0.0/14',
4147         'IR': '2.176.0.0/12',
4148         'IS': '82.221.0.0/16',
4149         'IT': '79.0.0.0/10',
4150         'JE': '87.244.64.0/18',
4151         'JM': '72.27.0.0/17',
4152         'JO': '176.29.0.0/16',
4153         'JP': '133.0.0.0/8',
4154         'KE': '105.48.0.0/12',
4155         'KG': '158.181.128.0/17',
4156         'KH': '36.37.128.0/17',
4157         'KI': '103.25.140.0/22',
4158         'KM': '197.255.224.0/20',
4159         'KN': '198.167.192.0/19',
4160         'KP': '175.45.176.0/22',
4161         'KR': '175.192.0.0/10',
4162         'KW': '37.36.0.0/14',
4163         'KY': '64.96.0.0/15',
4164         'KZ': '2.72.0.0/13',
4165         'LA': '115.84.64.0/18',
4166         'LB': '178.135.0.0/16',
4167         'LC': '24.92.144.0/20',
4168         'LI': '82.117.0.0/19',
4169         'LK': '112.134.0.0/15',
4170         'LR': '102.183.0.0/16',
4171         'LS': '129.232.0.0/17',
4172         'LT': '78.56.0.0/13',
4173         'LU': '188.42.0.0/16',
4174         'LV': '46.109.0.0/16',
4175         'LY': '41.252.0.0/14',
4176         'MA': '105.128.0.0/11',
4177         'MC': '88.209.64.0/18',
4178         'MD': '37.246.0.0/16',
4179         'ME': '178.175.0.0/17',
4180         'MF': '74.112.232.0/21',
4181         'MG': '154.126.0.0/17',
4182         'MH': '117.103.88.0/21',
4183         'MK': '77.28.0.0/15',
4184         'ML': '154.118.128.0/18',
4185         'MM': '37.111.0.0/17',
4186         'MN': '49.0.128.0/17',
4187         'MO': '60.246.0.0/16',
4188         'MP': '202.88.64.0/20',
4189         'MQ': '109.203.224.0/19',
4190         'MR': '41.188.64.0/18',
4191         'MS': '208.90.112.0/22',
4192         'MT': '46.11.0.0/16',
4193         'MU': '105.16.0.0/12',
4194         'MV': '27.114.128.0/18',
4195         'MW': '102.70.0.0/15',
4196         'MX': '187.192.0.0/11',
4197         'MY': '175.136.0.0/13',
4198         'MZ': '197.218.0.0/15',
4199         'NA': '41.182.0.0/16',
4200         'NC': '101.101.0.0/18',
4201         'NE': '197.214.0.0/18',
4202         'NF': '203.17.240.0/22',
4203         'NG': '105.112.0.0/12',
4204         'NI': '186.76.0.0/15',
4205         'NL': '145.96.0.0/11',
4206         'NO': '84.208.0.0/13',
4207         'NP': '36.252.0.0/15',
4208         'NR': '203.98.224.0/19',
4209         'NU': '49.156.48.0/22',
4210         'NZ': '49.224.0.0/14',
4211         'OM': '5.36.0.0/15',
4212         'PA': '186.72.0.0/15',
4213         'PE': '186.160.0.0/14',
4214         'PF': '123.50.64.0/18',
4215         'PG': '124.240.192.0/19',
4216         'PH': '49.144.0.0/13',
4217         'PK': '39.32.0.0/11',
4218         'PL': '83.0.0.0/11',
4219         'PM': '70.36.0.0/20',
4220         'PR': '66.50.0.0/16',
4221         'PS': '188.161.0.0/16',
4222         'PT': '85.240.0.0/13',
4223         'PW': '202.124.224.0/20',
4224         'PY': '181.120.0.0/14',
4225         'QA': '37.210.0.0/15',
4226         'RE': '102.35.0.0/16',
4227         'RO': '79.112.0.0/13',
4228         'RS': '93.86.0.0/15',
4229         'RU': '5.136.0.0/13',
4230         'RW': '41.186.0.0/16',
4231         'SA': '188.48.0.0/13',
4232         'SB': '202.1.160.0/19',
4233         'SC': '154.192.0.0/11',
4234         'SD': '102.120.0.0/13',
4235         'SE': '78.64.0.0/12',
4236         'SG': '8.128.0.0/10',
4237         'SI': '188.196.0.0/14',
4238         'SK': '78.98.0.0/15',
4239         'SL': '102.143.0.0/17',
4240         'SM': '89.186.32.0/19',
4241         'SN': '41.82.0.0/15',
4242         'SO': '154.115.192.0/18',
4243         'SR': '186.179.128.0/17',
4244         'SS': '105.235.208.0/21',
4245         'ST': '197.159.160.0/19',
4246         'SV': '168.243.0.0/16',
4247         'SX': '190.102.0.0/20',
4248         'SY': '5.0.0.0/16',
4249         'SZ': '41.84.224.0/19',
4250         'TC': '65.255.48.0/20',
4251         'TD': '154.68.128.0/19',
4252         'TG': '196.168.0.0/14',
4253         'TH': '171.96.0.0/13',
4254         'TJ': '85.9.128.0/18',
4255         'TK': '27.96.24.0/21',
4256         'TL': '180.189.160.0/20',
4257         'TM': '95.85.96.0/19',
4258         'TN': '197.0.0.0/11',
4259         'TO': '175.176.144.0/21',
4260         'TR': '78.160.0.0/11',
4261         'TT': '186.44.0.0/15',
4262         'TV': '202.2.96.0/19',
4263         'TW': '120.96.0.0/11',
4264         'TZ': '156.156.0.0/14',
4265         'UA': '37.52.0.0/14',
4266         'UG': '102.80.0.0/13',
4267         'US': '6.0.0.0/8',
4268         'UY': '167.56.0.0/13',
4269         'UZ': '84.54.64.0/18',
4270         'VA': '212.77.0.0/19',
4271         'VC': '207.191.240.0/21',
4272         'VE': '186.88.0.0/13',
4273         'VG': '66.81.192.0/20',
4274         'VI': '146.226.0.0/16',
4275         'VN': '14.160.0.0/11',
4276         'VU': '202.80.32.0/20',
4277         'WF': '117.20.32.0/21',
4278         'WS': '202.4.32.0/19',
4279         'YE': '134.35.0.0/16',
4280         'YT': '41.242.116.0/22',
4281         'ZA': '41.0.0.0/11',
4282         'ZM': '102.144.0.0/13',
4283         'ZW': '102.177.192.0/18',
4284     }
4285
4286     @classmethod
4287     def random_ipv4(cls, code_or_block):
4288         if len(code_or_block) == 2:
4289             block = cls._country_ip_map.get(code_or_block.upper())
4290             if not block:
4291                 return None
4292         else:
4293             block = code_or_block
4294         addr, preflen = block.split('/')
4295         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4296         addr_max = addr_min | (0xffffffff >> int(preflen))
4297         return str(socket.inet_ntoa(
4298             struct.pack('!L', random.randint(addr_min, addr_max))))
4299
4300
4301 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4302 # released into Public Domain
4303 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4304
4305 def long_to_bytes(n, blocksize=0):
4306     """long_to_bytes(n:long, blocksize:int) : string
4307     Convert a long integer to a byte string.
4308
4309     If optional blocksize is given and greater than zero, pad the front of the
4310     byte string with binary zeros so that the length is a multiple of
4311     blocksize.
4312     """
4313     # after much testing, this algorithm was deemed to be the fastest
4314     s = b''
4315     n = int(n)
4316     while n > 0:
4317         s = struct.pack('>I', n & 0xffffffff) + s
4318         n = n >> 32
4319     # strip off leading zeros
4320     for i in range(len(s)):
4321         if s[i] != b'\000'[0]:
4322             break
4323     else:
4324         # only happens when n == 0
4325         s = b'\000'
4326         i = 0
4327     s = s[i:]
4328     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4329     # de-padding being done above, but sigh...
4330     if blocksize > 0 and len(s) % blocksize:
4331         s = (blocksize - len(s) % blocksize) * b'\000' + s
4332     return s
4333
4334
4335 def bytes_to_long(s):
4336     """bytes_to_long(string) : long
4337     Convert a byte string to a long integer.
4338
4339     This is (essentially) the inverse of long_to_bytes().
4340     """
4341     acc = 0
4342     length = len(s)
4343     if length % 4:
4344         extra = (4 - length % 4)
4345         s = b'\000' * extra + s
4346         length = length + extra
4347     for i in range(0, length, 4):
4348         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4349     return acc
4350
4351
4352 def ohdave_rsa_encrypt(data, exponent, modulus):
4353     """
4354     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4355
4356     Input:
4357         data: data to encrypt, bytes-like object
4358         exponent, modulus: parameter e and N of RSA algorithm, both integer
4359     Output: hex string of encrypted data
4360
4361     Limitation: supports one block encryption only
4362     """
4363
4364     payload = int(binascii.hexlify(data[::-1]), 16)
4365     encrypted = pow(payload, exponent, modulus)
4366     return f'{encrypted:x}'
4367
4368
4369 def pkcs1pad(data, length):
4370     """
4371     Padding input data with PKCS#1 scheme
4372
4373     @param {int[]} data        input data
4374     @param {int}   length      target length
4375     @returns {int[]}           padded data
4376     """
4377     if len(data) > length - 11:
4378         raise ValueError('Input data too long for PKCS#1 padding')
4379
4380     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4381     return [0, 2, *pseudo_random, 0, *data]
4382
4383
4384 def _base_n_table(n, table):
4385     if not table and not n:
4386         raise ValueError('Either table or n must be specified')
4387     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4388
4389     if n and n != len(table):
4390         raise ValueError(f'base {n} exceeds table length {len(table)}')
4391     return table
4392
4393
4394 def encode_base_n(num, n=None, table=None):
4395     """Convert given int to a base-n string"""
4396     table = _base_n_table(n, table)
4397     if not num:
4398         return table[0]
4399
4400     result, base = '', len(table)
4401     while num:
4402         result = table[num % base] + result
4403         num = num // base
4404     return result
4405
4406
4407 def decode_base_n(string, n=None, table=None):
4408     """Convert given base-n string to int"""
4409     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4410     result, base = 0, len(table)
4411     for char in string:
4412         result = result * base + table[char]
4413     return result
4414
4415
4416 def decode_packed_codes(code):
4417     mobj = re.search(PACKED_CODES_RE, code)
4418     obfuscated_code, base, count, symbols = mobj.groups()
4419     base = int(base)
4420     count = int(count)
4421     symbols = symbols.split('|')
4422     symbol_table = {}
4423
4424     while count:
4425         count -= 1
4426         base_n_count = encode_base_n(count, base)
4427         symbol_table[base_n_count] = symbols[count] or base_n_count
4428
4429     return re.sub(
4430         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4431         obfuscated_code)
4432
4433
4434 def caesar(s, alphabet, shift):
4435     if shift == 0:
4436         return s
4437     l = len(alphabet)
4438     return ''.join(
4439         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4440         for c in s)
4441
4442
4443 def rot47(s):
4444     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4445
4446
4447 def parse_m3u8_attributes(attrib):
4448     info = {}
4449     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4450         if val.startswith('"'):
4451             val = val[1:-1]
4452         info[key] = val
4453     return info
4454
4455
4456 def urshift(val, n):
4457     return val >> n if val >= 0 else (val + 0x100000000) >> n
4458
4459
4460 def write_xattr(path, key, value):
4461     # Windows: Write xattrs to NTFS Alternate Data Streams:
4462     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4463     if compat_os_name == 'nt':
4464         assert ':' not in key
4465         assert os.path.exists(path)
4466
4467         try:
4468             with open(f'{path}:{key}', 'wb') as f:
4469                 f.write(value)
4470         except OSError as e:
4471             raise XAttrMetadataError(e.errno, e.strerror)
4472         return
4473
4474     # UNIX Method 1. Use os.setxattr/xattrs/pyxattrs modules
4475
4476     setxattr = None
4477     if callable(getattr(os, 'setxattr', None)):
4478         setxattr = os.setxattr
4479     elif getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4480         # Unicode arguments are not supported in pyxattr until version 0.5.0
4481         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4482         if version_tuple(xattr.__version__) >= (0, 5, 0):
4483             setxattr = xattr.set
4484     elif xattr:
4485         setxattr = xattr.setxattr
4486
4487     if setxattr:
4488         try:
4489             setxattr(path, key, value)
4490         except OSError as e:
4491             raise XAttrMetadataError(e.errno, e.strerror)
4492         return
4493
4494     # UNIX Method 2. Use setfattr/xattr executables
4495     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4496            else 'xattr' if check_executable('xattr', ['-h']) else None)
4497     if not exe:
4498         raise XAttrUnavailableError(
4499             'Couldn\'t find a tool to set the xattrs. Install either the "xattr" or "pyxattr" Python modules or the '
4500             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4501
4502     value = value.decode()
4503     try:
4504         _, stderr, returncode = Popen.run(
4505             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4506             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4507     except OSError as e:
4508         raise XAttrMetadataError(e.errno, e.strerror)
4509     if returncode:
4510         raise XAttrMetadataError(returncode, stderr)
4511
4512
4513 def random_birthday(year_field, month_field, day_field):
4514     start_date = dt.date(1950, 1, 1)
4515     end_date = dt.date(1995, 12, 31)
4516     offset = random.randint(0, (end_date - start_date).days)
4517     random_date = start_date + dt.timedelta(offset)
4518     return {
4519         year_field: str(random_date.year),
4520         month_field: str(random_date.month),
4521         day_field: str(random_date.day),
4522     }
4523
4524
4525 def find_available_port(interface=''):
4526     try:
4527         with socket.socket() as sock:
4528             sock.bind((interface, 0))
4529             return sock.getsockname()[1]
4530     except OSError:
4531         return None
4532
4533
4534 # Templates for internet shortcut files, which are plain text files.
4535 DOT_URL_LINK_TEMPLATE = '''\
4536 [InternetShortcut]
4537 URL=%(url)s
4538 '''
4539
4540 DOT_WEBLOC_LINK_TEMPLATE = '''\
4541 <?xml version="1.0" encoding="UTF-8"?>
4542 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4543 <plist version="1.0">
4544 <dict>
4545 \t<key>URL</key>
4546 \t<string>%(url)s</string>
4547 </dict>
4548 </plist>
4549 '''
4550
4551 DOT_DESKTOP_LINK_TEMPLATE = '''\
4552 [Desktop Entry]
4553 Encoding=UTF-8
4554 Name=%(filename)s
4555 Type=Link
4556 URL=%(url)s
4557 Icon=text-html
4558 '''
4559
4560 LINK_TEMPLATES = {
4561     'url': DOT_URL_LINK_TEMPLATE,
4562     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4563     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4564 }
4565
4566
4567 def iri_to_uri(iri):
4568     """
4569     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4570
4571     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4572     """
4573
4574     iri_parts = urllib.parse.urlparse(iri)
4575
4576     if '[' in iri_parts.netloc:
4577         raise ValueError('IPv6 URIs are not, yet, supported.')
4578         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4579
4580     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4581
4582     net_location = ''
4583     if iri_parts.username:
4584         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4585         if iri_parts.password is not None:
4586             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4587         net_location += '@'
4588
4589     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
4590     # The 'idna' encoding produces ASCII text.
4591     if iri_parts.port is not None and iri_parts.port != 80:
4592         net_location += ':' + str(iri_parts.port)
4593
4594     return urllib.parse.urlunparse(
4595         (iri_parts.scheme,
4596             net_location,
4597
4598             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4599
4600             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4601             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4602
4603             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4604             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4605
4606             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4607
4608     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4609
4610
4611 def to_high_limit_path(path):
4612     if sys.platform in ['win32', 'cygwin']:
4613         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4614         return '\\\\?\\' + os.path.abspath(path)
4615
4616     return path
4617
4618
4619 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
4620     val = traversal.traverse_obj(obj, *variadic(field))
4621     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
4622         return default
4623     return template % func(val)
4624
4625
4626 def clean_podcast_url(url):
4627     url = re.sub(r'''(?x)
4628         (?:
4629             (?:
4630                 chtbl\.com/track|
4631                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4632                 play\.podtrac\.com|
4633                 chrt\.fm/track|
4634                 mgln\.ai/e
4635             )(?:/[^/.]+)?|
4636             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4637             flex\.acast\.com|
4638             pd(?:
4639                 cn\.co| # https://podcorn.com/analytics-prefix/
4640                 st\.fm # https://podsights.com/docs/
4641             )/e|
4642             [0-9]\.gum\.fm|
4643             pscrb\.fm/rss/p
4644         )/''', '', url)
4645     return re.sub(r'^\w+://(\w+://)', r'\1', url)
4646
4647
4648 _HEX_TABLE = '0123456789abcdef'
4649
4650
4651 def random_uuidv4():
4652     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4653
4654
4655 def make_dir(path, to_screen=None):
4656     try:
4657         dn = os.path.dirname(path)
4658         if dn:
4659             os.makedirs(dn, exist_ok=True)
4660         return True
4661     except OSError as err:
4662         if callable(to_screen) is not None:
4663             to_screen(f'unable to create directory {err}')
4664         return False
4665
4666
4667 def get_executable_path():
4668     from ..update import _get_variant_and_executable_path
4669
4670     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
4671
4672
4673 def get_user_config_dirs(package_name):
4674     # .config (e.g. ~/.config/package_name)
4675     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
4676     yield os.path.join(xdg_config_home, package_name)
4677
4678     # appdata (%APPDATA%/package_name)
4679     appdata_dir = os.getenv('appdata')
4680     if appdata_dir:
4681         yield os.path.join(appdata_dir, package_name)
4682
4683     # home (~/.package_name)
4684     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
4685
4686
4687 def get_system_config_dirs(package_name):
4688     # /etc/package_name
4689     yield os.path.join('/etc', package_name)
4690
4691
4692 def time_seconds(**kwargs):
4693     """
4694     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4695     """
4696     return time.time() + dt.timedelta(**kwargs).total_seconds()
4697
4698
4699 # create a JSON Web Signature (jws) with HS256 algorithm
4700 # the resulting format is in JWS Compact Serialization
4701 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4702 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4703 def jwt_encode_hs256(payload_data, key, headers={}):
4704     header_data = {
4705         'alg': 'HS256',
4706         'typ': 'JWT',
4707     }
4708     if headers:
4709         header_data.update(headers)
4710     header_b64 = base64.b64encode(json.dumps(header_data).encode())
4711     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4712     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
4713     signature_b64 = base64.b64encode(h.digest())
4714     return header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4715
4716
4717 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4718 def jwt_decode_hs256(jwt):
4719     header_b64, payload_b64, signature_b64 = jwt.split('.')
4720     # add trailing ='s that may have been stripped, superfluous ='s are ignored
4721     return json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
4722
4723
4724 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4725
4726
4727 @functools.cache
4728 def supports_terminal_sequences(stream):
4729     if compat_os_name == 'nt':
4730         if not WINDOWS_VT_MODE:
4731             return False
4732     elif not os.getenv('TERM'):
4733         return False
4734     try:
4735         return stream.isatty()
4736     except BaseException:
4737         return False
4738
4739
4740 def windows_enable_vt_mode():
4741     """Ref: https://bugs.python.org/issue30075 """
4742     if get_windows_version() < (10, 0, 10586):
4743         return
4744
4745     import ctypes
4746     import ctypes.wintypes
4747     import msvcrt
4748
4749     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4750
4751     dll = ctypes.WinDLL('kernel32', use_last_error=False)
4752     handle = os.open('CONOUT$', os.O_RDWR)
4753     try:
4754         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4755         dw_original_mode = ctypes.wintypes.DWORD()
4756         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4757         if not success:
4758             raise Exception('GetConsoleMode failed')
4759
4760         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4761             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4762         if not success:
4763             raise Exception('SetConsoleMode failed')
4764     finally:
4765         os.close(handle)
4766
4767     global WINDOWS_VT_MODE
4768     WINDOWS_VT_MODE = True
4769     supports_terminal_sequences.cache_clear()
4770
4771
4772 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
4773
4774
4775 def remove_terminal_sequences(string):
4776     return _terminal_sequences_re.sub('', string)
4777
4778
4779 def number_of_digits(number):
4780     return len('%d' % number)
4781
4782
4783 def join_nonempty(*values, delim='-', from_dict=None):
4784     if from_dict is not None:
4785         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
4786     return delim.join(map(str, filter(None, values)))
4787
4788
4789 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4790     """
4791     Find the largest format dimensions in terms of video width and, for each thumbnail:
4792     * Modify the URL: Match the width with the provided regex and replace with the former width
4793     * Update dimensions
4794
4795     This function is useful with video services that scale the provided thumbnails on demand
4796     """
4797     _keys = ('width', 'height')
4798     max_dimensions = max(
4799         (tuple(fmt.get(k) or 0 for k in _keys) for fmt in formats),
4800         default=(0, 0))
4801     if not max_dimensions[0]:
4802         return thumbnails
4803     return [
4804         merge_dicts(
4805             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4806             dict(zip(_keys, max_dimensions)), thumbnail)
4807         for thumbnail in thumbnails
4808     ]
4809
4810
4811 def parse_http_range(range):
4812     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4813     if not range:
4814         return None, None, None
4815     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4816     if not crg:
4817         return None, None, None
4818     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4819
4820
4821 def read_stdin(what):
4822     if what:
4823         eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4824         write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4825     return sys.stdin
4826
4827
4828 def determine_file_encoding(data):
4829     """
4830     Detect the text encoding used
4831     @returns (encoding, bytes to skip)
4832     """
4833
4834     # BOM marks are given priority over declarations
4835     for bom, enc in BOMS:
4836         if data.startswith(bom):
4837             return enc, len(bom)
4838
4839     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4840     # We ignore the endianness to get a good enough match
4841     data = data.replace(b'\0', b'')
4842     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4843     return mobj.group(1).decode() if mobj else None, 0
4844
4845
4846 class Config:
4847     own_args = None
4848     parsed_args = None
4849     filename = None
4850     __initialized = False
4851
4852     def __init__(self, parser, label=None):
4853         self.parser, self.label = parser, label
4854         self._loaded_paths, self.configs = set(), []
4855
4856     def init(self, args=None, filename=None):
4857         assert not self.__initialized
4858         self.own_args, self.filename = args, filename
4859         return self.load_configs()
4860
4861     def load_configs(self):
4862         directory = ''
4863         if self.filename:
4864             location = os.path.realpath(self.filename)
4865             directory = os.path.dirname(location)
4866             if location in self._loaded_paths:
4867                 return False
4868             self._loaded_paths.add(location)
4869
4870         self.__initialized = True
4871         opts, _ = self.parser.parse_known_args(self.own_args)
4872         self.parsed_args = self.own_args
4873         for location in opts.config_locations or []:
4874             if location == '-':
4875                 if location in self._loaded_paths:
4876                     continue
4877                 self._loaded_paths.add(location)
4878                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4879                 continue
4880             location = os.path.join(directory, expand_path(location))
4881             if os.path.isdir(location):
4882                 location = os.path.join(location, 'yt-dlp.conf')
4883             if not os.path.exists(location):
4884                 self.parser.error(f'config location {location} does not exist')
4885             self.append_config(self.read_file(location), location)
4886         return True
4887
4888     def __str__(self):
4889         label = join_nonempty(
4890             self.label, 'config', f'"{self.filename}"' if self.filename else '',
4891             delim=' ')
4892         return join_nonempty(
4893             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4894             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4895             delim='\n')
4896
4897     @staticmethod
4898     def read_file(filename, default=[]):
4899         try:
4900             optionf = open(filename, 'rb')
4901         except OSError:
4902             return default  # silently skip if file is not present
4903         try:
4904             enc, skip = determine_file_encoding(optionf.read(512))
4905             optionf.seek(skip, io.SEEK_SET)
4906         except OSError:
4907             enc = None  # silently skip read errors
4908         try:
4909             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
4910             contents = optionf.read().decode(enc or preferredencoding())
4911             res = shlex.split(contents, comments=True)
4912         except Exception as err:
4913             raise ValueError(f'Unable to parse "{filename}": {err}')
4914         finally:
4915             optionf.close()
4916         return res
4917
4918     @staticmethod
4919     def hide_login_info(opts):
4920         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
4921         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4922
4923         def _scrub_eq(o):
4924             m = eqre.match(o)
4925             if m:
4926                 return m.group('key') + '=PRIVATE'
4927             else:
4928                 return o
4929
4930         opts = list(map(_scrub_eq, opts))
4931         for idx, opt in enumerate(opts):
4932             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4933                 opts[idx + 1] = 'PRIVATE'
4934         return opts
4935
4936     def append_config(self, *args, label=None):
4937         config = type(self)(self.parser, label)
4938         config._loaded_paths = self._loaded_paths
4939         if config.init(*args):
4940             self.configs.append(config)
4941
4942     @property
4943     def all_args(self):
4944         for config in reversed(self.configs):
4945             yield from config.all_args
4946         yield from self.parsed_args or []
4947
4948     def parse_known_args(self, **kwargs):
4949         return self.parser.parse_known_args(self.all_args, **kwargs)
4950
4951     def parse_args(self):
4952         return self.parser.parse_args(self.all_args)
4953
4954
4955 def merge_headers(*dicts):
4956     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
4957     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
4958
4959
4960 def cached_method(f):
4961     """Cache a method"""
4962     signature = inspect.signature(f)
4963
4964     @functools.wraps(f)
4965     def wrapper(self, *args, **kwargs):
4966         bound_args = signature.bind(self, *args, **kwargs)
4967         bound_args.apply_defaults()
4968         key = tuple(bound_args.arguments.values())[1:]
4969
4970         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
4971         if key not in cache:
4972             cache[key] = f(self, *args, **kwargs)
4973         return cache[key]
4974     return wrapper
4975
4976
4977 class classproperty:
4978     """property access for class methods with optional caching"""
4979     def __new__(cls, func=None, *args, **kwargs):
4980         if not func:
4981             return functools.partial(cls, *args, **kwargs)
4982         return super().__new__(cls)
4983
4984     def __init__(self, func, *, cache=False):
4985         functools.update_wrapper(self, func)
4986         self.func = func
4987         self._cache = {} if cache else None
4988
4989     def __get__(self, _, cls):
4990         if self._cache is None:
4991             return self.func(cls)
4992         elif cls not in self._cache:
4993             self._cache[cls] = self.func(cls)
4994         return self._cache[cls]
4995
4996
4997 class function_with_repr:
4998     def __init__(self, func, repr_=None):
4999         functools.update_wrapper(self, func)
5000         self.func, self.__repr = func, repr_
5001
5002     def __call__(self, *args, **kwargs):
5003         return self.func(*args, **kwargs)
5004
5005     @classmethod
5006     def set_repr(cls, repr_):
5007         return functools.partial(cls, repr_=repr_)
5008
5009     def __repr__(self):
5010         if self.__repr:
5011             return self.__repr
5012         return f'{self.func.__module__}.{self.func.__qualname__}'
5013
5014
5015 class Namespace(types.SimpleNamespace):
5016     """Immutable namespace"""
5017
5018     def __iter__(self):
5019         return iter(self.__dict__.values())
5020
5021     @property
5022     def items_(self):
5023         return self.__dict__.items()
5024
5025
5026 MEDIA_EXTENSIONS = Namespace(
5027     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5028     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5029     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5030     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5031     thumbnails=('jpg', 'png', 'webp'),
5032     storyboards=('mhtml', ),
5033     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5034     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5035 )
5036 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5037 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5038
5039 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5040
5041
5042 class _UnsafeExtensionError(Exception):
5043     """
5044     Mitigation exception for uncommon/malicious file extensions
5045     This should be caught in YoutubeDL.py alongside a warning
5046
5047     Ref: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-79w7-vh3h-8g4j
5048     """
5049     ALLOWED_EXTENSIONS = frozenset([
5050         # internal
5051         'description',
5052         'json',
5053         'meta',
5054         'orig',
5055         'part',
5056         'temp',
5057         'uncut',
5058         'unknown_video',
5059         'ytdl',
5060
5061         # video
5062         *MEDIA_EXTENSIONS.video,
5063         'asx',
5064         'ismv',
5065         'm2t',
5066         'm2ts',
5067         'm2v',
5068         'm4s',
5069         'mng',
5070         'mp2v',
5071         'mp4v',
5072         'mpe',
5073         'mpeg',
5074         'mpeg1',
5075         'mpeg2',
5076         'mpeg4',
5077         'mxf',
5078         'ogm',
5079         'qt',
5080         'rm',
5081         'swf',
5082         'ts',
5083         'vob',
5084         'vp9',
5085
5086         # audio
5087         *MEDIA_EXTENSIONS.audio,
5088         '3ga',
5089         'ac3',
5090         'adts',
5091         'aif',
5092         'au',
5093         'dts',
5094         'isma',
5095         'it',
5096         'mid',
5097         'mod',
5098         'mpga',
5099         'mp1',
5100         'mp2',
5101         'mp4a',
5102         'mpa',
5103         'ra',
5104         'shn',
5105         'xm',
5106
5107         # image
5108         *MEDIA_EXTENSIONS.thumbnails,
5109         'avif',
5110         'bmp',
5111         'gif',
5112         'heic',
5113         'ico',
5114         'image',
5115         'jng',
5116         'jpeg',
5117         'jxl',
5118         'svg',
5119         'tif',
5120         'tiff',
5121         'wbmp',
5122
5123         # subtitle
5124         *MEDIA_EXTENSIONS.subtitles,
5125         'dfxp',
5126         'fs',
5127         'ismt',
5128         'json3',
5129         'sami',
5130         'scc',
5131         'srv1',
5132         'srv2',
5133         'srv3',
5134         'ssa',
5135         'tt',
5136         'ttml',
5137         'xml',
5138
5139         # others
5140         *MEDIA_EXTENSIONS.manifests,
5141         *MEDIA_EXTENSIONS.storyboards,
5142         'desktop',
5143         'ism',
5144         'm3u',
5145         'sbv',
5146         'url',
5147         'webloc',
5148     ])
5149
5150     def __init__(self, extension, /):
5151         super().__init__(f'unsafe file extension: {extension!r}')
5152         self.extension = extension
5153
5154     @classmethod
5155     def sanitize_extension(cls, extension, /, *, prepend=False):
5156         if extension is None:
5157             return None
5158
5159         if '/' in extension or '\\' in extension:
5160             raise cls(extension)
5161
5162         if not prepend:
5163             _, _, last = extension.rpartition('.')
5164             if last == 'bin':
5165                 extension = last = 'unknown_video'
5166             if last.lower() not in cls.ALLOWED_EXTENSIONS:
5167                 raise cls(extension)
5168
5169         return extension
5170
5171
5172 class RetryManager:
5173     """Usage:
5174         for retry in RetryManager(...):
5175             try:
5176                 ...
5177             except SomeException as err:
5178                 retry.error = err
5179                 continue
5180     """
5181     attempt, _error = 0, None
5182
5183     def __init__(self, _retries, _error_callback, **kwargs):
5184         self.retries = _retries or 0
5185         self.error_callback = functools.partial(_error_callback, **kwargs)
5186
5187     def _should_retry(self):
5188         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5189
5190     @property
5191     def error(self):
5192         if self._error is NO_DEFAULT:
5193             return None
5194         return self._error
5195
5196     @error.setter
5197     def error(self, value):
5198         self._error = value
5199
5200     def __iter__(self):
5201         while self._should_retry():
5202             self.error = NO_DEFAULT
5203             self.attempt += 1
5204             yield self
5205             if self.error:
5206                 self.error_callback(self.error, self.attempt, self.retries)
5207
5208     @staticmethod
5209     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5210         """Utility function for reporting retries"""
5211         if count > retries:
5212             if error:
5213                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5214             raise e
5215
5216         if not count:
5217             return warn(e)
5218         elif isinstance(e, ExtractorError):
5219             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5220         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5221
5222         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5223         if delay:
5224             info(f'Sleeping {delay:.2f} seconds ...')
5225             time.sleep(delay)
5226
5227
5228 def make_archive_id(ie, video_id):
5229     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5230     return f'{ie_key.lower()} {video_id}'
5231
5232
5233 def truncate_string(s, left, right=0):
5234     assert left > 3 and right >= 0
5235     if s is None or len(s) <= left + right:
5236         return s
5237     return f'{s[:left - 3]}...{s[-right:] if right else ""}'
5238
5239
5240 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5241     assert 'all' in alias_dict, '"all" alias is required'
5242     requested = list(start or [])
5243     for val in options:
5244         discard = val.startswith('-')
5245         if discard:
5246             val = val[1:]
5247
5248         if val in alias_dict:
5249             val = alias_dict[val] if not discard else [
5250                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5251             # NB: Do not allow regex in aliases for performance
5252             requested = orderedSet_from_options(val, alias_dict, start=requested)
5253             continue
5254
5255         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5256                    else [val] if val in alias_dict['all'] else None)
5257         if current is None:
5258             raise ValueError(val)
5259
5260         if discard:
5261             for item in current:
5262                 while item in requested:
5263                     requested.remove(item)
5264         else:
5265             requested.extend(current)
5266
5267     return orderedSet(requested)
5268
5269
5270 # TODO: Rewrite
5271 class FormatSorter:
5272     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5273
5274     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5275                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5276                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5277     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5278                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5279                     'fps', 'fs_approx', 'source', 'id')
5280
5281     settings = {
5282         'vcodec': {'type': 'ordered', 'regex': True,
5283                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5284         'acodec': {'type': 'ordered', 'regex': True,
5285                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5286         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5287                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5288         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5289                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5290         'vext': {'type': 'ordered', 'field': 'video_ext',
5291                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5292                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5293         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5294                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5295                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5296         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5297         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5298                        'field': ('vcodec', 'acodec'),
5299                        'function': lambda it: int(any(v != 'none' for v in it))},
5300         'ie_pref': {'priority': True, 'type': 'extractor'},
5301         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5302         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5303         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5304         'quality': {'convert': 'float', 'default': -1},
5305         'filesize': {'convert': 'bytes'},
5306         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5307         'id': {'convert': 'string', 'field': 'format_id'},
5308         'height': {'convert': 'float_none'},
5309         'width': {'convert': 'float_none'},
5310         'fps': {'convert': 'float_none'},
5311         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5312         'tbr': {'convert': 'float_none'},
5313         'vbr': {'convert': 'float_none'},
5314         'abr': {'convert': 'float_none'},
5315         'asr': {'convert': 'float_none'},
5316         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5317
5318         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5319         'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5320                'function': lambda it: next(filter(None, it), None)},
5321         'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5322                  'function': lambda it: next(filter(None, it), None)},
5323         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5324         'res': {'type': 'multiple', 'field': ('height', 'width'),
5325                 'function': lambda it: min(filter(None, it), default=0)},
5326
5327         # Actual field names
5328         'format_id': {'type': 'alias', 'field': 'id'},
5329         'preference': {'type': 'alias', 'field': 'ie_pref'},
5330         'language_preference': {'type': 'alias', 'field': 'lang'},
5331         'source_preference': {'type': 'alias', 'field': 'source'},
5332         'protocol': {'type': 'alias', 'field': 'proto'},
5333         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5334         'audio_channels': {'type': 'alias', 'field': 'channels'},
5335
5336         # Deprecated
5337         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5338         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5339         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5340         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5341         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5342         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5343         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5344         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5345         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5346         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5347         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5348         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5349         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5350         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5351         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5352         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5353         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5354         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5355         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5356         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5357     }
5358
5359     def __init__(self, ydl, field_preference):
5360         self.ydl = ydl
5361         self._order = []
5362         self.evaluate_params(self.ydl.params, field_preference)
5363         if ydl.params.get('verbose'):
5364             self.print_verbose_info(self.ydl.write_debug)
5365
5366     def _get_field_setting(self, field, key):
5367         if field not in self.settings:
5368             if key in ('forced', 'priority'):
5369                 return False
5370             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5371                                         'deprecated and may be removed in a future version')
5372             self.settings[field] = {}
5373         prop_obj = self.settings[field]
5374         if key not in prop_obj:
5375             type_ = prop_obj.get('type')
5376             if key == 'field':
5377                 default = 'preference' if type_ == 'extractor' else (field,) if type_ in ('combined', 'multiple') else field
5378             elif key == 'convert':
5379                 default = 'order' if type_ == 'ordered' else 'float_string' if field else 'ignore'
5380             else:
5381                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key)
5382             prop_obj[key] = default
5383         return prop_obj[key]
5384
5385     def _resolve_field_value(self, field, value, convert_none=False):
5386         if value is None:
5387             if not convert_none:
5388                 return None
5389         else:
5390             value = value.lower()
5391         conversion = self._get_field_setting(field, 'convert')
5392         if conversion == 'ignore':
5393             return None
5394         if conversion == 'string':
5395             return value
5396         elif conversion == 'float_none':
5397             return float_or_none(value)
5398         elif conversion == 'bytes':
5399             return parse_bytes(value)
5400         elif conversion == 'order':
5401             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5402             use_regex = self._get_field_setting(field, 'regex')
5403             list_length = len(order_list)
5404             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5405             if use_regex and value is not None:
5406                 for i, regex in enumerate(order_list):
5407                     if regex and re.match(regex, value):
5408                         return list_length - i
5409                 return list_length - empty_pos  # not in list
5410             else:  # not regex or  value = None
5411                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5412         else:
5413             if value.isnumeric():
5414                 return float(value)
5415             else:
5416                 self.settings[field]['convert'] = 'string'
5417                 return value
5418
5419     def evaluate_params(self, params, sort_extractor):
5420         self._use_free_order = params.get('prefer_free_formats', False)
5421         self._sort_user = params.get('format_sort', [])
5422         self._sort_extractor = sort_extractor
5423
5424         def add_item(field, reverse, closest, limit_text):
5425             field = field.lower()
5426             if field in self._order:
5427                 return
5428             self._order.append(field)
5429             limit = self._resolve_field_value(field, limit_text)
5430             data = {
5431                 'reverse': reverse,
5432                 'closest': False if limit is None else closest,
5433                 'limit_text': limit_text,
5434                 'limit': limit}
5435             if field in self.settings:
5436                 self.settings[field].update(data)
5437             else:
5438                 self.settings[field] = data
5439
5440         sort_list = (
5441             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5442             + (tuple() if params.get('format_sort_force', False)
5443                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5444             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5445
5446         for item in sort_list:
5447             match = re.match(self.regex, item)
5448             if match is None:
5449                 raise ExtractorError(f'Invalid format sort string "{item}" given by extractor')
5450             field = match.group('field')
5451             if field is None:
5452                 continue
5453             if self._get_field_setting(field, 'type') == 'alias':
5454                 alias, field = field, self._get_field_setting(field, 'field')
5455                 if self._get_field_setting(alias, 'deprecated'):
5456                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5457                                                 f'be removed in a future version. Please use {field} instead')
5458             reverse = match.group('reverse') is not None
5459             closest = match.group('separator') == '~'
5460             limit_text = match.group('limit')
5461
5462             has_limit = limit_text is not None
5463             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5464             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5465
5466             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5467             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5468             limit_count = len(limits)
5469             for (i, f) in enumerate(fields):
5470                 add_item(f, reverse, closest,
5471                          limits[i] if i < limit_count
5472                          else limits[0] if has_limit and not has_multiple_limits
5473                          else None)
5474
5475     def print_verbose_info(self, write_debug):
5476         if self._sort_user:
5477             write_debug('Sort order given by user: {}'.format(', '.join(self._sort_user)))
5478         if self._sort_extractor:
5479             write_debug('Sort order given by extractor: {}'.format(', '.join(self._sort_extractor)))
5480         write_debug('Formats sorted by: {}'.format(', '.join(['{}{}{}'.format(
5481             '+' if self._get_field_setting(field, 'reverse') else '', field,
5482             '{}{}({})'.format('~' if self._get_field_setting(field, 'closest') else ':',
5483                               self._get_field_setting(field, 'limit_text'),
5484                               self._get_field_setting(field, 'limit'))
5485             if self._get_field_setting(field, 'limit_text') is not None else '')
5486             for field in self._order if self._get_field_setting(field, 'visible')])))
5487
5488     def _calculate_field_preference_from_value(self, format_, field, type_, value):
5489         reverse = self._get_field_setting(field, 'reverse')
5490         closest = self._get_field_setting(field, 'closest')
5491         limit = self._get_field_setting(field, 'limit')
5492
5493         if type_ == 'extractor':
5494             maximum = self._get_field_setting(field, 'max')
5495             if value is None or (maximum is not None and value >= maximum):
5496                 value = -1
5497         elif type_ == 'boolean':
5498             in_list = self._get_field_setting(field, 'in_list')
5499             not_in_list = self._get_field_setting(field, 'not_in_list')
5500             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5501         elif type_ == 'ordered':
5502             value = self._resolve_field_value(field, value, True)
5503
5504         # try to convert to number
5505         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5506         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5507         if is_num:
5508             value = val_num
5509
5510         return ((-10, 0) if value is None
5511                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5512                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5513                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5514                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5515                 else (-1, value, 0))
5516
5517     def _calculate_field_preference(self, format_, field):
5518         type_ = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5519         get_value = lambda f: format_.get(self._get_field_setting(f, 'field'))
5520         if type_ == 'multiple':
5521             type_ = 'field'  # Only 'field' is allowed in multiple for now
5522             actual_fields = self._get_field_setting(field, 'field')
5523
5524             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5525         else:
5526             value = get_value(field)
5527         return self._calculate_field_preference_from_value(format_, field, type_, value)
5528
5529     def calculate_preference(self, format):
5530         # Determine missing protocol
5531         if not format.get('protocol'):
5532             format['protocol'] = determine_protocol(format)
5533
5534         # Determine missing ext
5535         if not format.get('ext') and 'url' in format:
5536             format['ext'] = determine_ext(format['url'])
5537         if format.get('vcodec') == 'none':
5538             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5539             format['video_ext'] = 'none'
5540         else:
5541             format['video_ext'] = format['ext']
5542             format['audio_ext'] = 'none'
5543         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5544         #    format['preference'] = -1000
5545
5546         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5547             # HEVC-over-FLV is out-of-spec by FLV's original spec
5548             # ref. https://trac.ffmpeg.org/ticket/6389
5549             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5550             format['preference'] = -100
5551
5552         # Determine missing bitrates
5553         if format.get('vcodec') == 'none':
5554             format['vbr'] = 0
5555         if format.get('acodec') == 'none':
5556             format['abr'] = 0
5557         if not format.get('vbr') and format.get('vcodec') != 'none':
5558             format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5559         if not format.get('abr') and format.get('acodec') != 'none':
5560             format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5561         if not format.get('tbr'):
5562             format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5563
5564         return tuple(self._calculate_field_preference(format, field) for field in self._order)
5565
5566
5567 def filesize_from_tbr(tbr, duration):
5568     """
5569     @param tbr:      Total bitrate in kbps (1000 bits/sec)
5570     @param duration: Duration in seconds
5571     @returns         Filesize in bytes
5572     """
5573     if tbr is None or duration is None:
5574         return None
5575     return int(duration * tbr * (1000 / 8))
5576
5577
5578 # XXX: Temporary
5579 class _YDLLogger:
5580     def __init__(self, ydl=None):
5581         self._ydl = ydl
5582
5583     def debug(self, message):
5584         if self._ydl:
5585             self._ydl.write_debug(message)
5586
5587     def info(self, message):
5588         if self._ydl:
5589             self._ydl.to_screen(message)
5590
5591     def warning(self, message, *, once=False):
5592         if self._ydl:
5593             self._ydl.report_warning(message, once)
5594
5595     def error(self, message, *, is_error=True):
5596         if self._ydl:
5597             self._ydl.report_error(message, is_error=is_error)
5598
5599     def stdout(self, message):
5600         if self._ydl:
5601             self._ydl.to_stdout(message)
5602
5603     def stderr(self, message):
5604         if self._ydl:
5605             self._ydl.to_stderr(message)