yt_dlp/extractor/common.py

   1 import base64
   2 import collections
   3 import functools
   4 import getpass
   5 import hashlib
   6 import http.client
   7 import http.cookiejar
   8 import http.cookies
   9 import inspect
  10 import itertools
  11 import json
  12 import math
  13 import netrc
  14 import os
  15 import random
  16 import re
  17 import subprocess
  18 import sys
  19 import time
  20 import types
  21 import urllib.parse
  22 import urllib.request
  23 import xml.etree.ElementTree
  24
  25 from ..compat import (
  26     compat_etree_fromstring,
  27     compat_expanduser,
  28     compat_os_name,
  29     urllib_req_to_req,
  30 )
  31 from ..cookies import LenientSimpleCookie
  32 from ..downloader.f4m import get_base_url, remove_encrypted_media
  33 from ..downloader.hls import HlsFD
  34 from ..networking import HEADRequest, Request
  35 from ..networking.exceptions import (
  36     HTTPError,
  37     IncompleteRead,
  38     TransportError,
  39     network_exceptions,
  40 )
  41 from ..networking.impersonate import ImpersonateTarget
  42 from ..utils import (
  43     IDENTITY,
  44     JSON_LD_RE,
  45     NO_DEFAULT,
  46     ExtractorError,
  47     FormatSorter,
  48     GeoRestrictedError,
  49     GeoUtils,
  50     LenientJSONDecoder,
  51     Popen,
  52     RegexNotFoundError,
  53     RetryManager,
  54     UnsupportedError,
  55     age_restricted,
  56     base_url,
  57     bug_reports_message,
  58     classproperty,
  59     clean_html,
  60     deprecation_warning,
  61     determine_ext,
  62     dict_get,
  63     encode_data_uri,
  64     extract_attributes,
  65     filter_dict,
  66     fix_xml_ampersands,
  67     float_or_none,
  68     format_field,
  69     int_or_none,
  70     join_nonempty,
  71     js_to_json,
  72     mimetype2ext,
  73     netrc_from_content,
  74     orderedSet,
  75     parse_bitrate,
  76     parse_codecs,
  77     parse_duration,
  78     parse_iso8601,
  79     parse_m3u8_attributes,
  80     parse_resolution,
  81     sanitize_filename,
  82     sanitize_url,
  83     smuggle_url,
  84     str_or_none,
  85     str_to_int,
  86     strip_or_none,
  87     traverse_obj,
  88     truncate_string,
  89     try_call,
  90     try_get,
  91     unescapeHTML,
  92     unified_strdate,
  93     unified_timestamp,
  94     url_basename,
  95     url_or_none,
  96     urlhandle_detect_ext,
  97     urljoin,
  98     variadic,
  99     xpath_element,
 100     xpath_text,
 101     xpath_with_ns,
 102 )
 103
 104
 105 class InfoExtractor:
 106     """Information Extractor class.
 107
 108     Information extractors are the classes that, given a URL, extract
 109     information about the video (or videos) the URL refers to. This
 110     information includes the real video URL, the video title, author and
 111     others. The information is stored in a dictionary which is then
 112     passed to the YoutubeDL. The YoutubeDL processes this
 113     information possibly downloading the video to the file system, among
 114     other possible outcomes.
 115
 116     The type field determines the type of the result.
 117     By far the most common value (and the default if _type is missing) is
 118     "video", which indicates a single video.
 119
 120     For a video, the dictionaries must include the following fields:
 121
 122     id:             Video identifier.
 123     title:          Video title, unescaped. Set to an empty string if video has
 124                     no title as opposed to "None" which signifies that the
 125                     extractor failed to obtain a title
 126
 127     Additionally, it must contain either a formats entry or a url one:
 128
 129     formats:        A list of dictionaries for each format available, ordered
 130                     from worst to best quality.
 131
 132                     Potential fields:
 133                     * url        The mandatory URL representing the media:
 134                                    for plain file media - HTTP URL of this file,
 135                                    for RTMP - RTMP URL,
 136                                    for HLS - URL of the M3U8 media playlist,
 137                                    for HDS - URL of the F4M manifest,
 138                                    for DASH
 139                                      - HTTP URL to plain file media (in case of
 140                                        unfragmented media)
 141                                      - URL of the MPD manifest or base URL
 142                                        representing the media if MPD manifest
 143                                        is parsed from a string (in case of
 144                                        fragmented media)
 145                                    for MSS - URL of the ISM manifest.
 146                     * request_data  Data to send in POST request to the URL
 147                     * manifest_url
 148                                  The URL of the manifest file in case of
 149                                  fragmented media:
 150                                    for HLS - URL of the M3U8 master playlist,
 151                                    for HDS - URL of the F4M manifest,
 152                                    for DASH - URL of the MPD manifest,
 153                                    for MSS - URL of the ISM manifest.
 154                     * manifest_stream_number  (For internal use only)
 155                                  The index of the stream in the manifest file
 156                     * ext        Will be calculated from URL if missing
 157                     * format     A human-readable description of the format
 158                                  ("mp4 container with h264/opus").
 159                                  Calculated from the format_id, width, height.
 160                                  and format_note fields if missing.
 161                     * format_id  A short description of the format
 162                                  ("mp4_h264_opus" or "19").
 163                                 Technically optional, but strongly recommended.
 164                     * format_note Additional info about the format
 165                                  ("3D" or "DASH video")
 166                     * width      Width of the video, if known
 167                     * height     Height of the video, if known
 168                     * aspect_ratio  Aspect ratio of the video, if known
 169                                  Automatically calculated from width and height
 170                     * resolution Textual description of width and height
 171                                  Automatically calculated from width and height
 172                     * dynamic_range The dynamic range of the video. One of:
 173                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 174                     * tbr        Average bitrate of audio and video in kbps (1000 bits/sec)
 175                     * abr        Average audio bitrate in kbps (1000 bits/sec)
 176                     * acodec     Name of the audio codec in use
 177                     * asr        Audio sampling rate in Hertz
 178                     * audio_channels  Number of audio channels
 179                     * vbr        Average video bitrate in kbps (1000 bits/sec)
 180                     * fps        Frame rate
 181                     * vcodec     Name of the video codec in use
 182                     * container  Name of the container format
 183                     * filesize   The number of bytes, if known in advance
 184                     * filesize_approx  An estimate for the number of bytes
 185                     * player_url SWF Player URL (used for rtmpdump).
 186                     * protocol   The protocol that will be used for the actual
 187                                  download, lower-case. One of "http", "https" or
 188                                  one of the protocols defined in downloader.PROTOCOL_MAP
 189                     * fragment_base_url
 190                                  Base URL for fragments. Each fragment's path
 191                                  value (if present) will be relative to
 192                                  this URL.
 193                     * fragments  A list of fragments of a fragmented media.
 194                                  Each fragment entry must contain either an url
 195                                  or a path. If an url is present it should be
 196                                  considered by a client. Otherwise both path and
 197                                  fragment_base_url must be present. Here is
 198                                  the list of all potential fields:
 199                                  * "url" - fragment's URL
 200                                  * "path" - fragment's path relative to
 201                                             fragment_base_url
 202                                  * "duration" (optional, int or float)
 203                                  * "filesize" (optional, int)
 204                     * is_from_start  Is a live format that can be downloaded
 205                                 from the start. Boolean
 206                     * preference Order number of this format. If this field is
 207                                  present and not None, the formats get sorted
 208                                  by this field, regardless of all other values.
 209                                  -1 for default (order by other properties),
 210                                  -2 or smaller for less than default.
 211                                  < -1000 to hide the format (if there is
 212                                     another one which is strictly better)
 213                     * language   Language code, e.g. "de" or "en-US".
 214                     * language_preference  Is this in the language mentioned in
 215                                  the URL?
 216                                  10 if it's what the URL is about,
 217                                  -1 for default (don't know),
 218                                  -10 otherwise, other values reserved for now.
 219                     * quality    Order number of the video quality of this
 220                                  format, irrespective of the file format.
 221                                  -1 for default (order by other properties),
 222                                  -2 or smaller for less than default.
 223                     * source_preference  Order number for this video source
 224                                   (quality takes higher priority)
 225                                  -1 for default (order by other properties),
 226                                  -2 or smaller for less than default.
 227                     * http_headers  A dictionary of additional HTTP headers
 228                                  to add to the request.
 229                     * stretched_ratio  If given and not 1, indicates that the
 230                                  video's pixels are not square.
 231                                  width : height ratio as float.
 232                     * no_resume  The server does not support resuming the
 233                                  (HTTP or RTMP) download. Boolean.
 234                     * has_drm    True if the format has DRM and cannot be downloaded.
 235                                  'maybe' if the format may have DRM and has to be tested before download.
 236                     * extra_param_to_segment_url  A query string to append to each
 237                                  fragment's URL, or to update each existing query string
 238                                  with. If it is an HLS stream with an AES-128 decryption key,
 239                                  the query paramaters will be passed to the key URI as well,
 240                                  unless there is an `extra_param_to_key_url` given,
 241                                  or unless an external key URI is provided via `hls_aes`.
 242                                  Only applied by the native HLS/DASH downloaders.
 243                     * extra_param_to_key_url  A query string to append to the URL
 244                                  of the format's HLS AES-128 decryption key.
 245                                  Only applied by the native HLS downloader.
 246                     * hls_aes    A dictionary of HLS AES-128 decryption information
 247                                  used by the native HLS downloader to override the
 248                                  values in the media playlist when an '#EXT-X-KEY' tag
 249                                  is present in the playlist:
 250                                  * uri  The URI from which the key will be downloaded
 251                                  * key  The key (as hex) used to decrypt fragments.
 252                                         If `key` is given, any key URI will be ignored
 253                                  * iv   The IV (as hex) used to decrypt fragments
 254                     * downloader_options  A dictionary of downloader options
 255                                  (For internal use only)
 256                                  * http_chunk_size Chunk size for HTTP downloads
 257                                  * ffmpeg_args     Extra arguments for ffmpeg downloader (input)
 258                                  * ffmpeg_args_out Extra arguments for ffmpeg downloader (output)
 259                     * is_dash_periods  Whether the format is a result of merging
 260                                  multiple DASH periods.
 261                     RTMP formats can also have the additional fields: page_url,
 262                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 263                     rtmp_protocol, rtmp_real_time
 264
 265     url:            Final video URL.
 266     ext:            Video filename extension.
 267     format:         The video format, defaults to ext (used for --get-format)
 268     player_url:     SWF Player URL (used for rtmpdump).
 269
 270     The following fields are optional:
 271
 272     direct:         True if a direct video file was given (must only be set by GenericIE)
 273     alt_title:      A secondary title of the video.
 274     display_id:     An alternative identifier for the video, not necessarily
 275                     unique, but available before title. Typically, id is
 276                     something like "4234987", title "Dancing naked mole rats",
 277                     and display_id "dancing-naked-mole-rats"
 278     thumbnails:     A list of dictionaries, with the following entries:
 279                         * "id" (optional, string) - Thumbnail format ID
 280                         * "url"
 281                         * "preference" (optional, int) - quality of the image
 282                         * "width" (optional, int)
 283                         * "height" (optional, int)
 284                         * "resolution" (optional, string "{width}x{height}",
 285                                         deprecated)
 286                         * "filesize" (optional, int)
 287                         * "http_headers" (dict) - HTTP headers for the request
 288     thumbnail:      Full URL to a video thumbnail image.
 289     description:    Full video description.
 290     uploader:       Full name of the video uploader.
 291     license:        License name the video is licensed under.
 292     creators:       List of creators of the video.
 293     timestamp:      UNIX timestamp of the moment the video was uploaded
 294     upload_date:    Video upload date in UTC (YYYYMMDD).
 295                     If not explicitly set, calculated from timestamp
 296     release_timestamp: UNIX timestamp of the moment the video was released.
 297                     If it is not clear whether to use timestamp or this, use the former
 298     release_date:   The date (YYYYMMDD) when the video was released in UTC.
 299                     If not explicitly set, calculated from release_timestamp
 300     release_year:   Year (YYYY) as integer when the video or album was released.
 301                     To be used if no exact release date is known.
 302                     If not explicitly set, calculated from release_date.
 303     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 304     modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
 305                     If not explicitly set, calculated from modified_timestamp
 306     uploader_id:    Nickname or id of the video uploader.
 307     uploader_url:   Full URL to a personal webpage of the video uploader.
 308     channel:        Full name of the channel the video is uploaded on.
 309                     Note that channel fields may or may not repeat uploader
 310                     fields. This depends on a particular extractor.
 311     channel_id:     Id of the channel.
 312     channel_url:    Full URL to a channel webpage.
 313     channel_follower_count: Number of followers of the channel.
 314     channel_is_verified: Whether the channel is verified on the platform.
 315     location:       Physical location where the video was filmed.
 316     subtitles:      The available subtitles as a dictionary in the format
 317                     {tag: subformats}. "tag" is usually a language code, and
 318                     "subformats" is a list sorted from lower to higher
 319                     preference, each element is a dictionary with the "ext"
 320                     entry and one of:
 321                         * "data": The subtitles file contents
 322                         * "url": A URL pointing to the subtitles file
 323                     It can optionally also have:
 324                         * "name": Name or description of the subtitles
 325                         * "http_headers": A dictionary of additional HTTP headers
 326                                   to add to the request.
 327                     "ext" will be calculated from URL if missing
 328     automatic_captions: Like 'subtitles'; contains automatically generated
 329                     captions instead of normal subtitles
 330     duration:       Length of the video in seconds, as an integer or float.
 331     view_count:     How many users have watched the video on the platform.
 332     concurrent_view_count: How many users are currently watching the video on the platform.
 333     like_count:     Number of positive ratings of the video
 334     dislike_count:  Number of negative ratings of the video
 335     repost_count:   Number of reposts of the video
 336     average_rating: Average rating give by users, the scale used depends on the webpage
 337     comment_count:  Number of comments on the video
 338     comments:       A list of comments, each with one or more of the following
 339                     properties (all but one of text or html optional):
 340                         * "author" - human-readable name of the comment author
 341                         * "author_id" - user ID of the comment author
 342                         * "author_thumbnail" - The thumbnail of the comment author
 343                         * "author_url" - The url to the comment author's page
 344                         * "author_is_verified" - Whether the author is verified
 345                                                  on the platform
 346                         * "author_is_uploader" - Whether the comment is made by
 347                                                  the video uploader
 348                         * "id" - Comment ID
 349                         * "html" - Comment as HTML
 350                         * "text" - Plain text of the comment
 351                         * "timestamp" - UNIX timestamp of comment
 352                         * "parent" - ID of the comment this one is replying to.
 353                                      Set to "root" to indicate that this is a
 354                                      comment to the original video.
 355                         * "like_count" - Number of positive ratings of the comment
 356                         * "dislike_count" - Number of negative ratings of the comment
 357                         * "is_favorited" - Whether the comment is marked as
 358                                            favorite by the video uploader
 359                         * "is_pinned" - Whether the comment is pinned to
 360                                         the top of the comments
 361     age_limit:      Age restriction for the video, as an integer (years)
 362     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 363                     should allow to get the same result again. (It will be set
 364                     by YoutubeDL if it's missing)
 365     categories:     A list of categories that the video falls in, for example
 366                     ["Sports", "Berlin"]
 367     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 368     cast:           A list of the video cast
 369     is_live:        True, False, or None (=unknown). Whether this video is a
 370                     live stream that goes on instead of a fixed-length video.
 371     was_live:       True, False, or None (=unknown). Whether this video was
 372                     originally a live stream.
 373     live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
 374                     or 'post_live' (was live, but VOD is not yet processed)
 375                     If absent, automatically set from is_live, was_live
 376     start_time:     Time in seconds where the reproduction should start, as
 377                     specified in the URL.
 378     end_time:       Time in seconds where the reproduction should end, as
 379                     specified in the URL.
 380     chapters:       A list of dictionaries, with the following entries:
 381                         * "start_time" - The start time of the chapter in seconds
 382                         * "end_time" - The end time of the chapter in seconds
 383                         * "title" (optional, string)
 384     heatmap:        A list of dictionaries, with the following entries:
 385                         * "start_time" - The start time of the data point in seconds
 386                         * "end_time" - The end time of the data point in seconds
 387                         * "value" - The normalized value of the data point (float between 0 and 1)
 388     playable_in_embed: Whether this video is allowed to play in embedded
 389                     players on other sites. Can be True (=always allowed),
 390                     False (=never allowed), None (=unknown), or a string
 391                     specifying the criteria for embedability; e.g. 'whitelist'
 392     availability:   Under what condition the video is available. One of
 393                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 394                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 395                     to set it
 396     media_type:     The type of media as classified by the site, e.g. "episode", "clip", "trailer"
 397     _old_archive_ids: A list of old archive ids needed for backward compatibility
 398     _format_sort_fields: A list of fields to use for sorting formats
 399     __post_extractor: A function to be called just before the metadata is
 400                     written to either disk, logger or console. The function
 401                     must return a dict which will be added to the info_dict.
 402                     This is usefull for additional information that is
 403                     time-consuming to extract. Note that the fields thus
 404                     extracted will not be available to output template and
 405                     match_filter. So, only "comments" and "comment_count" are
 406                     currently allowed to be extracted via this method.
 407
 408     The following fields should only be used when the video belongs to some logical
 409     chapter or section:
 410
 411     chapter:        Name or title of the chapter the video belongs to.
 412     chapter_number: Number of the chapter the video belongs to, as an integer.
 413     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 414
 415     The following fields should only be used when the video is an episode of some
 416     series, programme or podcast:
 417
 418     series:         Title of the series or programme the video episode belongs to.
 419     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 420     season:         Title of the season the video episode belongs to.
 421     season_number:  Number of the season the video episode belongs to, as an integer.
 422     season_id:      Id of the season the video episode belongs to, as a unicode string.
 423     episode:        Title of the video episode. Unlike mandatory video title field,
 424                     this field should denote the exact title of the video episode
 425                     without any kind of decoration.
 426     episode_number: Number of the video episode within a season, as an integer.
 427     episode_id:     Id of the video episode, as a unicode string.
 428
 429     The following fields should only be used when the media is a track or a part of
 430     a music album:
 431
 432     track:          Title of the track.
 433     track_number:   Number of the track within an album or a disc, as an integer.
 434     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 435                     as a unicode string.
 436     artists:        List of artists of the track.
 437     composers:      List of composers of the piece.
 438     genres:         List of genres of the track.
 439     album:          Title of the album the track belongs to.
 440     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 441     album_artists:  List of all artists appeared on the album.
 442                     E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"].
 443                     Useful for splits and compilations.
 444     disc_number:    Number of the disc or other physical medium the track belongs to,
 445                     as an integer.
 446
 447     The following fields should only be set for clips that should be cut from the original video:
 448
 449     section_start:  Start time of the section in seconds
 450     section_end:    End time of the section in seconds
 451
 452     The following fields should only be set for storyboards:
 453     rows:           Number of rows in each storyboard fragment, as an integer
 454     columns:        Number of columns in each storyboard fragment, as an integer
 455
 456     The following fields are deprecated and should not be set by new code:
 457     composer:       Use "composers" instead.
 458                     Composer(s) of the piece, comma-separated.
 459     artist:         Use "artists" instead.
 460                     Artist(s) of the track, comma-separated.
 461     genre:          Use "genres" instead.
 462                     Genre(s) of the track, comma-separated.
 463     album_artist:   Use "album_artists" instead.
 464                     All artists appeared on the album, comma-separated.
 465     creator:        Use "creators" instead.
 466                     The creator of the video.
 467
 468     Unless mentioned otherwise, the fields should be Unicode strings.
 469
 470     Unless mentioned otherwise, None is equivalent to absence of information.
 471
 472
 473     _type "playlist" indicates multiple videos.
 474     There must be a key "entries", which is a list, an iterable, or a PagedList
 475     object, each element of which is a valid dictionary by this specification.
 476
 477     Additionally, playlists can have "id", "title", and any other relevant
 478     attributes with the same semantics as videos (see above).
 479
 480     It can also have the following optional fields:
 481
 482     playlist_count: The total number of videos in a playlist. If not given,
 483                     YoutubeDL tries to calculate it from "entries"
 484
 485
 486     _type "multi_video" indicates that there are multiple videos that
 487     form a single show, for examples multiple acts of an opera or TV episode.
 488     It must have an entries key like a playlist and contain all the keys
 489     required for a video at the same time.
 490
 491
 492     _type "url" indicates that the video must be extracted from another
 493     location, possibly by a different extractor. Its only required key is:
 494     "url" - the next URL to extract.
 495     The key "ie_key" can be set to the class name (minus the trailing "IE",
 496     e.g. "Youtube") if the extractor class is known in advance.
 497     Additionally, the dictionary may have any properties of the resolved entity
 498     known in advance, for example "title" if the title of the referred video is
 499     known ahead of time.
 500
 501
 502     _type "url_transparent" entities have the same specification as "url", but
 503     indicate that the given additional information is more precise than the one
 504     associated with the resolved URL.
 505     This is useful when a site employs a video service that hosts the video and
 506     its technical metadata, but that video service does not embed a useful
 507     title, description etc.
 508
 509
 510     Subclasses of this should also be added to the list of extractors and
 511     should define _VALID_URL as a regexp or a Sequence of regexps, and
 512     re-define the _real_extract() and (optionally) _real_initialize() methods.
 513
 514     Subclasses may also override suitable() if necessary, but ensure the function
 515     signature is preserved and that this function imports everything it needs
 516     (except other extractors), so that lazy_extractors works correctly.
 517
 518     Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 519     the HTML of Generic webpages. It may also override _extract_embed_urls
 520     or _extract_from_webpage as necessary. While these are normally classmethods,
 521     _extract_from_webpage is allowed to be an instance method.
 522
 523     _extract_from_webpage may raise self.StopExtraction() to stop further
 524     processing of the webpage and obtain exclusive rights to it. This is useful
 525     when the extractor cannot reliably be matched using just the URL,
 526     e.g. invidious/peertube instances
 527
 528     Embed-only extractors can be defined by setting _VALID_URL = False.
 529
 530     To support username + password (or netrc) login, the extractor must define a
 531     _NETRC_MACHINE and re-define _perform_login(username, password) and
 532     (optionally) _initialize_pre_login() methods. The _perform_login method will
 533     be called between _initialize_pre_login and _real_initialize if credentials
 534     are passed by the user. In cases where it is necessary to have the login
 535     process as part of the extraction rather than initialization, _perform_login
 536     can be left undefined.
 537
 538     _GEO_BYPASS attribute may be set to False in order to disable
 539     geo restriction bypass mechanisms for a particular extractor.
 540     Though it won't disable explicit geo restriction bypass based on
 541     country code provided with geo_bypass_country.
 542
 543     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 544     countries for this extractor. One of these countries will be used by
 545     geo restriction bypass mechanism right away in order to bypass
 546     geo restriction, of course, if the mechanism is not disabled.
 547
 548     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 549     IP blocks in CIDR notation for this extractor. One of these IP blocks
 550     will be used by geo restriction bypass mechanism similarly
 551     to _GEO_COUNTRIES.
 552
 553     The _ENABLED attribute should be set to False for IEs that
 554     are disabled by default and must be explicitly enabled.
 555
 556     The _WORKING attribute should be set to False for broken IEs
 557     in order to warn the users and skip the tests.
 558     """
 559
 560     _ready = False
 561     _downloader = None
 562     _x_forwarded_for_ip = None
 563     _GEO_BYPASS = True
 564     _GEO_COUNTRIES = None
 565     _GEO_IP_BLOCKS = None
 566     _WORKING = True
 567     _ENABLED = True
 568     _NETRC_MACHINE = None
 569     IE_DESC = None
 570     SEARCH_KEY = None
 571     _VALID_URL = None
 572     _EMBED_REGEX = []
 573
 574     def _login_hint(self, method=NO_DEFAULT, netrc=None):
 575         password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
 576         return {
 577             None: '',
 578             'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 579             'password': f'Use {password_hint}',
 580             'cookies': (
 581                 'Use --cookies-from-browser or --cookies for the authentication. '
 582                 'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
 583         }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
 584
 585     def __init__(self, downloader=None):
 586         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 587         If a downloader is not passed during initialization,
 588         it must be set using "set_downloader()" before "extract()" is called"""
 589         self._ready = False
 590         self._x_forwarded_for_ip = None
 591         self._printed_messages = set()
 592         self.set_downloader(downloader)
 593
 594     @classmethod
 595     def _match_valid_url(cls, url):
 596         if cls._VALID_URL is False:
 597             return None
 598         # This does not use has/getattr intentionally - we want to know whether
 599         # we have cached the regexp for *this* class, whereas getattr would also
 600         # match the superclass
 601         if '_VALID_URL_RE' not in cls.__dict__:
 602             cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
 603         return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
 604
 605     @classmethod
 606     def suitable(cls, url):
 607         """Receives a URL and returns True if suitable for this IE."""
 608         # This function must import everything it needs (except other extractors),
 609         # so that lazy_extractors works correctly
 610         return cls._match_valid_url(url) is not None
 611
 612     @classmethod
 613     def _match_id(cls, url):
 614         return cls._match_valid_url(url).group('id')
 615
 616     @classmethod
 617     def get_temp_id(cls, url):
 618         try:
 619             return cls._match_id(url)
 620         except (IndexError, AttributeError):
 621             return None
 622
 623     @classmethod
 624     def working(cls):
 625         """Getter method for _WORKING."""
 626         return cls._WORKING
 627
 628     @classmethod
 629     def supports_login(cls):
 630         return bool(cls._NETRC_MACHINE)
 631
 632     def initialize(self):
 633         """Initializes an instance (authentication, etc)."""
 634         self._printed_messages = set()
 635         self._initialize_geo_bypass({
 636             'countries': self._GEO_COUNTRIES,
 637             'ip_blocks': self._GEO_IP_BLOCKS,
 638         })
 639         if not self._ready:
 640             self._initialize_pre_login()
 641             if self.supports_login():
 642                 username, password = self._get_login_info()
 643                 if username:
 644                     self._perform_login(username, password)
 645             elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
 646                 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
 647             self._real_initialize()
 648             self._ready = True
 649
 650     def _initialize_geo_bypass(self, geo_bypass_context):
 651         """
 652         Initialize geo restriction bypass mechanism.
 653
 654         This method is used to initialize geo bypass mechanism based on faking
 655         X-Forwarded-For HTTP header. A random country from provided country list
 656         is selected and a random IP belonging to this country is generated. This
 657         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 658         HTTP requests.
 659
 660         This method will be used for initial geo bypass mechanism initialization
 661         during the instance initialization with _GEO_COUNTRIES and
 662         _GEO_IP_BLOCKS.
 663
 664         You may also manually call it from extractor's code if geo bypass
 665         information is not available beforehand (e.g. obtained during
 666         extraction) or due to some other reason. In this case you should pass
 667         this information in geo bypass context passed as first argument. It may
 668         contain following fields:
 669
 670         countries:  List of geo unrestricted countries (similar
 671                     to _GEO_COUNTRIES)
 672         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 673                     (similar to _GEO_IP_BLOCKS)
 674
 675         """
 676         if not self._x_forwarded_for_ip:
 677
 678             # Geo bypass mechanism is explicitly disabled by user
 679             if not self.get_param('geo_bypass', True):
 680                 return
 681
 682             if not geo_bypass_context:
 683                 geo_bypass_context = {}
 684
 685             # Backward compatibility: previously _initialize_geo_bypass
 686             # expected a list of countries, some 3rd party code may still use
 687             # it this way
 688             if isinstance(geo_bypass_context, (list, tuple)):
 689                 geo_bypass_context = {
 690                     'countries': geo_bypass_context,
 691                 }
 692
 693             # The whole point of geo bypass mechanism is to fake IP
 694             # as X-Forwarded-For HTTP header based on some IP block or
 695             # country code.
 696
 697             # Path 1: bypassing based on IP block in CIDR notation
 698
 699             # Explicit IP block specified by user, use it right away
 700             # regardless of whether extractor is geo bypassable or not
 701             ip_block = self.get_param('geo_bypass_ip_block', None)
 702
 703             # Otherwise use random IP block from geo bypass context but only
 704             # if extractor is known as geo bypassable
 705             if not ip_block:
 706                 ip_blocks = geo_bypass_context.get('ip_blocks')
 707                 if self._GEO_BYPASS and ip_blocks:
 708                     ip_block = random.choice(ip_blocks)
 709
 710             if ip_block:
 711                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 712                 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
 713                 return
 714
 715             # Path 2: bypassing based on country code
 716
 717             # Explicit country code specified by user, use it right away
 718             # regardless of whether extractor is geo bypassable or not
 719             country = self.get_param('geo_bypass_country', None)
 720
 721             # Otherwise use random country code from geo bypass context but
 722             # only if extractor is known as geo bypassable
 723             if not country:
 724                 countries = geo_bypass_context.get('countries')
 725                 if self._GEO_BYPASS and countries:
 726                     country = random.choice(countries)
 727
 728             if country:
 729                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 730                 self._downloader.write_debug(
 731                     f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
 732
 733     def extract(self, url):
 734         """Extracts URL information and returns it in list of dicts."""
 735         try:
 736             for _ in range(2):
 737                 try:
 738                     self.initialize()
 739                     self.to_screen('Extracting URL: %s' % (
 740                         url if self.get_param('verbose') else truncate_string(url, 100, 20)))
 741                     ie_result = self._real_extract(url)
 742                     if ie_result is None:
 743                         return None
 744                     if self._x_forwarded_for_ip:
 745                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 746                     subtitles = ie_result.get('subtitles') or {}
 747                     if 'no-live-chat' in self.get_param('compat_opts'):
 748                         for lang in ('live_chat', 'comments', 'danmaku'):
 749                             subtitles.pop(lang, None)
 750                     return ie_result
 751                 except GeoRestrictedError as e:
 752                     if self.__maybe_fake_ip_and_retry(e.countries):
 753                         continue
 754                     raise
 755         except UnsupportedError:
 756             raise
 757         except ExtractorError as e:
 758             e.video_id = e.video_id or self.get_temp_id(url)
 759             e.ie = e.ie or self.IE_NAME
 760             e.traceback = e.traceback or sys.exc_info()[2]
 761             raise
 762         except IncompleteRead as e:
 763             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 764         except (KeyError, StopIteration) as e:
 765             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 766
 767     def __maybe_fake_ip_and_retry(self, countries):
 768         if (not self.get_param('geo_bypass_country', None)
 769                 and self._GEO_BYPASS
 770                 and self.get_param('geo_bypass', True)
 771                 and not self._x_forwarded_for_ip
 772                 and countries):
 773             country_code = random.choice(countries)
 774             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 775             if self._x_forwarded_for_ip:
 776                 self.report_warning(
 777                     'Video is geo restricted. Retrying extraction with fake IP '
 778                     f'{self._x_forwarded_for_ip} ({country_code.upper()}) as X-Forwarded-For.')
 779                 return True
 780         return False
 781
 782     def set_downloader(self, downloader):
 783         """Sets a YoutubeDL instance as the downloader for this IE."""
 784         self._downloader = downloader
 785
 786     @property
 787     def cache(self):
 788         return self._downloader.cache
 789
 790     @property
 791     def cookiejar(self):
 792         return self._downloader.cookiejar
 793
 794     def _initialize_pre_login(self):
 795         """ Initialization before login. Redefine in subclasses."""
 796         pass
 797
 798     def _perform_login(self, username, password):
 799         """ Login with username and password. Redefine in subclasses."""
 800         pass
 801
 802     def _real_initialize(self):
 803         """Real initialization process. Redefine in subclasses."""
 804         pass
 805
 806     def _real_extract(self, url):
 807         """Real extraction process. Redefine in subclasses."""
 808         raise NotImplementedError('This method must be implemented by subclasses')
 809
 810     @classmethod
 811     def ie_key(cls):
 812         """A string for getting the InfoExtractor with get_info_extractor"""
 813         return cls.__name__[:-2]
 814
 815     @classproperty
 816     def IE_NAME(cls):
 817         return cls.__name__[:-2]
 818
 819     @staticmethod
 820     def __can_accept_status_code(err, expected_status):
 821         assert isinstance(err, HTTPError)
 822         if expected_status is None:
 823             return False
 824         elif callable(expected_status):
 825             return expected_status(err.status) is True
 826         else:
 827             return err.status in variadic(expected_status)
 828
 829     def _create_request(self, url_or_request, data=None, headers=None, query=None, extensions=None):
 830         if isinstance(url_or_request, urllib.request.Request):
 831             self._downloader.deprecation_warning(
 832                 'Passing a urllib.request.Request to _create_request() is deprecated. '
 833                 'Use yt_dlp.networking.common.Request instead.')
 834             url_or_request = urllib_req_to_req(url_or_request)
 835         elif not isinstance(url_or_request, Request):
 836             url_or_request = Request(url_or_request)
 837
 838         url_or_request.update(data=data, headers=headers, query=query, extensions=extensions)
 839         return url_or_request
 840
 841     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None,
 842                          headers=None, query=None, expected_status=None, impersonate=None, require_impersonation=False):
 843         """
 844         Return the response handle.
 845
 846         See _download_webpage docstring for arguments specification.
 847         """
 848         if not self._downloader._first_webpage_request:
 849             sleep_interval = self.get_param('sleep_interval_requests') or 0
 850             if sleep_interval > 0:
 851                 self.to_screen(f'Sleeping {sleep_interval} seconds ...')
 852                 time.sleep(sleep_interval)
 853         else:
 854             self._downloader._first_webpage_request = False
 855
 856         if note is None:
 857             self.report_download_webpage(video_id)
 858         elif note is not False:
 859             if video_id is None:
 860                 self.to_screen(str(note))
 861             else:
 862                 self.to_screen(f'{video_id}: {note}')
 863
 864         # Some sites check X-Forwarded-For HTTP header in order to figure out
 865         # the origin of the client behind proxy. This allows bypassing geo
 866         # restriction by faking this header's value to IP that belongs to some
 867         # geo unrestricted country. We will do so once we encounter any
 868         # geo restriction error.
 869         if self._x_forwarded_for_ip:
 870             headers = (headers or {}).copy()
 871             headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
 872
 873         extensions = {}
 874
 875         if impersonate in (True, ''):
 876             impersonate = ImpersonateTarget()
 877         requested_targets = [
 878             t if isinstance(t, ImpersonateTarget) else ImpersonateTarget.from_str(t)
 879             for t in variadic(impersonate)
 880         ] if impersonate else []
 881
 882         available_target = next(filter(self._downloader._impersonate_target_available, requested_targets), None)
 883         if available_target:
 884             extensions['impersonate'] = available_target
 885         elif requested_targets:
 886             message = 'The extractor is attempting impersonation, but '
 887             message += (
 888                 'no impersonate target is available' if not str(impersonate)
 889                 else f'none of these impersonate targets are available: "{", ".join(map(str, requested_targets))}"')
 890             info_msg = ('see  https://github.com/yt-dlp/yt-dlp#impersonation  '
 891                         'for information on installing the required dependencies')
 892             if require_impersonation:
 893                 raise ExtractorError(f'{message}; {info_msg}', expected=True)
 894             self.report_warning(f'{message}; if you encounter errors, then {info_msg}', only_once=True)
 895
 896         try:
 897             return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query, extensions))
 898         except network_exceptions as err:
 899             if isinstance(err, HTTPError):
 900                 if self.__can_accept_status_code(err, expected_status):
 901                     return err.response
 902
 903             if errnote is False:
 904                 return False
 905             if errnote is None:
 906                 errnote = 'Unable to download webpage'
 907
 908             errmsg = f'{errnote}: {err}'
 909             if fatal:
 910                 raise ExtractorError(errmsg, cause=err)
 911             else:
 912                 self.report_warning(errmsg)
 913                 return False
 914
 915     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
 916                                  encoding=None, data=None, headers={}, query={}, expected_status=None,
 917                                  impersonate=None, require_impersonation=False):
 918         """
 919         Return a tuple (page content as string, URL handle).
 920
 921         Arguments:
 922         url_or_request -- plain text URL as a string or
 923             a yt_dlp.networking.Request object
 924         video_id -- Video/playlist/item identifier (string)
 925
 926         Keyword arguments:
 927         note -- note printed before downloading (string)
 928         errnote -- note printed in case of an error (string)
 929         fatal -- flag denoting whether error should be considered fatal,
 930             i.e. whether it should cause ExtractionError to be raised,
 931             otherwise a warning will be reported and extraction continued
 932         encoding -- encoding for a page content decoding, guessed automatically
 933             when not explicitly specified
 934         data -- POST data (bytes)
 935         headers -- HTTP headers (dict)
 936         query -- URL query (dict)
 937         expected_status -- allows to accept failed HTTP requests (non 2xx
 938             status code) by explicitly specifying a set of accepted status
 939             codes. Can be any of the following entities:
 940                 - an integer type specifying an exact failed status code to
 941                   accept
 942                 - a list or a tuple of integer types specifying a list of
 943                   failed status codes to accept
 944                 - a callable accepting an actual failed status code and
 945                   returning True if it should be accepted
 946             Note that this argument does not affect success status codes (2xx)
 947             which are always accepted.
 948         impersonate -- the impersonate target. Can be any of the following entities:
 949                 - an instance of yt_dlp.networking.impersonate.ImpersonateTarget
 950                 - a string in the format of CLIENT[:OS]
 951                 - a list or a tuple of CLIENT[:OS] strings or ImpersonateTarget instances
 952                 - a boolean value; True means any impersonate target is sufficient
 953         require_impersonation -- flag to toggle whether the request should raise an error
 954             if impersonation is not possible (bool, default: False)
 955         """
 956
 957         # Strip hashes from the URL (#1038)
 958         if isinstance(url_or_request, str):
 959             url_or_request = url_or_request.partition('#')[0]
 960
 961         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data,
 962                                      headers=headers, query=query, expected_status=expected_status,
 963                                      impersonate=impersonate, require_impersonation=require_impersonation)
 964         if urlh is False:
 965             assert not fatal
 966             return False
 967         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal,
 968                                              encoding=encoding, data=data)
 969         if content is False:
 970             assert not fatal
 971             return False
 972         return (content, urlh)
 973
 974     @staticmethod
 975     def _guess_encoding_from_content(content_type, webpage_bytes):
 976         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 977         if m:
 978             encoding = m.group(1)
 979         else:
 980             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 981                           webpage_bytes[:1024])
 982             if m:
 983                 encoding = m.group(1).decode('ascii')
 984             elif webpage_bytes.startswith(b'\xff\xfe'):
 985                 encoding = 'utf-16'
 986             else:
 987                 encoding = 'utf-8'
 988
 989         return encoding
 990
 991     def __check_blocked(self, content):
 992         first_block = content[:512]
 993         if ('<title>Access to this site is blocked</title>' in content
 994                 and 'Websense' in first_block):
 995             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 996             blocked_iframe = self._html_search_regex(
 997                 r'<iframe src="([^"]+)"', content,
 998                 'Websense information URL', default=None)
 999             if blocked_iframe:
1000                 msg += f' Visit {blocked_iframe} for more details'
1001             raise ExtractorError(msg, expected=True)
1002         if '<title>The URL you requested has been blocked</title>' in first_block:
1003             msg = (
1004                 'Access to this webpage has been blocked by Indian censorship. '
1005                 'Use a VPN or proxy server (with --proxy) to route around it.')
1006             block_msg = self._html_search_regex(
1007                 r'</h1><p>(.*?)</p>',
1008                 content, 'block message', default=None)
1009             if block_msg:
1010                 msg += ' (Message: "{}")'.format(block_msg.replace('\n', ' '))
1011             raise ExtractorError(msg, expected=True)
1012         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
1013                 and 'blocklist.rkn.gov.ru' in content):
1014             raise ExtractorError(
1015                 'Access to this webpage has been blocked by decision of the Russian government. '
1016                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
1017                 expected=True)
1018
1019     def _request_dump_filename(self, url, video_id, data=None):
1020         if data is not None:
1021             data = hashlib.md5(data).hexdigest()
1022         basen = join_nonempty(video_id, data, url, delim='_')
1023         trim_length = self.get_param('trim_file_name') or 240
1024         if len(basen) > trim_length:
1025             h = '___' + hashlib.md5(basen.encode()).hexdigest()
1026             basen = basen[:trim_length - len(h)] + h
1027         filename = sanitize_filename(f'{basen}.dump', restricted=True)
1028         # Working around MAX_PATH limitation on Windows (see
1029         # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
1030         if compat_os_name == 'nt':
1031             absfilepath = os.path.abspath(filename)
1032             if len(absfilepath) > 259:
1033                 filename = fR'\\?\{absfilepath}'
1034         return filename
1035
1036     def __decode_webpage(self, webpage_bytes, encoding, headers):
1037         if not encoding:
1038             encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
1039         try:
1040             return webpage_bytes.decode(encoding, 'replace')
1041         except LookupError:
1042             return webpage_bytes.decode('utf-8', 'replace')
1043
1044     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True,
1045                               prefix=None, encoding=None, data=None):
1046         try:
1047             webpage_bytes = urlh.read()
1048         except TransportError as err:
1049             errmsg = f'{video_id}: Error reading response: {err.msg}'
1050             if fatal:
1051                 raise ExtractorError(errmsg, cause=err)
1052             self.report_warning(errmsg)
1053             return False
1054
1055         if prefix is not None:
1056             webpage_bytes = prefix + webpage_bytes
1057         if self.get_param('dump_intermediate_pages', False):
1058             self.to_screen('Dumping request to ' + urlh.url)
1059             dump = base64.b64encode(webpage_bytes).decode('ascii')
1060             self._downloader.to_screen(dump)
1061         if self.get_param('write_pages'):
1062             if isinstance(url_or_request, Request):
1063                 data = self._create_request(url_or_request, data).data
1064             filename = self._request_dump_filename(urlh.url, video_id, data)
1065             self.to_screen(f'Saving request to {filename}')
1066             with open(filename, 'wb') as outf:
1067                 outf.write(webpage_bytes)
1068
1069         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
1070         self.__check_blocked(content)
1071
1072         return content
1073
1074     def __print_error(self, errnote, fatal, video_id, err):
1075         if fatal:
1076             raise ExtractorError(f'{video_id}: {errnote}', cause=err)
1077         elif errnote:
1078             self.report_warning(f'{video_id}: {errnote}: {err}')
1079
1080     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
1081         if transform_source:
1082             xml_string = transform_source(xml_string)
1083         try:
1084             return compat_etree_fromstring(xml_string.encode())
1085         except xml.etree.ElementTree.ParseError as ve:
1086             self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
1087
1088     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
1089         try:
1090             return json.loads(
1091                 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
1092         except ValueError as ve:
1093             self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
1094
1095     def _parse_socket_response_as_json(self, data, *args, **kwargs):
1096         return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
1097
1098     def __create_download_methods(name, parser, note, errnote, return_value):
1099
1100         def parse(ie, content, *args, errnote=errnote, **kwargs):
1101             if parser is None:
1102                 return content
1103             if errnote is False:
1104                 kwargs['errnote'] = errnote
1105             # parser is fetched by name so subclasses can override it
1106             return getattr(ie, parser)(content, *args, **kwargs)
1107
1108         def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1109                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
1110                             impersonate=None, require_impersonation=False):
1111             res = self._download_webpage_handle(
1112                 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1113                 data=data, headers=headers, query=query, expected_status=expected_status,
1114                 impersonate=impersonate, require_impersonation=require_impersonation)
1115             if res is False:
1116                 return res
1117             content, urlh = res
1118             return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1119
1120         def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1121                              fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
1122                              impersonate=None, require_impersonation=False):
1123             if self.get_param('load_pages'):
1124                 url_or_request = self._create_request(url_or_request, data, headers, query)
1125                 filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data)
1126                 self.to_screen(f'Loading request from {filename}')
1127                 try:
1128                     with open(filename, 'rb') as dumpf:
1129                         webpage_bytes = dumpf.read()
1130                 except OSError as e:
1131                     self.report_warning(f'Unable to load request from disk: {e}')
1132                 else:
1133                     content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1134                     return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1135             kwargs = {
1136                 'note': note,
1137                 'errnote': errnote,
1138                 'transform_source': transform_source,
1139                 'fatal': fatal,
1140                 'encoding': encoding,
1141                 'data': data,
1142                 'headers': headers,
1143                 'query': query,
1144                 'expected_status': expected_status,
1145                 'impersonate': impersonate,
1146                 'require_impersonation': require_impersonation,
1147             }
1148             if parser is None:
1149                 kwargs.pop('transform_source')
1150             # The method is fetched by name so subclasses can override _download_..._handle
1151             res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1152             return res if res is False else res[0]
1153
1154         def impersonate(func, name, return_value):
1155             func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1156             func.__doc__ = f'''
1157                 @param transform_source     Apply this transformation before parsing
1158                 @returns                    {return_value}
1159
1160                 See _download_webpage_handle docstring for other arguments specification
1161             '''
1162
1163         impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1164         impersonate(download_content, f'_download_{name}', f'{return_value}')
1165         return download_handle, download_content
1166
1167     _download_xml_handle, _download_xml = __create_download_methods(
1168         'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1169     _download_json_handle, _download_json = __create_download_methods(
1170         'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1171     _download_socket_json_handle, _download_socket_json = __create_download_methods(
1172         'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1173     __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1174
1175     def _download_webpage(
1176             self, url_or_request, video_id, note=None, errnote=None,
1177             fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1178         """
1179         Return the data of the page as a string.
1180
1181         Keyword arguments:
1182         tries -- number of tries
1183         timeout -- sleep interval between tries
1184
1185         See _download_webpage_handle docstring for other arguments specification.
1186         """
1187
1188         R''' # NB: These are unused; should they be deprecated?
1189         if tries != 1:
1190             self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1191         if timeout is NO_DEFAULT:
1192             timeout = 5
1193         else:
1194             self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1195         '''
1196
1197         try_count = 0
1198         while True:
1199             try:
1200                 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1201             except IncompleteRead as e:
1202                 try_count += 1
1203                 if try_count >= tries:
1204                     raise e
1205                 self._sleep(timeout, video_id)
1206
1207     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1208         idstr = format_field(video_id, None, '%s: ')
1209         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1210         if only_once:
1211             if f'WARNING: {msg}' in self._printed_messages:
1212                 return
1213             self._printed_messages.add(f'WARNING: {msg}')
1214         self._downloader.report_warning(msg, *args, **kwargs)
1215
1216     def to_screen(self, msg, *args, **kwargs):
1217         """Print msg to screen, prefixing it with '[ie_name]'"""
1218         self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1219
1220     def write_debug(self, msg, *args, **kwargs):
1221         self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1222
1223     def get_param(self, name, default=None, *args, **kwargs):
1224         if self._downloader:
1225             return self._downloader.params.get(name, default, *args, **kwargs)
1226         return default
1227
1228     def report_drm(self, video_id, partial=NO_DEFAULT):
1229         if partial is not NO_DEFAULT:
1230             self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1231         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1232
1233     def report_extraction(self, id_or_name):
1234         """Report information extraction."""
1235         self.to_screen(f'{id_or_name}: Extracting information')
1236
1237     def report_download_webpage(self, video_id):
1238         """Report webpage download."""
1239         self.to_screen(f'{video_id}: Downloading webpage')
1240
1241     def report_age_confirmation(self):
1242         """Report attempt to confirm age."""
1243         self.to_screen('Confirming age')
1244
1245     def report_login(self):
1246         """Report attempt to log in."""
1247         self.to_screen('Logging in')
1248
1249     def raise_login_required(
1250             self, msg='This video is only available for registered users',
1251             metadata_available=False, method=NO_DEFAULT):
1252         if metadata_available and (
1253                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1254             self.report_warning(msg)
1255             return
1256         msg += format_field(self._login_hint(method), None, '. %s')
1257         raise ExtractorError(msg, expected=True)
1258
1259     def raise_geo_restricted(
1260             self, msg='This video is not available from your location due to geo restriction',
1261             countries=None, metadata_available=False):
1262         if metadata_available and (
1263                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1264             self.report_warning(msg)
1265         else:
1266             raise GeoRestrictedError(msg, countries=countries)
1267
1268     def raise_no_formats(self, msg, expected=False, video_id=None):
1269         if expected and (
1270                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1271             self.report_warning(msg, video_id)
1272         elif isinstance(msg, ExtractorError):
1273             raise msg
1274         else:
1275             raise ExtractorError(msg, expected=expected, video_id=video_id)
1276
1277     # Methods for following #608
1278     @staticmethod
1279     def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1280         """Returns a URL that points to a page that should be processed"""
1281         if ie is not None:
1282             kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1283         if video_id is not None:
1284             kwargs['id'] = video_id
1285         if video_title is not None:
1286             kwargs['title'] = video_title
1287         return {
1288             **kwargs,
1289             '_type': 'url_transparent' if url_transparent else 'url',
1290             'url': url,
1291         }
1292
1293     @classmethod
1294     def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1295                               getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1296         return cls.playlist_result(
1297             (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1298             playlist_id, playlist_title, **kwargs)
1299
1300     @staticmethod
1301     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1302         """Returns a playlist"""
1303         if playlist_id:
1304             kwargs['id'] = playlist_id
1305         if playlist_title:
1306             kwargs['title'] = playlist_title
1307         if playlist_description is not None:
1308             kwargs['description'] = playlist_description
1309         return {
1310             **kwargs,
1311             '_type': 'multi_video' if multi_video else 'playlist',
1312             'entries': entries,
1313         }
1314
1315     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1316         """
1317         Perform a regex search on the given string, using a single or a list of
1318         patterns returning the first matching group.
1319         In case of failure return a default value or raise a WARNING or a
1320         RegexNotFoundError, depending on fatal, specifying the field name.
1321         """
1322         if string is None:
1323             mobj = None
1324         elif isinstance(pattern, (str, re.Pattern)):
1325             mobj = re.search(pattern, string, flags)
1326         else:
1327             for p in pattern:
1328                 mobj = re.search(p, string, flags)
1329                 if mobj:
1330                     break
1331
1332         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1333
1334         if mobj:
1335             if group is None:
1336                 # return the first matching group
1337                 return next(g for g in mobj.groups() if g is not None)
1338             elif isinstance(group, (list, tuple)):
1339                 return tuple(mobj.group(g) for g in group)
1340             else:
1341                 return mobj.group(group)
1342         elif default is not NO_DEFAULT:
1343             return default
1344         elif fatal:
1345             raise RegexNotFoundError(f'Unable to extract {_name}')
1346         else:
1347             self.report_warning(f'unable to extract {_name}' + bug_reports_message())
1348             return None
1349
1350     def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1351                      contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1352         """Searches string for the JSON object specified by start_pattern"""
1353         # NB: end_pattern is only used to reduce the size of the initial match
1354         if default is NO_DEFAULT:
1355             default, has_default = {}, False
1356         else:
1357             fatal, has_default = False, True
1358
1359         json_string = self._search_regex(
1360             rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1361             string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1362         if not json_string:
1363             return default
1364
1365         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1366         try:
1367             return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1368         except ExtractorError as e:
1369             if fatal:
1370                 raise ExtractorError(
1371                     f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1372             elif not has_default:
1373                 self.report_warning(
1374                     f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1375         return default
1376
1377     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1378         """
1379         Like _search_regex, but strips HTML tags and unescapes entities.
1380         """
1381         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1382         if isinstance(res, tuple):
1383             return tuple(map(clean_html, res))
1384         return clean_html(res)
1385
1386     def _get_netrc_login_info(self, netrc_machine=None):
1387         netrc_machine = netrc_machine or self._NETRC_MACHINE
1388
1389         cmd = self.get_param('netrc_cmd')
1390         if cmd:
1391             cmd = cmd.replace('{}', netrc_machine)
1392             self.to_screen(f'Executing command: {cmd}')
1393             stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
1394             if ret != 0:
1395                 raise OSError(f'Command returned error code {ret}')
1396             info = netrc_from_content(stdout).authenticators(netrc_machine)
1397
1398         elif self.get_param('usenetrc', False):
1399             netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1400             if os.path.isdir(netrc_file):
1401                 netrc_file = os.path.join(netrc_file, '.netrc')
1402             info = netrc.netrc(netrc_file).authenticators(netrc_machine)
1403
1404         else:
1405             return None, None
1406         if not info:
1407             self.to_screen(f'No authenticators for {netrc_machine}')
1408             return None, None
1409
1410         self.write_debug(f'Using netrc for {netrc_machine} authentication')
1411         return info[0], info[2]
1412
1413     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1414         """
1415         Get the login info as (username, password)
1416         First look for the manually specified credentials using username_option
1417         and password_option as keys in params dictionary. If no such credentials
1418         are available try the netrc_cmd if it is defined or look in the
1419         netrc file using the netrc_machine or _NETRC_MACHINE value.
1420         If there's no info available, return (None, None)
1421         """
1422
1423         username = self.get_param(username_option)
1424         if username is not None:
1425             password = self.get_param(password_option)
1426         else:
1427             try:
1428                 username, password = self._get_netrc_login_info(netrc_machine)
1429             except (OSError, netrc.NetrcParseError) as err:
1430                 self.report_warning(f'Failed to parse .netrc: {err}')
1431                 return None, None
1432         return username, password
1433
1434     def _get_tfa_info(self, note='two-factor verification code'):
1435         """
1436         Get the two-factor authentication info
1437         TODO - asking the user will be required for sms/phone verify
1438         currently just uses the command line option
1439         If there's no info available, return None
1440         """
1441
1442         tfa = self.get_param('twofactor')
1443         if tfa is not None:
1444             return tfa
1445
1446         return getpass.getpass(f'Type {note} and press [Return]: ')
1447
1448     # Helper functions for extracting OpenGraph info
1449     @staticmethod
1450     def _og_regexes(prop):
1451         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
1452         property_re = r'(?:name|property)=(?:\'og{sep}{prop}\'|"og{sep}{prop}"|\s*og{sep}{prop}\b)'.format(
1453             prop=re.escape(prop), sep='(?:&#x3A;|[:-])')
1454         template = r'<meta[^>]+?%s[^>]+?%s'
1455         return [
1456             template % (property_re, content_re),
1457             template % (content_re, property_re),
1458         ]
1459
1460     @staticmethod
1461     def _meta_regex(prop):
1462         return rf'''(?isx)<meta
1463                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?){re.escape(prop)}\1)
1464                     [^>]+?content=(["\'])(?P<content>.*?)\2'''
1465
1466     def _og_search_property(self, prop, html, name=None, **kargs):
1467         prop = variadic(prop)
1468         if name is None:
1469             name = f'OpenGraph {prop[0]}'
1470         og_regexes = []
1471         for p in prop:
1472             og_regexes.extend(self._og_regexes(p))
1473         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1474         if escaped is None:
1475             return None
1476         return unescapeHTML(escaped)
1477
1478     def _og_search_thumbnail(self, html, **kargs):
1479         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1480
1481     def _og_search_description(self, html, **kargs):
1482         return self._og_search_property('description', html, fatal=False, **kargs)
1483
1484     def _og_search_title(self, html, *, fatal=False, **kargs):
1485         return self._og_search_property('title', html, fatal=fatal, **kargs)
1486
1487     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1488         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1489         if secure:
1490             regexes = self._og_regexes('video:secure_url') + regexes
1491         return self._html_search_regex(regexes, html, name, **kargs)
1492
1493     def _og_search_url(self, html, **kargs):
1494         return self._og_search_property('url', html, **kargs)
1495
1496     def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1497         return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1498
1499     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1500         name = variadic(name)
1501         if display_name is None:
1502             display_name = name[0]
1503         return self._html_search_regex(
1504             [self._meta_regex(n) for n in name],
1505             html, display_name, fatal=fatal, group='content', **kwargs)
1506
1507     def _dc_search_uploader(self, html):
1508         return self._html_search_meta('dc.creator', html, 'uploader')
1509
1510     @staticmethod
1511     def _rta_search(html):
1512         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1513         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1514                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1515                      html):
1516             return 18
1517
1518         # And then there are the jokers who advertise that they use RTA, but actually don't.
1519         AGE_LIMIT_MARKERS = [
1520             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1521             r'>[^<]*you acknowledge you are at least (\d+) years old',
1522             r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1523         ]
1524
1525         age_limit = 0
1526         for marker in AGE_LIMIT_MARKERS:
1527             mobj = re.search(marker, html)
1528             if mobj:
1529                 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1530         return age_limit
1531
1532     def _media_rating_search(self, html):
1533         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1534         rating = self._html_search_meta('rating', html)
1535
1536         if not rating:
1537             return None
1538
1539         RATING_TABLE = {
1540             'safe for kids': 0,
1541             'general': 8,
1542             '14 years': 14,
1543             'mature': 17,
1544             'restricted': 19,
1545         }
1546         return RATING_TABLE.get(rating.lower())
1547
1548     def _family_friendly_search(self, html):
1549         # See http://schema.org/VideoObject
1550         family_friendly = self._html_search_meta(
1551             'isFamilyFriendly', html, default=None)
1552
1553         if not family_friendly:
1554             return None
1555
1556         RATING_TABLE = {
1557             '1': 0,
1558             'true': 0,
1559             '0': 18,
1560             'false': 18,
1561         }
1562         return RATING_TABLE.get(family_friendly.lower())
1563
1564     def _twitter_search_player(self, html):
1565         return self._html_search_meta('twitter:player', html,
1566                                       'twitter card player')
1567
1568     def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1569         """Yield all json ld objects in the html"""
1570         if default is not NO_DEFAULT:
1571             fatal = False
1572         for mobj in re.finditer(JSON_LD_RE, html):
1573             json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1574             for json_ld in variadic(json_ld_item):
1575                 if isinstance(json_ld, dict):
1576                     yield json_ld
1577
1578     def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1579         """Search for a video in any json ld in the html"""
1580         if default is not NO_DEFAULT:
1581             fatal = False
1582         info = self._json_ld(
1583             list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1584             video_id, fatal=fatal, expected_type=expected_type)
1585         if info:
1586             return info
1587         if default is not NO_DEFAULT:
1588             return default
1589         elif fatal:
1590             raise RegexNotFoundError('Unable to extract JSON-LD')
1591         else:
1592             self.report_warning(f'unable to extract JSON-LD {bug_reports_message()}')
1593             return {}
1594
1595     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1596         if isinstance(json_ld, str):
1597             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1598         if not json_ld:
1599             return {}
1600         info = {}
1601
1602         INTERACTION_TYPE_MAP = {
1603             'CommentAction': 'comment',
1604             'AgreeAction': 'like',
1605             'DisagreeAction': 'dislike',
1606             'LikeAction': 'like',
1607             'DislikeAction': 'dislike',
1608             'ListenAction': 'view',
1609             'WatchAction': 'view',
1610             'ViewAction': 'view',
1611         }
1612
1613         def is_type(e, *expected_types):
1614             type_ = variadic(traverse_obj(e, '@type'))
1615             return any(x in type_ for x in expected_types)
1616
1617         def extract_interaction_type(e):
1618             interaction_type = e.get('interactionType')
1619             if isinstance(interaction_type, dict):
1620                 interaction_type = interaction_type.get('@type')
1621             return str_or_none(interaction_type)
1622
1623         def extract_interaction_statistic(e):
1624             interaction_statistic = e.get('interactionStatistic')
1625             if isinstance(interaction_statistic, dict):
1626                 interaction_statistic = [interaction_statistic]
1627             if not isinstance(interaction_statistic, list):
1628                 return
1629             for is_e in interaction_statistic:
1630                 if not is_type(is_e, 'InteractionCounter'):
1631                     continue
1632                 interaction_type = extract_interaction_type(is_e)
1633                 if not interaction_type:
1634                     continue
1635                 # For interaction count some sites provide string instead of
1636                 # an integer (as per spec) with non digit characters (e.g. ",")
1637                 # so extracting count with more relaxed str_to_int
1638                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1639                 if interaction_count is None:
1640                     continue
1641                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1642                 if not count_kind:
1643                     continue
1644                 count_key = f'{count_kind}_count'
1645                 if info.get(count_key) is not None:
1646                     continue
1647                 info[count_key] = interaction_count
1648
1649         def extract_chapter_information(e):
1650             chapters = [{
1651                 'title': part.get('name'),
1652                 'start_time': part.get('startOffset'),
1653                 'end_time': part.get('endOffset'),
1654             } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1655             for idx, (last_c, current_c, next_c) in enumerate(zip(
1656                     [{'end_time': 0}, *chapters], chapters, chapters[1:])):
1657                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1658                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1659                 if None in current_c.values():
1660                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1661                     return
1662             if chapters:
1663                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1664                 info['chapters'] = chapters
1665
1666         def extract_video_object(e):
1667             author = e.get('author')
1668             info.update({
1669                 'url': url_or_none(e.get('contentUrl')),
1670                 'ext': mimetype2ext(e.get('encodingFormat')),
1671                 'title': unescapeHTML(e.get('name')),
1672                 'description': unescapeHTML(e.get('description')),
1673                 'thumbnails': [{'url': unescapeHTML(url)}
1674                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1675                                if url_or_none(url)],
1676                 'duration': parse_duration(e.get('duration')),
1677                 'timestamp': unified_timestamp(e.get('uploadDate')),
1678                 # author can be an instance of 'Organization' or 'Person' types.
1679                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1680                 # however some websites are using 'Text' type instead.
1681                 # 1. https://schema.org/VideoObject
1682                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1683                 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1684                 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1685                 'tbr': int_or_none(e.get('bitrate')),
1686                 'width': int_or_none(e.get('width')),
1687                 'height': int_or_none(e.get('height')),
1688                 'view_count': int_or_none(e.get('interactionCount')),
1689                 'tags': try_call(lambda: e.get('keywords').split(',')),
1690             })
1691             if is_type(e, 'AudioObject'):
1692                 info.update({
1693                     'vcodec': 'none',
1694                     'abr': int_or_none(e.get('bitrate')),
1695                 })
1696             extract_interaction_statistic(e)
1697             extract_chapter_information(e)
1698
1699         def traverse_json_ld(json_ld, at_top_level=True):
1700             for e in variadic(json_ld):
1701                 if not isinstance(e, dict):
1702                     continue
1703                 if at_top_level and '@context' not in e:
1704                     continue
1705                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1706                     traverse_json_ld(e['@graph'], at_top_level=False)
1707                     continue
1708                 if expected_type is not None and not is_type(e, expected_type):
1709                     continue
1710                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1711                 if rating is not None:
1712                     info['average_rating'] = rating
1713                 if is_type(e, 'TVEpisode', 'Episode'):
1714                     episode_name = unescapeHTML(e.get('name'))
1715                     info.update({
1716                         'episode': episode_name,
1717                         'episode_number': int_or_none(e.get('episodeNumber')),
1718                         'description': unescapeHTML(e.get('description')),
1719                     })
1720                     if not info.get('title') and episode_name:
1721                         info['title'] = episode_name
1722                     part_of_season = e.get('partOfSeason')
1723                     if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1724                         info.update({
1725                             'season': unescapeHTML(part_of_season.get('name')),
1726                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1727                         })
1728                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1729                     if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1730                         info['series'] = unescapeHTML(part_of_series.get('name'))
1731                 elif is_type(e, 'Movie'):
1732                     info.update({
1733                         'title': unescapeHTML(e.get('name')),
1734                         'description': unescapeHTML(e.get('description')),
1735                         'duration': parse_duration(e.get('duration')),
1736                         'timestamp': unified_timestamp(e.get('dateCreated')),
1737                     })
1738                 elif is_type(e, 'Article', 'NewsArticle'):
1739                     info.update({
1740                         'timestamp': parse_iso8601(e.get('datePublished')),
1741                         'title': unescapeHTML(e.get('headline')),
1742                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1743                     })
1744                     if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1745                         extract_video_object(e['video'][0])
1746                     elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1747                         extract_video_object(e['subjectOf'][0])
1748                 elif is_type(e, 'VideoObject', 'AudioObject'):
1749                     extract_video_object(e)
1750                     if expected_type is None:
1751                         continue
1752                     else:
1753                         break
1754                 video = e.get('video')
1755                 if is_type(video, 'VideoObject'):
1756                     extract_video_object(video)
1757                 if expected_type is None:
1758                     continue
1759                 else:
1760                     break
1761
1762         traverse_json_ld(json_ld)
1763         return filter_dict(info)
1764
1765     def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT, **kw):
1766         if default == '{}':
1767             self._downloader.deprecation_warning('using `default=\'{}\'` is deprecated, use `default={}` instead')
1768             default = {}
1769         if default is not NO_DEFAULT:
1770             fatal = False
1771
1772         return self._search_json(
1773             r'<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data',
1774             video_id, end_pattern='</script>', fatal=fatal, default=default, **kw)
1775
1776     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1777         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1778         rectx = re.escape(context_name)
1779         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1780         js, arg_keys, arg_vals = self._search_regex(
1781             (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1782             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1783             default=NO_DEFAULT if fatal else (None, None, None))
1784         if js is None:
1785             return {}
1786
1787         args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1788             f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
1789
1790         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1791         return traverse_obj(ret, traverse) or {}
1792
1793     @staticmethod
1794     def _hidden_inputs(html):
1795         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1796         hidden_inputs = {}
1797         for input_el in re.findall(r'(?i)(<input[^>]+>)', html):
1798             attrs = extract_attributes(input_el)
1799             if not input_el:
1800                 continue
1801             if attrs.get('type') not in ('hidden', 'submit'):
1802                 continue
1803             name = attrs.get('name') or attrs.get('id')
1804             value = attrs.get('value')
1805             if name and value is not None:
1806                 hidden_inputs[name] = value
1807         return hidden_inputs
1808
1809     def _form_hidden_inputs(self, form_id, html):
1810         form = self._search_regex(
1811             rf'(?is)<form[^>]+?id=(["\']){form_id}\1[^>]*>(?P<form>.+?)</form>',
1812             html, f'{form_id} form', group='form')
1813         return self._hidden_inputs(form)
1814
1815     @classproperty(cache=True)
1816     def FormatSort(cls):
1817         class FormatSort(FormatSorter):
1818             def __init__(ie, *args, **kwargs):
1819                 super().__init__(ie._downloader, *args, **kwargs)
1820
1821         deprecation_warning(
1822             'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1823             'Use yt_dlp.utils.FormatSorter instead')
1824         return FormatSort
1825
1826     def _sort_formats(self, formats, field_preference=[]):
1827         if not field_preference:
1828             self._downloader.deprecation_warning(
1829                 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1830             return
1831         self._downloader.deprecation_warning(
1832             'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1833             'Return _format_sort_fields in the info_dict instead')
1834         if formats:
1835             formats[0]['__sort_fields'] = field_preference
1836
1837     def _check_formats(self, formats, video_id):
1838         if formats:
1839             formats[:] = filter(
1840                 lambda f: self._is_valid_url(
1841                     f['url'], video_id,
1842                     item='{} video format'.format(f.get('format_id')) if f.get('format_id') else 'video'),
1843                 formats)
1844
1845     @staticmethod
1846     def _remove_duplicate_formats(formats):
1847         format_urls = set()
1848         unique_formats = []
1849         for f in formats:
1850             if f['url'] not in format_urls:
1851                 format_urls.add(f['url'])
1852                 unique_formats.append(f)
1853         formats[:] = unique_formats
1854
1855     def _is_valid_url(self, url, video_id, item='video', headers={}):
1856         url = self._proto_relative_url(url, scheme='http:')
1857         # For now assume non HTTP(S) URLs always valid
1858         if not url.startswith(('http://', 'https://')):
1859             return True
1860         try:
1861             self._request_webpage(url, video_id, f'Checking {item} URL', headers=headers)
1862             return True
1863         except ExtractorError as e:
1864             self.to_screen(
1865                 f'{video_id}: {item} URL is invalid, skipping: {e.cause!s}')
1866             return False
1867
1868     def http_scheme(self):
1869         """ Either "http:" or "https:", depending on the user's preferences """
1870         return (
1871             'http:'
1872             if self.get_param('prefer_insecure', False)
1873             else 'https:')
1874
1875     def _proto_relative_url(self, url, scheme=None):
1876         scheme = scheme or self.http_scheme()
1877         assert scheme.endswith(':')
1878         return sanitize_url(url, scheme=scheme[:-1])
1879
1880     def _sleep(self, timeout, video_id, msg_template=None):
1881         if msg_template is None:
1882             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1883         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1884         self.to_screen(msg)
1885         time.sleep(timeout)
1886
1887     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1888                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1889                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1890         if self.get_param('ignore_no_formats_error'):
1891             fatal = False
1892
1893         res = self._download_xml_handle(
1894             manifest_url, video_id, 'Downloading f4m manifest',
1895             'Unable to download f4m manifest',
1896             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1897             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1898             transform_source=transform_source,
1899             fatal=fatal, data=data, headers=headers, query=query)
1900         if res is False:
1901             return []
1902
1903         manifest, urlh = res
1904         manifest_url = urlh.url
1905
1906         return self._parse_f4m_formats(
1907             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1908             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1909
1910     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1911                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1912                            fatal=True, m3u8_id=None):
1913         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1914             return []
1915
1916         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1917         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1918         if akamai_pv is not None and ';' in akamai_pv.text:
1919             player_verification_challenge = akamai_pv.text.split(';')[0]
1920             if player_verification_challenge.strip() != '':
1921                 return []
1922
1923         formats = []
1924         manifest_version = '1.0'
1925         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1926         if not media_nodes:
1927             manifest_version = '2.0'
1928             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1929         # Remove unsupported DRM protected media from final formats
1930         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1931         media_nodes = remove_encrypted_media(media_nodes)
1932         if not media_nodes:
1933             return formats
1934
1935         manifest_base_url = get_base_url(manifest)
1936
1937         bootstrap_info = xpath_element(
1938             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1939             'bootstrap info', default=None)
1940
1941         vcodec = None
1942         mime_type = xpath_text(
1943             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1944             'base URL', default=None)
1945         if mime_type and mime_type.startswith('audio/'):
1946             vcodec = 'none'
1947
1948         for i, media_el in enumerate(media_nodes):
1949             tbr = int_or_none(media_el.attrib.get('bitrate'))
1950             width = int_or_none(media_el.attrib.get('width'))
1951             height = int_or_none(media_el.attrib.get('height'))
1952             format_id = join_nonempty(f4m_id, tbr or i)
1953             # If <bootstrapInfo> is present, the specified f4m is a
1954             # stream-level manifest, and only set-level manifests may refer to
1955             # external resources.  See section 11.4 and section 4 of F4M spec
1956             if bootstrap_info is None:
1957                 media_url = None
1958                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1959                 if manifest_version == '2.0':
1960                     media_url = media_el.attrib.get('href')
1961                 if media_url is None:
1962                     media_url = media_el.attrib.get('url')
1963                 if not media_url:
1964                     continue
1965                 manifest_url = (
1966                     media_url if media_url.startswith(('http://', 'https://'))
1967                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1968                 # If media_url is itself a f4m manifest do the recursive extraction
1969                 # since bitrates in parent manifest (this one) and media_url manifest
1970                 # may differ leading to inability to resolve the format by requested
1971                 # bitrate in f4m downloader
1972                 ext = determine_ext(manifest_url)
1973                 if ext == 'f4m':
1974                     f4m_formats = self._extract_f4m_formats(
1975                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1976                         transform_source=transform_source, fatal=fatal)
1977                     # Sometimes stream-level manifest contains single media entry that
1978                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1979                     # At the same time parent's media entry in set-level manifest may
1980                     # contain it. We will copy it from parent in such cases.
1981                     if len(f4m_formats) == 1:
1982                         f = f4m_formats[0]
1983                         f.update({
1984                             'tbr': f.get('tbr') or tbr,
1985                             'width': f.get('width') or width,
1986                             'height': f.get('height') or height,
1987                             'format_id': f.get('format_id') if not tbr else format_id,
1988                             'vcodec': vcodec,
1989                         })
1990                     formats.extend(f4m_formats)
1991                     continue
1992                 elif ext == 'm3u8':
1993                     formats.extend(self._extract_m3u8_formats(
1994                         manifest_url, video_id, 'mp4', preference=preference,
1995                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1996                     continue
1997             formats.append({
1998                 'format_id': format_id,
1999                 'url': manifest_url,
2000                 'manifest_url': manifest_url,
2001                 'ext': 'flv' if bootstrap_info is not None else None,
2002                 'protocol': 'f4m',
2003                 'tbr': tbr,
2004                 'width': width,
2005                 'height': height,
2006                 'vcodec': vcodec,
2007                 'preference': preference,
2008                 'quality': quality,
2009             })
2010         return formats
2011
2012     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2013         return {
2014             'format_id': join_nonempty(m3u8_id, 'meta'),
2015             'url': m3u8_url,
2016             'ext': ext,
2017             'protocol': 'm3u8',
2018             'preference': preference - 100 if preference else -100,
2019             'quality': quality,
2020             'resolution': 'multiple',
2021             'format_note': 'Quality selection URL',
2022         }
2023
2024     def _report_ignoring_subs(self, name):
2025         self.report_warning(bug_reports_message(
2026             f'Ignoring subtitle tracks found in the {name} manifest; '
2027             'if any subtitle tracks are missing,',
2028         ), only_once=True)
2029
2030     def _extract_m3u8_formats(self, *args, **kwargs):
2031         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2032         if subs:
2033             self._report_ignoring_subs('HLS')
2034         return fmts
2035
2036     def _extract_m3u8_formats_and_subtitles(
2037             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2038             preference=None, quality=None, m3u8_id=None, note=None,
2039             errnote=None, fatal=True, live=False, data=None, headers={},
2040             query={}):
2041
2042         if self.get_param('ignore_no_formats_error'):
2043             fatal = False
2044
2045         if not m3u8_url:
2046             if errnote is not False:
2047                 errnote = errnote or 'Failed to obtain m3u8 URL'
2048                 if fatal:
2049                     raise ExtractorError(errnote, video_id=video_id)
2050                 self.report_warning(f'{errnote}{bug_reports_message()}')
2051             return [], {}
2052
2053         res = self._download_webpage_handle(
2054             m3u8_url, video_id,
2055             note='Downloading m3u8 information' if note is None else note,
2056             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2057             fatal=fatal, data=data, headers=headers, query=query)
2058
2059         if res is False:
2060             return [], {}
2061
2062         m3u8_doc, urlh = res
2063         m3u8_url = urlh.url
2064
2065         return self._parse_m3u8_formats_and_subtitles(
2066             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2067             preference=preference, quality=quality, m3u8_id=m3u8_id,
2068             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2069             headers=headers, query=query, video_id=video_id)
2070
2071     def _parse_m3u8_formats_and_subtitles(
2072             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2073             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2074             errnote=None, fatal=True, data=None, headers={}, query={},
2075             video_id=None):
2076         formats, subtitles = [], {}
2077         has_drm = HlsFD._has_drm(m3u8_doc)
2078
2079         def format_url(url):
2080             return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2081
2082         if self.get_param('hls_split_discontinuity', False):
2083             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2084                 if not m3u8_doc:
2085                     if not manifest_url:
2086                         return []
2087                     m3u8_doc = self._download_webpage(
2088                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2089                         note=False, errnote='Failed to download m3u8 playlist information')
2090                     if m3u8_doc is False:
2091                         return []
2092                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2093
2094         else:
2095             def _extract_m3u8_playlist_indices(*args, **kwargs):
2096                 return [None]
2097
2098         # References:
2099         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2100         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2101         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2102
2103         # We should try extracting formats only from master playlists [1, 4.3.4],
2104         # i.e. playlists that describe available qualities. On the other hand
2105         # media playlists [1, 4.3.3] should be returned as is since they contain
2106         # just the media without qualities renditions.
2107         # Fortunately, master playlist can be easily distinguished from media
2108         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2109         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2110         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2111         # media playlist and MUST NOT appear in master playlist thus we can
2112         # clearly detect media playlist with this criterion.
2113
2114         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2115             formats = [{
2116                 'format_id': join_nonempty(m3u8_id, idx),
2117                 'format_index': idx,
2118                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode(), 'application/x-mpegurl'),
2119                 'ext': ext,
2120                 'protocol': entry_protocol,
2121                 'preference': preference,
2122                 'quality': quality,
2123                 'has_drm': has_drm,
2124             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2125
2126             return formats, subtitles
2127
2128         groups = {}
2129         last_stream_inf = {}
2130
2131         def extract_media(x_media_line):
2132             media = parse_m3u8_attributes(x_media_line)
2133             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2134             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2135             if not (media_type and group_id and name):
2136                 return
2137             groups.setdefault(group_id, []).append(media)
2138             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2139             if media_type == 'SUBTITLES':
2140                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2141                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2142                 # However, lack of URI has been spotted in the wild.
2143                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2144                 if not media.get('URI'):
2145                     return
2146                 url = format_url(media['URI'])
2147                 sub_info = {
2148                     'url': url,
2149                     'ext': determine_ext(url),
2150                 }
2151                 if sub_info['ext'] == 'm3u8':
2152                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2153                     # files may contain is WebVTT:
2154                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2155                     sub_info['ext'] = 'vtt'
2156                     sub_info['protocol'] = 'm3u8_native'
2157                 lang = media.get('LANGUAGE') or 'und'
2158                 subtitles.setdefault(lang, []).append(sub_info)
2159             if media_type not in ('VIDEO', 'AUDIO'):
2160                 return
2161             media_url = media.get('URI')
2162             if media_url:
2163                 manifest_url = format_url(media_url)
2164                 formats.extend({
2165                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2166                     'format_note': name,
2167                     'format_index': idx,
2168                     'url': manifest_url,
2169                     'manifest_url': m3u8_url,
2170                     'language': media.get('LANGUAGE'),
2171                     'ext': ext,
2172                     'protocol': entry_protocol,
2173                     'preference': preference,
2174                     'quality': quality,
2175                     'has_drm': has_drm,
2176                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2177                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2178
2179         def build_stream_name():
2180             # Despite specification does not mention NAME attribute for
2181             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2182             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2183             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2184             stream_name = last_stream_inf.get('NAME')
2185             if stream_name:
2186                 return stream_name
2187             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2188             # from corresponding rendition group
2189             stream_group_id = last_stream_inf.get('VIDEO')
2190             if not stream_group_id:
2191                 return
2192             stream_group = groups.get(stream_group_id)
2193             if not stream_group:
2194                 return stream_group_id
2195             rendition = stream_group[0]
2196             return rendition.get('NAME') or stream_group_id
2197
2198         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2199         # chance to detect video only formats when EXT-X-STREAM-INF tags
2200         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2201         for line in m3u8_doc.splitlines():
2202             if line.startswith('#EXT-X-MEDIA:'):
2203                 extract_media(line)
2204
2205         for line in m3u8_doc.splitlines():
2206             if line.startswith('#EXT-X-STREAM-INF:'):
2207                 last_stream_inf = parse_m3u8_attributes(line)
2208             elif line.startswith('#') or not line.strip():
2209                 continue
2210             else:
2211                 tbr = float_or_none(
2212                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2213                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2214                 manifest_url = format_url(line.strip())
2215
2216                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2217                     format_id = [m3u8_id, None, idx]
2218                     # Bandwidth of live streams may differ over time thus making
2219                     # format_id unpredictable. So it's better to keep provided
2220                     # format_id intact.
2221                     if not live:
2222                         stream_name = build_stream_name()
2223                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2224                     f = {
2225                         'format_id': join_nonempty(*format_id),
2226                         'format_index': idx,
2227                         'url': manifest_url,
2228                         'manifest_url': m3u8_url,
2229                         'tbr': tbr,
2230                         'ext': ext,
2231                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2232                         'protocol': entry_protocol,
2233                         'preference': preference,
2234                         'quality': quality,
2235                         'has_drm': has_drm,
2236                     }
2237
2238                     # YouTube-specific
2239                     if yt_audio_content_id := last_stream_inf.get('YT-EXT-AUDIO-CONTENT-ID'):
2240                         f['language'] = yt_audio_content_id.split('.')[0]
2241
2242                     resolution = last_stream_inf.get('RESOLUTION')
2243                     if resolution:
2244                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2245                         if mobj:
2246                             f['width'] = int(mobj.group('width'))
2247                             f['height'] = int(mobj.group('height'))
2248                     # Unified Streaming Platform
2249                     mobj = re.search(
2250                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2251                     if mobj:
2252                         abr, vbr = mobj.groups()
2253                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2254                         f.update({
2255                             'vbr': vbr,
2256                             'abr': abr,
2257                         })
2258                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2259                     f.update(codecs)
2260                     audio_group_id = last_stream_inf.get('AUDIO')
2261                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2262                     # references a rendition group MUST have a CODECS attribute.
2263                     # However, this is not always respected. E.g. [2]
2264                     # contains EXT-X-STREAM-INF tag which references AUDIO
2265                     # rendition group but does not have CODECS and despite
2266                     # referencing an audio group it represents a complete
2267                     # (with audio and video) format. So, for such cases we will
2268                     # ignore references to rendition groups and treat them
2269                     # as complete formats.
2270                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2271                         audio_group = groups.get(audio_group_id)
2272                         if audio_group and audio_group[0].get('URI'):
2273                             # TODO: update acodec for audio only formats with
2274                             # the same GROUP-ID
2275                             f['acodec'] = 'none'
2276                     if not f.get('ext'):
2277                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2278                     formats.append(f)
2279
2280                     # for DailyMotion
2281                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2282                     if progressive_uri:
2283                         http_f = f.copy()
2284                         del http_f['manifest_url']
2285                         http_f.update({
2286                             'format_id': f['format_id'].replace('hls-', 'http-'),
2287                             'protocol': 'http',
2288                             'url': progressive_uri,
2289                         })
2290                         formats.append(http_f)
2291
2292                 last_stream_inf = {}
2293         return formats, subtitles
2294
2295     def _extract_m3u8_vod_duration(
2296             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2297
2298         m3u8_vod = self._download_webpage(
2299             m3u8_vod_url, video_id,
2300             note='Downloading m3u8 VOD manifest' if note is None else note,
2301             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2302             fatal=False, data=data, headers=headers, query=query)
2303
2304         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2305
2306     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2307         if '#EXT-X-ENDLIST' not in m3u8_vod:
2308             return None
2309
2310         return int(sum(
2311             float(line[len('#EXTINF:'):].split(',')[0])
2312             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2313
2314     def _extract_mpd_vod_duration(
2315             self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2316
2317         mpd_doc = self._download_xml(
2318             mpd_url, video_id,
2319             note='Downloading MPD VOD manifest' if note is None else note,
2320             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2321             fatal=False, data=data, headers=headers, query=query)
2322         if not isinstance(mpd_doc, xml.etree.ElementTree.Element):
2323             return None
2324         return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2325
2326     @staticmethod
2327     def _xpath_ns(path, namespace=None):
2328         if not namespace:
2329             return path
2330         out = []
2331         for c in path.split('/'):
2332             if not c or c == '.':
2333                 out.append(c)
2334             else:
2335                 out.append(f'{{{namespace}}}{c}')
2336         return '/'.join(out)
2337
2338     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2339         if self.get_param('ignore_no_formats_error'):
2340             fatal = False
2341
2342         res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2343         if res is False:
2344             assert not fatal
2345             return [], {}
2346         smil, urlh = res
2347
2348         return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
2349                                                       namespace=self._parse_smil_namespace(smil))
2350
2351     def _extract_smil_formats(self, *args, **kwargs):
2352         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2353         if subs:
2354             self._report_ignoring_subs('SMIL')
2355         return fmts
2356
2357     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2358         res = self._download_smil(smil_url, video_id, fatal=fatal)
2359         if res is False:
2360             return {}
2361
2362         smil, urlh = res
2363         smil_url = urlh.url
2364
2365         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2366
2367     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2368         return self._download_xml_handle(
2369             smil_url, video_id, 'Downloading SMIL file',
2370             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2371
2372     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2373         namespace = self._parse_smil_namespace(smil)
2374
2375         formats, subtitles = self._parse_smil_formats_and_subtitles(
2376             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2377
2378         video_id = os.path.splitext(url_basename(smil_url))[0]
2379         title = None
2380         description = None
2381         upload_date = None
2382         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2383             name = meta.attrib.get('name')
2384             content = meta.attrib.get('content')
2385             if not name or not content:
2386                 continue
2387             if not title and name == 'title':
2388                 title = content
2389             elif not description and name in ('description', 'abstract'):
2390                 description = content
2391             elif not upload_date and name == 'date':
2392                 upload_date = unified_strdate(content)
2393
2394         thumbnails = [{
2395             'id': image.get('type'),
2396             'url': image.get('src'),
2397             'width': int_or_none(image.get('width')),
2398             'height': int_or_none(image.get('height')),
2399         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2400
2401         return {
2402             'id': video_id,
2403             'title': title or video_id,
2404             'description': description,
2405             'upload_date': upload_date,
2406             'thumbnails': thumbnails,
2407             'formats': formats,
2408             'subtitles': subtitles,
2409         }
2410
2411     def _parse_smil_namespace(self, smil):
2412         return self._search_regex(
2413             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2414
2415     def _parse_smil_formats(self, *args, **kwargs):
2416         fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
2417         if subs:
2418             self._report_ignoring_subs('SMIL')
2419         return fmts
2420
2421     def _parse_smil_formats_and_subtitles(
2422             self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2423         base = smil_url
2424         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2425             b = meta.get('base') or meta.get('httpBase')
2426             if b:
2427                 base = b
2428                 break
2429
2430         formats, subtitles = [], {}
2431         rtmp_count = 0
2432         http_count = 0
2433         m3u8_count = 0
2434         imgs_count = 0
2435
2436         srcs = set()
2437         media = itertools.chain.from_iterable(
2438             smil.findall(self._xpath_ns(arg, namespace))
2439             for arg in ['.//video', './/audio', './/media'])
2440         for medium in media:
2441             src = medium.get('src')
2442             if not src or src in srcs:
2443                 continue
2444             srcs.add(src)
2445
2446             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2447             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2448             width = int_or_none(medium.get('width'))
2449             height = int_or_none(medium.get('height'))
2450             proto = medium.get('proto')
2451             ext = medium.get('ext')
2452             src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2453                 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
2454             streamer = medium.get('streamer') or base
2455
2456             if proto == 'rtmp' or streamer.startswith('rtmp'):
2457                 rtmp_count += 1
2458                 formats.append({
2459                     'url': streamer,
2460                     'play_path': src,
2461                     'ext': 'flv',
2462                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2463                     'tbr': bitrate,
2464                     'filesize': filesize,
2465                     'width': width,
2466                     'height': height,
2467                 })
2468                 if transform_rtmp_url:
2469                     streamer, src = transform_rtmp_url(streamer, src)
2470                     formats[-1].update({
2471                         'url': streamer,
2472                         'play_path': src,
2473                     })
2474                 continue
2475
2476             src_url = src if src.startswith('http') else urllib.parse.urljoin(f'{base}/', src)
2477             src_url = src_url.strip()
2478
2479             if proto == 'm3u8' or src_ext == 'm3u8':
2480                 m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
2481                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2482                 self._merge_subtitles(m3u8_subs, target=subtitles)
2483                 if len(m3u8_formats) == 1:
2484                     m3u8_count += 1
2485                     m3u8_formats[0].update({
2486                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2487                         'tbr': bitrate,
2488                         'width': width,
2489                         'height': height,
2490                     })
2491                 formats.extend(m3u8_formats)
2492             elif src_ext == 'f4m':
2493                 f4m_url = src_url
2494                 if not f4m_params:
2495                     f4m_params = {
2496                         'hdcore': '3.2.0',
2497                         'plugin': 'flowplayer-3.2.0.1',
2498                     }
2499                 f4m_url += '&' if '?' in f4m_url else '?'
2500                 f4m_url += urllib.parse.urlencode(f4m_params)
2501                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2502             elif src_ext == 'mpd':
2503                 mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
2504                     src_url, video_id, mpd_id='dash', fatal=False)
2505                 formats.extend(mpd_formats)
2506                 self._merge_subtitles(mpd_subs, target=subtitles)
2507             elif re.search(r'\.ism/[Mm]anifest', src_url):
2508                 ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
2509                     src_url, video_id, ism_id='mss', fatal=False)
2510                 formats.extend(ism_formats)
2511                 self._merge_subtitles(ism_subs, target=subtitles)
2512             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2513                 http_count += 1
2514                 formats.append({
2515                     'url': src_url,
2516                     'ext': ext or src_ext or 'flv',
2517                     'format_id': 'http-%d' % (bitrate or http_count),
2518                     'tbr': bitrate,
2519                     'filesize': filesize,
2520                     'width': width,
2521                     'height': height,
2522                 })
2523
2524         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2525             src = medium.get('src')
2526             if not src or src in srcs:
2527                 continue
2528             srcs.add(src)
2529
2530             imgs_count += 1
2531             formats.append({
2532                 'format_id': f'imagestream-{imgs_count}',
2533                 'url': src,
2534                 'ext': mimetype2ext(medium.get('type')),
2535                 'acodec': 'none',
2536                 'vcodec': 'none',
2537                 'width': int_or_none(medium.get('width')),
2538                 'height': int_or_none(medium.get('height')),
2539                 'format_note': 'SMIL storyboards',
2540             })
2541
2542         smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
2543         self._merge_subtitles(smil_subs, target=subtitles)
2544
2545         return formats, subtitles
2546
2547     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2548         urls = []
2549         subtitles = {}
2550         for textstream in smil.findall(self._xpath_ns('.//textstream', namespace)):
2551             src = textstream.get('src')
2552             if not src or src in urls:
2553                 continue
2554             urls.append(src)
2555             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2556             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2557             subtitles.setdefault(lang, []).append({
2558                 'url': src,
2559                 'ext': ext,
2560             })
2561         return subtitles
2562
2563     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2564         res = self._download_xml_handle(
2565             xspf_url, playlist_id, 'Downloading xpsf playlist',
2566             'Unable to download xspf manifest', fatal=fatal)
2567         if res is False:
2568             return []
2569
2570         xspf, urlh = res
2571         xspf_url = urlh.url
2572
2573         return self._parse_xspf(
2574             xspf, playlist_id, xspf_url=xspf_url,
2575             xspf_base_url=base_url(xspf_url))
2576
2577     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2578         NS_MAP = {
2579             'xspf': 'http://xspf.org/ns/0/',
2580             's1': 'http://static.streamone.nl/player/ns/0',
2581         }
2582
2583         entries = []
2584         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2585             title = xpath_text(
2586                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2587             description = xpath_text(
2588                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2589             thumbnail = xpath_text(
2590                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2591             duration = float_or_none(
2592                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2593
2594             formats = []
2595             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2596                 format_url = urljoin(xspf_base_url, location.text)
2597                 if not format_url:
2598                     continue
2599                 formats.append({
2600                     'url': format_url,
2601                     'manifest_url': xspf_url,
2602                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2603                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2604                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2605                 })
2606
2607             entries.append({
2608                 'id': playlist_id,
2609                 'title': title,
2610                 'description': description,
2611                 'thumbnail': thumbnail,
2612                 'duration': duration,
2613                 'formats': formats,
2614             })
2615         return entries
2616
2617     def _extract_mpd_formats(self, *args, **kwargs):
2618         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2619         if subs:
2620             self._report_ignoring_subs('DASH')
2621         return fmts
2622
2623     def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
2624         periods = self._extract_mpd_periods(*args, **kwargs)
2625         return self._merge_mpd_periods(periods)
2626
2627     def _extract_mpd_periods(
2628             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2629             fatal=True, data=None, headers={}, query={}):
2630
2631         if self.get_param('ignore_no_formats_error'):
2632             fatal = False
2633
2634         res = self._download_xml_handle(
2635             mpd_url, video_id,
2636             note='Downloading MPD manifest' if note is None else note,
2637             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2638             fatal=fatal, data=data, headers=headers, query=query)
2639         if res is False:
2640             return []
2641         mpd_doc, urlh = res
2642         if mpd_doc is None:
2643             return []
2644
2645         # We could have been redirected to a new url when we retrieved our mpd file.
2646         mpd_url = urlh.url
2647         mpd_base_url = base_url(mpd_url)
2648
2649         return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)
2650
2651     def _parse_mpd_formats(self, *args, **kwargs):
2652         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2653         if subs:
2654             self._report_ignoring_subs('DASH')
2655         return fmts
2656
2657     def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
2658         periods = self._parse_mpd_periods(*args, **kwargs)
2659         return self._merge_mpd_periods(periods)
2660
2661     def _merge_mpd_periods(self, periods):
2662         """
2663         Combine all formats and subtitles from an MPD manifest into a single list,
2664         by concatenate streams with similar formats.
2665         """
2666         formats, subtitles = {}, {}
2667         for period in periods:
2668             for f in period['formats']:
2669                 assert 'is_dash_periods' not in f, 'format already processed'
2670                 f['is_dash_periods'] = True
2671                 format_key = tuple(v for k, v in f.items() if k not in (
2672                     ('format_id', 'fragments', 'manifest_stream_number')))
2673                 if format_key not in formats:
2674                     formats[format_key] = f
2675                 elif 'fragments' in f:
2676                     formats[format_key].setdefault('fragments', []).extend(f['fragments'])
2677
2678             if subtitles and period['subtitles']:
2679                 self.report_warning(bug_reports_message(
2680                     'Found subtitles in multiple periods in the DASH manifest; '
2681                     'if part of the subtitles are missing,',
2682                 ), only_once=True)
2683
2684             for sub_lang, sub_info in period['subtitles'].items():
2685                 subtitles.setdefault(sub_lang, []).extend(sub_info)
2686
2687         return list(formats.values()), subtitles
2688
2689     def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2690         """
2691         Parse formats from MPD manifest.
2692         References:
2693          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2694             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2695          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2696         """
2697         if not self.get_param('dynamic_mpd', True):
2698             if mpd_doc.get('type') == 'dynamic':
2699                 return [], {}
2700
2701         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2702
2703         def _add_ns(path):
2704             return self._xpath_ns(path, namespace)
2705
2706         def is_drm_protected(element):
2707             return element.find(_add_ns('ContentProtection')) is not None
2708
2709         def extract_multisegment_info(element, ms_parent_info):
2710             ms_info = ms_parent_info.copy()
2711
2712             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2713             # common attributes and elements.  We will only extract relevant
2714             # for us.
2715             def extract_common(source):
2716                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2717                 if segment_timeline is not None:
2718                     s_e = segment_timeline.findall(_add_ns('S'))
2719                     if s_e:
2720                         ms_info['total_number'] = 0
2721                         ms_info['s'] = []
2722                         for s in s_e:
2723                             r = int(s.get('r', 0))
2724                             ms_info['total_number'] += 1 + r
2725                             ms_info['s'].append({
2726                                 't': int(s.get('t', 0)),
2727                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2728                                 'd': int(s.attrib['d']),
2729                                 'r': r,
2730                             })
2731                 start_number = source.get('startNumber')
2732                 if start_number:
2733                     ms_info['start_number'] = int(start_number)
2734                 timescale = source.get('timescale')
2735                 if timescale:
2736                     ms_info['timescale'] = int(timescale)
2737                 segment_duration = source.get('duration')
2738                 if segment_duration:
2739                     ms_info['segment_duration'] = float(segment_duration)
2740
2741             def extract_Initialization(source):
2742                 initialization = source.find(_add_ns('Initialization'))
2743                 if initialization is not None:
2744                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2745
2746             segment_list = element.find(_add_ns('SegmentList'))
2747             if segment_list is not None:
2748                 extract_common(segment_list)
2749                 extract_Initialization(segment_list)
2750                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2751                 if segment_urls_e:
2752                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2753             else:
2754                 segment_template = element.find(_add_ns('SegmentTemplate'))
2755                 if segment_template is not None:
2756                     extract_common(segment_template)
2757                     media = segment_template.get('media')
2758                     if media:
2759                         ms_info['media'] = media
2760                     initialization = segment_template.get('initialization')
2761                     if initialization:
2762                         ms_info['initialization'] = initialization
2763                     else:
2764                         extract_Initialization(segment_template)
2765             return ms_info
2766
2767         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2768         stream_numbers = collections.defaultdict(int)
2769         for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
2770             period_entry = {
2771                 'id': period.get('id', f'period-{period_idx}'),
2772                 'formats': [],
2773                 'subtitles': collections.defaultdict(list),
2774             }
2775             period_duration = parse_duration(period.get('duration')) or mpd_duration
2776             period_ms_info = extract_multisegment_info(period, {
2777                 'start_number': 1,
2778                 'timescale': 1,
2779             })
2780             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2781                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2782                 for representation in adaptation_set.findall(_add_ns('Representation')):
2783                     representation_attrib = adaptation_set.attrib.copy()
2784                     representation_attrib.update(representation.attrib)
2785                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2786                     mime_type = representation_attrib['mimeType']
2787                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2788
2789                     codec_str = representation_attrib.get('codecs', '')
2790                     # Some kind of binary subtitle found in some youtube livestreams
2791                     if mime_type == 'application/x-rawcc':
2792                         codecs = {'scodec': codec_str}
2793                     else:
2794                         codecs = parse_codecs(codec_str)
2795                     if content_type not in ('video', 'audio', 'text'):
2796                         if mime_type == 'image/jpeg':
2797                             content_type = mime_type
2798                         elif codecs.get('vcodec', 'none') != 'none':
2799                             content_type = 'video'
2800                         elif codecs.get('acodec', 'none') != 'none':
2801                             content_type = 'audio'
2802                         elif codecs.get('scodec', 'none') != 'none':
2803                             content_type = 'text'
2804                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2805                             content_type = 'text'
2806                         else:
2807                             self.report_warning(f'Unknown MIME type {mime_type} in DASH manifest')
2808                             continue
2809
2810                     base_url = ''
2811                     for element in (representation, adaptation_set, period, mpd_doc):
2812                         base_url_e = element.find(_add_ns('BaseURL'))
2813                         if try_call(lambda: base_url_e.text) is not None:
2814                             base_url = base_url_e.text + base_url
2815                             if re.match(r'^https?://', base_url):
2816                                 break
2817                     if mpd_base_url and base_url.startswith('/'):
2818                         base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2819                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2820                         if not mpd_base_url.endswith('/'):
2821                             mpd_base_url += '/'
2822                         base_url = mpd_base_url + base_url
2823                     representation_id = representation_attrib.get('id')
2824                     lang = representation_attrib.get('lang')
2825                     url_el = representation.find(_add_ns('BaseURL'))
2826                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2827                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2828                     if representation_id is not None:
2829                         format_id = representation_id
2830                     else:
2831                         format_id = content_type
2832                     if mpd_id:
2833                         format_id = mpd_id + '-' + format_id
2834                     if content_type in ('video', 'audio'):
2835                         f = {
2836                             'format_id': format_id,
2837                             'manifest_url': mpd_url,
2838                             'ext': mimetype2ext(mime_type),
2839                             'width': int_or_none(representation_attrib.get('width')),
2840                             'height': int_or_none(representation_attrib.get('height')),
2841                             'tbr': float_or_none(bandwidth, 1000),
2842                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2843                             'fps': int_or_none(representation_attrib.get('frameRate')),
2844                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2845                             'format_note': f'DASH {content_type}',
2846                             'filesize': filesize,
2847                             'container': mimetype2ext(mime_type) + '_dash',
2848                             **codecs,
2849                         }
2850                     elif content_type == 'text':
2851                         f = {
2852                             'ext': mimetype2ext(mime_type),
2853                             'manifest_url': mpd_url,
2854                             'filesize': filesize,
2855                         }
2856                     elif content_type == 'image/jpeg':
2857                         # See test case in VikiIE
2858                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2859                         f = {
2860                             'format_id': format_id,
2861                             'ext': 'mhtml',
2862                             'manifest_url': mpd_url,
2863                             'format_note': 'DASH storyboards (jpeg)',
2864                             'acodec': 'none',
2865                             'vcodec': 'none',
2866                         }
2867                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2868                         f['has_drm'] = True
2869                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2870
2871                     def prepare_template(template_name, identifiers):
2872                         tmpl = representation_ms_info[template_name]
2873                         if representation_id is not None:
2874                             tmpl = tmpl.replace('$RepresentationID$', representation_id)
2875                         # First of, % characters outside $...$ templates
2876                         # must be escaped by doubling for proper processing
2877                         # by % operator string formatting used further (see
2878                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2879                         t = ''
2880                         in_template = False
2881                         for c in tmpl:
2882                             t += c
2883                             if c == '$':
2884                                 in_template = not in_template
2885                             elif c == '%' and not in_template:
2886                                 t += c
2887                         # Next, $...$ templates are translated to their
2888                         # %(...) counterparts to be used with % operator
2889                         t = re.sub(r'\$({})\$'.format('|'.join(identifiers)), r'%(\1)d', t)
2890                         t = re.sub(r'\$({})%([^$]+)\$'.format('|'.join(identifiers)), r'%(\1)\2', t)
2891                         t.replace('$$', '$')
2892                         return t
2893
2894                     # @initialization is a regular template like @media one
2895                     # so it should be handled just the same way (see
2896                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2897                     if 'initialization' in representation_ms_info:
2898                         initialization_template = prepare_template(
2899                             'initialization',
2900                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2901                             # $Time$ shall not be included for @initialization thus
2902                             # only $Bandwidth$ remains
2903                             ('Bandwidth', ))
2904                         representation_ms_info['initialization_url'] = initialization_template % {
2905                             'Bandwidth': bandwidth,
2906                         }
2907
2908                     def location_key(location):
2909                         return 'url' if re.match(r'^https?://', location) else 'path'
2910
2911                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2912
2913                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2914                         media_location_key = location_key(media_template)
2915
2916                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2917                         # can't be used at the same time
2918                         if '%(Number' in media_template and 's' not in representation_ms_info:
2919                             segment_duration = None
2920                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2921                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2922                                 representation_ms_info['total_number'] = int(math.ceil(
2923                                     float_or_none(period_duration, segment_duration, default=0)))
2924                             representation_ms_info['fragments'] = [{
2925                                 media_location_key: media_template % {
2926                                     'Number': segment_number,
2927                                     'Bandwidth': bandwidth,
2928                                 },
2929                                 'duration': segment_duration,
2930                             } for segment_number in range(
2931                                 representation_ms_info['start_number'],
2932                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2933                         else:
2934                             # $Number*$ or $Time$ in media template with S list available
2935                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2936                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2937                             representation_ms_info['fragments'] = []
2938                             segment_time = 0
2939                             segment_d = None
2940                             segment_number = representation_ms_info['start_number']
2941
2942                             def add_segment_url():
2943                                 segment_url = media_template % {
2944                                     'Time': segment_time,
2945                                     'Bandwidth': bandwidth,
2946                                     'Number': segment_number,
2947                                 }
2948                                 representation_ms_info['fragments'].append({
2949                                     media_location_key: segment_url,
2950                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2951                                 })
2952
2953                             for s in representation_ms_info['s']:
2954                                 segment_time = s.get('t') or segment_time
2955                                 segment_d = s['d']
2956                                 add_segment_url()
2957                                 segment_number += 1
2958                                 for _ in range(s.get('r', 0)):
2959                                     segment_time += segment_d
2960                                     add_segment_url()
2961                                     segment_number += 1
2962                                 segment_time += segment_d
2963                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2964                         # No media template,
2965                         # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2966                         # or any YouTube dashsegments video
2967                         fragments = []
2968                         segment_index = 0
2969                         timescale = representation_ms_info['timescale']
2970                         for s in representation_ms_info['s']:
2971                             duration = float_or_none(s['d'], timescale)
2972                             for _ in range(s.get('r', 0) + 1):
2973                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2974                                 fragments.append({
2975                                     location_key(segment_uri): segment_uri,
2976                                     'duration': duration,
2977                                 })
2978                                 segment_index += 1
2979                         representation_ms_info['fragments'] = fragments
2980                     elif 'segment_urls' in representation_ms_info:
2981                         # Segment URLs with no SegmentTimeline
2982                         # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2983                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2984                         fragments = []
2985                         segment_duration = float_or_none(
2986                             representation_ms_info['segment_duration'],
2987                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2988                         for segment_url in representation_ms_info['segment_urls']:
2989                             fragment = {
2990                                 location_key(segment_url): segment_url,
2991                             }
2992                             if segment_duration:
2993                                 fragment['duration'] = segment_duration
2994                             fragments.append(fragment)
2995                         representation_ms_info['fragments'] = fragments
2996                     # If there is a fragments key available then we correctly recognized fragmented media.
2997                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2998                     # assumption is not necessarily correct since we may simply have no support for
2999                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3000                     if 'fragments' in representation_ms_info:
3001                         f.update({
3002                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
3003                             'url': mpd_url or base_url,
3004                             'fragment_base_url': base_url,
3005                             'fragments': [],
3006                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3007                         })
3008                         if 'initialization_url' in representation_ms_info:
3009                             initialization_url = representation_ms_info['initialization_url']
3010                             if not f.get('url'):
3011                                 f['url'] = initialization_url
3012                             f['fragments'].append({location_key(initialization_url): initialization_url})
3013                         f['fragments'].extend(representation_ms_info['fragments'])
3014                         if not period_duration:
3015                             period_duration = try_get(
3016                                 representation_ms_info,
3017                                 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3018                     else:
3019                         # Assuming direct URL to unfragmented media.
3020                         f['url'] = base_url
3021                     if content_type in ('video', 'audio', 'image/jpeg'):
3022                         f['manifest_stream_number'] = stream_numbers[f['url']]
3023                         stream_numbers[f['url']] += 1
3024                         period_entry['formats'].append(f)
3025                     elif content_type == 'text':
3026                         period_entry['subtitles'][lang or 'und'].append(f)
3027             yield period_entry
3028
3029     def _extract_ism_formats(self, *args, **kwargs):
3030         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3031         if subs:
3032             self._report_ignoring_subs('ISM')
3033         return fmts
3034
3035     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3036         if self.get_param('ignore_no_formats_error'):
3037             fatal = False
3038
3039         res = self._download_xml_handle(
3040             ism_url, video_id,
3041             note='Downloading ISM manifest' if note is None else note,
3042             errnote='Failed to download ISM manifest' if errnote is None else errnote,
3043             fatal=fatal, data=data, headers=headers, query=query)
3044         if res is False:
3045             return [], {}
3046         ism_doc, urlh = res
3047         if ism_doc is None:
3048             return [], {}
3049
3050         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
3051
3052     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3053         """
3054         Parse formats from ISM manifest.
3055         References:
3056          1. [MS-SSTR]: Smooth Streaming Protocol,
3057             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3058         """
3059         if ism_doc.get('IsLive') == 'TRUE':
3060             return [], {}
3061
3062         duration = int(ism_doc.attrib['Duration'])
3063         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3064
3065         formats = []
3066         subtitles = {}
3067         for stream in ism_doc.findall('StreamIndex'):
3068             stream_type = stream.get('Type')
3069             if stream_type not in ('video', 'audio', 'text'):
3070                 continue
3071             url_pattern = stream.attrib['Url']
3072             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3073             stream_name = stream.get('Name')
3074             stream_language = stream.get('Language', 'und')
3075             for track in stream.findall('QualityLevel'):
3076                 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
3077                 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
3078                 # TODO: add support for WVC1 and WMAP
3079                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
3080                     self.report_warning(f'{fourcc} is not a supported codec')
3081                     continue
3082                 tbr = int(track.attrib['Bitrate']) // 1000
3083                 # [1] does not mention Width and Height attributes. However,
3084                 # they're often present while MaxWidth and MaxHeight are
3085                 # missing, so should be used as fallbacks
3086                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3087                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3088                 sampling_rate = int_or_none(track.get('SamplingRate'))
3089
3090                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3091                 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3092
3093                 fragments = []
3094                 fragment_ctx = {
3095                     'time': 0,
3096                 }
3097                 stream_fragments = stream.findall('c')
3098                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3099                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3100                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3101                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3102                     if not fragment_ctx['duration']:
3103                         try:
3104                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3105                         except IndexError:
3106                             next_fragment_time = duration
3107                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3108                     for _ in range(fragment_repeat):
3109                         fragments.append({
3110                             'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3111                             'duration': fragment_ctx['duration'] / stream_timescale,
3112                         })
3113                         fragment_ctx['time'] += fragment_ctx['duration']
3114
3115                 if stream_type == 'text':
3116                     subtitles.setdefault(stream_language, []).append({
3117                         'ext': 'ismt',
3118                         'protocol': 'ism',
3119                         'url': ism_url,
3120                         'manifest_url': ism_url,
3121                         'fragments': fragments,
3122                         '_download_params': {
3123                             'stream_type': stream_type,
3124                             'duration': duration,
3125                             'timescale': stream_timescale,
3126                             'fourcc': fourcc,
3127                             'language': stream_language,
3128                             'codec_private_data': track.get('CodecPrivateData'),
3129                         },
3130                     })
3131                 elif stream_type in ('video', 'audio'):
3132                     formats.append({
3133                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3134                         'url': ism_url,
3135                         'manifest_url': ism_url,
3136                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3137                         'width': width,
3138                         'height': height,
3139                         'tbr': tbr,
3140                         'asr': sampling_rate,
3141                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3142                         'acodec': 'none' if stream_type == 'video' else fourcc,
3143                         'protocol': 'ism',
3144                         'fragments': fragments,
3145                         'has_drm': ism_doc.find('Protection') is not None,
3146                         'language': stream_language,
3147                         'audio_channels': int_or_none(track.get('Channels')),
3148                         '_download_params': {
3149                             'stream_type': stream_type,
3150                             'duration': duration,
3151                             'timescale': stream_timescale,
3152                             'width': width or 0,
3153                             'height': height or 0,
3154                             'fourcc': fourcc,
3155                             'language': stream_language,
3156                             'codec_private_data': track.get('CodecPrivateData'),
3157                             'sampling_rate': sampling_rate,
3158                             'channels': int_or_none(track.get('Channels', 2)),
3159                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3160                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3161                         },
3162                     })
3163         return formats, subtitles
3164
3165     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None, _headers=None):
3166         def absolute_url(item_url):
3167             return urljoin(base_url, item_url)
3168
3169         def parse_content_type(content_type):
3170             if not content_type:
3171                 return {}
3172             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3173             if ctr:
3174                 mimetype, codecs = ctr.groups()
3175                 f = parse_codecs(codecs)
3176                 f['ext'] = mimetype2ext(mimetype)
3177                 return f
3178             return {}
3179
3180         def _media_formats(src, cur_media_type, type_info=None):
3181             type_info = type_info or {}
3182             full_url = absolute_url(src)
3183             ext = type_info.get('ext') or determine_ext(full_url)
3184             if ext == 'm3u8':
3185                 is_plain_url = False
3186                 formats = self._extract_m3u8_formats(
3187                     full_url, video_id, ext='mp4',
3188                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3189                     preference=preference, quality=quality, fatal=False, headers=_headers)
3190             elif ext == 'mpd':
3191                 is_plain_url = False
3192                 formats = self._extract_mpd_formats(
3193                     full_url, video_id, mpd_id=mpd_id, fatal=False, headers=_headers)
3194             else:
3195                 is_plain_url = True
3196                 formats = [{
3197                     'url': full_url,
3198                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3199                     'ext': ext,
3200                 }]
3201             return is_plain_url, formats
3202
3203         entries = []
3204         # amp-video and amp-audio are very similar to their HTML5 counterparts
3205         # so we will include them right here (see
3206         # https://www.ampproject.org/docs/reference/components/amp-video)
3207         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3208         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3209         media_tags = [(media_tag, media_tag_name, media_type, '')
3210                       for media_tag, media_tag_name, media_type
3211                       in re.findall(rf'(?s)(<({_MEDIA_TAG_NAME_RE})[^>]*/>)', webpage)]
3212         media_tags.extend(re.findall(
3213             # We only allow video|audio followed by a whitespace or '>'.
3214             # Allowing more characters may end up in significant slow down (see
3215             # https://github.com/ytdl-org/youtube-dl/issues/11979,
3216             # e.g. http://www.porntrex.com/maps/videositemap.xml).
3217             rf'(?s)(<(?P<tag>{_MEDIA_TAG_NAME_RE})(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
3218         for media_tag, _, media_type, media_content in media_tags:
3219             media_info = {
3220                 'formats': [],
3221                 'subtitles': {},
3222             }
3223             media_attributes = extract_attributes(media_tag)
3224             src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3225             if src:
3226                 f = parse_content_type(media_attributes.get('type'))
3227                 _, formats = _media_formats(src, media_type, f)
3228                 media_info['formats'].extend(formats)
3229             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3230             if media_content:
3231                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3232                     s_attr = extract_attributes(source_tag)
3233                     # data-video-src and data-src are non standard but seen
3234                     # several times in the wild
3235                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3236                     if not src:
3237                         continue
3238                     f = parse_content_type(s_attr.get('type'))
3239                     is_plain_url, formats = _media_formats(src, media_type, f)
3240                     if is_plain_url:
3241                         # width, height, res, label and title attributes are
3242                         # all not standard but seen several times in the wild
3243                         labels = [
3244                             s_attr.get(lbl)
3245                             for lbl in ('label', 'title')
3246                             if str_or_none(s_attr.get(lbl))
3247                         ]
3248                         width = int_or_none(s_attr.get('width'))
3249                         height = (int_or_none(s_attr.get('height'))
3250                                   or int_or_none(s_attr.get('res')))
3251                         if not width or not height:
3252                             for lbl in labels:
3253                                 resolution = parse_resolution(lbl)
3254                                 if not resolution:
3255                                     continue
3256                                 width = width or resolution.get('width')
3257                                 height = height or resolution.get('height')
3258                         for lbl in labels:
3259                             tbr = parse_bitrate(lbl)
3260                             if tbr:
3261                                 break
3262                         else:
3263                             tbr = None
3264                         f.update({
3265                             'width': width,
3266                             'height': height,
3267                             'tbr': tbr,
3268                             'format_id': s_attr.get('label') or s_attr.get('title'),
3269                         })
3270                         f.update(formats[0])
3271                         media_info['formats'].append(f)
3272                     else:
3273                         media_info['formats'].extend(formats)
3274                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3275                     track_attributes = extract_attributes(track_tag)
3276                     kind = track_attributes.get('kind')
3277                     if not kind or kind in ('subtitles', 'captions'):
3278                         src = strip_or_none(track_attributes.get('src'))
3279                         if not src:
3280                             continue
3281                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3282                         media_info['subtitles'].setdefault(lang, []).append({
3283                             'url': absolute_url(src),
3284                         })
3285             for f in media_info['formats']:
3286                 f.setdefault('http_headers', {})['Referer'] = base_url
3287                 if _headers:
3288                     f['http_headers'].update(_headers)
3289             if media_info['formats'] or media_info['subtitles']:
3290                 entries.append(media_info)
3291         return entries
3292
3293     def _extract_akamai_formats(self, *args, **kwargs):
3294         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3295         if subs:
3296             self._report_ignoring_subs('akamai')
3297         return fmts
3298
3299     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3300         signed = 'hdnea=' in manifest_url
3301         if not signed:
3302             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3303             manifest_url = re.sub(
3304                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3305                 '', manifest_url).strip('?')
3306
3307         formats = []
3308         subtitles = {}
3309
3310         hdcore_sign = 'hdcore=3.7.0'
3311         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3312         hds_host = hosts.get('hds')
3313         if hds_host:
3314             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3315         if 'hdcore=' not in f4m_url:
3316             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3317         f4m_formats = self._extract_f4m_formats(
3318             f4m_url, video_id, f4m_id='hds', fatal=False)
3319         for entry in f4m_formats:
3320             entry.update({'extra_param_to_segment_url': hdcore_sign})
3321         formats.extend(f4m_formats)
3322
3323         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3324         hls_host = hosts.get('hls')
3325         if hls_host:
3326             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3327         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3328             m3u8_url, video_id, 'mp4', 'm3u8_native',
3329             m3u8_id='hls', fatal=False)
3330         formats.extend(m3u8_formats)
3331         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3332
3333         http_host = hosts.get('http')
3334         if http_host and m3u8_formats and not signed:
3335             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3336             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3337             qualities_length = len(qualities)
3338             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3339                 i = 0
3340                 for f in m3u8_formats:
3341                     if f['vcodec'] != 'none':
3342                         for protocol in ('http', 'https'):
3343                             http_f = f.copy()
3344                             del http_f['manifest_url']
3345                             http_url = re.sub(
3346                                 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3347                             http_f.update({
3348                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3349                                 'url': http_url,
3350                                 'protocol': protocol,
3351                             })
3352                             formats.append(http_f)
3353                         i += 1
3354
3355         return formats, subtitles
3356
3357     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3358         query = urllib.parse.urlparse(url).query
3359         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3360         mobj = re.search(
3361             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3362         url_base = mobj.group('url')
3363         http_base_url = '{}{}:{}'.format('http', mobj.group('s') or '', url_base)
3364         formats = []
3365
3366         def manifest_url(manifest):
3367             m_url = f'{http_base_url}/{manifest}'
3368             if query:
3369                 m_url += f'?{query}'
3370             return m_url
3371
3372         if 'm3u8' not in skip_protocols:
3373             formats.extend(self._extract_m3u8_formats(
3374                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3375                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3376         if 'f4m' not in skip_protocols:
3377             formats.extend(self._extract_f4m_formats(
3378                 manifest_url('manifest.f4m'),
3379                 video_id, f4m_id='hds', fatal=False))
3380         if 'dash' not in skip_protocols:
3381             formats.extend(self._extract_mpd_formats(
3382                 manifest_url('manifest.mpd'),
3383                 video_id, mpd_id='dash', fatal=False))
3384         if re.search(r'(?:/smil:|\.smil)', url_base):
3385             if 'smil' not in skip_protocols:
3386                 rtmp_formats = self._extract_smil_formats(
3387                     manifest_url('jwplayer.smil'),
3388                     video_id, fatal=False)
3389                 for rtmp_format in rtmp_formats:
3390                     rtsp_format = rtmp_format.copy()
3391                     rtsp_format['url'] = '{}/{}'.format(rtmp_format['url'], rtmp_format['play_path'])
3392                     del rtsp_format['play_path']
3393                     del rtsp_format['ext']
3394                     rtsp_format.update({
3395                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3396                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3397                         'protocol': 'rtsp',
3398                     })
3399                     formats.extend([rtmp_format, rtsp_format])
3400         else:
3401             for protocol in ('rtmp', 'rtsp'):
3402                 if protocol not in skip_protocols:
3403                     formats.append({
3404                         'url': f'{protocol}:{url_base}',
3405                         'format_id': protocol,
3406                         'protocol': protocol,
3407                     })
3408         return formats
3409
3410     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3411         return self._search_json(
3412             r'''(?<!-)\bjwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?:(?!</script>).)*?\.\s*(?:setup\s*\(|(?P<load>load)\s*\(\s*\[)''',
3413             webpage, 'JWPlayer data', video_id,
3414             # must be a {...} or sequence, ending
3415             contains_pattern=r'\{(?s:.*)}(?(load)(?:\s*,\s*\{(?s:.*)})*)', end_pattern=r'(?(load)\]|\))',
3416             transform_source=transform_source, default=None)
3417
3418     def _extract_jwplayer_data(self, webpage, video_id, *args, transform_source=js_to_json, **kwargs):
3419         jwplayer_data = self._find_jwplayer_data(
3420             webpage, video_id, transform_source=transform_source)
3421         return self._parse_jwplayer_data(
3422             jwplayer_data, video_id, *args, **kwargs)
3423
3424     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3425                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3426         entries = []
3427         if not isinstance(jwplayer_data, dict):
3428             return entries
3429
3430         playlist_items = jwplayer_data.get('playlist')
3431         # JWPlayer backward compatibility: single playlist item/flattened playlists
3432         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3433         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3434         if not isinstance(playlist_items, list):
3435             playlist_items = (playlist_items or jwplayer_data, )
3436
3437         for video_data in playlist_items:
3438             if not isinstance(video_data, dict):
3439                 continue
3440             # JWPlayer backward compatibility: flattened sources
3441             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3442             if 'sources' not in video_data:
3443                 video_data['sources'] = [video_data]
3444
3445             this_video_id = video_id or video_data['mediaid']
3446
3447             formats = self._parse_jwplayer_formats(
3448                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3449                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3450
3451             subtitles = {}
3452             for track in traverse_obj(video_data, (
3453                     'tracks', lambda _, v: v['kind'].lower() in ('captions', 'subtitles'))):
3454                 track_url = urljoin(base_url, track.get('file'))
3455                 if not track_url:
3456                     continue
3457                 subtitles.setdefault(track.get('label') or 'en', []).append({
3458                     'url': self._proto_relative_url(track_url),
3459                 })
3460
3461             entry = {
3462                 'id': this_video_id,
3463                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3464                 'description': clean_html(video_data.get('description')),
3465                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3466                 'timestamp': int_or_none(video_data.get('pubdate')),
3467                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3468                 'subtitles': subtitles,
3469                 'alt_title': clean_html(video_data.get('subtitle')),  # attributes used e.g. by Tele5 ...
3470                 'genre': clean_html(video_data.get('genre')),
3471                 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3472                 'season_number': int_or_none(video_data.get('season')),
3473                 'episode_number': int_or_none(video_data.get('episode')),
3474                 'release_year': int_or_none(video_data.get('releasedate')),
3475                 'age_limit': int_or_none(video_data.get('age_restriction')),
3476             }
3477             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3478             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3479                 entry.update({
3480                     '_type': 'url_transparent',
3481                     'url': formats[0]['url'],
3482                 })
3483             else:
3484                 entry['formats'] = formats
3485             entries.append(entry)
3486         if len(entries) == 1:
3487             return entries[0]
3488         else:
3489             return self.playlist_result(entries)
3490
3491     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3492                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3493         urls = set()
3494         formats = []
3495         for source in jwplayer_sources_data:
3496             if not isinstance(source, dict):
3497                 continue
3498             source_url = urljoin(
3499                 base_url, self._proto_relative_url(source.get('file')))
3500             if not source_url or source_url in urls:
3501                 continue
3502             urls.add(source_url)
3503             source_type = source.get('type') or ''
3504             ext = determine_ext(source_url, default_ext=mimetype2ext(source_type))
3505             if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
3506                 formats.extend(self._extract_m3u8_formats(
3507                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3508                     m3u8_id=m3u8_id, fatal=False))
3509             elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
3510                 formats.extend(self._extract_mpd_formats(
3511                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3512             elif ext == 'smil':
3513                 formats.extend(self._extract_smil_formats(
3514                     source_url, video_id, fatal=False))
3515             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3516             elif source_type.startswith('audio') or ext in (
3517                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3518                 formats.append({
3519                     'url': source_url,
3520                     'vcodec': 'none',
3521                     'ext': ext,
3522                 })
3523             else:
3524                 format_id = str_or_none(source.get('label'))
3525                 height = int_or_none(source.get('height'))
3526                 if height is None and format_id:
3527                     # Often no height is provided but there is a label in
3528                     # format like "1080p", "720p SD", or 1080.
3529                     height = parse_resolution(format_id).get('height')
3530                 a_format = {
3531                     'url': source_url,
3532                     'width': int_or_none(source.get('width')),
3533                     'height': height,
3534                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
3535                     'filesize': int_or_none(source.get('filesize')),
3536                     'ext': ext,
3537                     'format_id': format_id,
3538                 }
3539                 if source_url.startswith('rtmp'):
3540                     a_format['ext'] = 'flv'
3541                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3542                     # of jwplayer.flash.swf
3543                     rtmp_url_parts = re.split(
3544                         r'((?:mp4|mp3|flv):)', source_url, maxsplit=1)
3545                     if len(rtmp_url_parts) == 3:
3546                         rtmp_url, prefix, play_path = rtmp_url_parts
3547                         a_format.update({
3548                             'url': rtmp_url,
3549                             'play_path': prefix + play_path,
3550                         })
3551                     if rtmp_params:
3552                         a_format.update(rtmp_params)
3553                 formats.append(a_format)
3554         return formats
3555
3556     def _live_title(self, name):
3557         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3558         return name
3559
3560     def _int(self, v, name, fatal=False, **kwargs):
3561         res = int_or_none(v, **kwargs)
3562         if res is None:
3563             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3564             if fatal:
3565                 raise ExtractorError(msg)
3566             else:
3567                 self.report_warning(msg)
3568         return res
3569
3570     def _float(self, v, name, fatal=False, **kwargs):
3571         res = float_or_none(v, **kwargs)
3572         if res is None:
3573             msg = f'Failed to extract {name}: Could not parse value {v!r}'
3574             if fatal:
3575                 raise ExtractorError(msg)
3576             else:
3577                 self.report_warning(msg)
3578         return res
3579
3580     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3581                     path='/', secure=False, discard=False, rest={}, **kwargs):
3582         cookie = http.cookiejar.Cookie(
3583             0, name, value, port, port is not None, domain, True,
3584             domain.startswith('.'), path, True, secure, expire_time,
3585             discard, None, None, rest)
3586         self.cookiejar.set_cookie(cookie)
3587
3588     def _get_cookies(self, url):
3589         """ Return a http.cookies.SimpleCookie with the cookies for the url """
3590         return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
3591
3592     def _apply_first_set_cookie_header(self, url_handle, cookie):
3593         """
3594         Apply first Set-Cookie header instead of the last. Experimental.
3595
3596         Some sites (e.g. [1-3]) may serve two cookies under the same name
3597         in Set-Cookie header and expect the first (old) one to be set rather
3598         than second (new). However, as of RFC6265 the newer one cookie
3599         should be set into cookie store what actually happens.
3600         We will workaround this issue by resetting the cookie to
3601         the first one manually.
3602         1. https://new.vk.com/
3603         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3604         3. https://learning.oreilly.com/
3605         """
3606         for header, cookies in url_handle.headers.items():
3607             if header.lower() != 'set-cookie':
3608                 continue
3609             cookies = cookies.encode('iso-8859-1').decode('utf-8')
3610             cookie_value = re.search(
3611                 rf'{cookie}=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)', cookies)
3612             if cookie_value:
3613                 value, domain = cookie_value.groups()
3614                 self._set_cookie(domain, cookie, value)
3615                 break
3616
3617     @classmethod
3618     def get_testcases(cls, include_onlymatching=False):
3619         # Do not look in super classes
3620         t = vars(cls).get('_TEST')
3621         if t:
3622             assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3623             tests = [t]
3624         else:
3625             tests = vars(cls).get('_TESTS', [])
3626         for t in tests:
3627             if not include_onlymatching and t.get('only_matching', False):
3628                 continue
3629             t['name'] = cls.ie_key()
3630             yield t
3631         if getattr(cls, '__wrapped__', None):
3632             yield from cls.__wrapped__.get_testcases(include_onlymatching)
3633
3634     @classmethod
3635     def get_webpage_testcases(cls):
3636         tests = vars(cls).get('_WEBPAGE_TESTS', [])
3637         for t in tests:
3638             t['name'] = cls.ie_key()
3639             yield t
3640         if getattr(cls, '__wrapped__', None):
3641             yield from cls.__wrapped__.get_webpage_testcases()
3642
3643     @classproperty(cache=True)
3644     def age_limit(cls):
3645         """Get age limit from the testcases"""
3646         return max(traverse_obj(
3647             (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3648             (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3649
3650     @classproperty(cache=True)
3651     def _RETURN_TYPE(cls):
3652         """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3653         tests = tuple(cls.get_testcases(include_onlymatching=False))
3654         if not tests:
3655             return None
3656         elif not any(k.startswith('playlist') for test in tests for k in test):
3657             return 'video'
3658         elif all(any(k.startswith('playlist') for k in test) for test in tests):
3659             return 'playlist'
3660         return 'any'
3661
3662     @classmethod
3663     def is_single_video(cls, url):
3664         """Returns whether the URL is of a single video, None if unknown"""
3665         if cls.suitable(url):
3666             return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3667
3668     @classmethod
3669     def is_suitable(cls, age_limit):
3670         """Test whether the extractor is generally suitable for the given age limit"""
3671         return not age_restricted(cls.age_limit, age_limit)
3672
3673     @classmethod
3674     def description(cls, *, markdown=True, search_examples=None):
3675         """Description of the extractor"""
3676         desc = ''
3677         if cls._NETRC_MACHINE:
3678             if markdown:
3679                 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
3680             else:
3681                 desc += f' [{cls._NETRC_MACHINE}]'
3682         if cls.IE_DESC is False:
3683             desc += ' [HIDDEN]'
3684         elif cls.IE_DESC:
3685             desc += f' {cls.IE_DESC}'
3686         if cls.SEARCH_KEY:
3687             desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3688             if search_examples:
3689                 _COUNTS = ('', '5', '10', 'all')
3690                 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3691         if not cls.working():
3692             desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3693
3694         # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3695         name = (' - **{}**'.format(re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME))) if markdown else cls.IE_NAME
3696         return f'{name}:{desc}' if desc else name
3697
3698     def extract_subtitles(self, *args, **kwargs):
3699         if (self.get_param('writesubtitles', False)
3700                 or self.get_param('listsubtitles')):
3701             return self._get_subtitles(*args, **kwargs)
3702         return {}
3703
3704     def _get_subtitles(self, *args, **kwargs):
3705         raise NotImplementedError('This method must be implemented by subclasses')
3706
3707     class CommentsDisabled(Exception):
3708         """Raise in _get_comments if comments are disabled for the video"""
3709
3710     def extract_comments(self, *args, **kwargs):
3711         if not self.get_param('getcomments'):
3712             return None
3713         generator = self._get_comments(*args, **kwargs)
3714
3715         def extractor():
3716             comments = []
3717             interrupted = True
3718             try:
3719                 while True:
3720                     comments.append(next(generator))
3721             except StopIteration:
3722                 interrupted = False
3723             except KeyboardInterrupt:
3724                 self.to_screen('Interrupted by user')
3725             except self.CommentsDisabled:
3726                 return {'comments': None, 'comment_count': None}
3727             except Exception as e:
3728                 if self.get_param('ignoreerrors') is not True:
3729                     raise
3730                 self._downloader.report_error(e)
3731             comment_count = len(comments)
3732             self.to_screen(f'Extracted {comment_count} comments')
3733             return {
3734                 'comments': comments,
3735                 'comment_count': None if interrupted else comment_count,
3736             }
3737         return extractor
3738
3739     def _get_comments(self, *args, **kwargs):
3740         raise NotImplementedError('This method must be implemented by subclasses')
3741
3742     @staticmethod
3743     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3744         """ Merge subtitle items for one language. Items with duplicated URLs/data
3745         will be dropped. """
3746         list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3747         ret = list(subtitle_list1)
3748         ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3749         return ret
3750
3751     @classmethod
3752     def _merge_subtitles(cls, *dicts, target=None):
3753         """ Merge subtitle dictionaries, language by language. """
3754         if target is None:
3755             target = {}
3756         for d in dicts:
3757             for lang, subs in d.items():
3758                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3759         return target
3760
3761     def extract_automatic_captions(self, *args, **kwargs):
3762         if (self.get_param('writeautomaticsub', False)
3763                 or self.get_param('listsubtitles')):
3764             return self._get_automatic_captions(*args, **kwargs)
3765         return {}
3766
3767     def _get_automatic_captions(self, *args, **kwargs):
3768         raise NotImplementedError('This method must be implemented by subclasses')
3769
3770     @functools.cached_property
3771     def _cookies_passed(self):
3772         """Whether cookies have been passed to YoutubeDL"""
3773         return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3774
3775     def mark_watched(self, *args, **kwargs):
3776         if not self.get_param('mark_watched', False):
3777             return
3778         if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3779             self._mark_watched(*args, **kwargs)
3780
3781     def _mark_watched(self, *args, **kwargs):
3782         raise NotImplementedError('This method must be implemented by subclasses')
3783
3784     def geo_verification_headers(self):
3785         headers = {}
3786         geo_verification_proxy = self.get_param('geo_verification_proxy')
3787         if geo_verification_proxy:
3788             headers['Ytdl-request-proxy'] = geo_verification_proxy
3789         return headers
3790
3791     @staticmethod
3792     def _generic_id(url):
3793         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3794
3795     def _generic_title(self, url='', webpage='', *, default=None):
3796         return (self._og_search_title(webpage, default=None)
3797                 or self._html_extract_title(webpage, default=None)
3798                 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3799                 or default)
3800
3801     def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3802         if not duration:
3803             return
3804         chapter_list = [{
3805             'start_time': start_function(chapter),
3806             'title': title_function(chapter),
3807         } for chapter in chapter_list or []]
3808         if strict:
3809             warn = self.report_warning
3810         else:
3811             warn = self.write_debug
3812             chapter_list.sort(key=lambda c: c['start_time'] or 0)
3813
3814         chapters = [{'start_time': 0}]
3815         for idx, chapter in enumerate(chapter_list):
3816             if chapter['start_time'] is None:
3817                 warn(f'Incomplete chapter {idx}')
3818             elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3819                 chapters.append(chapter)
3820             elif chapter not in chapters:
3821                 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3822                          else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3823                 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
3824         return chapters[1:]
3825
3826     def _extract_chapters_from_description(self, description, duration):
3827         duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3828         sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3829         return self._extract_chapters_helper(
3830             re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3831             start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3832             duration=duration, strict=False) or self._extract_chapters_helper(
3833             re.findall(sep_re % (r'.+?', duration_re), description or ''),
3834             start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3835             duration=duration, strict=False)
3836
3837     @staticmethod
3838     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3839         all_known = all(
3840             x is not None for x in
3841             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted))
3842         return (
3843             'private' if is_private
3844             else 'premium_only' if needs_premium
3845             else 'subscriber_only' if needs_subscription
3846             else 'needs_auth' if needs_auth
3847             else 'unlisted' if is_unlisted
3848             else 'public' if all_known
3849             else None)
3850
3851     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3852         '''
3853         @returns            A list of values for the extractor argument given by "key"
3854                             or "default" if no such key is present
3855         @param default      The default value to return when the key is not present (default: [])
3856         @param casesense    When false, the values are converted to lower case
3857         '''
3858         ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3859         val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3860         if val is None:
3861             return [] if default is NO_DEFAULT else default
3862         return list(val) if casesense else [x.lower() for x in val]
3863
3864     def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3865         if not playlist_id or not video_id:
3866             return not video_id
3867
3868         no_playlist = (smuggled_data or {}).get('force_noplaylist')
3869         if no_playlist is not None:
3870             return not no_playlist
3871
3872         video_id = '' if video_id is True else f' {video_id}'
3873         playlist_id = '' if playlist_id is True else f' {playlist_id}'
3874         if self.get_param('noplaylist'):
3875             self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3876             return False
3877         self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3878         return True
3879
3880     def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3881         RetryManager.report_retry(
3882             err, _count or int(fatal), _retries,
3883             info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3884             sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3885
3886     def RetryManager(self, **kwargs):
3887         return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3888
3889     def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3890         display_id = traverse_obj(info_dict, 'display_id', 'id')
3891         self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3892         return self._downloader.get_info_extractor('Generic')._extract_embeds(
3893             smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3894
3895     @classmethod
3896     def extract_from_webpage(cls, ydl, url, webpage):
3897         ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3898               else ydl.get_info_extractor(cls.ie_key()))
3899         for info in ie._extract_from_webpage(url, webpage) or []:
3900             # url = None since we do not want to set (webpage/original)_url
3901             ydl.add_default_extra_info(info, ie, None)
3902             yield info
3903
3904     @classmethod
3905     def _extract_from_webpage(cls, url, webpage):
3906         for embed_url in orderedSet(
3907                 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3908             yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3909
3910     @classmethod
3911     def _extract_embed_urls(cls, url, webpage):
3912         """@returns all the embed urls on the webpage"""
3913         if '_EMBED_URL_RE' not in cls.__dict__:
3914             assert isinstance(cls._EMBED_REGEX, (list, tuple))
3915             for idx, regex in enumerate(cls._EMBED_REGEX):
3916                 assert regex.count('(?P<url>') == 1, \
3917                     f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3918             cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3919
3920         for regex in cls._EMBED_URL_RE:
3921             for mobj in regex.finditer(webpage):
3922                 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3923                 if cls._VALID_URL is False or cls.suitable(embed_url):
3924                     yield embed_url
3925
3926     class StopExtraction(Exception):
3927         pass
3928
3929     @classmethod
3930     def _extract_url(cls, webpage):  # TODO: Remove
3931         """Only for compatibility with some older extractors"""
3932         return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3933
3934     @classmethod
3935     def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3936         if plugin_name:
3937             mro = inspect.getmro(cls)
3938             super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3939             cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3940             cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3941             while getattr(super_class, '__wrapped__', None):
3942                 super_class = super_class.__wrapped__
3943             setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3944             _PLUGIN_OVERRIDES[super_class].append(cls)
3945
3946         return super().__init_subclass__(**kwargs)
3947
3948
3949 class SearchInfoExtractor(InfoExtractor):
3950     """
3951     Base class for paged search queries extractors.
3952     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3953     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3954     """
3955
3956     _MAX_RESULTS = float('inf')
3957     _RETURN_TYPE = 'playlist'
3958
3959     @classproperty
3960     def _VALID_URL(cls):
3961         return rf'{cls._SEARCH_KEY}(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)'
3962
3963     def _real_extract(self, query):
3964         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3965         if prefix == '':
3966             return self._get_n_results(query, 1)
3967         elif prefix == 'all':
3968             return self._get_n_results(query, self._MAX_RESULTS)
3969         else:
3970             n = int(prefix)
3971             if n <= 0:
3972                 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3973             elif n > self._MAX_RESULTS:
3974                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3975                 n = self._MAX_RESULTS
3976             return self._get_n_results(query, n)
3977
3978     def _get_n_results(self, query, n):
3979         """Get a specified number of results for a query.
3980         Either this function or _search_results must be overridden by subclasses """
3981         return self.playlist_result(
3982             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3983             query, query)
3984
3985     def _search_results(self, query):
3986         """Returns an iterator of search results"""
3987         raise NotImplementedError('This method must be implemented by subclasses')
3988
3989     @classproperty
3990     def SEARCH_KEY(cls):
3991         return cls._SEARCH_KEY
3992
3993
3994 class UnsupportedURLIE(InfoExtractor):
3995     _VALID_URL = '.*'
3996     _ENABLED = False
3997     IE_DESC = False
3998
3999     def _real_extract(self, url):
4000         raise UnsupportedError(url)
4001
4002
4003 _PLUGIN_OVERRIDES = collections.defaultdict(list)