[ie/youtube] Add age-gate workaround for some embeddable videos (#11821)
[yt-dlp.git] / yt_dlp / YoutubeDL.py
blob65b72e026cb34694dd55847e4d2bf90a4548809e
1 import collections
2 import contextlib
3 import copy
4 import datetime as dt
5 import errno
6 import fileinput
7 import functools
8 import http.cookiejar
9 import io
10 import itertools
11 import json
12 import locale
13 import operator
14 import os
15 import random
16 import re
17 import shutil
18 import string
19 import subprocess
20 import sys
21 import tempfile
22 import time
23 import tokenize
24 import traceback
25 import unicodedata
27 from .cache import Cache
28 from .compat import urllib # isort: split
29 from .compat import urllib_req_to_req
30 from .cookies import CookieLoadError, LenientSimpleCookie, load_cookies
31 from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name
32 from .downloader.rtmp import rtmpdump_version
33 from .extractor import gen_extractor_classes, get_info_extractor
34 from .extractor.common import UnsupportedURLIE
35 from .extractor.openload import PhantomJSwrapper
36 from .minicurses import format_text
37 from .networking import HEADRequest, Request, RequestDirector
38 from .networking.common import _REQUEST_HANDLERS, _RH_PREFERENCES
39 from .networking.exceptions import (
40 HTTPError,
41 NoSupportingHandlers,
42 RequestError,
43 SSLError,
44 network_exceptions,
46 from .networking.impersonate import ImpersonateRequestHandler
47 from .plugins import directories as plugin_directories
48 from .postprocessor import _PLUGIN_CLASSES as plugin_pps
49 from .postprocessor import (
50 EmbedThumbnailPP,
51 FFmpegFixupDuplicateMoovPP,
52 FFmpegFixupDurationPP,
53 FFmpegFixupM3u8PP,
54 FFmpegFixupM4aPP,
55 FFmpegFixupStretchedPP,
56 FFmpegFixupTimestampPP,
57 FFmpegMergerPP,
58 FFmpegPostProcessor,
59 FFmpegVideoConvertorPP,
60 MoveFilesAfterDownloadPP,
61 get_postprocessor,
63 from .postprocessor.ffmpeg import resolve_mapping as resolve_recode_mapping
64 from .update import (
65 REPOSITORY,
66 _get_system_deprecation,
67 _make_label,
68 current_git_head,
69 detect_variant,
71 from .utils import (
72 DEFAULT_OUTTMPL,
73 IDENTITY,
74 LINK_TEMPLATES,
75 MEDIA_EXTENSIONS,
76 NO_DEFAULT,
77 NUMBER_RE,
78 OUTTMPL_TYPES,
79 POSTPROCESS_WHEN,
80 STR_FORMAT_RE_TMPL,
81 STR_FORMAT_TYPES,
82 ContentTooShortError,
83 DateRange,
84 DownloadCancelled,
85 DownloadError,
86 EntryNotInPlaylist,
87 ExistingVideoReached,
88 ExtractorError,
89 FormatSorter,
90 GeoRestrictedError,
91 ISO3166Utils,
92 LazyList,
93 MaxDownloadsReached,
94 Namespace,
95 PagedList,
96 PlaylistEntries,
97 Popen,
98 PostProcessingError,
99 ReExtractInfo,
100 RejectedVideoReached,
101 SameFileError,
102 UnavailableVideoError,
103 UserNotLive,
104 YoutubeDLError,
105 age_restricted,
106 bug_reports_message,
107 date_from_str,
108 deprecation_warning,
109 determine_ext,
110 determine_protocol,
111 encode_compat_str,
112 escapeHTML,
113 expand_path,
114 extract_basic_auth,
115 filter_dict,
116 float_or_none,
117 format_bytes,
118 format_decimal_suffix,
119 format_field,
120 formatSeconds,
121 get_compatible_ext,
122 get_domain,
123 int_or_none,
124 iri_to_uri,
125 is_path_like,
126 join_nonempty,
127 locked_file,
128 make_archive_id,
129 make_dir,
130 number_of_digits,
131 orderedSet,
132 orderedSet_from_options,
133 parse_filesize,
134 preferredencoding,
135 prepend_extension,
136 remove_terminal_sequences,
137 render_table,
138 replace_extension,
139 sanitize_filename,
140 sanitize_path,
141 sanitize_url,
142 shell_quote,
143 str_or_none,
144 strftime_or_none,
145 subtitles_filename,
146 supports_terminal_sequences,
147 system_identifier,
148 filesize_from_tbr,
149 timetuple_from_msec,
150 to_high_limit_path,
151 traverse_obj,
152 try_call,
153 try_get,
154 url_basename,
155 variadic,
156 windows_enable_vt_mode,
157 write_json_file,
158 write_string,
160 from .utils._utils import _UnsafeExtensionError, _YDLLogger
161 from .utils.networking import (
162 HTTPHeaderDict,
163 clean_headers,
164 clean_proxies,
165 std_headers,
167 from .version import CHANNEL, ORIGIN, RELEASE_GIT_HEAD, VARIANT, __version__
169 if os.name == 'nt':
170 import ctypes
173 def _catch_unsafe_extension_error(func):
174 @functools.wraps(func)
175 def wrapper(self, *args, **kwargs):
176 try:
177 return func(self, *args, **kwargs)
178 except _UnsafeExtensionError as error:
179 self.report_error(
180 f'The extracted extension ({error.extension!r}) is unusual '
181 'and will be skipped for safety reasons. '
182 f'If you believe this is an error{bug_reports_message(",")}')
184 return wrapper
187 class YoutubeDL:
188 """YoutubeDL class.
190 YoutubeDL objects are the ones responsible of downloading the
191 actual video file and writing it to disk if the user has requested
192 it, among some other tasks. In most cases there should be one per
193 program. As, given a video URL, the downloader doesn't know how to
194 extract all the needed information, task that InfoExtractors do, it
195 has to pass the URL to one of them.
197 For this, YoutubeDL objects have a method that allows
198 InfoExtractors to be registered in a given order. When it is passed
199 a URL, the YoutubeDL object handles it to the first InfoExtractor it
200 finds that reports being able to handle it. The InfoExtractor extracts
201 all the information about the video or videos the URL refers to, and
202 YoutubeDL process the extracted information, possibly using a File
203 Downloader to download the video.
205 YoutubeDL objects accept a lot of parameters. In order not to saturate
206 the object constructor with arguments, it receives a dictionary of
207 options instead. These options are available through the params
208 attribute for the InfoExtractors to use. The YoutubeDL also
209 registers itself as the downloader in charge for the InfoExtractors
210 that are added to it, so this is a "mutual registration".
212 Available options:
214 username: Username for authentication purposes.
215 password: Password for authentication purposes.
216 videopassword: Password for accessing a video.
217 ap_mso: Adobe Pass multiple-system operator identifier.
218 ap_username: Multiple-system operator account username.
219 ap_password: Multiple-system operator account password.
220 usenetrc: Use netrc for authentication instead.
221 netrc_location: Location of the netrc file. Defaults to ~/.netrc.
222 netrc_cmd: Use a shell command to get credentials
223 verbose: Print additional info to stdout.
224 quiet: Do not print messages to stdout.
225 no_warnings: Do not print out anything for warnings.
226 forceprint: A dict with keys WHEN mapped to a list of templates to
227 print to stdout. The allowed keys are video or any of the
228 items in utils.POSTPROCESS_WHEN.
229 For compatibility, a single list is also accepted
230 print_to_file: A dict with keys WHEN (same as forceprint) mapped to
231 a list of tuples with (template, filename)
232 forcejson: Force printing info_dict as JSON.
233 dump_single_json: Force printing the info_dict of the whole playlist
234 (or video) as a single JSON line.
235 force_write_download_archive: Force writing download archive regardless
236 of 'skip_download' or 'simulate'.
237 simulate: Do not download the video files. If unset (or None),
238 simulate only if listsubtitles, listformats or list_thumbnails is used
239 format: Video format code. see "FORMAT SELECTION" for more details.
240 You can also pass a function. The function takes 'ctx' as
241 argument and returns the formats to download.
242 See "build_format_selector" for an implementation
243 allow_unplayable_formats: Allow unplayable formats to be extracted and downloaded.
244 ignore_no_formats_error: Ignore "No video formats" error. Usefull for
245 extracting metadata even if the video is not actually
246 available for download (experimental)
247 format_sort: A list of fields by which to sort the video formats.
248 See "Sorting Formats" for more details.
249 format_sort_force: Force the given format_sort. see "Sorting Formats"
250 for more details.
251 prefer_free_formats: Whether to prefer video formats with free containers
252 over non-free ones of the same quality.
253 allow_multiple_video_streams: Allow multiple video streams to be merged
254 into a single file
255 allow_multiple_audio_streams: Allow multiple audio streams to be merged
256 into a single file
257 check_formats Whether to test if the formats are downloadable.
258 Can be True (check all), False (check none),
259 'selected' (check selected formats),
260 or None (check only if requested by extractor)
261 paths: Dictionary of output paths. The allowed keys are 'home'
262 'temp' and the keys of OUTTMPL_TYPES (in utils/_utils.py)
263 outtmpl: Dictionary of templates for output names. Allowed keys
264 are 'default' and the keys of OUTTMPL_TYPES (in utils/_utils.py).
265 For compatibility with youtube-dl, a single string can also be used
266 outtmpl_na_placeholder: Placeholder for unavailable meta fields.
267 restrictfilenames: Do not allow "&" and spaces in file names
268 trim_file_name: Limit length of filename (extension excluded)
269 windowsfilenames: Force the filenames to be windows compatible
270 ignoreerrors: Do not stop on download/postprocessing errors.
271 Can be 'only_download' to ignore only download errors.
272 Default is 'only_download' for CLI, but False for API
273 skip_playlist_after_errors: Number of allowed failures until the rest of
274 the playlist is skipped
275 allowed_extractors: List of regexes to match against extractor names that are allowed
276 overwrites: Overwrite all video and metadata files if True,
277 overwrite only non-video files if None
278 and don't overwrite any file if False
279 playlist_items: Specific indices of playlist to download.
280 playlistrandom: Download playlist items in random order.
281 lazy_playlist: Process playlist entries as they are received.
282 matchtitle: Download only matching titles.
283 rejecttitle: Reject downloads for matching titles.
284 logger: Log messages to a logging.Logger instance.
285 logtostderr: Print everything to stderr instead of stdout.
286 consoletitle: Display progress in the console window's titlebar.
287 writedescription: Write the video description to a .description file
288 writeinfojson: Write the video description to a .info.json file
289 clean_infojson: Remove internal metadata from the infojson
290 getcomments: Extract video comments. This will not be written to disk
291 unless writeinfojson is also given
292 writeannotations: Write the video annotations to a .annotations.xml file
293 writethumbnail: Write the thumbnail image to a file
294 allow_playlist_files: Whether to write playlists' description, infojson etc
295 also to disk when using the 'write*' options
296 write_all_thumbnails: Write all thumbnail formats to files
297 writelink: Write an internet shortcut file, depending on the
298 current platform (.url/.webloc/.desktop)
299 writeurllink: Write a Windows internet shortcut file (.url)
300 writewebloclink: Write a macOS internet shortcut file (.webloc)
301 writedesktoplink: Write a Linux internet shortcut file (.desktop)
302 writesubtitles: Write the video subtitles to a file
303 writeautomaticsub: Write the automatically generated subtitles to a file
304 listsubtitles: Lists all available subtitles for the video
305 subtitlesformat: The format code for subtitles
306 subtitleslangs: List of languages of the subtitles to download (can be regex).
307 The list may contain "all" to refer to all the available
308 subtitles. The language can be prefixed with a "-" to
309 exclude it from the requested languages, e.g. ['all', '-live_chat']
310 keepvideo: Keep the video file after post-processing
311 daterange: A utils.DateRange object, download only if the upload_date is in the range.
312 skip_download: Skip the actual download of the video file
313 cachedir: Location of the cache files in the filesystem.
314 False to disable filesystem cache.
315 noplaylist: Download single video instead of a playlist if in doubt.
316 age_limit: An integer representing the user's age in years.
317 Unsuitable videos for the given age are skipped.
318 min_views: An integer representing the minimum view count the video
319 must have in order to not be skipped.
320 Videos without view count information are always
321 downloaded. None for no limit.
322 max_views: An integer representing the maximum view count.
323 Videos that are more popular than that are not
324 downloaded.
325 Videos without view count information are always
326 downloaded. None for no limit.
327 download_archive: A set, or the name of a file where all downloads are recorded.
328 Videos already present in the file are not downloaded again.
329 break_on_existing: Stop the download process after attempting to download a
330 file that is in the archive.
331 break_per_url: Whether break_on_reject and break_on_existing
332 should act on each input URL as opposed to for the entire queue
333 cookiefile: File name or text stream from where cookies should be read and dumped to
334 cookiesfrombrowser: A tuple containing the name of the browser, the profile
335 name/path from where cookies are loaded, the name of the keyring,
336 and the container name, e.g. ('chrome', ) or
337 ('vivaldi', 'default', 'BASICTEXT') or ('firefox', 'default', None, 'Meta')
338 legacyserverconnect: Explicitly allow HTTPS connection to servers that do not
339 support RFC 5746 secure renegotiation
340 nocheckcertificate: Do not verify SSL certificates
341 client_certificate: Path to client certificate file in PEM format. May include the private key
342 client_certificate_key: Path to private key file for client certificate
343 client_certificate_password: Password for client certificate private key, if encrypted.
344 If not provided and the key is encrypted, yt-dlp will ask interactively
345 prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
346 (Only supported by some extractors)
347 enable_file_urls: Enable file:// URLs. This is disabled by default for security reasons.
348 http_headers: A dictionary of custom headers to be used for all requests
349 proxy: URL of the proxy server to use
350 geo_verification_proxy: URL of the proxy to use for IP address verification
351 on geo-restricted sites.
352 socket_timeout: Time to wait for unresponsive hosts, in seconds
353 bidi_workaround: Work around buggy terminals without bidirectional text
354 support, using fridibi
355 debug_printtraffic:Print out sent and received HTTP traffic
356 default_search: Prepend this string if an input url is not valid.
357 'auto' for elaborate guessing
358 encoding: Use this encoding instead of the system-specified.
359 extract_flat: Whether to resolve and process url_results further
360 * False: Always process. Default for API
361 * True: Never process
362 * 'in_playlist': Do not process inside playlist/multi_video
363 * 'discard': Always process, but don't return the result
364 from inside playlist/multi_video
365 * 'discard_in_playlist': Same as "discard", but only for
366 playlists (not multi_video). Default for CLI
367 wait_for_video: If given, wait for scheduled streams to become available.
368 The value should be a tuple containing the range
369 (min_secs, max_secs) to wait between retries
370 postprocessors: A list of dictionaries, each with an entry
371 * key: The name of the postprocessor. See
372 yt_dlp/postprocessor/__init__.py for a list.
373 * when: When to run the postprocessor. Allowed values are
374 the entries of utils.POSTPROCESS_WHEN
375 Assumed to be 'post_process' if not given
376 progress_hooks: A list of functions that get called on download
377 progress, with a dictionary with the entries
378 * status: One of "downloading", "error", or "finished".
379 Check this first and ignore unknown values.
380 * info_dict: The extracted info_dict
382 If status is one of "downloading", or "finished", the
383 following properties may also be present:
384 * filename: The final filename (always present)
385 * tmpfilename: The filename we're currently writing to
386 * downloaded_bytes: Bytes on disk
387 * total_bytes: Size of the whole file, None if unknown
388 * total_bytes_estimate: Guess of the eventual file size,
389 None if unavailable.
390 * elapsed: The number of seconds since download started.
391 * eta: The estimated time in seconds, None if unknown
392 * speed: The download speed in bytes/second, None if
393 unknown
394 * fragment_index: The counter of the currently
395 downloaded video fragment.
396 * fragment_count: The number of fragments (= individual
397 files that will be merged)
399 Progress hooks are guaranteed to be called at least once
400 (with status "finished") if the download is successful.
401 postprocessor_hooks: A list of functions that get called on postprocessing
402 progress, with a dictionary with the entries
403 * status: One of "started", "processing", or "finished".
404 Check this first and ignore unknown values.
405 * postprocessor: Name of the postprocessor
406 * info_dict: The extracted info_dict
408 Progress hooks are guaranteed to be called at least twice
409 (with status "started" and "finished") if the processing is successful.
410 merge_output_format: "/" separated list of extensions to use when merging formats.
411 final_ext: Expected final extension; used to detect when the file was
412 already downloaded and converted
413 fixup: Automatically correct known faults of the file.
414 One of:
415 - "never": do nothing
416 - "warn": only emit a warning
417 - "detect_or_warn": check whether we can do anything
418 about it, warn otherwise (default)
419 source_address: Client-side IP address to bind to.
420 impersonate: Client to impersonate for requests.
421 An ImpersonateTarget (from yt_dlp.networking.impersonate)
422 sleep_interval_requests: Number of seconds to sleep between requests
423 during extraction
424 sleep_interval: Number of seconds to sleep before each download when
425 used alone or a lower bound of a range for randomized
426 sleep before each download (minimum possible number
427 of seconds to sleep) when used along with
428 max_sleep_interval.
429 max_sleep_interval:Upper bound of a range for randomized sleep before each
430 download (maximum possible number of seconds to sleep).
431 Must only be used along with sleep_interval.
432 Actual sleep time will be a random float from range
433 [sleep_interval; max_sleep_interval].
434 sleep_interval_subtitles: Number of seconds to sleep before each subtitle download
435 listformats: Print an overview of available video formats and exit.
436 list_thumbnails: Print a table of all thumbnails and exit.
437 match_filter: A function that gets called for every video with the signature
438 (info_dict, *, incomplete: bool) -> Optional[str]
439 For backward compatibility with youtube-dl, the signature
440 (info_dict) -> Optional[str] is also allowed.
441 - If it returns a message, the video is ignored.
442 - If it returns None, the video is downloaded.
443 - If it returns utils.NO_DEFAULT, the user is interactively
444 asked whether to download the video.
445 - Raise utils.DownloadCancelled(msg) to abort remaining
446 downloads when a video is rejected.
447 match_filter_func in utils/_utils.py is one example for this.
448 color: A Dictionary with output stream names as keys
449 and their respective color policy as values.
450 Can also just be a single color policy,
451 in which case it applies to all outputs.
452 Valid stream names are 'stdout' and 'stderr'.
453 Valid color policies are one of 'always', 'auto',
454 'no_color', 'never', 'auto-tty' or 'no_color-tty'.
455 geo_bypass: Bypass geographic restriction via faking X-Forwarded-For
456 HTTP header
457 geo_bypass_country:
458 Two-letter ISO 3166-2 country code that will be used for
459 explicit geographic restriction bypassing via faking
460 X-Forwarded-For HTTP header
461 geo_bypass_ip_block:
462 IP range in CIDR notation that will be used similarly to
463 geo_bypass_country
464 external_downloader: A dictionary of protocol keys and the executable of the
465 external downloader to use for it. The allowed protocols
466 are default|http|ftp|m3u8|dash|rtsp|rtmp|mms.
467 Set the value to 'native' to use the native downloader
468 compat_opts: Compatibility options. See "Differences in default behavior".
469 The following options do not work when used through the API:
470 filename, abort-on-error, multistreams, no-live-chat,
471 format-sort, no-clean-infojson, no-playlist-metafiles,
472 no-keep-subs, no-attach-info-json, allow-unsafe-ext, prefer-vp9-sort.
473 Refer __init__.py for their implementation
474 progress_template: Dictionary of templates for progress outputs.
475 Allowed keys are 'download', 'postprocess',
476 'download-title' (console title) and 'postprocess-title'.
477 The template is mapped on a dictionary with keys 'progress' and 'info'
478 retry_sleep_functions: Dictionary of functions that takes the number of attempts
479 as argument and returns the time to sleep in seconds.
480 Allowed keys are 'http', 'fragment', 'file_access'
481 download_ranges: A callback function that gets called for every video with
482 the signature (info_dict, ydl) -> Iterable[Section].
483 Only the returned sections will be downloaded.
484 Each Section is a dict with the following keys:
485 * start_time: Start time of the section in seconds
486 * end_time: End time of the section in seconds
487 * title: Section title (Optional)
488 * index: Section number (Optional)
489 force_keyframes_at_cuts: Re-encode the video when downloading ranges to get precise cuts
490 noprogress: Do not print the progress bar
491 live_from_start: Whether to download livestreams videos from the start
493 The following parameters are not used by YoutubeDL itself, they are used by
494 the downloader (see yt_dlp/downloader/common.py):
495 nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize,
496 max_filesize, test, noresizebuffer, retries, file_access_retries, fragment_retries,
497 continuedl, xattr_set_filesize, hls_use_mpegts, http_chunk_size,
498 external_downloader_args, concurrent_fragment_downloads, progress_delta.
500 The following options are used by the post processors:
501 ffmpeg_location: Location of the ffmpeg/avconv binary; either the path
502 to the binary or its containing directory.
503 postprocessor_args: A dictionary of postprocessor/executable keys (in lower case)
504 and a list of additional command-line arguments for the
505 postprocessor/executable. The dict can also have "PP+EXE" keys
506 which are used when the given exe is used by the given PP.
507 Use 'default' as the name for arguments to passed to all PP
508 For compatibility with youtube-dl, a single list of args
509 can also be used
511 The following options are used by the extractors:
512 extractor_retries: Number of times to retry for known errors (default: 3)
513 dynamic_mpd: Whether to process dynamic DASH manifests (default: True)
514 hls_split_discontinuity: Split HLS playlists into different formats at
515 discontinuities such as ad breaks (default: False)
516 extractor_args: A dictionary of arguments to be passed to the extractors.
517 See "EXTRACTOR ARGUMENTS" for details.
518 E.g. {'youtube': {'skip': ['dash', 'hls']}}
519 mark_watched: Mark videos watched (even with --simulate). Only for YouTube
521 The following options are deprecated and may be removed in the future:
523 break_on_reject: Stop the download process when encountering a video that
524 has been filtered out.
525 - `raise DownloadCancelled(msg)` in match_filter instead
526 force_generic_extractor: Force downloader to use the generic extractor
527 - Use allowed_extractors = ['generic', 'default']
528 playliststart: - Use playlist_items
529 Playlist item to start at.
530 playlistend: - Use playlist_items
531 Playlist item to end at.
532 playlistreverse: - Use playlist_items
533 Download playlist items in reverse order.
534 forceurl: - Use forceprint
535 Force printing final URL.
536 forcetitle: - Use forceprint
537 Force printing title.
538 forceid: - Use forceprint
539 Force printing ID.
540 forcethumbnail: - Use forceprint
541 Force printing thumbnail URL.
542 forcedescription: - Use forceprint
543 Force printing description.
544 forcefilename: - Use forceprint
545 Force printing final filename.
546 forceduration: - Use forceprint
547 Force printing duration.
548 allsubtitles: - Use subtitleslangs = ['all']
549 Downloads all the subtitles of the video
550 (requires writesubtitles or writeautomaticsub)
551 include_ads: - Doesn't work
552 Download ads as well
553 call_home: - Not implemented
554 Boolean, true if we are allowed to contact the
555 yt-dlp servers for debugging.
556 post_hooks: - Register a custom postprocessor
557 A list of functions that get called as the final step
558 for each video file, after all postprocessors have been
559 called. The filename will be passed as the only argument.
560 hls_prefer_native: - Use external_downloader = {'m3u8': 'native'} or {'m3u8': 'ffmpeg'}.
561 Use the native HLS downloader instead of ffmpeg/avconv
562 if True, otherwise use ffmpeg/avconv if False, otherwise
563 use downloader suggested by extractor if None.
564 prefer_ffmpeg: - avconv support is deprecated
565 If False, use avconv instead of ffmpeg if both are available,
566 otherwise prefer ffmpeg.
567 youtube_include_dash_manifest: - Use extractor_args
568 If True (default), DASH manifests and related
569 data will be downloaded and processed by extractor.
570 You can reduce network I/O by disabling it if you don't
571 care about DASH. (only for youtube)
572 youtube_include_hls_manifest: - Use extractor_args
573 If True (default), HLS manifests and related
574 data will be downloaded and processed by extractor.
575 You can reduce network I/O by disabling it if you don't
576 care about HLS. (only for youtube)
577 no_color: Same as `color='no_color'`
578 no_overwrites: Same as `overwrites=False`
581 _NUMERIC_FIELDS = {
582 'width', 'height', 'asr', 'audio_channels', 'fps',
583 'tbr', 'abr', 'vbr', 'filesize', 'filesize_approx',
584 'timestamp', 'release_timestamp',
585 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
586 'average_rating', 'comment_count', 'age_limit',
587 'start_time', 'end_time',
588 'chapter_number', 'season_number', 'episode_number',
589 'track_number', 'disc_number', 'release_year',
592 _format_fields = {
593 # NB: Keep in sync with the docstring of extractor/common.py
594 'url', 'manifest_url', 'manifest_stream_number', 'ext', 'format', 'format_id', 'format_note',
595 'width', 'height', 'aspect_ratio', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', 'audio_channels',
596 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'rows', 'columns',
597 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start', 'is_dash_periods', 'request_data',
598 'preference', 'language', 'language_preference', 'quality', 'source_preference', 'cookies',
599 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'extra_param_to_key_url',
600 'hls_aes', 'downloader_options', 'page_url', 'app', 'play_path', 'tc_url', 'flash_version',
601 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time',
603 _deprecated_multivalue_fields = {
604 'album_artist': 'album_artists',
605 'artist': 'artists',
606 'composer': 'composers',
607 'creator': 'creators',
608 'genre': 'genres',
610 _format_selection_exts = {
611 'audio': set(MEDIA_EXTENSIONS.common_audio),
612 'video': {*MEDIA_EXTENSIONS.common_video, '3gp'},
613 'storyboards': set(MEDIA_EXTENSIONS.storyboards),
616 def __init__(self, params=None, auto_init=True):
617 """Create a FileDownloader object with the given options.
618 @param auto_init Whether to load the default extractors and print header (if verbose).
619 Set to 'no_verbose_header' to not print the header
621 if params is None:
622 params = {}
623 self.params = params
624 self._ies = {}
625 self._ies_instances = {}
626 self._pps = {k: [] for k in POSTPROCESS_WHEN}
627 self._printed_messages = set()
628 self._first_webpage_request = True
629 self._post_hooks = []
630 self._progress_hooks = []
631 self._postprocessor_hooks = []
632 self._download_retcode = 0
633 self._num_downloads = 0
634 self._num_videos = 0
635 self._playlist_level = 0
636 self._playlist_urls = set()
637 self.cache = Cache(self)
638 self.__header_cookies = []
640 stdout = sys.stderr if self.params.get('logtostderr') else sys.stdout
641 self._out_files = Namespace(
642 out=stdout,
643 error=sys.stderr,
644 screen=sys.stderr if self.params.get('quiet') else stdout,
645 console=None if os.name == 'nt' else next(
646 filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None),
649 try:
650 windows_enable_vt_mode()
651 except Exception as e:
652 self.write_debug(f'Failed to enable VT mode: {e}')
654 if self.params.get('no_color'):
655 if self.params.get('color') is not None:
656 self.params.setdefault('_warnings', []).append(
657 'Overwriting params from "color" with "no_color"')
658 self.params['color'] = 'no_color'
660 term_allow_color = os.getenv('TERM', '').lower() != 'dumb'
661 base_no_color = bool(os.getenv('NO_COLOR'))
663 def process_color_policy(stream):
664 stream_name = {sys.stdout: 'stdout', sys.stderr: 'stderr'}[stream]
665 policy = traverse_obj(self.params, ('color', (stream_name, None), {str}, any)) or 'auto'
666 if policy in ('auto', 'auto-tty', 'no_color-tty'):
667 no_color = base_no_color
668 if policy.endswith('tty'):
669 no_color = policy.startswith('no_color')
670 if term_allow_color and supports_terminal_sequences(stream):
671 return 'no_color' if no_color else True
672 return False
673 assert policy in ('always', 'never', 'no_color'), policy
674 return {'always': True, 'never': False}.get(policy, policy)
676 self._allow_colors = Namespace(**{
677 name: process_color_policy(stream)
678 for name, stream in self._out_files.items_ if name != 'console'
681 system_deprecation = _get_system_deprecation()
682 if system_deprecation:
683 self.deprecated_feature(system_deprecation.replace('\n', '\n '))
685 if self.params.get('allow_unplayable_formats'):
686 self.report_warning(
687 f'You have asked for {self._format_err("UNPLAYABLE", self.Styles.EMPHASIS)} formats to be listed/downloaded. '
688 'This is a developer option intended for debugging. \n'
689 ' If you experience any issues while using this option, '
690 f'{self._format_err("DO NOT", self.Styles.ERROR)} open a bug report')
692 if self.params.get('bidi_workaround', False):
693 try:
694 import pty
695 master, slave = pty.openpty()
696 width = shutil.get_terminal_size().columns
697 width_args = [] if width is None else ['-w', str(width)]
698 sp_kwargs = {'stdin': subprocess.PIPE, 'stdout': slave, 'stderr': self._out_files.error}
699 try:
700 self._output_process = Popen(['bidiv', *width_args], **sp_kwargs)
701 except OSError:
702 self._output_process = Popen(['fribidi', '-c', 'UTF-8', *width_args], **sp_kwargs)
703 self._output_channel = os.fdopen(master, 'rb')
704 except OSError as ose:
705 if ose.errno == errno.ENOENT:
706 self.report_warning(
707 'Could not find fribidi executable, ignoring --bidi-workaround. '
708 'Make sure that fribidi is an executable file in one of the directories in your $PATH.')
709 else:
710 raise
712 self.params['compat_opts'] = set(self.params.get('compat_opts', ()))
713 self.params['http_headers'] = HTTPHeaderDict(std_headers, self.params.get('http_headers'))
714 self._load_cookies(self.params['http_headers'].get('Cookie')) # compat
715 self.params['http_headers'].pop('Cookie', None)
717 if auto_init and auto_init != 'no_verbose_header':
718 self.print_debug_header()
720 def check_deprecated(param, option, suggestion):
721 if self.params.get(param) is not None:
722 self.report_warning(f'{option} is deprecated. Use {suggestion} instead')
723 return True
724 return False
726 if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
727 if self.params.get('geo_verification_proxy') is None:
728 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
730 check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
731 check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
732 check_deprecated('useid', '--id', '-o "%(id)s.%(ext)s"')
734 for msg in self.params.get('_warnings', []):
735 self.report_warning(msg)
736 for msg in self.params.get('_deprecation_warnings', []):
737 self.deprecated_feature(msg)
739 if impersonate_target := self.params.get('impersonate'):
740 if not self._impersonate_target_available(impersonate_target):
741 raise YoutubeDLError(
742 f'Impersonate target "{impersonate_target}" is not available. '
743 f'Use --list-impersonate-targets to see available targets. '
744 f'You may be missing dependencies required to support this target.')
746 if 'list-formats' in self.params['compat_opts']:
747 self.params['listformats_table'] = False
749 if 'overwrites' not in self.params and self.params.get('nooverwrites') is not None:
750 # nooverwrites was unnecessarily changed to overwrites
751 # in 0c3d0f51778b153f65c21906031c2e091fcfb641
752 # This ensures compatibility with both keys
753 self.params['overwrites'] = not self.params['nooverwrites']
754 elif self.params.get('overwrites') is None:
755 self.params.pop('overwrites', None)
756 else:
757 self.params['nooverwrites'] = not self.params['overwrites']
759 if self.params.get('simulate') is None and any((
760 self.params.get('list_thumbnails'),
761 self.params.get('listformats'),
762 self.params.get('listsubtitles'),
764 self.params['simulate'] = 'list_only'
766 self.params.setdefault('forceprint', {})
767 self.params.setdefault('print_to_file', {})
769 # Compatibility with older syntax
770 if not isinstance(params['forceprint'], dict):
771 self.params['forceprint'] = {'video': params['forceprint']}
773 if auto_init:
774 self.add_default_info_extractors()
776 if (sys.platform != 'win32'
777 and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
778 and not self.params.get('restrictfilenames', False)):
779 # Unicode filesystem API will throw errors (#1474, #13027)
780 self.report_warning(
781 'Assuming --restrict-filenames since file system encoding '
782 'cannot encode all characters. '
783 'Set the LC_ALL environment variable to fix this.')
784 self.params['restrictfilenames'] = True
786 self._parse_outtmpl()
788 # Creating format selector here allows us to catch syntax errors before the extraction
789 self.format_selector = (
790 self.params.get('format') if self.params.get('format') in (None, '-')
791 else self.params['format'] if callable(self.params['format'])
792 else self.build_format_selector(self.params['format']))
794 hooks = {
795 'post_hooks': self.add_post_hook,
796 'progress_hooks': self.add_progress_hook,
797 'postprocessor_hooks': self.add_postprocessor_hook,
799 for opt, fn in hooks.items():
800 for ph in self.params.get(opt, []):
801 fn(ph)
803 for pp_def_raw in self.params.get('postprocessors', []):
804 pp_def = dict(pp_def_raw)
805 when = pp_def.pop('when', 'post_process')
806 self.add_post_processor(
807 get_postprocessor(pp_def.pop('key'))(self, **pp_def),
808 when=when)
810 def preload_download_archive(fn):
811 """Preload the archive, if any is specified"""
812 archive = set()
813 if fn is None:
814 return archive
815 elif not is_path_like(fn):
816 return fn
818 self.write_debug(f'Loading archive file {fn!r}')
819 try:
820 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
821 for line in archive_file:
822 archive.add(line.strip())
823 except OSError as ioe:
824 if ioe.errno != errno.ENOENT:
825 raise
826 return archive
828 self.archive = preload_download_archive(self.params.get('download_archive'))
830 def warn_if_short_id(self, argv):
831 # short YouTube ID starting with dash?
832 idxs = [
833 i for i, a in enumerate(argv)
834 if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
835 if idxs:
836 correct_argv = (
837 ['yt-dlp']
838 + [a for i, a in enumerate(argv) if i not in idxs]
839 + ['--'] + [argv[i] for i in idxs]
841 self.report_warning(
842 'Long argument string detected. '
843 f'Use -- to separate parameters and URLs, like this:\n{shell_quote(correct_argv)}')
845 def add_info_extractor(self, ie):
846 """Add an InfoExtractor object to the end of the list."""
847 ie_key = ie.ie_key()
848 self._ies[ie_key] = ie
849 if not isinstance(ie, type):
850 self._ies_instances[ie_key] = ie
851 ie.set_downloader(self)
853 def get_info_extractor(self, ie_key):
855 Get an instance of an IE with name ie_key, it will try to get one from
856 the _ies list, if there's no instance it will create a new one and add
857 it to the extractor list.
859 ie = self._ies_instances.get(ie_key)
860 if ie is None:
861 ie = get_info_extractor(ie_key)()
862 self.add_info_extractor(ie)
863 return ie
865 def add_default_info_extractors(self):
867 Add the InfoExtractors returned by gen_extractors to the end of the list
869 all_ies = {ie.IE_NAME.lower(): ie for ie in gen_extractor_classes()}
870 all_ies['end'] = UnsupportedURLIE()
871 try:
872 ie_names = orderedSet_from_options(
873 self.params.get('allowed_extractors', ['default']), {
874 'all': list(all_ies),
875 'default': [name for name, ie in all_ies.items() if ie._ENABLED],
876 }, use_regex=True)
877 except re.error as e:
878 raise ValueError(f'Wrong regex for allowed_extractors: {e.pattern}')
879 for name in ie_names:
880 self.add_info_extractor(all_ies[name])
881 self.write_debug(f'Loaded {len(ie_names)} extractors')
883 def add_post_processor(self, pp, when='post_process'):
884 """Add a PostProcessor object to the end of the chain."""
885 assert when in POSTPROCESS_WHEN, f'Invalid when={when}'
886 self._pps[when].append(pp)
887 pp.set_downloader(self)
889 def add_post_hook(self, ph):
890 """Add the post hook"""
891 self._post_hooks.append(ph)
893 def add_progress_hook(self, ph):
894 """Add the download progress hook"""
895 self._progress_hooks.append(ph)
897 def add_postprocessor_hook(self, ph):
898 """Add the postprocessing progress hook"""
899 self._postprocessor_hooks.append(ph)
900 for pps in self._pps.values():
901 for pp in pps:
902 pp.add_progress_hook(ph)
904 def _bidi_workaround(self, message):
905 if not hasattr(self, '_output_channel'):
906 return message
908 assert hasattr(self, '_output_process')
909 assert isinstance(message, str)
910 line_count = message.count('\n') + 1
911 self._output_process.stdin.write((message + '\n').encode())
912 self._output_process.stdin.flush()
913 res = ''.join(self._output_channel.readline().decode()
914 for _ in range(line_count))
915 return res[:-len('\n')]
917 def _write_string(self, message, out=None, only_once=False):
918 if only_once:
919 if message in self._printed_messages:
920 return
921 self._printed_messages.add(message)
922 write_string(message, out=out, encoding=self.params.get('encoding'))
924 def to_stdout(self, message, skip_eol=False, quiet=None):
925 """Print message to stdout"""
926 if quiet is not None:
927 self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument quiet. '
928 'Use "YoutubeDL.to_screen" instead')
929 if skip_eol is not False:
930 self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument skip_eol. '
931 'Use "YoutubeDL.to_screen" instead')
932 self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.out)
934 def to_screen(self, message, skip_eol=False, quiet=None, only_once=False):
935 """Print message to screen if not in quiet mode"""
936 if self.params.get('logger'):
937 self.params['logger'].debug(message)
938 return
939 if (self.params.get('quiet') if quiet is None else quiet) and not self.params.get('verbose'):
940 return
941 self._write_string(
942 '{}{}'.format(self._bidi_workaround(message), ('' if skip_eol else '\n')),
943 self._out_files.screen, only_once=only_once)
945 def to_stderr(self, message, only_once=False):
946 """Print message to stderr"""
947 assert isinstance(message, str)
948 if self.params.get('logger'):
949 self.params['logger'].error(message)
950 else:
951 self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.error, only_once=only_once)
953 def _send_console_code(self, code):
954 if os.name == 'nt' or not self._out_files.console:
955 return
956 self._write_string(code, self._out_files.console)
958 def to_console_title(self, message):
959 if not self.params.get('consoletitle', False):
960 return
961 message = remove_terminal_sequences(message)
962 if os.name == 'nt':
963 if ctypes.windll.kernel32.GetConsoleWindow():
964 # c_wchar_p() might not be necessary if `message` is
965 # already of type unicode()
966 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
967 else:
968 self._send_console_code(f'\033]0;{message}\007')
970 def save_console_title(self):
971 if not self.params.get('consoletitle') or self.params.get('simulate'):
972 return
973 self._send_console_code('\033[22;0t') # Save the title on stack
975 def restore_console_title(self):
976 if not self.params.get('consoletitle') or self.params.get('simulate'):
977 return
978 self._send_console_code('\033[23;0t') # Restore the title from stack
980 def __enter__(self):
981 self.save_console_title()
982 return self
984 def save_cookies(self):
985 if self.params.get('cookiefile') is not None:
986 self.cookiejar.save()
988 def __exit__(self, *args):
989 self.restore_console_title()
990 self.close()
992 def close(self):
993 self.save_cookies()
994 if '_request_director' in self.__dict__:
995 self._request_director.close()
996 del self._request_director
998 def trouble(self, message=None, tb=None, is_error=True):
999 """Determine action to take when a download problem appears.
1001 Depending on if the downloader has been configured to ignore
1002 download errors or not, this method may throw an exception or
1003 not when errors are found, after printing the message.
1005 @param tb If given, is additional traceback information
1006 @param is_error Whether to raise error according to ignorerrors
1008 if message is not None:
1009 self.to_stderr(message)
1010 if self.params.get('verbose'):
1011 if tb is None:
1012 if sys.exc_info()[0]: # if .trouble has been called from an except block
1013 tb = ''
1014 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
1015 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
1016 tb += encode_compat_str(traceback.format_exc())
1017 else:
1018 tb_data = traceback.format_list(traceback.extract_stack())
1019 tb = ''.join(tb_data)
1020 if tb:
1021 self.to_stderr(tb)
1022 if not is_error:
1023 return
1024 if not self.params.get('ignoreerrors'):
1025 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
1026 exc_info = sys.exc_info()[1].exc_info
1027 else:
1028 exc_info = sys.exc_info()
1029 raise DownloadError(message, exc_info)
1030 self._download_retcode = 1
1032 Styles = Namespace(
1033 HEADERS='yellow',
1034 EMPHASIS='light blue',
1035 FILENAME='green',
1036 ID='green',
1037 DELIM='blue',
1038 ERROR='red',
1039 BAD_FORMAT='light red',
1040 WARNING='yellow',
1041 SUPPRESS='light black',
1044 def _format_text(self, handle, allow_colors, text, f, fallback=None, *, test_encoding=False):
1045 text = str(text)
1046 if test_encoding:
1047 original_text = text
1048 # handle.encoding can be None. See https://github.com/yt-dlp/yt-dlp/issues/2711
1049 encoding = self.params.get('encoding') or getattr(handle, 'encoding', None) or 'ascii'
1050 text = text.encode(encoding, 'ignore').decode(encoding)
1051 if fallback is not None and text != original_text:
1052 text = fallback
1053 return format_text(text, f) if allow_colors is True else text if fallback is None else fallback
1055 def _format_out(self, *args, **kwargs):
1056 return self._format_text(self._out_files.out, self._allow_colors.out, *args, **kwargs)
1058 def _format_screen(self, *args, **kwargs):
1059 return self._format_text(self._out_files.screen, self._allow_colors.screen, *args, **kwargs)
1061 def _format_err(self, *args, **kwargs):
1062 return self._format_text(self._out_files.error, self._allow_colors.error, *args, **kwargs)
1064 def report_warning(self, message, only_once=False):
1066 Print the message to stderr, it will be prefixed with 'WARNING:'
1067 If stderr is a tty file the 'WARNING:' will be colored
1069 if self.params.get('logger') is not None:
1070 self.params['logger'].warning(message)
1071 else:
1072 if self.params.get('no_warnings'):
1073 return
1074 self.to_stderr(f'{self._format_err("WARNING:", self.Styles.WARNING)} {message}', only_once)
1076 def deprecation_warning(self, message, *, stacklevel=0):
1077 deprecation_warning(
1078 message, stacklevel=stacklevel + 1, printer=self.report_error, is_error=False)
1080 def deprecated_feature(self, message):
1081 if self.params.get('logger') is not None:
1082 self.params['logger'].warning(f'Deprecated Feature: {message}')
1083 self.to_stderr(f'{self._format_err("Deprecated Feature:", self.Styles.ERROR)} {message}', True)
1085 def report_error(self, message, *args, **kwargs):
1087 Do the same as trouble, but prefixes the message with 'ERROR:', colored
1088 in red if stderr is a tty file.
1090 self.trouble(f'{self._format_err("ERROR:", self.Styles.ERROR)} {message}', *args, **kwargs)
1092 def write_debug(self, message, only_once=False):
1093 """Log debug message or Print message to stderr"""
1094 if not self.params.get('verbose', False):
1095 return
1096 message = f'[debug] {message}'
1097 if self.params.get('logger'):
1098 self.params['logger'].debug(message)
1099 else:
1100 self.to_stderr(message, only_once)
1102 def report_file_already_downloaded(self, file_name):
1103 """Report file has already been fully downloaded."""
1104 try:
1105 self.to_screen(f'[download] {file_name} has already been downloaded')
1106 except UnicodeEncodeError:
1107 self.to_screen('[download] The file has already been downloaded')
1109 def report_file_delete(self, file_name):
1110 """Report that existing file will be deleted."""
1111 try:
1112 self.to_screen(f'Deleting existing file {file_name}')
1113 except UnicodeEncodeError:
1114 self.to_screen('Deleting existing file')
1116 def raise_no_formats(self, info, forced=False, *, msg=None):
1117 has_drm = info.get('_has_drm')
1118 ignored, expected = self.params.get('ignore_no_formats_error'), bool(msg)
1119 msg = msg or (has_drm and 'This video is DRM protected') or 'No video formats found!'
1120 if forced or not ignored:
1121 raise ExtractorError(msg, video_id=info['id'], ie=info['extractor'],
1122 expected=has_drm or ignored or expected)
1123 else:
1124 self.report_warning(msg)
1126 def parse_outtmpl(self):
1127 self.deprecation_warning('"YoutubeDL.parse_outtmpl" is deprecated and may be removed in a future version')
1128 self._parse_outtmpl()
1129 return self.params['outtmpl']
1131 def _parse_outtmpl(self):
1132 sanitize = IDENTITY
1133 if self.params.get('restrictfilenames'): # Remove spaces in the default template
1134 sanitize = lambda x: x.replace(' - ', ' ').replace(' ', '-')
1136 outtmpl = self.params.setdefault('outtmpl', {})
1137 if not isinstance(outtmpl, dict):
1138 self.params['outtmpl'] = outtmpl = {'default': outtmpl}
1139 outtmpl.update({k: sanitize(v) for k, v in DEFAULT_OUTTMPL.items() if outtmpl.get(k) is None})
1141 def get_output_path(self, dir_type='', filename=None):
1142 paths = self.params.get('paths', {})
1143 assert isinstance(paths, dict), '"paths" parameter must be a dictionary'
1144 path = os.path.join(
1145 expand_path(paths.get('home', '').strip()),
1146 expand_path(paths.get(dir_type, '').strip()) if dir_type else '',
1147 filename or '')
1148 return sanitize_path(path, force=self.params.get('windowsfilenames'))
1150 @staticmethod
1151 def _outtmpl_expandpath(outtmpl):
1152 # expand_path translates '%%' into '%' and '$$' into '$'
1153 # correspondingly that is not what we want since we need to keep
1154 # '%%' intact for template dict substitution step. Working around
1155 # with boundary-alike separator hack.
1156 sep = ''.join(random.choices(string.ascii_letters, k=32))
1157 outtmpl = outtmpl.replace('%%', f'%{sep}%').replace('$$', f'${sep}$')
1159 # outtmpl should be expand_path'ed before template dict substitution
1160 # because meta fields may contain env variables we don't want to
1161 # be expanded. E.g. for outtmpl "%(title)s.%(ext)s" and
1162 # title "Hello $PATH", we don't want `$PATH` to be expanded.
1163 return expand_path(outtmpl).replace(sep, '')
1165 @staticmethod
1166 def escape_outtmpl(outtmpl):
1167 """ Escape any remaining strings like %s, %abc% etc. """
1168 return re.sub(
1169 STR_FORMAT_RE_TMPL.format('', '(?![%(\0])'),
1170 lambda mobj: ('' if mobj.group('has_key') else '%') + mobj.group(0),
1171 outtmpl)
1173 @classmethod
1174 def validate_outtmpl(cls, outtmpl):
1175 """ @return None or Exception object """
1176 outtmpl = re.sub(
1177 STR_FORMAT_RE_TMPL.format('[^)]*', '[ljhqBUDS]'),
1178 lambda mobj: f'{mobj.group(0)[:-1]}s',
1179 cls._outtmpl_expandpath(outtmpl))
1180 try:
1181 cls.escape_outtmpl(outtmpl) % collections.defaultdict(int)
1182 return None
1183 except ValueError as err:
1184 return err
1186 @staticmethod
1187 def _copy_infodict(info_dict):
1188 info_dict = dict(info_dict)
1189 info_dict.pop('__postprocessors', None)
1190 info_dict.pop('__pending_error', None)
1191 return info_dict
1193 def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False):
1194 """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict
1195 @param sanitize Whether to sanitize the output as a filename.
1196 For backward compatibility, a function can also be passed
1199 info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set
1201 info_dict = self._copy_infodict(info_dict)
1202 info_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs
1203 formatSeconds(info_dict['duration'], '-' if sanitize else ':')
1204 if info_dict.get('duration', None) is not None
1205 else None)
1206 info_dict['autonumber'] = int(self.params.get('autonumber_start', 1) - 1 + self._num_downloads)
1207 info_dict['video_autonumber'] = self._num_videos
1208 if info_dict.get('resolution') is None:
1209 info_dict['resolution'] = self.format_resolution(info_dict, default=None)
1211 # For fields playlist_index, playlist_autonumber and autonumber convert all occurrences
1212 # of %(field)s to %(field)0Nd for backward compatibility
1213 field_size_compat_map = {
1214 'playlist_index': number_of_digits(info_dict.get('__last_playlist_index') or 0),
1215 'playlist_autonumber': number_of_digits(info_dict.get('n_entries') or 0),
1216 'autonumber': self.params.get('autonumber_size') or 5,
1219 TMPL_DICT = {}
1220 EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljhqBUDS]'))
1221 MATH_FUNCTIONS = {
1222 '+': float.__add__,
1223 '-': float.__sub__,
1224 '*': float.__mul__,
1226 # Field is of the form key1.key2...
1227 # where keys (except first) can be string, int, slice or "{field, ...}"
1228 FIELD_INNER_RE = r'(?:\w+|%(num)s|%(num)s?(?::%(num)s?){1,2})' % {'num': r'(?:-?\d+)'} # noqa: UP031
1229 FIELD_RE = r'\w*(?:\.(?:%(inner)s|{%(field)s(?:,%(field)s)*}))*' % { # noqa: UP031
1230 'inner': FIELD_INNER_RE,
1231 'field': rf'\w*(?:\.{FIELD_INNER_RE})*',
1233 MATH_FIELD_RE = rf'(?:{FIELD_RE}|-?{NUMBER_RE})'
1234 MATH_OPERATORS_RE = r'(?:{})'.format('|'.join(map(re.escape, MATH_FUNCTIONS.keys())))
1235 INTERNAL_FORMAT_RE = re.compile(rf'''(?xs)
1236 (?P<negate>-)?
1237 (?P<fields>{FIELD_RE})
1238 (?P<maths>(?:{MATH_OPERATORS_RE}{MATH_FIELD_RE})*)
1239 (?:>(?P<strf_format>.+?))?
1240 (?P<remaining>
1241 (?P<alternate>(?<!\\),[^|&)]+)?
1242 (?:&(?P<replacement>.*?))?
1243 (?:\|(?P<default>.*?))?
1244 )$''')
1246 def _from_user_input(field):
1247 if field == ':':
1248 return ...
1249 elif ':' in field:
1250 return slice(*map(int_or_none, field.split(':')))
1251 elif int_or_none(field) is not None:
1252 return int(field)
1253 return field
1255 def _traverse_infodict(fields):
1256 fields = [f for x in re.split(r'\.({.+?})\.?', fields)
1257 for f in ([x] if x.startswith('{') else x.split('.'))]
1258 for i in (0, -1):
1259 if fields and not fields[i]:
1260 fields.pop(i)
1262 for i, f in enumerate(fields):
1263 if not f.startswith('{'):
1264 fields[i] = _from_user_input(f)
1265 continue
1266 assert f.endswith('}'), f'No closing brace for {f} in {fields}'
1267 fields[i] = {k: list(map(_from_user_input, k.split('.'))) for k in f[1:-1].split(',')}
1269 return traverse_obj(info_dict, fields, traverse_string=True)
1271 def get_value(mdict):
1272 # Object traversal
1273 value = _traverse_infodict(mdict['fields'])
1274 # Negative
1275 if mdict['negate']:
1276 value = float_or_none(value)
1277 if value is not None:
1278 value *= -1
1279 # Do maths
1280 offset_key = mdict['maths']
1281 if offset_key:
1282 value = float_or_none(value)
1283 operator = None
1284 while offset_key:
1285 item = re.match(
1286 MATH_FIELD_RE if operator else MATH_OPERATORS_RE,
1287 offset_key).group(0)
1288 offset_key = offset_key[len(item):]
1289 if operator is None:
1290 operator = MATH_FUNCTIONS[item]
1291 continue
1292 item, multiplier = (item[1:], -1) if item[0] == '-' else (item, 1)
1293 offset = float_or_none(item)
1294 if offset is None:
1295 offset = float_or_none(_traverse_infodict(item))
1296 try:
1297 value = operator(value, multiplier * offset)
1298 except (TypeError, ZeroDivisionError):
1299 return None
1300 operator = None
1301 # Datetime formatting
1302 if mdict['strf_format']:
1303 value = strftime_or_none(value, mdict['strf_format'].replace('\\,', ','))
1305 # XXX: Workaround for https://github.com/yt-dlp/yt-dlp/issues/4485
1306 if sanitize and value == '':
1307 value = None
1308 return value
1310 na = self.params.get('outtmpl_na_placeholder', 'NA')
1312 def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')):
1313 return sanitize_filename(str(value), restricted=restricted, is_id=(
1314 bool(re.search(r'(^|[_.])id(\.|$)', key))
1315 if 'filename-sanitization' in self.params['compat_opts']
1316 else NO_DEFAULT))
1318 sanitizer = sanitize if callable(sanitize) else filename_sanitizer
1319 sanitize = bool(sanitize)
1321 def _dumpjson_default(obj):
1322 if isinstance(obj, (set, LazyList)):
1323 return list(obj)
1324 return repr(obj)
1326 class _ReplacementFormatter(string.Formatter):
1327 def get_field(self, field_name, args, kwargs):
1328 if field_name.isdigit():
1329 return args[0], -1
1330 raise ValueError('Unsupported field')
1332 replacement_formatter = _ReplacementFormatter()
1334 def create_key(outer_mobj):
1335 if not outer_mobj.group('has_key'):
1336 return outer_mobj.group(0)
1337 key = outer_mobj.group('key')
1338 mobj = re.match(INTERNAL_FORMAT_RE, key)
1339 value, replacement, default, last_field = None, None, na, ''
1340 while mobj:
1341 mobj = mobj.groupdict()
1342 default = mobj['default'] if mobj['default'] is not None else default
1343 value = get_value(mobj)
1344 last_field, replacement = mobj['fields'], mobj['replacement']
1345 if value is None and mobj['alternate']:
1346 mobj = re.match(INTERNAL_FORMAT_RE, mobj['remaining'][1:])
1347 else:
1348 break
1350 if None not in (value, replacement):
1351 try:
1352 value = replacement_formatter.format(replacement, value)
1353 except ValueError:
1354 value, default = None, na
1356 fmt = outer_mobj.group('format')
1357 if fmt == 's' and last_field in field_size_compat_map and isinstance(value, int):
1358 fmt = f'0{field_size_compat_map[last_field]:d}d'
1360 flags = outer_mobj.group('conversion') or ''
1361 str_fmt = f'{fmt[:-1]}s'
1362 if value is None:
1363 value, fmt = default, 's'
1364 elif fmt[-1] == 'l': # list
1365 delim = '\n' if '#' in flags else ', '
1366 value, fmt = delim.join(map(str, variadic(value, allowed_types=(str, bytes)))), str_fmt
1367 elif fmt[-1] == 'j': # json
1368 value, fmt = json.dumps(
1369 value, default=_dumpjson_default,
1370 indent=4 if '#' in flags else None, ensure_ascii='+' not in flags), str_fmt
1371 elif fmt[-1] == 'h': # html
1372 value, fmt = escapeHTML(str(value)), str_fmt
1373 elif fmt[-1] == 'q': # quoted
1374 value = map(str, variadic(value) if '#' in flags else [value])
1375 value, fmt = shell_quote(value, shell=True), str_fmt
1376 elif fmt[-1] == 'B': # bytes
1377 value = f'%{str_fmt}'.encode() % str(value).encode()
1378 value, fmt = value.decode('utf-8', 'ignore'), 's'
1379 elif fmt[-1] == 'U': # unicode normalized
1380 value, fmt = unicodedata.normalize(
1381 # "+" = compatibility equivalence, "#" = NFD
1382 'NF{}{}'.format('K' if '+' in flags else '', 'D' if '#' in flags else 'C'),
1383 value), str_fmt
1384 elif fmt[-1] == 'D': # decimal suffix
1385 num_fmt, fmt = fmt[:-1].replace('#', ''), 's'
1386 value = format_decimal_suffix(value, f'%{num_fmt}f%s' if num_fmt else '%d%s',
1387 factor=1024 if '#' in flags else 1000)
1388 elif fmt[-1] == 'S': # filename sanitization
1389 value, fmt = filename_sanitizer(last_field, value, restricted='#' in flags), str_fmt
1390 elif fmt[-1] == 'c':
1391 if value:
1392 value = str(value)[0]
1393 else:
1394 fmt = str_fmt
1395 elif fmt[-1] not in 'rsa': # numeric
1396 value = float_or_none(value)
1397 if value is None:
1398 value, fmt = default, 's'
1400 if sanitize:
1401 # If value is an object, sanitize might convert it to a string
1402 # So we convert it to repr first
1403 if fmt[-1] == 'r':
1404 value, fmt = repr(value), str_fmt
1405 elif fmt[-1] == 'a':
1406 value, fmt = ascii(value), str_fmt
1407 if fmt[-1] in 'csra':
1408 value = sanitizer(last_field, value)
1410 key = '{}\0{}'.format(key.replace('%', '%\0'), outer_mobj.group('format'))
1411 TMPL_DICT[key] = value
1412 return '{prefix}%({key}){fmt}'.format(key=key, fmt=fmt, prefix=outer_mobj.group('prefix'))
1414 return EXTERNAL_FORMAT_RE.sub(create_key, outtmpl), TMPL_DICT
1416 def evaluate_outtmpl(self, outtmpl, info_dict, *args, **kwargs):
1417 outtmpl, info_dict = self.prepare_outtmpl(outtmpl, info_dict, *args, **kwargs)
1418 return self.escape_outtmpl(outtmpl) % info_dict
1420 @_catch_unsafe_extension_error
1421 def _prepare_filename(self, info_dict, *, outtmpl=None, tmpl_type=None):
1422 assert None in (outtmpl, tmpl_type), 'outtmpl and tmpl_type are mutually exclusive'
1423 if outtmpl is None:
1424 outtmpl = self.params['outtmpl'].get(tmpl_type or 'default', self.params['outtmpl']['default'])
1425 try:
1426 outtmpl = self._outtmpl_expandpath(outtmpl)
1427 filename = self.evaluate_outtmpl(outtmpl, info_dict, True)
1428 if not filename:
1429 return None
1431 if tmpl_type in ('', 'temp'):
1432 final_ext, ext = self.params.get('final_ext'), info_dict.get('ext')
1433 if final_ext and ext and final_ext != ext and filename.endswith(f'.{final_ext}'):
1434 filename = replace_extension(filename, ext, final_ext)
1435 elif tmpl_type:
1436 force_ext = OUTTMPL_TYPES[tmpl_type]
1437 if force_ext:
1438 filename = replace_extension(filename, force_ext, info_dict.get('ext'))
1440 # https://github.com/blackjack4494/youtube-dlc/issues/85
1441 trim_file_name = self.params.get('trim_file_name', False)
1442 if trim_file_name:
1443 no_ext, *ext = filename.rsplit('.', 2)
1444 filename = join_nonempty(no_ext[:trim_file_name], *ext, delim='.')
1446 return filename
1447 except ValueError as err:
1448 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
1449 return None
1451 def prepare_filename(self, info_dict, dir_type='', *, outtmpl=None, warn=False):
1452 """Generate the output filename"""
1453 if outtmpl:
1454 assert not dir_type, 'outtmpl and dir_type are mutually exclusive'
1455 dir_type = None
1456 filename = self._prepare_filename(info_dict, tmpl_type=dir_type, outtmpl=outtmpl)
1457 if not filename and dir_type not in ('', 'temp'):
1458 return ''
1460 if warn:
1461 if not self.params.get('paths'):
1462 pass
1463 elif filename == '-':
1464 self.report_warning('--paths is ignored when an outputting to stdout', only_once=True)
1465 elif os.path.isabs(filename):
1466 self.report_warning('--paths is ignored since an absolute path is given in output template', only_once=True)
1467 if filename == '-' or not filename:
1468 return filename
1470 return self.get_output_path(dir_type, filename)
1472 def _match_entry(self, info_dict, incomplete=False, silent=False):
1473 """Returns None if the file should be downloaded"""
1474 _type = 'video' if 'playlist-match-filter' in self.params['compat_opts'] else info_dict.get('_type', 'video')
1475 assert incomplete or _type == 'video', 'Only video result can be considered complete'
1477 video_title = info_dict.get('title', info_dict.get('id', 'entry'))
1479 def check_filter():
1480 if _type in ('playlist', 'multi_video'):
1481 return
1482 elif _type in ('url', 'url_transparent') and not try_call(
1483 lambda: self.get_info_extractor(info_dict['ie_key']).is_single_video(info_dict['url'])):
1484 return
1486 if 'title' in info_dict:
1487 # This can happen when we're just evaluating the playlist
1488 title = info_dict['title']
1489 matchtitle = self.params.get('matchtitle', False)
1490 if matchtitle:
1491 if not re.search(matchtitle, title, re.IGNORECASE):
1492 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
1493 rejecttitle = self.params.get('rejecttitle', False)
1494 if rejecttitle:
1495 if re.search(rejecttitle, title, re.IGNORECASE):
1496 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
1498 date = info_dict.get('upload_date')
1499 if date is not None:
1500 date_range = self.params.get('daterange', DateRange())
1501 if date not in date_range:
1502 return f'{date_from_str(date).isoformat()} upload date is not in range {date_range}'
1503 view_count = info_dict.get('view_count')
1504 if view_count is not None:
1505 min_views = self.params.get('min_views')
1506 if min_views is not None and view_count < min_views:
1507 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
1508 max_views = self.params.get('max_views')
1509 if max_views is not None and view_count > max_views:
1510 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
1511 if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
1512 return f'Skipping "{video_title}" because it is age restricted'
1514 match_filter = self.params.get('match_filter')
1515 if match_filter is None:
1516 return None
1518 cancelled = None
1519 try:
1520 try:
1521 ret = match_filter(info_dict, incomplete=incomplete)
1522 except TypeError:
1523 # For backward compatibility
1524 ret = None if incomplete else match_filter(info_dict)
1525 except DownloadCancelled as err:
1526 if err.msg is not NO_DEFAULT:
1527 raise
1528 ret, cancelled = err.msg, err
1530 if ret is NO_DEFAULT:
1531 while True:
1532 filename = self._format_screen(self.prepare_filename(info_dict), self.Styles.FILENAME)
1533 reply = input(self._format_screen(
1534 f'Download "{filename}"? (Y/n): ', self.Styles.EMPHASIS)).lower().strip()
1535 if reply in {'y', ''}:
1536 return None
1537 elif reply == 'n':
1538 if cancelled:
1539 raise type(cancelled)(f'Skipping {video_title}')
1540 return f'Skipping {video_title}'
1541 return ret
1543 if self.in_download_archive(info_dict):
1544 reason = ''.join((
1545 format_field(info_dict, 'id', f'{self._format_screen("%s", self.Styles.ID)}: '),
1546 format_field(info_dict, 'title', f'{self._format_screen("%s", self.Styles.EMPHASIS)} '),
1547 'has already been recorded in the archive'))
1548 break_opt, break_err = 'break_on_existing', ExistingVideoReached
1549 else:
1550 try:
1551 reason = check_filter()
1552 except DownloadCancelled as e:
1553 reason, break_opt, break_err = e.msg, 'match_filter', type(e)
1554 else:
1555 break_opt, break_err = 'break_on_reject', RejectedVideoReached
1556 if reason is not None:
1557 if not silent:
1558 self.to_screen('[download] ' + reason)
1559 if self.params.get(break_opt, False):
1560 raise break_err()
1561 return reason
1563 @staticmethod
1564 def add_extra_info(info_dict, extra_info):
1565 """Set the keys from extra_info in info dict if they are missing"""
1566 for key, value in extra_info.items():
1567 info_dict.setdefault(key, value)
1569 def extract_info(self, url, download=True, ie_key=None, extra_info=None,
1570 process=True, force_generic_extractor=False):
1572 Extract and return the information dictionary of the URL
1574 Arguments:
1575 @param url URL to extract
1577 Keyword arguments:
1578 @param download Whether to download videos
1579 @param process Whether to resolve all unresolved references (URLs, playlist items).
1580 Must be True for download to work
1581 @param ie_key Use only the extractor with this key
1583 @param extra_info Dictionary containing the extra values to add to the info (For internal use only)
1584 @force_generic_extractor Force using the generic extractor (Deprecated; use ie_key='Generic')
1587 if extra_info is None:
1588 extra_info = {}
1590 if not ie_key and force_generic_extractor:
1591 ie_key = 'Generic'
1593 if ie_key:
1594 ies = {ie_key: self._ies[ie_key]} if ie_key in self._ies else {}
1595 else:
1596 ies = self._ies
1598 for key, ie in ies.items():
1599 if not ie.suitable(url):
1600 continue
1602 if not ie.working():
1603 self.report_warning('The program functionality for this site has been marked as broken, '
1604 'and will probably not work.')
1606 temp_id = ie.get_temp_id(url)
1607 if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': key}):
1608 self.to_screen(f'[download] {self._format_screen(temp_id, self.Styles.ID)}: '
1609 'has already been recorded in the archive')
1610 if self.params.get('break_on_existing', False):
1611 raise ExistingVideoReached
1612 break
1613 return self.__extract_info(url, self.get_info_extractor(key), download, extra_info, process)
1614 else:
1615 extractors_restricted = self.params.get('allowed_extractors') not in (None, ['default'])
1616 self.report_error(f'No suitable extractor{format_field(ie_key, None, " (%s)")} found for URL {url}',
1617 tb=False if extractors_restricted else None)
1619 def _handle_extraction_exceptions(func):
1620 @functools.wraps(func)
1621 def wrapper(self, *args, **kwargs):
1622 while True:
1623 try:
1624 return func(self, *args, **kwargs)
1625 except (CookieLoadError, DownloadCancelled, LazyList.IndexError, PagedList.IndexError):
1626 raise
1627 except ReExtractInfo as e:
1628 if e.expected:
1629 self.to_screen(f'{e}; Re-extracting data')
1630 else:
1631 self.to_stderr('\r')
1632 self.report_warning(f'{e}; Re-extracting data')
1633 continue
1634 except GeoRestrictedError as e:
1635 msg = e.msg
1636 if e.countries:
1637 msg += '\nThis video is available in {}.'.format(', '.join(
1638 map(ISO3166Utils.short2full, e.countries)))
1639 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
1640 self.report_error(msg)
1641 except ExtractorError as e: # An error we somewhat expected
1642 self.report_error(str(e), e.format_traceback())
1643 except Exception as e:
1644 if self.params.get('ignoreerrors'):
1645 self.report_error(str(e), tb=encode_compat_str(traceback.format_exc()))
1646 else:
1647 raise
1648 break
1649 return wrapper
1651 def _wait_for_video(self, ie_result={}):
1652 if (not self.params.get('wait_for_video')
1653 or ie_result.get('_type', 'video') != 'video'
1654 or ie_result.get('formats') or ie_result.get('url')):
1655 return
1657 format_dur = lambda dur: '%02d:%02d:%02d' % timetuple_from_msec(dur * 1000)[:-1]
1658 last_msg = ''
1660 def progress(msg):
1661 nonlocal last_msg
1662 full_msg = f'{msg}\n'
1663 if not self.params.get('noprogress'):
1664 full_msg = msg + ' ' * (len(last_msg) - len(msg)) + '\r'
1665 elif last_msg:
1666 return
1667 self.to_screen(full_msg, skip_eol=True)
1668 last_msg = msg
1670 min_wait, max_wait = self.params.get('wait_for_video')
1671 diff = try_get(ie_result, lambda x: x['release_timestamp'] - time.time())
1672 if diff is None and ie_result.get('live_status') == 'is_upcoming':
1673 diff = round(random.uniform(min_wait, max_wait) if (max_wait and min_wait) else (max_wait or min_wait), 0)
1674 self.report_warning('Release time of video is not known')
1675 elif ie_result and (diff or 0) <= 0:
1676 self.report_warning('Video should already be available according to extracted info')
1677 diff = min(max(diff or 0, min_wait or 0), max_wait or float('inf'))
1678 self.to_screen(f'[wait] Waiting for {format_dur(diff)} - Press Ctrl+C to try now')
1680 wait_till = time.time() + diff
1681 try:
1682 while True:
1683 diff = wait_till - time.time()
1684 if diff <= 0:
1685 progress('')
1686 raise ReExtractInfo('[wait] Wait period ended', expected=True)
1687 progress(f'[wait] Remaining time until next attempt: {self._format_screen(format_dur(diff), self.Styles.EMPHASIS)}')
1688 time.sleep(1)
1689 except KeyboardInterrupt:
1690 progress('')
1691 raise ReExtractInfo('[wait] Interrupted by user', expected=True)
1692 except BaseException as e:
1693 if not isinstance(e, ReExtractInfo):
1694 self.to_screen('')
1695 raise
1697 def _load_cookies(self, data, *, autoscope=True):
1698 """Loads cookies from a `Cookie` header
1700 This tries to work around the security vulnerability of passing cookies to every domain.
1701 See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj
1703 @param data The Cookie header as string to load the cookies from
1704 @param autoscope If `False`, scope cookies using Set-Cookie syntax and error for cookie without domains
1705 If `True`, save cookies for later to be stored in the jar with a limited scope
1706 If a URL, save cookies in the jar with the domain of the URL
1708 for cookie in LenientSimpleCookie(data).values():
1709 if autoscope and any(cookie.values()):
1710 raise ValueError('Invalid syntax in Cookie Header')
1712 domain = cookie.get('domain') or ''
1713 expiry = cookie.get('expires')
1714 if expiry == '': # 0 is valid
1715 expiry = None
1716 prepared_cookie = http.cookiejar.Cookie(
1717 cookie.get('version') or 0, cookie.key, cookie.value, None, False,
1718 domain, True, True, cookie.get('path') or '', bool(cookie.get('path')),
1719 cookie.get('secure') or False, expiry, False, None, None, {})
1721 if domain:
1722 self.cookiejar.set_cookie(prepared_cookie)
1723 elif autoscope is True:
1724 self.deprecated_feature(
1725 'Passing cookies as a header is a potential security risk; '
1726 'they will be scoped to the domain of the downloaded urls. '
1727 'Please consider loading cookies from a file or browser instead.')
1728 self.__header_cookies.append(prepared_cookie)
1729 elif autoscope:
1730 self.report_warning(
1731 'The extractor result contains an unscoped cookie as an HTTP header. '
1732 f'If you are using yt-dlp with an input URL{bug_reports_message(before=",")}',
1733 only_once=True)
1734 self._apply_header_cookies(autoscope, [prepared_cookie])
1735 else:
1736 self.report_error('Unscoped cookies are not allowed; please specify some sort of scoping',
1737 tb=False, is_error=False)
1739 def _apply_header_cookies(self, url, cookies=None):
1740 """Applies stray header cookies to the provided url
1742 This loads header cookies and scopes them to the domain provided in `url`.
1743 While this is not ideal, it helps reduce the risk of them being sent
1744 to an unintended destination while mostly maintaining compatibility.
1746 parsed = urllib.parse.urlparse(url)
1747 if not parsed.hostname:
1748 return
1750 for cookie in map(copy.copy, cookies or self.__header_cookies):
1751 cookie.domain = f'.{parsed.hostname}'
1752 self.cookiejar.set_cookie(cookie)
1754 @_handle_extraction_exceptions
1755 def __extract_info(self, url, ie, download, extra_info, process):
1756 self._apply_header_cookies(url)
1758 try:
1759 ie_result = ie.extract(url)
1760 except UserNotLive as e:
1761 if process:
1762 if self.params.get('wait_for_video'):
1763 self.report_warning(e)
1764 self._wait_for_video()
1765 raise
1766 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
1767 self.report_warning(f'Extractor {ie.IE_NAME} returned nothing{bug_reports_message()}')
1768 return
1769 if isinstance(ie_result, list):
1770 # Backwards compatibility: old IE result format
1771 ie_result = {
1772 '_type': 'compat_list',
1773 'entries': ie_result,
1775 if extra_info.get('original_url'):
1776 ie_result.setdefault('original_url', extra_info['original_url'])
1777 self.add_default_extra_info(ie_result, ie, url)
1778 if process:
1779 self._wait_for_video(ie_result)
1780 return self.process_ie_result(ie_result, download, extra_info)
1781 else:
1782 return ie_result
1784 def add_default_extra_info(self, ie_result, ie, url):
1785 if url is not None:
1786 self.add_extra_info(ie_result, {
1787 'webpage_url': url,
1788 'original_url': url,
1790 webpage_url = ie_result.get('webpage_url')
1791 if webpage_url:
1792 self.add_extra_info(ie_result, {
1793 'webpage_url_basename': url_basename(webpage_url),
1794 'webpage_url_domain': get_domain(webpage_url),
1796 if ie is not None:
1797 self.add_extra_info(ie_result, {
1798 'extractor': ie.IE_NAME,
1799 'extractor_key': ie.ie_key(),
1802 def process_ie_result(self, ie_result, download=True, extra_info=None):
1804 Take the result of the ie(may be modified) and resolve all unresolved
1805 references (URLs, playlist items).
1807 It will also download the videos if 'download'.
1808 Returns the resolved ie_result.
1810 if extra_info is None:
1811 extra_info = {}
1812 result_type = ie_result.get('_type', 'video')
1814 if result_type in ('url', 'url_transparent'):
1815 ie_result['url'] = sanitize_url(
1816 ie_result['url'], scheme='http' if self.params.get('prefer_insecure') else 'https')
1817 if ie_result.get('original_url') and not extra_info.get('original_url'):
1818 extra_info = {'original_url': ie_result['original_url'], **extra_info}
1820 extract_flat = self.params.get('extract_flat', False)
1821 if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
1822 or extract_flat is True):
1823 info_copy = ie_result.copy()
1824 ie = try_get(ie_result.get('ie_key'), self.get_info_extractor)
1825 if ie and not ie_result.get('id'):
1826 info_copy['id'] = ie.get_temp_id(ie_result['url'])
1827 self.add_default_extra_info(info_copy, ie, ie_result['url'])
1828 self.add_extra_info(info_copy, extra_info)
1829 info_copy, _ = self.pre_process(info_copy)
1830 self._fill_common_fields(info_copy, False)
1831 self.__forced_printings(info_copy)
1832 self._raise_pending_errors(info_copy)
1833 if self.params.get('force_write_download_archive', False):
1834 self.record_download_archive(info_copy)
1835 return ie_result
1837 if result_type == 'video':
1838 self.add_extra_info(ie_result, extra_info)
1839 ie_result = self.process_video_result(ie_result, download=download)
1840 self._raise_pending_errors(ie_result)
1841 additional_urls = (ie_result or {}).get('additional_urls')
1842 if additional_urls:
1843 # TODO: Improve MetadataParserPP to allow setting a list
1844 if isinstance(additional_urls, str):
1845 additional_urls = [additional_urls]
1846 self.to_screen(
1847 '[info] {}: {} additional URL(s) requested'.format(ie_result['id'], len(additional_urls)))
1848 self.write_debug('Additional URLs: "{}"'.format('", "'.join(additional_urls)))
1849 ie_result['additional_entries'] = [
1850 self.extract_info(
1851 url, download, extra_info=extra_info,
1852 force_generic_extractor=self.params.get('force_generic_extractor'))
1853 for url in additional_urls
1855 return ie_result
1856 elif result_type == 'url':
1857 # We have to add extra_info to the results because it may be
1858 # contained in a playlist
1859 return self.extract_info(
1860 ie_result['url'], download,
1861 ie_key=ie_result.get('ie_key'),
1862 extra_info=extra_info)
1863 elif result_type == 'url_transparent':
1864 # Use the information from the embedding page
1865 info = self.extract_info(
1866 ie_result['url'], ie_key=ie_result.get('ie_key'),
1867 extra_info=extra_info, download=False, process=False)
1869 # extract_info may return None when ignoreerrors is enabled and
1870 # extraction failed with an error, don't crash and return early
1871 # in this case
1872 if not info:
1873 return info
1875 exempted_fields = {'_type', 'url', 'ie_key'}
1876 if not ie_result.get('section_end') and ie_result.get('section_start') is None:
1877 # For video clips, the id etc of the clip extractor should be used
1878 exempted_fields |= {'id', 'extractor', 'extractor_key'}
1880 new_result = info.copy()
1881 new_result.update(filter_dict(ie_result, lambda k, v: v is not None and k not in exempted_fields))
1883 # Extracted info may not be a video result (i.e.
1884 # info.get('_type', 'video') != video) but rather an url or
1885 # url_transparent. In such cases outer metadata (from ie_result)
1886 # should be propagated to inner one (info). For this to happen
1887 # _type of info should be overridden with url_transparent. This
1888 # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163.
1889 if new_result.get('_type') == 'url':
1890 new_result['_type'] = 'url_transparent'
1892 return self.process_ie_result(
1893 new_result, download=download, extra_info=extra_info)
1894 elif result_type in ('playlist', 'multi_video'):
1895 # Protect from infinite recursion due to recursively nested playlists
1896 # (see https://github.com/ytdl-org/youtube-dl/issues/27833)
1897 webpage_url = ie_result.get('webpage_url') # Playlists maynot have webpage_url
1898 if webpage_url and webpage_url in self._playlist_urls:
1899 self.to_screen(
1900 '[download] Skipping already downloaded playlist: {}'.format(
1901 ie_result.get('title')) or ie_result.get('id'))
1902 return
1904 self._playlist_level += 1
1905 self._playlist_urls.add(webpage_url)
1906 self._fill_common_fields(ie_result, False)
1907 self._sanitize_thumbnails(ie_result)
1908 try:
1909 return self.__process_playlist(ie_result, download)
1910 finally:
1911 self._playlist_level -= 1
1912 if not self._playlist_level:
1913 self._playlist_urls.clear()
1914 elif result_type == 'compat_list':
1915 self.report_warning(
1916 'Extractor {} returned a compat_list result. '
1917 'It needs to be updated.'.format(ie_result.get('extractor')))
1919 def _fixup(r):
1920 self.add_extra_info(r, {
1921 'extractor': ie_result['extractor'],
1922 'webpage_url': ie_result['webpage_url'],
1923 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1924 'webpage_url_domain': get_domain(ie_result['webpage_url']),
1925 'extractor_key': ie_result['extractor_key'],
1927 return r
1928 ie_result['entries'] = [
1929 self.process_ie_result(_fixup(r), download, extra_info)
1930 for r in ie_result['entries']
1932 return ie_result
1933 else:
1934 raise Exception(f'Invalid result type: {result_type}')
1936 def _ensure_dir_exists(self, path):
1937 return make_dir(path, self.report_error)
1939 @staticmethod
1940 def _playlist_infodict(ie_result, strict=False, **kwargs):
1941 info = {
1942 'playlist_count': ie_result.get('playlist_count'),
1943 'playlist': ie_result.get('title') or ie_result.get('id'),
1944 'playlist_id': ie_result.get('id'),
1945 'playlist_title': ie_result.get('title'),
1946 'playlist_uploader': ie_result.get('uploader'),
1947 'playlist_uploader_id': ie_result.get('uploader_id'),
1948 'playlist_channel': ie_result.get('channel'),
1949 'playlist_channel_id': ie_result.get('channel_id'),
1950 'playlist_webpage_url': ie_result.get('webpage_url'),
1951 **kwargs,
1953 if strict:
1954 return info
1955 if ie_result.get('webpage_url'):
1956 info.update({
1957 'webpage_url': ie_result['webpage_url'],
1958 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1959 'webpage_url_domain': get_domain(ie_result['webpage_url']),
1961 return {
1962 **info,
1963 'playlist_index': 0,
1964 '__last_playlist_index': max(ie_result.get('requested_entries') or (0, 0)),
1965 'extractor': ie_result['extractor'],
1966 'extractor_key': ie_result['extractor_key'],
1969 def __process_playlist(self, ie_result, download):
1970 """Process each entry in the playlist"""
1971 assert ie_result['_type'] in ('playlist', 'multi_video')
1973 common_info = self._playlist_infodict(ie_result, strict=True)
1974 title = common_info.get('playlist') or '<Untitled>'
1975 if self._match_entry(common_info, incomplete=True) is not None:
1976 return
1977 self.to_screen(f'[download] Downloading {ie_result["_type"]}: {title}')
1979 all_entries = PlaylistEntries(self, ie_result)
1980 entries = orderedSet(all_entries.get_requested_items(), lazy=True)
1982 lazy = self.params.get('lazy_playlist')
1983 if lazy:
1984 resolved_entries, n_entries = [], 'N/A'
1985 ie_result['requested_entries'], ie_result['entries'] = None, None
1986 else:
1987 entries = resolved_entries = list(entries)
1988 n_entries = len(resolved_entries)
1989 ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*resolved_entries)) or ([], [])
1990 if not ie_result.get('playlist_count'):
1991 # Better to do this after potentially exhausting entries
1992 ie_result['playlist_count'] = all_entries.get_full_count()
1994 extra = self._playlist_infodict(ie_result, n_entries=int_or_none(n_entries))
1995 ie_copy = collections.ChainMap(ie_result, extra)
1997 _infojson_written = False
1998 write_playlist_files = self.params.get('allow_playlist_files', True)
1999 if write_playlist_files and self.params.get('list_thumbnails'):
2000 self.list_thumbnails(ie_result)
2001 if write_playlist_files and not self.params.get('simulate'):
2002 _infojson_written = self._write_info_json(
2003 'playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson'))
2004 if _infojson_written is None:
2005 return
2006 if self._write_description('playlist', ie_result,
2007 self.prepare_filename(ie_copy, 'pl_description')) is None:
2008 return
2009 # TODO: This should be passed to ThumbnailsConvertor if necessary
2010 self._write_thumbnails('playlist', ie_result, self.prepare_filename(ie_copy, 'pl_thumbnail'))
2012 if lazy:
2013 if self.params.get('playlistreverse') or self.params.get('playlistrandom'):
2014 self.report_warning('playlistreverse and playlistrandom are not supported with lazy_playlist', only_once=True)
2015 elif self.params.get('playlistreverse'):
2016 entries.reverse()
2017 elif self.params.get('playlistrandom'):
2018 random.shuffle(entries)
2020 self.to_screen(f'[{ie_result["extractor"]}] Playlist {title}: Downloading {n_entries} items'
2021 f'{format_field(ie_result, "playlist_count", " of %s")}')
2023 keep_resolved_entries = self.params.get('extract_flat') != 'discard'
2024 if self.params.get('extract_flat') == 'discard_in_playlist':
2025 keep_resolved_entries = ie_result['_type'] != 'playlist'
2026 if keep_resolved_entries:
2027 self.write_debug('The information of all playlist entries will be held in memory')
2029 failures = 0
2030 max_failures = self.params.get('skip_playlist_after_errors') or float('inf')
2031 for i, (playlist_index, entry) in enumerate(entries):
2032 if lazy:
2033 resolved_entries.append((playlist_index, entry))
2034 if not entry:
2035 continue
2037 entry['__x_forwarded_for_ip'] = ie_result.get('__x_forwarded_for_ip')
2038 if not lazy and 'playlist-index' in self.params['compat_opts']:
2039 playlist_index = ie_result['requested_entries'][i]
2041 entry_copy = collections.ChainMap(entry, {
2042 **common_info,
2043 'n_entries': int_or_none(n_entries),
2044 'playlist_index': playlist_index,
2045 'playlist_autonumber': i + 1,
2048 if self._match_entry(entry_copy, incomplete=True) is not None:
2049 # For compatabilty with youtube-dl. See https://github.com/yt-dlp/yt-dlp/issues/4369
2050 resolved_entries[i] = (playlist_index, NO_DEFAULT)
2051 continue
2053 self.to_screen(
2054 f'[download] Downloading item {self._format_screen(i + 1, self.Styles.ID)} '
2055 f'of {self._format_screen(n_entries, self.Styles.EMPHASIS)}')
2057 entry_result = self.__process_iterable_entry(entry, download, collections.ChainMap({
2058 'playlist_index': playlist_index,
2059 'playlist_autonumber': i + 1,
2060 }, extra))
2061 if not entry_result:
2062 failures += 1
2063 if failures >= max_failures:
2064 self.report_error(
2065 f'Skipping the remaining entries in playlist "{title}" since {failures} items failed extraction')
2066 break
2067 if keep_resolved_entries:
2068 resolved_entries[i] = (playlist_index, entry_result)
2070 # Update with processed data
2071 ie_result['entries'] = [e for _, e in resolved_entries if e is not NO_DEFAULT]
2072 ie_result['requested_entries'] = [i for i, e in resolved_entries if e is not NO_DEFAULT]
2073 if ie_result['requested_entries'] == try_call(lambda: list(range(1, ie_result['playlist_count'] + 1))):
2074 # Do not set for full playlist
2075 ie_result.pop('requested_entries')
2077 # Write the updated info to json
2078 if _infojson_written is True and self._write_info_json(
2079 'updated playlist', ie_result,
2080 self.prepare_filename(ie_copy, 'pl_infojson'), overwrite=True) is None:
2081 return
2083 ie_result = self.run_all_pps('playlist', ie_result)
2084 self.to_screen(f'[download] Finished downloading playlist: {title}')
2085 return ie_result
2087 @_handle_extraction_exceptions
2088 def __process_iterable_entry(self, entry, download, extra_info):
2089 return self.process_ie_result(
2090 entry, download=download, extra_info=extra_info)
2092 def _build_format_filter(self, filter_spec):
2093 " Returns a function to filter the formats according to the filter_spec "
2095 OPERATORS = {
2096 '<': operator.lt,
2097 '<=': operator.le,
2098 '>': operator.gt,
2099 '>=': operator.ge,
2100 '=': operator.eq,
2101 '!=': operator.ne,
2103 operator_rex = re.compile(r'''(?x)\s*
2104 (?P<key>[\w.-]+)\s*
2105 (?P<op>{})(?P<none_inclusive>\s*\?)?\s*
2106 (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s*
2107 '''.format('|'.join(map(re.escape, OPERATORS.keys()))))
2108 m = operator_rex.fullmatch(filter_spec)
2109 if m:
2110 try:
2111 comparison_value = int(m.group('value'))
2112 except ValueError:
2113 comparison_value = parse_filesize(m.group('value'))
2114 if comparison_value is None:
2115 comparison_value = parse_filesize(m.group('value') + 'B')
2116 if comparison_value is None:
2117 raise ValueError(
2118 'Invalid value {!r} in format specification {!r}'.format(
2119 m.group('value'), filter_spec))
2120 op = OPERATORS[m.group('op')]
2122 if not m:
2123 STR_OPERATORS = {
2124 '=': operator.eq,
2125 '^=': lambda attr, value: attr.startswith(value),
2126 '$=': lambda attr, value: attr.endswith(value),
2127 '*=': lambda attr, value: value in attr,
2128 '~=': lambda attr, value: value.search(attr) is not None,
2130 str_operator_rex = re.compile(r'''(?x)\s*
2131 (?P<key>[a-zA-Z0-9._-]+)\s*
2132 (?P<negation>!\s*)?(?P<op>{})\s*(?P<none_inclusive>\?\s*)?
2133 (?P<quote>["'])?
2134 (?P<value>(?(quote)(?:(?!(?P=quote))[^\\]|\\.)+|[\w.-]+))
2135 (?(quote)(?P=quote))\s*
2136 '''.format('|'.join(map(re.escape, STR_OPERATORS.keys()))))
2137 m = str_operator_rex.fullmatch(filter_spec)
2138 if m:
2139 if m.group('op') == '~=':
2140 comparison_value = re.compile(m.group('value'))
2141 else:
2142 comparison_value = re.sub(r'''\\([\\"'])''', r'\1', m.group('value'))
2143 str_op = STR_OPERATORS[m.group('op')]
2144 if m.group('negation'):
2145 op = lambda attr, value: not str_op(attr, value)
2146 else:
2147 op = str_op
2149 if not m:
2150 raise SyntaxError(f'Invalid filter specification {filter_spec!r}')
2152 def _filter(f):
2153 actual_value = f.get(m.group('key'))
2154 if actual_value is None:
2155 return m.group('none_inclusive')
2156 return op(actual_value, comparison_value)
2157 return _filter
2159 def _check_formats(self, formats):
2160 for f in formats:
2161 working = f.get('__working')
2162 if working is not None:
2163 if working:
2164 yield f
2165 continue
2166 self.to_screen('[info] Testing format {}'.format(f['format_id']))
2167 path = self.get_output_path('temp')
2168 if not self._ensure_dir_exists(f'{path}/'):
2169 continue
2170 temp_file = tempfile.NamedTemporaryFile(suffix='.tmp', delete=False, dir=path or None)
2171 temp_file.close()
2172 try:
2173 success, _ = self.dl(temp_file.name, f, test=True)
2174 except (DownloadError, OSError, ValueError, *network_exceptions):
2175 success = False
2176 finally:
2177 if os.path.exists(temp_file.name):
2178 try:
2179 os.remove(temp_file.name)
2180 except OSError:
2181 self.report_warning(f'Unable to delete temporary file "{temp_file.name}"')
2182 f['__working'] = success
2183 if success:
2184 yield f
2185 else:
2186 self.to_screen('[info] Unable to download format {}. Skipping...'.format(f['format_id']))
2188 def _select_formats(self, formats, selector):
2189 return list(selector({
2190 'formats': formats,
2191 'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats),
2192 'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats) # No formats with video
2193 or all(f.get('acodec') == 'none' for f in formats)), # OR, No formats with audio
2196 def _default_format_spec(self, info_dict):
2197 prefer_best = (
2198 self.params['outtmpl']['default'] == '-'
2199 or (info_dict.get('is_live') and not self.params.get('live_from_start')))
2201 def can_merge():
2202 merger = FFmpegMergerPP(self)
2203 return merger.available and merger.can_merge()
2205 if not prefer_best and not can_merge():
2206 prefer_best = True
2207 formats = self._get_formats(info_dict)
2208 evaluate_formats = lambda spec: self._select_formats(formats, self.build_format_selector(spec))
2209 if evaluate_formats('b/bv+ba') != evaluate_formats('bv*+ba/b'):
2210 self.report_warning('ffmpeg not found. The downloaded format may not be the best available. '
2211 'Installing ffmpeg is strongly recommended: https://github.com/yt-dlp/yt-dlp#dependencies')
2213 compat = (self.params.get('allow_multiple_audio_streams')
2214 or 'format-spec' in self.params['compat_opts'])
2216 return ('best/bestvideo+bestaudio' if prefer_best
2217 else 'bestvideo+bestaudio/best' if compat
2218 else 'bestvideo*+bestaudio/best')
2220 def build_format_selector(self, format_spec):
2221 def syntax_error(note, start):
2222 message = (
2223 'Invalid format specification: '
2224 '{}\n\t{}\n\t{}^'.format(note, format_spec, ' ' * start[1]))
2225 return SyntaxError(message)
2227 PICKFIRST = 'PICKFIRST'
2228 MERGE = 'MERGE'
2229 SINGLE = 'SINGLE'
2230 GROUP = 'GROUP'
2231 FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
2233 allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False),
2234 'video': self.params.get('allow_multiple_video_streams', False)}
2236 def _parse_filter(tokens):
2237 filter_parts = []
2238 for type_, string_, _start, _, _ in tokens:
2239 if type_ == tokenize.OP and string_ == ']':
2240 return ''.join(filter_parts)
2241 else:
2242 filter_parts.append(string_)
2244 def _remove_unused_ops(tokens):
2245 # Remove operators that we don't use and join them with the surrounding strings.
2246 # E.g. 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
2247 ALLOWED_OPS = ('/', '+', ',', '(', ')')
2248 last_string, last_start, last_end, last_line = None, None, None, None
2249 for type_, string_, start, end, line in tokens:
2250 if type_ == tokenize.OP and string_ == '[':
2251 if last_string:
2252 yield tokenize.NAME, last_string, last_start, last_end, last_line
2253 last_string = None
2254 yield type_, string_, start, end, line
2255 # everything inside brackets will be handled by _parse_filter
2256 for type_, string_, start, end, line in tokens:
2257 yield type_, string_, start, end, line
2258 if type_ == tokenize.OP and string_ == ']':
2259 break
2260 elif type_ == tokenize.OP and string_ in ALLOWED_OPS:
2261 if last_string:
2262 yield tokenize.NAME, last_string, last_start, last_end, last_line
2263 last_string = None
2264 yield type_, string_, start, end, line
2265 elif type_ in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
2266 if not last_string:
2267 last_string = string_
2268 last_start = start
2269 last_end = end
2270 else:
2271 last_string += string_
2272 if last_string:
2273 yield tokenize.NAME, last_string, last_start, last_end, last_line
2275 def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
2276 selectors = []
2277 current_selector = None
2278 for type_, string_, start, _, _ in tokens:
2279 # ENCODING is only defined in Python 3.x
2280 if type_ == getattr(tokenize, 'ENCODING', None):
2281 continue
2282 elif type_ in [tokenize.NAME, tokenize.NUMBER]:
2283 current_selector = FormatSelector(SINGLE, string_, [])
2284 elif type_ == tokenize.OP:
2285 if string_ == ')':
2286 if not inside_group:
2287 # ')' will be handled by the parentheses group
2288 tokens.restore_last_token()
2289 break
2290 elif inside_merge and string_ in ['/', ',']:
2291 tokens.restore_last_token()
2292 break
2293 elif inside_choice and string_ == ',':
2294 tokens.restore_last_token()
2295 break
2296 elif string_ == ',':
2297 if not current_selector:
2298 raise syntax_error('"," must follow a format selector', start)
2299 selectors.append(current_selector)
2300 current_selector = None
2301 elif string_ == '/':
2302 if not current_selector:
2303 raise syntax_error('"/" must follow a format selector', start)
2304 first_choice = current_selector
2305 second_choice = _parse_format_selection(tokens, inside_choice=True)
2306 current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
2307 elif string_ == '[':
2308 if not current_selector:
2309 current_selector = FormatSelector(SINGLE, 'best', [])
2310 format_filter = _parse_filter(tokens)
2311 current_selector.filters.append(format_filter)
2312 elif string_ == '(':
2313 if current_selector:
2314 raise syntax_error('Unexpected "("', start)
2315 group = _parse_format_selection(tokens, inside_group=True)
2316 current_selector = FormatSelector(GROUP, group, [])
2317 elif string_ == '+':
2318 if not current_selector:
2319 raise syntax_error('Unexpected "+"', start)
2320 selector_1 = current_selector
2321 selector_2 = _parse_format_selection(tokens, inside_merge=True)
2322 if not selector_2:
2323 raise syntax_error('Expected a selector', start)
2324 current_selector = FormatSelector(MERGE, (selector_1, selector_2), [])
2325 else:
2326 raise syntax_error(f'Operator not recognized: "{string_}"', start)
2327 elif type_ == tokenize.ENDMARKER:
2328 break
2329 if current_selector:
2330 selectors.append(current_selector)
2331 return selectors
2333 def _merge(formats_pair):
2334 format_1, format_2 = formats_pair
2336 formats_info = []
2337 formats_info.extend(format_1.get('requested_formats', (format_1,)))
2338 formats_info.extend(format_2.get('requested_formats', (format_2,)))
2340 if not allow_multiple_streams['video'] or not allow_multiple_streams['audio']:
2341 get_no_more = {'video': False, 'audio': False}
2342 for (i, fmt_info) in enumerate(formats_info):
2343 if fmt_info.get('acodec') == fmt_info.get('vcodec') == 'none':
2344 formats_info.pop(i)
2345 continue
2346 for aud_vid in ['audio', 'video']:
2347 if not allow_multiple_streams[aud_vid] and fmt_info.get(aud_vid[0] + 'codec') != 'none':
2348 if get_no_more[aud_vid]:
2349 formats_info.pop(i)
2350 break
2351 get_no_more[aud_vid] = True
2353 if len(formats_info) == 1:
2354 return formats_info[0]
2356 video_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('vcodec') != 'none']
2357 audio_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('acodec') != 'none']
2359 the_only_video = video_fmts[0] if len(video_fmts) == 1 else None
2360 the_only_audio = audio_fmts[0] if len(audio_fmts) == 1 else None
2362 output_ext = get_compatible_ext(
2363 vcodecs=[f.get('vcodec') for f in video_fmts],
2364 acodecs=[f.get('acodec') for f in audio_fmts],
2365 vexts=[f['ext'] for f in video_fmts],
2366 aexts=[f['ext'] for f in audio_fmts],
2367 preferences=(try_call(lambda: self.params['merge_output_format'].split('/'))
2368 or (self.params.get('prefer_free_formats') and ('webm', 'mkv'))))
2370 filtered = lambda *keys: filter(None, (traverse_obj(fmt, *keys) for fmt in formats_info))
2372 new_dict = {
2373 'requested_formats': formats_info,
2374 'format': '+'.join(filtered('format')),
2375 'format_id': '+'.join(filtered('format_id')),
2376 'ext': output_ext,
2377 'protocol': '+'.join(map(determine_protocol, formats_info)),
2378 'language': '+'.join(orderedSet(filtered('language'))) or None,
2379 'format_note': '+'.join(orderedSet(filtered('format_note'))) or None,
2380 'filesize_approx': sum(filtered('filesize', 'filesize_approx')) or None,
2381 'tbr': sum(filtered('tbr', 'vbr', 'abr')),
2384 if the_only_video:
2385 new_dict.update({
2386 'width': the_only_video.get('width'),
2387 'height': the_only_video.get('height'),
2388 'resolution': the_only_video.get('resolution') or self.format_resolution(the_only_video),
2389 'fps': the_only_video.get('fps'),
2390 'dynamic_range': the_only_video.get('dynamic_range'),
2391 'vcodec': the_only_video.get('vcodec'),
2392 'vbr': the_only_video.get('vbr'),
2393 'stretched_ratio': the_only_video.get('stretched_ratio'),
2394 'aspect_ratio': the_only_video.get('aspect_ratio'),
2397 if the_only_audio:
2398 new_dict.update({
2399 'acodec': the_only_audio.get('acodec'),
2400 'abr': the_only_audio.get('abr'),
2401 'asr': the_only_audio.get('asr'),
2402 'audio_channels': the_only_audio.get('audio_channels'),
2405 return new_dict
2407 def _check_formats(formats):
2408 if self.params.get('check_formats') == 'selected':
2409 yield from self._check_formats(formats)
2410 return
2411 elif (self.params.get('check_formats') is not None
2412 or self.params.get('allow_unplayable_formats')):
2413 yield from formats
2414 return
2416 for f in formats:
2417 if f.get('has_drm') or f.get('__needs_testing'):
2418 yield from self._check_formats([f])
2419 else:
2420 yield f
2422 def _build_selector_function(selector):
2423 if isinstance(selector, list): # ,
2424 fs = [_build_selector_function(s) for s in selector]
2426 def selector_function(ctx):
2427 for f in fs:
2428 yield from f(ctx)
2429 return selector_function
2431 elif selector.type == GROUP: # ()
2432 selector_function = _build_selector_function(selector.selector)
2434 elif selector.type == PICKFIRST: # /
2435 fs = [_build_selector_function(s) for s in selector.selector]
2437 def selector_function(ctx):
2438 for f in fs:
2439 picked_formats = list(f(ctx))
2440 if picked_formats:
2441 return picked_formats
2442 return []
2444 elif selector.type == MERGE: # +
2445 selector_1, selector_2 = map(_build_selector_function, selector.selector)
2447 def selector_function(ctx):
2448 for pair in itertools.product(selector_1(ctx), selector_2(ctx)):
2449 yield _merge(pair)
2451 elif selector.type == SINGLE: # atom
2452 format_spec = selector.selector or 'best'
2454 # TODO: Add allvideo, allaudio etc by generalizing the code with best/worst selector
2455 if format_spec == 'all':
2456 def selector_function(ctx):
2457 yield from _check_formats(ctx['formats'][::-1])
2458 elif format_spec == 'mergeall':
2459 def selector_function(ctx):
2460 formats = list(_check_formats(
2461 f for f in ctx['formats'] if f.get('vcodec') != 'none' or f.get('acodec') != 'none'))
2462 if not formats:
2463 return
2464 merged_format = formats[-1]
2465 for f in formats[-2::-1]:
2466 merged_format = _merge((merged_format, f))
2467 yield merged_format
2469 else:
2470 format_fallback, seperate_fallback, format_reverse, format_idx = False, None, True, 1
2471 mobj = re.match(
2472 r'(?P<bw>best|worst|b|w)(?P<type>video|audio|v|a)?(?P<mod>\*)?(?:\.(?P<n>[1-9]\d*))?$',
2473 format_spec)
2474 if mobj is not None:
2475 format_idx = int_or_none(mobj.group('n'), default=1)
2476 format_reverse = mobj.group('bw')[0] == 'b'
2477 format_type = (mobj.group('type') or [None])[0]
2478 not_format_type = {'v': 'a', 'a': 'v'}.get(format_type)
2479 format_modified = mobj.group('mod') is not None
2481 format_fallback = not format_type and not format_modified # for b, w
2482 _filter_f = (
2483 (lambda f: f.get(f'{format_type}codec') != 'none')
2484 if format_type and format_modified # bv*, ba*, wv*, wa*
2485 else (lambda f: f.get(f'{not_format_type}codec') == 'none')
2486 if format_type # bv, ba, wv, wa
2487 else (lambda f: f.get('vcodec') != 'none' and f.get('acodec') != 'none')
2488 if not format_modified # b, w
2489 else lambda f: True) # b*, w*
2490 filter_f = lambda f: _filter_f(f) and (
2491 f.get('vcodec') != 'none' or f.get('acodec') != 'none')
2492 else:
2493 if format_spec in self._format_selection_exts['audio']:
2494 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none'
2495 elif format_spec in self._format_selection_exts['video']:
2496 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' and f.get('vcodec') != 'none'
2497 seperate_fallback = lambda f: f.get('ext') == format_spec and f.get('vcodec') != 'none'
2498 elif format_spec in self._format_selection_exts['storyboards']:
2499 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') == 'none' and f.get('vcodec') == 'none'
2500 else:
2501 filter_f = lambda f: f.get('format_id') == format_spec # id
2503 def selector_function(ctx):
2504 formats = list(ctx['formats'])
2505 matches = list(filter(filter_f, formats)) if filter_f is not None else formats
2506 if not matches:
2507 if format_fallback and ctx['incomplete_formats']:
2508 # for extractors with incomplete formats (audio only (soundcloud)
2509 # or video only (imgur)) best/worst will fallback to
2510 # best/worst {video,audio}-only format
2511 matches = list(filter(lambda f: f.get('vcodec') != 'none' or f.get('acodec') != 'none', formats))
2512 elif seperate_fallback and not ctx['has_merged_format']:
2513 # for compatibility with youtube-dl when there is no pre-merged format
2514 matches = list(filter(seperate_fallback, formats))
2515 matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1]))
2516 try:
2517 yield matches[format_idx - 1]
2518 except LazyList.IndexError:
2519 return
2521 filters = [self._build_format_filter(f) for f in selector.filters]
2523 def final_selector(ctx):
2524 ctx_copy = dict(ctx)
2525 for _filter in filters:
2526 ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
2527 return selector_function(ctx_copy)
2528 return final_selector
2530 # HACK: Python 3.12 changed the underlying parser, rendering '7_a' invalid
2531 # Prefix numbers with random letters to avoid it being classified as a number
2532 # See: https://github.com/yt-dlp/yt-dlp/pulls/8797
2533 # TODO: Implement parser not reliant on tokenize.tokenize
2534 prefix = ''.join(random.choices(string.ascii_letters, k=32))
2535 stream = io.BytesIO(re.sub(r'\d[_\d]*', rf'{prefix}\g<0>', format_spec).encode())
2536 try:
2537 tokens = list(_remove_unused_ops(
2538 token._replace(string=token.string.replace(prefix, ''))
2539 for token in tokenize.tokenize(stream.readline)))
2540 except tokenize.TokenError:
2541 raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
2543 class TokenIterator:
2544 def __init__(self, tokens):
2545 self.tokens = tokens
2546 self.counter = 0
2548 def __iter__(self):
2549 return self
2551 def __next__(self):
2552 if self.counter >= len(self.tokens):
2553 raise StopIteration
2554 value = self.tokens[self.counter]
2555 self.counter += 1
2556 return value
2558 next = __next__
2560 def restore_last_token(self):
2561 self.counter -= 1
2563 parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
2564 return _build_selector_function(parsed_selector)
2566 def _calc_headers(self, info_dict, load_cookies=False):
2567 res = HTTPHeaderDict(self.params['http_headers'], info_dict.get('http_headers'))
2568 clean_headers(res)
2570 if load_cookies: # For --load-info-json
2571 self._load_cookies(res.get('Cookie'), autoscope=info_dict['url']) # compat
2572 self._load_cookies(info_dict.get('cookies'), autoscope=False)
2573 # The `Cookie` header is removed to prevent leaks and unscoped cookies.
2574 # See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj
2575 res.pop('Cookie', None)
2576 cookies = self.cookiejar.get_cookies_for_url(info_dict['url'])
2577 if cookies:
2578 encoder = LenientSimpleCookie()
2579 values = []
2580 for cookie in cookies:
2581 _, value = encoder.value_encode(cookie.value)
2582 values.append(f'{cookie.name}={value}')
2583 if cookie.domain:
2584 values.append(f'Domain={cookie.domain}')
2585 if cookie.path:
2586 values.append(f'Path={cookie.path}')
2587 if cookie.secure:
2588 values.append('Secure')
2589 if cookie.expires:
2590 values.append(f'Expires={cookie.expires}')
2591 if cookie.version:
2592 values.append(f'Version={cookie.version}')
2593 info_dict['cookies'] = '; '.join(values)
2595 if 'X-Forwarded-For' not in res:
2596 x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
2597 if x_forwarded_for_ip:
2598 res['X-Forwarded-For'] = x_forwarded_for_ip
2600 return res
2602 def _calc_cookies(self, url):
2603 self.deprecation_warning('"YoutubeDL._calc_cookies" is deprecated and may be removed in a future version')
2604 return self.cookiejar.get_cookie_header(url)
2606 def _sort_thumbnails(self, thumbnails):
2607 thumbnails.sort(key=lambda t: (
2608 t.get('preference') if t.get('preference') is not None else -1,
2609 t.get('width') if t.get('width') is not None else -1,
2610 t.get('height') if t.get('height') is not None else -1,
2611 t.get('id') if t.get('id') is not None else '',
2612 t.get('url')))
2614 def _sanitize_thumbnails(self, info_dict):
2615 thumbnails = info_dict.get('thumbnails')
2616 if thumbnails is None:
2617 thumbnail = info_dict.get('thumbnail')
2618 if thumbnail:
2619 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
2620 if not thumbnails:
2621 return
2623 def check_thumbnails(thumbnails):
2624 for t in thumbnails:
2625 self.to_screen(f'[info] Testing thumbnail {t["id"]}')
2626 try:
2627 self.urlopen(HEADRequest(t['url']))
2628 except network_exceptions as err:
2629 self.to_screen(f'[info] Unable to connect to thumbnail {t["id"]} URL {t["url"]!r} - {err}. Skipping...')
2630 continue
2631 yield t
2633 self._sort_thumbnails(thumbnails)
2634 for i, t in enumerate(thumbnails):
2635 if t.get('id') is None:
2636 t['id'] = str(i)
2637 if t.get('width') and t.get('height'):
2638 t['resolution'] = '%dx%d' % (t['width'], t['height'])
2639 t['url'] = sanitize_url(t['url'])
2641 if self.params.get('check_formats') is True:
2642 info_dict['thumbnails'] = LazyList(check_thumbnails(thumbnails[::-1]), reverse=True)
2643 else:
2644 info_dict['thumbnails'] = thumbnails
2646 def _fill_common_fields(self, info_dict, final=True):
2647 # TODO: move sanitization here
2648 if final:
2649 title = info_dict['fulltitle'] = info_dict.get('title')
2650 if not title:
2651 if title == '':
2652 self.write_debug('Extractor gave empty title. Creating a generic title')
2653 else:
2654 self.report_warning('Extractor failed to obtain "title". Creating a generic title instead')
2655 info_dict['title'] = f'{info_dict["extractor"].replace(":", "-")} video #{info_dict["id"]}'
2657 if info_dict.get('duration') is not None:
2658 info_dict['duration_string'] = formatSeconds(info_dict['duration'])
2660 for ts_key, date_key in (
2661 ('timestamp', 'upload_date'),
2662 ('release_timestamp', 'release_date'),
2663 ('modified_timestamp', 'modified_date'),
2665 if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
2666 # Working around out-of-range timestamp values (e.g. negative ones on Windows,
2667 # see http://bugs.python.org/issue1646728)
2668 with contextlib.suppress(ValueError, OverflowError, OSError):
2669 upload_date = dt.datetime.fromtimestamp(info_dict[ts_key], dt.timezone.utc)
2670 info_dict[date_key] = upload_date.strftime('%Y%m%d')
2672 if not info_dict.get('release_year'):
2673 info_dict['release_year'] = traverse_obj(info_dict, ('release_date', {lambda x: int(x[:4])}))
2675 live_keys = ('is_live', 'was_live')
2676 live_status = info_dict.get('live_status')
2677 if live_status is None:
2678 for key in live_keys:
2679 if info_dict.get(key) is False:
2680 continue
2681 if info_dict.get(key):
2682 live_status = key
2683 break
2684 if all(info_dict.get(key) is False for key in live_keys):
2685 live_status = 'not_live'
2686 if live_status:
2687 info_dict['live_status'] = live_status
2688 for key in live_keys:
2689 if info_dict.get(key) is None:
2690 info_dict[key] = (live_status == key)
2691 if live_status == 'post_live':
2692 info_dict['was_live'] = True
2694 # Auto generate title fields corresponding to the *_number fields when missing
2695 # in order to always have clean titles. This is very common for TV series.
2696 for field in ('chapter', 'season', 'episode'):
2697 if final and info_dict.get(f'{field}_number') is not None and not info_dict.get(field):
2698 info_dict[field] = '%s %d' % (field.capitalize(), info_dict[f'{field}_number'])
2700 for old_key, new_key in self._deprecated_multivalue_fields.items():
2701 if new_key in info_dict and old_key in info_dict:
2702 if '_version' not in info_dict: # HACK: Do not warn when using --load-info-json
2703 self.deprecation_warning(f'Do not return {old_key!r} when {new_key!r} is present')
2704 elif old_value := info_dict.get(old_key):
2705 info_dict[new_key] = old_value.split(', ')
2706 elif new_value := info_dict.get(new_key):
2707 info_dict[old_key] = ', '.join(v.replace(',', '\N{FULLWIDTH COMMA}') for v in new_value)
2709 def _raise_pending_errors(self, info):
2710 err = info.pop('__pending_error', None)
2711 if err:
2712 self.report_error(err, tb=False)
2714 def sort_formats(self, info_dict):
2715 formats = self._get_formats(info_dict)
2716 formats.sort(key=FormatSorter(
2717 self, info_dict.get('_format_sort_fields') or []).calculate_preference)
2719 def process_video_result(self, info_dict, download=True):
2720 assert info_dict.get('_type', 'video') == 'video'
2721 self._num_videos += 1
2723 if 'id' not in info_dict:
2724 raise ExtractorError('Missing "id" field in extractor result', ie=info_dict['extractor'])
2725 elif not info_dict.get('id'):
2726 raise ExtractorError('Extractor failed to obtain "id"', ie=info_dict['extractor'])
2728 def report_force_conversion(field, field_not, conversion):
2729 self.report_warning(
2730 f'"{field}" field is not {field_not} - forcing {conversion} conversion, '
2731 'there is an error in extractor')
2733 def sanitize_string_field(info, string_field):
2734 field = info.get(string_field)
2735 if field is None or isinstance(field, str):
2736 return
2737 report_force_conversion(string_field, 'a string', 'string')
2738 info[string_field] = str(field)
2740 def sanitize_numeric_fields(info):
2741 for numeric_field in self._NUMERIC_FIELDS:
2742 field = info.get(numeric_field)
2743 if field is None or isinstance(field, (int, float)):
2744 continue
2745 report_force_conversion(numeric_field, 'numeric', 'int')
2746 info[numeric_field] = int_or_none(field)
2748 sanitize_string_field(info_dict, 'id')
2749 sanitize_numeric_fields(info_dict)
2750 if info_dict.get('section_end') and info_dict.get('section_start') is not None:
2751 info_dict['duration'] = round(info_dict['section_end'] - info_dict['section_start'], 3)
2752 if (info_dict.get('duration') or 0) <= 0 and info_dict.pop('duration', None):
2753 self.report_warning('"duration" field is negative, there is an error in extractor')
2755 chapters = info_dict.get('chapters') or []
2756 if chapters and chapters[0].get('start_time'):
2757 chapters.insert(0, {'start_time': 0})
2759 dummy_chapter = {'end_time': 0, 'start_time': info_dict.get('duration')}
2760 for idx, (prev, current, next_) in enumerate(zip(
2761 (dummy_chapter, *chapters), chapters, (*chapters[1:], dummy_chapter)), 1):
2762 if current.get('start_time') is None:
2763 current['start_time'] = prev.get('end_time')
2764 if not current.get('end_time'):
2765 current['end_time'] = next_.get('start_time')
2766 if not current.get('title'):
2767 current['title'] = f'<Untitled Chapter {idx}>'
2769 if 'playlist' not in info_dict:
2770 # It isn't part of a playlist
2771 info_dict['playlist'] = None
2772 info_dict['playlist_index'] = None
2774 self._sanitize_thumbnails(info_dict)
2776 thumbnail = info_dict.get('thumbnail')
2777 thumbnails = info_dict.get('thumbnails')
2778 if thumbnail:
2779 info_dict['thumbnail'] = sanitize_url(thumbnail)
2780 elif thumbnails:
2781 info_dict['thumbnail'] = thumbnails[-1]['url']
2783 if info_dict.get('display_id') is None and 'id' in info_dict:
2784 info_dict['display_id'] = info_dict['id']
2786 self._fill_common_fields(info_dict)
2788 for cc_kind in ('subtitles', 'automatic_captions'):
2789 cc = info_dict.get(cc_kind)
2790 if cc:
2791 for _, subtitle in cc.items():
2792 for subtitle_format in subtitle:
2793 if subtitle_format.get('url'):
2794 subtitle_format['url'] = sanitize_url(subtitle_format['url'])
2795 if subtitle_format.get('ext') is None:
2796 subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
2798 automatic_captions = info_dict.get('automatic_captions')
2799 subtitles = info_dict.get('subtitles')
2801 info_dict['requested_subtitles'] = self.process_subtitles(
2802 info_dict['id'], subtitles, automatic_captions)
2804 formats = self._get_formats(info_dict)
2806 # Backward compatibility with InfoExtractor._sort_formats
2807 field_preference = (formats or [{}])[0].pop('__sort_fields', None)
2808 if field_preference:
2809 info_dict['_format_sort_fields'] = field_preference
2811 info_dict['_has_drm'] = any( # or None ensures --clean-infojson removes it
2812 f.get('has_drm') and f['has_drm'] != 'maybe' for f in formats) or None
2813 if not self.params.get('allow_unplayable_formats'):
2814 formats = [f for f in formats if not f.get('has_drm') or f['has_drm'] == 'maybe']
2816 if formats and all(f.get('acodec') == f.get('vcodec') == 'none' for f in formats):
2817 self.report_warning(
2818 f'{"This video is DRM protected and " if info_dict["_has_drm"] else ""}'
2819 'only images are available for download. Use --list-formats to see them'.capitalize())
2821 get_from_start = not info_dict.get('is_live') or bool(self.params.get('live_from_start'))
2822 if not get_from_start:
2823 info_dict['title'] += ' ' + dt.datetime.now().strftime('%Y-%m-%d %H:%M')
2824 if info_dict.get('is_live') and formats:
2825 formats = [f for f in formats if bool(f.get('is_from_start')) == get_from_start]
2826 if get_from_start and not formats:
2827 self.raise_no_formats(info_dict, msg=(
2828 '--live-from-start is passed, but there are no formats that can be downloaded from the start. '
2829 'If you want to download from the current time, use --no-live-from-start'))
2831 def is_wellformed(f):
2832 url = f.get('url')
2833 if not url:
2834 self.report_warning(
2835 '"url" field is missing or empty - skipping format, '
2836 'there is an error in extractor')
2837 return False
2838 if isinstance(url, bytes):
2839 sanitize_string_field(f, 'url')
2840 return True
2842 # Filter out malformed formats for better extraction robustness
2843 formats = list(filter(is_wellformed, formats or []))
2845 if not formats:
2846 self.raise_no_formats(info_dict)
2848 for fmt in formats:
2849 sanitize_string_field(fmt, 'format_id')
2850 sanitize_numeric_fields(fmt)
2851 fmt['url'] = sanitize_url(fmt['url'])
2852 FormatSorter._fill_sorting_fields(fmt)
2853 if fmt['ext'] in ('aac', 'opus', 'mp3', 'flac', 'vorbis'):
2854 if fmt.get('acodec') is None:
2855 fmt['acodec'] = fmt['ext']
2856 if fmt.get('resolution') is None:
2857 fmt['resolution'] = self.format_resolution(fmt, default=None)
2858 if fmt.get('dynamic_range') is None and fmt.get('vcodec') != 'none':
2859 fmt['dynamic_range'] = 'SDR'
2860 if fmt.get('aspect_ratio') is None:
2861 fmt['aspect_ratio'] = try_call(lambda: round(fmt['width'] / fmt['height'], 2))
2862 # For fragmented formats, "tbr" is often max bitrate and not average
2863 if (('manifest-filesize-approx' in self.params['compat_opts'] or not fmt.get('manifest_url'))
2864 and not fmt.get('filesize') and not fmt.get('filesize_approx')):
2865 fmt['filesize_approx'] = filesize_from_tbr(fmt.get('tbr'), info_dict.get('duration'))
2866 fmt['http_headers'] = self._calc_headers(collections.ChainMap(fmt, info_dict), load_cookies=True)
2868 # Safeguard against old/insecure infojson when using --load-info-json
2869 if info_dict.get('http_headers'):
2870 info_dict['http_headers'] = HTTPHeaderDict(info_dict['http_headers'])
2871 info_dict['http_headers'].pop('Cookie', None)
2873 # This is copied to http_headers by the above _calc_headers and can now be removed
2874 if '__x_forwarded_for_ip' in info_dict:
2875 del info_dict['__x_forwarded_for_ip']
2877 self.sort_formats({
2878 'formats': formats,
2879 '_format_sort_fields': info_dict.get('_format_sort_fields'),
2882 # Sanitize and group by format_id
2883 formats_dict = {}
2884 for i, fmt in enumerate(formats):
2885 if not fmt.get('format_id'):
2886 fmt['format_id'] = str(i)
2887 else:
2888 # Sanitize format_id from characters used in format selector expression
2889 fmt['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', fmt['format_id'])
2890 formats_dict.setdefault(fmt['format_id'], []).append(fmt)
2892 # Make sure all formats have unique format_id
2893 common_exts = set(itertools.chain(*self._format_selection_exts.values()))
2894 for format_id, ambiguous_formats in formats_dict.items():
2895 ambigious_id = len(ambiguous_formats) > 1
2896 for i, fmt in enumerate(ambiguous_formats):
2897 if ambigious_id:
2898 fmt['format_id'] = f'{format_id}-{i}'
2899 # Ensure there is no conflict between id and ext in format selection
2900 # See https://github.com/yt-dlp/yt-dlp/issues/1282
2901 if fmt['format_id'] != fmt['ext'] and fmt['format_id'] in common_exts:
2902 fmt['format_id'] = 'f{}'.format(fmt['format_id'])
2904 if fmt.get('format') is None:
2905 fmt['format'] = '{id} - {res}{note}'.format(
2906 id=fmt['format_id'],
2907 res=self.format_resolution(fmt),
2908 note=format_field(fmt, 'format_note', ' (%s)'),
2911 if self.params.get('check_formats') is True:
2912 formats = LazyList(self._check_formats(formats[::-1]), reverse=True)
2914 if not formats or formats[0] is not info_dict:
2915 # only set the 'formats' fields if the original info_dict list them
2916 # otherwise we end up with a circular reference, the first (and unique)
2917 # element in the 'formats' field in info_dict is info_dict itself,
2918 # which can't be exported to json
2919 info_dict['formats'] = formats
2921 info_dict, _ = self.pre_process(info_dict)
2923 if self._match_entry(info_dict, incomplete=self._format_fields) is not None:
2924 return info_dict
2926 self.post_extract(info_dict)
2927 info_dict, _ = self.pre_process(info_dict, 'after_filter')
2929 # The pre-processors may have modified the formats
2930 formats = self._get_formats(info_dict)
2932 list_only = self.params.get('simulate') == 'list_only'
2933 interactive_format_selection = not list_only and self.format_selector == '-'
2934 if self.params.get('list_thumbnails'):
2935 self.list_thumbnails(info_dict)
2936 if self.params.get('listsubtitles'):
2937 if 'automatic_captions' in info_dict:
2938 self.list_subtitles(
2939 info_dict['id'], automatic_captions, 'automatic captions')
2940 self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
2941 if self.params.get('listformats') or interactive_format_selection:
2942 self.list_formats(info_dict)
2943 if list_only:
2944 # Without this printing, -F --print-json will not work
2945 self.__forced_printings(info_dict)
2946 return info_dict
2948 format_selector = self.format_selector
2949 while True:
2950 if interactive_format_selection:
2951 req_format = input(self._format_screen('\nEnter format selector ', self.Styles.EMPHASIS)
2952 + '(Press ENTER for default, or Ctrl+C to quit)'
2953 + self._format_screen(': ', self.Styles.EMPHASIS))
2954 try:
2955 format_selector = self.build_format_selector(req_format) if req_format else None
2956 except SyntaxError as err:
2957 self.report_error(err, tb=False, is_error=False)
2958 continue
2960 if format_selector is None:
2961 req_format = self._default_format_spec(info_dict)
2962 self.write_debug(f'Default format spec: {req_format}')
2963 format_selector = self.build_format_selector(req_format)
2965 formats_to_download = self._select_formats(formats, format_selector)
2966 if interactive_format_selection and not formats_to_download:
2967 self.report_error('Requested format is not available', tb=False, is_error=False)
2968 continue
2969 break
2971 if not formats_to_download:
2972 if not self.params.get('ignore_no_formats_error'):
2973 raise ExtractorError(
2974 'Requested format is not available. Use --list-formats for a list of available formats',
2975 expected=True, video_id=info_dict['id'], ie=info_dict['extractor'])
2976 self.report_warning('Requested format is not available')
2977 # Process what we can, even without any available formats.
2978 formats_to_download = [{}]
2980 requested_ranges = tuple(self.params.get('download_ranges', lambda *_: [{}])(info_dict, self))
2981 best_format, downloaded_formats = formats_to_download[-1], []
2982 if download:
2983 if best_format and requested_ranges:
2984 def to_screen(*msg):
2985 self.to_screen(f'[info] {info_dict["id"]}: {" ".join(", ".join(variadic(m)) for m in msg)}')
2987 to_screen(f'Downloading {len(formats_to_download)} format(s):',
2988 (f['format_id'] for f in formats_to_download))
2989 if requested_ranges != ({}, ):
2990 to_screen(f'Downloading {len(requested_ranges)} time ranges:',
2991 (f'{c["start_time"]:.1f}-{c["end_time"]:.1f}' for c in requested_ranges))
2992 max_downloads_reached = False
2994 for fmt, chapter in itertools.product(formats_to_download, requested_ranges):
2995 new_info = self._copy_infodict(info_dict)
2996 new_info.update(fmt)
2997 offset, duration = info_dict.get('section_start') or 0, info_dict.get('duration') or float('inf')
2998 end_time = offset + min(chapter.get('end_time', duration), duration)
2999 # duration may not be accurate. So allow deviations <1sec
3000 if end_time == float('inf') or end_time > offset + duration + 1:
3001 end_time = None
3002 if chapter or offset:
3003 new_info.update({
3004 'section_start': offset + chapter.get('start_time', 0),
3005 'section_end': end_time,
3006 'section_title': chapter.get('title'),
3007 'section_number': chapter.get('index'),
3009 downloaded_formats.append(new_info)
3010 try:
3011 self.process_info(new_info)
3012 except MaxDownloadsReached:
3013 max_downloads_reached = True
3014 self._raise_pending_errors(new_info)
3015 # Remove copied info
3016 for key, val in tuple(new_info.items()):
3017 if info_dict.get(key) == val:
3018 new_info.pop(key)
3019 if max_downloads_reached:
3020 break
3022 write_archive = {f.get('__write_download_archive', False) for f in downloaded_formats}
3023 assert write_archive.issubset({True, False, 'ignore'})
3024 if True in write_archive and False not in write_archive:
3025 self.record_download_archive(info_dict)
3027 info_dict['requested_downloads'] = downloaded_formats
3028 info_dict = self.run_all_pps('after_video', info_dict)
3029 if max_downloads_reached:
3030 raise MaxDownloadsReached
3032 # We update the info dict with the selected best quality format (backwards compatibility)
3033 info_dict.update(best_format)
3034 return info_dict
3036 def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
3037 """Select the requested subtitles and their format"""
3038 available_subs, normal_sub_langs = {}, []
3039 if normal_subtitles and self.params.get('writesubtitles'):
3040 available_subs.update(normal_subtitles)
3041 normal_sub_langs = tuple(normal_subtitles.keys())
3042 if automatic_captions and self.params.get('writeautomaticsub'):
3043 for lang, cap_info in automatic_captions.items():
3044 if lang not in available_subs:
3045 available_subs[lang] = cap_info
3047 if not available_subs or (
3048 not self.params.get('writesubtitles')
3049 and not self.params.get('writeautomaticsub')):
3050 return None
3052 all_sub_langs = tuple(available_subs.keys())
3053 if self.params.get('allsubtitles', False):
3054 requested_langs = all_sub_langs
3055 elif self.params.get('subtitleslangs', False):
3056 try:
3057 requested_langs = orderedSet_from_options(
3058 self.params.get('subtitleslangs'), {'all': all_sub_langs}, use_regex=True)
3059 except re.error as e:
3060 raise ValueError(f'Wrong regex for subtitlelangs: {e.pattern}')
3061 else:
3062 requested_langs = LazyList(itertools.chain(
3063 ['en'] if 'en' in normal_sub_langs else [],
3064 filter(lambda f: f.startswith('en'), normal_sub_langs),
3065 ['en'] if 'en' in all_sub_langs else [],
3066 filter(lambda f: f.startswith('en'), all_sub_langs),
3067 normal_sub_langs, all_sub_langs,
3068 ))[:1]
3069 if requested_langs:
3070 self.to_screen(f'[info] {video_id}: Downloading subtitles: {", ".join(requested_langs)}')
3072 formats_query = self.params.get('subtitlesformat', 'best')
3073 formats_preference = formats_query.split('/') if formats_query else []
3074 subs = {}
3075 for lang in requested_langs:
3076 formats = available_subs.get(lang)
3077 if formats is None:
3078 self.report_warning(f'{lang} subtitles not available for {video_id}')
3079 continue
3080 for ext in formats_preference:
3081 if ext == 'best':
3082 f = formats[-1]
3083 break
3084 matches = list(filter(lambda f: f['ext'] == ext, formats))
3085 if matches:
3086 f = matches[-1]
3087 break
3088 else:
3089 f = formats[-1]
3090 self.report_warning(
3091 'No subtitle format found matching "{}" for language {}, '
3092 'using {}. Use --list-subs for a list of available subtitles'.format(formats_query, lang, f['ext']))
3093 subs[lang] = f
3094 return subs
3096 def _forceprint(self, key, info_dict):
3097 if info_dict is None:
3098 return
3099 info_copy = info_dict.copy()
3100 info_copy.setdefault('filename', self.prepare_filename(info_dict))
3101 if info_dict.get('requested_formats') is not None:
3102 # For RTMP URLs, also include the playpath
3103 info_copy['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats'])
3104 elif info_dict.get('url'):
3105 info_copy['urls'] = info_dict['url'] + info_dict.get('play_path', '')
3106 info_copy['formats_table'] = self.render_formats_table(info_dict)
3107 info_copy['thumbnails_table'] = self.render_thumbnails_table(info_dict)
3108 info_copy['subtitles_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('subtitles'))
3109 info_copy['automatic_captions_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('automatic_captions'))
3111 def format_tmpl(tmpl):
3112 mobj = re.fullmatch(r'([\w.:,]|-\d|(?P<dict>{([\w.:,]|-\d)+}))+=?', tmpl)
3113 if not mobj:
3114 return tmpl
3116 fmt = '%({})s'
3117 if tmpl.startswith('{'):
3118 tmpl, fmt = f'.{tmpl}', '%({})j'
3119 if tmpl.endswith('='):
3120 tmpl, fmt = tmpl[:-1], '{0} = %({0})#j'
3121 return '\n'.join(map(fmt.format, [tmpl] if mobj.group('dict') else tmpl.split(',')))
3123 for tmpl in self.params['forceprint'].get(key, []):
3124 self.to_stdout(self.evaluate_outtmpl(format_tmpl(tmpl), info_copy))
3126 for tmpl, file_tmpl in self.params['print_to_file'].get(key, []):
3127 filename = self.prepare_filename(info_dict, outtmpl=file_tmpl)
3128 tmpl = format_tmpl(tmpl)
3129 self.to_screen(f'[info] Writing {tmpl!r} to: {filename}')
3130 if self._ensure_dir_exists(filename):
3131 with open(filename, 'a', encoding='utf-8', newline='') as f:
3132 f.write(self.evaluate_outtmpl(tmpl, info_copy) + os.linesep)
3134 return info_copy
3136 def __forced_printings(self, info_dict, filename=None, incomplete=True):
3137 if (self.params.get('forcejson')
3138 or self.params['forceprint'].get('video')
3139 or self.params['print_to_file'].get('video')):
3140 self.post_extract(info_dict)
3141 if filename:
3142 info_dict['filename'] = filename
3143 info_copy = self._forceprint('video', info_dict)
3145 def print_field(field, actual_field=None, optional=False):
3146 if actual_field is None:
3147 actual_field = field
3148 if self.params.get(f'force{field}') and (
3149 info_copy.get(field) is not None or (not optional and not incomplete)):
3150 self.to_stdout(info_copy[actual_field])
3152 print_field('title')
3153 print_field('id')
3154 print_field('url', 'urls')
3155 print_field('thumbnail', optional=True)
3156 print_field('description', optional=True)
3157 print_field('filename')
3158 if self.params.get('forceduration') and info_copy.get('duration') is not None:
3159 self.to_stdout(formatSeconds(info_copy['duration']))
3160 print_field('format')
3162 if self.params.get('forcejson'):
3163 self.to_stdout(json.dumps(self.sanitize_info(info_dict)))
3165 def dl(self, name, info, subtitle=False, test=False):
3166 if not info.get('url'):
3167 self.raise_no_formats(info, True)
3169 if test:
3170 verbose = self.params.get('verbose')
3171 quiet = self.params.get('quiet') or not verbose
3172 params = {
3173 'test': True,
3174 'quiet': quiet,
3175 'verbose': verbose,
3176 'noprogress': quiet,
3177 'nopart': True,
3178 'skip_unavailable_fragments': False,
3179 'keep_fragments': False,
3180 'overwrites': True,
3181 '_no_ytdl_file': True,
3183 else:
3184 params = self.params
3185 fd = get_suitable_downloader(info, params, to_stdout=(name == '-'))(self, params)
3186 if not test:
3187 for ph in self._progress_hooks:
3188 fd.add_progress_hook(ph)
3189 urls = '", "'.join(
3190 (f['url'].split(',')[0] + ',<data>' if f['url'].startswith('data:') else f['url'])
3191 for f in info.get('requested_formats', []) or [info])
3192 self.write_debug(f'Invoking {fd.FD_NAME} downloader on "{urls}"')
3194 # Note: Ideally info should be a deep-copied so that hooks cannot modify it.
3195 # But it may contain objects that are not deep-copyable
3196 new_info = self._copy_infodict(info)
3197 if new_info.get('http_headers') is None:
3198 new_info['http_headers'] = self._calc_headers(new_info)
3199 return fd.download(name, new_info, subtitle)
3201 def existing_file(self, filepaths, *, default_overwrite=True):
3202 existing_files = list(filter(os.path.exists, orderedSet(filepaths)))
3203 if existing_files and not self.params.get('overwrites', default_overwrite):
3204 return existing_files[0]
3206 for file in existing_files:
3207 self.report_file_delete(file)
3208 os.remove(file)
3209 return None
3211 @_catch_unsafe_extension_error
3212 def process_info(self, info_dict):
3213 """Process a single resolved IE result. (Modifies it in-place)"""
3215 assert info_dict.get('_type', 'video') == 'video'
3216 original_infodict = info_dict
3218 if 'format' not in info_dict and 'ext' in info_dict:
3219 info_dict['format'] = info_dict['ext']
3221 if self._match_entry(info_dict) is not None:
3222 info_dict['__write_download_archive'] = 'ignore'
3223 return
3225 # Does nothing under normal operation - for backward compatibility of process_info
3226 self.post_extract(info_dict)
3228 def replace_info_dict(new_info):
3229 nonlocal info_dict
3230 if new_info == info_dict:
3231 return
3232 info_dict.clear()
3233 info_dict.update(new_info)
3235 new_info, _ = self.pre_process(info_dict, 'video')
3236 replace_info_dict(new_info)
3237 self._num_downloads += 1
3239 # info_dict['_filename'] needs to be set for backward compatibility
3240 info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True)
3241 temp_filename = self.prepare_filename(info_dict, 'temp')
3242 files_to_move = {}
3244 # Forced printings
3245 self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
3247 def check_max_downloads():
3248 if self._num_downloads >= float(self.params.get('max_downloads') or 'inf'):
3249 raise MaxDownloadsReached
3251 if self.params.get('simulate'):
3252 info_dict['__write_download_archive'] = self.params.get('force_write_download_archive')
3253 check_max_downloads()
3254 return
3256 if full_filename is None:
3257 return
3258 if not self._ensure_dir_exists(full_filename):
3259 return
3260 if not self._ensure_dir_exists(temp_filename):
3261 return
3263 if self._write_description('video', info_dict,
3264 self.prepare_filename(info_dict, 'description')) is None:
3265 return
3267 sub_files = self._write_subtitles(info_dict, temp_filename)
3268 if sub_files is None:
3269 return
3270 files_to_move.update(dict(sub_files))
3272 thumb_files = self._write_thumbnails(
3273 'video', info_dict, temp_filename, self.prepare_filename(info_dict, 'thumbnail'))
3274 if thumb_files is None:
3275 return
3276 files_to_move.update(dict(thumb_files))
3278 infofn = self.prepare_filename(info_dict, 'infojson')
3279 _infojson_written = self._write_info_json('video', info_dict, infofn)
3280 if _infojson_written:
3281 info_dict['infojson_filename'] = infofn
3282 # For backward compatibility, even though it was a private field
3283 info_dict['__infojson_filename'] = infofn
3284 elif _infojson_written is None:
3285 return
3287 # Note: Annotations are deprecated
3288 annofn = None
3289 if self.params.get('writeannotations', False):
3290 annofn = self.prepare_filename(info_dict, 'annotation')
3291 if annofn:
3292 if not self._ensure_dir_exists(annofn):
3293 return
3294 if not self.params.get('overwrites', True) and os.path.exists(annofn):
3295 self.to_screen('[info] Video annotations are already present')
3296 elif not info_dict.get('annotations'):
3297 self.report_warning('There are no annotations to write.')
3298 else:
3299 try:
3300 self.to_screen('[info] Writing video annotations to: ' + annofn)
3301 with open(annofn, 'w', encoding='utf-8') as annofile:
3302 annofile.write(info_dict['annotations'])
3303 except (KeyError, TypeError):
3304 self.report_warning('There are no annotations to write.')
3305 except OSError:
3306 self.report_error('Cannot write annotations file: ' + annofn)
3307 return
3309 # Write internet shortcut files
3310 def _write_link_file(link_type):
3311 url = try_get(info_dict['webpage_url'], iri_to_uri)
3312 if not url:
3313 self.report_warning(
3314 f'Cannot write internet shortcut file because the actual URL of "{info_dict["webpage_url"]}" is unknown')
3315 return True
3316 linkfn = replace_extension(self.prepare_filename(info_dict, 'link'), link_type, info_dict.get('ext'))
3317 if not self._ensure_dir_exists(linkfn):
3318 return False
3319 if self.params.get('overwrites', True) and os.path.exists(linkfn):
3320 self.to_screen(f'[info] Internet shortcut (.{link_type}) is already present')
3321 return True
3322 try:
3323 self.to_screen(f'[info] Writing internet shortcut (.{link_type}) to: {linkfn}')
3324 with open(to_high_limit_path(linkfn), 'w', encoding='utf-8',
3325 newline='\r\n' if link_type == 'url' else '\n') as linkfile:
3326 template_vars = {'url': url}
3327 if link_type == 'desktop':
3328 template_vars['filename'] = linkfn[:-(len(link_type) + 1)]
3329 linkfile.write(LINK_TEMPLATES[link_type] % template_vars)
3330 except OSError:
3331 self.report_error(f'Cannot write internet shortcut {linkfn}')
3332 return False
3333 return True
3335 write_links = {
3336 'url': self.params.get('writeurllink'),
3337 'webloc': self.params.get('writewebloclink'),
3338 'desktop': self.params.get('writedesktoplink'),
3340 if self.params.get('writelink'):
3341 link_type = ('webloc' if sys.platform == 'darwin'
3342 else 'desktop' if sys.platform.startswith('linux')
3343 else 'url')
3344 write_links[link_type] = True
3346 if any(should_write and not _write_link_file(link_type)
3347 for link_type, should_write in write_links.items()):
3348 return
3350 new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
3351 replace_info_dict(new_info)
3353 if self.params.get('skip_download'):
3354 info_dict['filepath'] = temp_filename
3355 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(full_filename))
3356 info_dict['__files_to_move'] = files_to_move
3357 replace_info_dict(self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict))
3358 info_dict['__write_download_archive'] = self.params.get('force_write_download_archive')
3359 else:
3360 # Download
3361 info_dict.setdefault('__postprocessors', [])
3362 try:
3364 def existing_video_file(*filepaths):
3365 ext = info_dict.get('ext')
3366 converted = lambda file: replace_extension(file, self.params.get('final_ext') or ext, ext)
3367 file = self.existing_file(itertools.chain(*zip(map(converted, filepaths), filepaths)),
3368 default_overwrite=False)
3369 if file:
3370 info_dict['ext'] = os.path.splitext(file)[1][1:]
3371 return file
3373 fd, success = None, True
3374 if info_dict.get('protocol') or info_dict.get('url'):
3375 fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-')
3376 if fd != FFmpegFD and 'no-direct-merge' not in self.params['compat_opts'] and (
3377 info_dict.get('section_start') or info_dict.get('section_end')):
3378 msg = ('This format cannot be partially downloaded' if FFmpegFD.available()
3379 else 'You have requested downloading the video partially, but ffmpeg is not installed')
3380 self.report_error(f'{msg}. Aborting')
3381 return
3383 if info_dict.get('requested_formats') is not None:
3384 old_ext = info_dict['ext']
3385 if self.params.get('merge_output_format') is None:
3386 if (info_dict['ext'] == 'webm'
3387 and info_dict.get('thumbnails')
3388 # check with type instead of pp_key, __name__, or isinstance
3389 # since we dont want any custom PPs to trigger this
3390 and any(type(pp) == EmbedThumbnailPP for pp in self._pps['post_process'])): # noqa: E721
3391 info_dict['ext'] = 'mkv'
3392 self.report_warning(
3393 'webm doesn\'t support embedding a thumbnail, mkv will be used')
3394 new_ext = info_dict['ext']
3396 def correct_ext(filename, ext=new_ext):
3397 if filename == '-':
3398 return filename
3399 filename_real_ext = os.path.splitext(filename)[1][1:]
3400 filename_wo_ext = (
3401 os.path.splitext(filename)[0]
3402 if filename_real_ext in (old_ext, new_ext)
3403 else filename)
3404 return f'{filename_wo_ext}.{ext}'
3406 # Ensure filename always has a correct extension for successful merge
3407 full_filename = correct_ext(full_filename)
3408 temp_filename = correct_ext(temp_filename)
3409 dl_filename = existing_video_file(full_filename, temp_filename)
3411 info_dict['__real_download'] = False
3412 # NOTE: Copy so that original format dicts are not modified
3413 info_dict['requested_formats'] = list(map(dict, info_dict['requested_formats']))
3415 merger = FFmpegMergerPP(self)
3416 downloaded = []
3417 if dl_filename is not None:
3418 self.report_file_already_downloaded(dl_filename)
3419 elif fd:
3420 for f in info_dict['requested_formats'] if fd != FFmpegFD else []:
3421 f['filepath'] = fname = prepend_extension(
3422 correct_ext(temp_filename, info_dict['ext']),
3423 'f{}'.format(f['format_id']), info_dict['ext'])
3424 downloaded.append(fname)
3425 info_dict['url'] = '\n'.join(f['url'] for f in info_dict['requested_formats'])
3426 success, real_download = self.dl(temp_filename, info_dict)
3427 info_dict['__real_download'] = real_download
3428 else:
3429 if self.params.get('allow_unplayable_formats'):
3430 self.report_warning(
3431 'You have requested merging of multiple formats '
3432 'while also allowing unplayable formats to be downloaded. '
3433 'The formats won\'t be merged to prevent data corruption.')
3434 elif not merger.available:
3435 msg = 'You have requested merging of multiple formats but ffmpeg is not installed'
3436 if not self.params.get('ignoreerrors'):
3437 self.report_error(f'{msg}. Aborting due to --abort-on-error')
3438 return
3439 self.report_warning(f'{msg}. The formats won\'t be merged')
3441 if temp_filename == '-':
3442 reason = ('using a downloader other than ffmpeg' if FFmpegFD.can_merge_formats(info_dict, self.params)
3443 else 'but the formats are incompatible for simultaneous download' if merger.available
3444 else 'but ffmpeg is not installed')
3445 self.report_warning(
3446 f'You have requested downloading multiple formats to stdout {reason}. '
3447 'The formats will be streamed one after the other')
3448 fname = temp_filename
3449 for f in info_dict['requested_formats']:
3450 new_info = dict(info_dict)
3451 del new_info['requested_formats']
3452 new_info.update(f)
3453 if temp_filename != '-':
3454 fname = prepend_extension(
3455 correct_ext(temp_filename, new_info['ext']),
3456 'f{}'.format(f['format_id']), new_info['ext'])
3457 if not self._ensure_dir_exists(fname):
3458 return
3459 f['filepath'] = fname
3460 downloaded.append(fname)
3461 partial_success, real_download = self.dl(fname, new_info)
3462 info_dict['__real_download'] = info_dict['__real_download'] or real_download
3463 success = success and partial_success
3465 if downloaded and merger.available and not self.params.get('allow_unplayable_formats'):
3466 info_dict['__postprocessors'].append(merger)
3467 info_dict['__files_to_merge'] = downloaded
3468 # Even if there were no downloads, it is being merged only now
3469 info_dict['__real_download'] = True
3470 else:
3471 for file in downloaded:
3472 files_to_move[file] = None
3473 else:
3474 # Just a single file
3475 dl_filename = existing_video_file(full_filename, temp_filename)
3476 if dl_filename is None or dl_filename == temp_filename:
3477 # dl_filename == temp_filename could mean that the file was partially downloaded with --no-part.
3478 # So we should try to resume the download
3479 success, real_download = self.dl(temp_filename, info_dict)
3480 info_dict['__real_download'] = real_download
3481 else:
3482 self.report_file_already_downloaded(dl_filename)
3484 dl_filename = dl_filename or temp_filename
3485 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(full_filename))
3487 except network_exceptions as err:
3488 self.report_error(f'unable to download video data: {err}')
3489 return
3490 except OSError as err:
3491 raise UnavailableVideoError(err)
3492 except ContentTooShortError as err:
3493 self.report_error(f'content too short (expected {err.expected} bytes and served {err.downloaded})')
3494 return
3496 self._raise_pending_errors(info_dict)
3497 if success and full_filename != '-':
3499 def fixup():
3500 do_fixup = True
3501 fixup_policy = self.params.get('fixup')
3502 vid = info_dict['id']
3504 if fixup_policy in ('ignore', 'never'):
3505 return
3506 elif fixup_policy == 'warn':
3507 do_fixup = 'warn'
3508 elif fixup_policy != 'force':
3509 assert fixup_policy in ('detect_or_warn', None)
3510 if not info_dict.get('__real_download'):
3511 do_fixup = False
3513 def ffmpeg_fixup(cndn, msg, cls):
3514 if not (do_fixup and cndn):
3515 return
3516 elif do_fixup == 'warn':
3517 self.report_warning(f'{vid}: {msg}')
3518 return
3519 pp = cls(self)
3520 if pp.available:
3521 info_dict['__postprocessors'].append(pp)
3522 else:
3523 self.report_warning(f'{vid}: {msg}. Install ffmpeg to fix this automatically')
3525 stretched_ratio = info_dict.get('stretched_ratio')
3526 ffmpeg_fixup(stretched_ratio not in (1, None),
3527 f'Non-uniform pixel ratio {stretched_ratio}',
3528 FFmpegFixupStretchedPP)
3530 downloader = get_suitable_downloader(info_dict, self.params) if 'protocol' in info_dict else None
3531 downloader = downloader.FD_NAME if downloader else None
3533 ext = info_dict.get('ext')
3534 postprocessed_by_ffmpeg = info_dict.get('requested_formats') or any((
3535 isinstance(pp, FFmpegVideoConvertorPP)
3536 and resolve_recode_mapping(ext, pp.mapping)[0] not in (ext, None)
3537 ) for pp in self._pps['post_process'])
3539 if not postprocessed_by_ffmpeg:
3540 ffmpeg_fixup(fd != FFmpegFD and ext == 'm4a'
3541 and info_dict.get('container') == 'm4a_dash',
3542 'writing DASH m4a. Only some players support this container',
3543 FFmpegFixupM4aPP)
3544 ffmpeg_fixup((downloader == 'hlsnative' and not self.params.get('hls_use_mpegts'))
3545 or (info_dict.get('is_live') and self.params.get('hls_use_mpegts') is None),
3546 'Possible MPEG-TS in MP4 container or malformed AAC timestamps',
3547 FFmpegFixupM3u8PP)
3548 ffmpeg_fixup(downloader == 'dashsegments'
3549 and (info_dict.get('is_live') or info_dict.get('is_dash_periods')),
3550 'Possible duplicate MOOV atoms', FFmpegFixupDuplicateMoovPP)
3552 ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed timestamps detected', FFmpegFixupTimestampPP)
3553 ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed duration detected', FFmpegFixupDurationPP)
3555 fixup()
3556 try:
3557 replace_info_dict(self.post_process(dl_filename, info_dict, files_to_move))
3558 except PostProcessingError as err:
3559 self.report_error(f'Postprocessing: {err}')
3560 return
3561 try:
3562 for ph in self._post_hooks:
3563 ph(info_dict['filepath'])
3564 except Exception as err:
3565 self.report_error(f'post hooks: {err}')
3566 return
3567 info_dict['__write_download_archive'] = True
3569 assert info_dict is original_infodict # Make sure the info_dict was modified in-place
3570 if self.params.get('force_write_download_archive'):
3571 info_dict['__write_download_archive'] = True
3572 check_max_downloads()
3574 def __download_wrapper(self, func):
3575 @functools.wraps(func)
3576 def wrapper(*args, **kwargs):
3577 try:
3578 res = func(*args, **kwargs)
3579 except CookieLoadError:
3580 raise
3581 except UnavailableVideoError as e:
3582 self.report_error(e)
3583 except DownloadCancelled as e:
3584 self.to_screen(f'[info] {e}')
3585 if not self.params.get('break_per_url'):
3586 raise
3587 self._num_downloads = 0
3588 else:
3589 if self.params.get('dump_single_json', False):
3590 self.post_extract(res)
3591 self.to_stdout(json.dumps(self.sanitize_info(res)))
3592 return wrapper
3594 def download(self, url_list):
3595 """Download a given list of URLs."""
3596 url_list = variadic(url_list) # Passing a single URL is a common mistake
3597 outtmpl = self.params['outtmpl']['default']
3598 if (len(url_list) > 1
3599 and outtmpl != '-'
3600 and '%' not in outtmpl
3601 and self.params.get('max_downloads') != 1):
3602 raise SameFileError(outtmpl)
3604 for url in url_list:
3605 self.__download_wrapper(self.extract_info)(
3606 url, force_generic_extractor=self.params.get('force_generic_extractor', False))
3608 return self._download_retcode
3610 def download_with_info_file(self, info_filename):
3611 with contextlib.closing(fileinput.FileInput(
3612 [info_filename], mode='r',
3613 openhook=fileinput.hook_encoded('utf-8'))) as f:
3614 # FileInput doesn't have a read method, we can't call json.load
3615 infos = [self.sanitize_info(info, self.params.get('clean_infojson', True))
3616 for info in variadic(json.loads('\n'.join(f)))]
3617 for info in infos:
3618 try:
3619 self.__download_wrapper(self.process_ie_result)(info, download=True)
3620 except (DownloadError, EntryNotInPlaylist, ReExtractInfo) as e:
3621 if not isinstance(e, EntryNotInPlaylist):
3622 self.to_stderr('\r')
3623 webpage_url = info.get('webpage_url')
3624 if webpage_url is None:
3625 raise
3626 self.report_warning(f'The info failed to download: {e}; trying with URL {webpage_url}')
3627 self.download([webpage_url])
3628 except ExtractorError as e:
3629 self.report_error(e)
3630 return self._download_retcode
3632 @staticmethod
3633 def sanitize_info(info_dict, remove_private_keys=False):
3634 """ Sanitize the infodict for converting to json """
3635 if info_dict is None:
3636 return info_dict
3637 info_dict.setdefault('epoch', int(time.time()))
3638 info_dict.setdefault('_type', 'video')
3639 info_dict.setdefault('_version', {
3640 'version': __version__,
3641 'current_git_head': current_git_head(),
3642 'release_git_head': RELEASE_GIT_HEAD,
3643 'repository': ORIGIN,
3646 if remove_private_keys:
3647 reject = lambda k, v: v is None or k.startswith('__') or k in {
3648 'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries',
3649 'entries', 'filepath', '_filename', 'filename', 'infojson_filename', 'original_url',
3650 'playlist_autonumber',
3652 else:
3653 reject = lambda k, v: False
3655 def filter_fn(obj):
3656 if isinstance(obj, dict):
3657 return {k: filter_fn(v) for k, v in obj.items() if not reject(k, v)}
3658 elif isinstance(obj, (list, tuple, set, LazyList)):
3659 return list(map(filter_fn, obj))
3660 elif obj is None or isinstance(obj, (str, int, float, bool)):
3661 return obj
3662 else:
3663 return repr(obj)
3665 return filter_fn(info_dict)
3667 @staticmethod
3668 def filter_requested_info(info_dict, actually_filter=True):
3669 """ Alias of sanitize_info for backward compatibility """
3670 return YoutubeDL.sanitize_info(info_dict, actually_filter)
3672 def _delete_downloaded_files(self, *files_to_delete, info={}, msg=None):
3673 for filename in set(filter(None, files_to_delete)):
3674 if msg:
3675 self.to_screen(msg % filename)
3676 try:
3677 os.remove(filename)
3678 except OSError:
3679 self.report_warning(f'Unable to delete file {filename}')
3680 if filename in info.get('__files_to_move', []): # NB: Delete even if None
3681 del info['__files_to_move'][filename]
3683 @staticmethod
3684 def post_extract(info_dict):
3685 def actual_post_extract(info_dict):
3686 if info_dict.get('_type') in ('playlist', 'multi_video'):
3687 for video_dict in info_dict.get('entries', {}):
3688 actual_post_extract(video_dict or {})
3689 return
3691 post_extractor = info_dict.pop('__post_extractor', None) or dict
3692 info_dict.update(post_extractor())
3694 actual_post_extract(info_dict or {})
3696 def run_pp(self, pp, infodict):
3697 files_to_delete = []
3698 if '__files_to_move' not in infodict:
3699 infodict['__files_to_move'] = {}
3700 try:
3701 files_to_delete, infodict = pp.run(infodict)
3702 except PostProcessingError as e:
3703 # Must be True and not 'only_download'
3704 if self.params.get('ignoreerrors') is True:
3705 self.report_error(e)
3706 return infodict
3707 raise
3709 if not files_to_delete:
3710 return infodict
3711 if self.params.get('keepvideo', False):
3712 for f in files_to_delete:
3713 infodict['__files_to_move'].setdefault(f, '')
3714 else:
3715 self._delete_downloaded_files(
3716 *files_to_delete, info=infodict, msg='Deleting original file %s (pass -k to keep)')
3717 return infodict
3719 def run_all_pps(self, key, info, *, additional_pps=None):
3720 if key != 'video':
3721 self._forceprint(key, info)
3722 for pp in (additional_pps or []) + self._pps[key]:
3723 info = self.run_pp(pp, info)
3724 return info
3726 def pre_process(self, ie_info, key='pre_process', files_to_move=None):
3727 info = dict(ie_info)
3728 info['__files_to_move'] = files_to_move or {}
3729 try:
3730 info = self.run_all_pps(key, info)
3731 except PostProcessingError as err:
3732 msg = f'Preprocessing: {err}'
3733 info.setdefault('__pending_error', msg)
3734 self.report_error(msg, is_error=False)
3735 return info, info.pop('__files_to_move', None)
3737 def post_process(self, filename, info, files_to_move=None):
3738 """Run all the postprocessors on the given file."""
3739 info['filepath'] = filename
3740 info['__files_to_move'] = files_to_move or {}
3741 info = self.run_all_pps('post_process', info, additional_pps=info.get('__postprocessors'))
3742 info = self.run_pp(MoveFilesAfterDownloadPP(self), info)
3743 del info['__files_to_move']
3744 return self.run_all_pps('after_move', info)
3746 def _make_archive_id(self, info_dict):
3747 video_id = info_dict.get('id')
3748 if not video_id:
3749 return
3750 # Future-proof against any change in case
3751 # and backwards compatibility with prior versions
3752 extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist
3753 if extractor is None:
3754 url = str_or_none(info_dict.get('url'))
3755 if not url:
3756 return
3757 # Try to find matching extractor for the URL and take its ie_key
3758 for ie_key, ie in self._ies.items():
3759 if ie.suitable(url):
3760 extractor = ie_key
3761 break
3762 else:
3763 return
3764 return make_archive_id(extractor, video_id)
3766 def in_download_archive(self, info_dict):
3767 if not self.archive:
3768 return False
3770 vid_ids = [self._make_archive_id(info_dict)]
3771 vid_ids.extend(info_dict.get('_old_archive_ids') or [])
3772 return any(id_ in self.archive for id_ in vid_ids)
3774 def record_download_archive(self, info_dict):
3775 fn = self.params.get('download_archive')
3776 if fn is None:
3777 return
3778 vid_id = self._make_archive_id(info_dict)
3779 assert vid_id
3781 self.write_debug(f'Adding to archive: {vid_id}')
3782 if is_path_like(fn):
3783 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
3784 archive_file.write(vid_id + '\n')
3785 self.archive.add(vid_id)
3787 @staticmethod
3788 def format_resolution(format, default='unknown'):
3789 if format.get('vcodec') == 'none' and format.get('acodec') != 'none':
3790 return 'audio only'
3791 if format.get('resolution') is not None:
3792 return format['resolution']
3793 if format.get('width') and format.get('height'):
3794 return '%dx%d' % (format['width'], format['height'])
3795 elif format.get('height'):
3796 return '{}p'.format(format['height'])
3797 elif format.get('width'):
3798 return '%dx?' % format['width']
3799 return default
3801 def _list_format_headers(self, *headers):
3802 if self.params.get('listformats_table', True) is not False:
3803 return [self._format_out(header, self.Styles.HEADERS) for header in headers]
3804 return headers
3806 def _format_note(self, fdict):
3807 res = ''
3808 if fdict.get('ext') in ['f4f', 'f4m']:
3809 res += '(unsupported)'
3810 if fdict.get('language'):
3811 if res:
3812 res += ' '
3813 res += '[{}]'.format(fdict['language'])
3814 if fdict.get('format_note') is not None:
3815 if res:
3816 res += ' '
3817 res += fdict['format_note']
3818 if fdict.get('tbr') is not None:
3819 if res:
3820 res += ', '
3821 res += '%4dk' % fdict['tbr']
3822 if fdict.get('container') is not None:
3823 if res:
3824 res += ', '
3825 res += '{} container'.format(fdict['container'])
3826 if (fdict.get('vcodec') is not None
3827 and fdict.get('vcodec') != 'none'):
3828 if res:
3829 res += ', '
3830 res += fdict['vcodec']
3831 if fdict.get('vbr') is not None:
3832 res += '@'
3833 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
3834 res += 'video@'
3835 if fdict.get('vbr') is not None:
3836 res += '%4dk' % fdict['vbr']
3837 if fdict.get('fps') is not None:
3838 if res:
3839 res += ', '
3840 res += '{}fps'.format(fdict['fps'])
3841 if fdict.get('acodec') is not None:
3842 if res:
3843 res += ', '
3844 if fdict['acodec'] == 'none':
3845 res += 'video only'
3846 else:
3847 res += '%-5s' % fdict['acodec']
3848 elif fdict.get('abr') is not None:
3849 if res:
3850 res += ', '
3851 res += 'audio'
3852 if fdict.get('abr') is not None:
3853 res += '@%3dk' % fdict['abr']
3854 if fdict.get('asr') is not None:
3855 res += ' (%5dHz)' % fdict['asr']
3856 if fdict.get('filesize') is not None:
3857 if res:
3858 res += ', '
3859 res += format_bytes(fdict['filesize'])
3860 elif fdict.get('filesize_approx') is not None:
3861 if res:
3862 res += ', '
3863 res += '~' + format_bytes(fdict['filesize_approx'])
3864 return res
3866 def _get_formats(self, info_dict):
3867 if info_dict.get('formats') is None:
3868 if info_dict.get('url') and info_dict.get('_type', 'video') == 'video':
3869 return [info_dict]
3870 return []
3871 return info_dict['formats']
3873 def render_formats_table(self, info_dict):
3874 formats = self._get_formats(info_dict)
3875 if not formats:
3876 return
3877 if not self.params.get('listformats_table', True) is not False:
3878 table = [
3880 format_field(f, 'format_id'),
3881 format_field(f, 'ext'),
3882 self.format_resolution(f),
3883 self._format_note(f),
3884 ] for f in formats if (f.get('preference') or 0) >= -1000]
3885 return render_table(['format code', 'extension', 'resolution', 'note'], table, extra_gap=1)
3887 def simplified_codec(f, field):
3888 assert field in ('acodec', 'vcodec')
3889 codec = f.get(field)
3890 if not codec:
3891 return 'unknown'
3892 elif codec != 'none':
3893 return '.'.join(codec.split('.')[:4])
3895 if field == 'vcodec' and f.get('acodec') == 'none':
3896 return 'images'
3897 elif field == 'acodec' and f.get('vcodec') == 'none':
3898 return ''
3899 return self._format_out('audio only' if field == 'vcodec' else 'video only',
3900 self.Styles.SUPPRESS)
3902 delim = self._format_out('\u2502', self.Styles.DELIM, '|', test_encoding=True)
3903 table = [
3905 self._format_out(format_field(f, 'format_id'), self.Styles.ID),
3906 format_field(f, 'ext'),
3907 format_field(f, func=self.format_resolution, ignore=('audio only', 'images')),
3908 format_field(f, 'fps', '\t%d', func=round),
3909 format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''),
3910 format_field(f, 'audio_channels', '\t%s'),
3911 delim, (
3912 format_field(f, 'filesize', ' \t%s', func=format_bytes)
3913 or format_field(f, 'filesize_approx', '≈\t%s', func=format_bytes)
3914 or format_field(filesize_from_tbr(f.get('tbr'), info_dict.get('duration')), None,
3915 self._format_out('~\t%s', self.Styles.SUPPRESS), func=format_bytes)),
3916 format_field(f, 'tbr', '\t%dk', func=round),
3917 shorten_protocol_name(f.get('protocol', '')),
3918 delim,
3919 simplified_codec(f, 'vcodec'),
3920 format_field(f, 'vbr', '\t%dk', func=round),
3921 simplified_codec(f, 'acodec'),
3922 format_field(f, 'abr', '\t%dk', func=round),
3923 format_field(f, 'asr', '\t%s', func=format_decimal_suffix),
3924 join_nonempty(format_field(f, 'language', '[%s]'), join_nonempty(
3925 self._format_out('UNSUPPORTED', self.Styles.BAD_FORMAT) if f.get('ext') in ('f4f', 'f4m') else None,
3926 (self._format_out('Maybe DRM', self.Styles.WARNING) if f.get('has_drm') == 'maybe'
3927 else self._format_out('DRM', self.Styles.BAD_FORMAT) if f.get('has_drm') else None),
3928 format_field(f, 'format_note'),
3929 format_field(f, 'container', ignore=(None, f.get('ext'))),
3930 delim=', '), delim=' '),
3931 ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
3932 header_line = self._list_format_headers(
3933 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', 'CH', delim, '\tFILESIZE', '\tTBR', 'PROTO',
3934 delim, 'VCODEC', '\tVBR', 'ACODEC', '\tABR', '\tASR', 'MORE INFO')
3936 return render_table(
3937 header_line, table, hide_empty=True,
3938 delim=self._format_out('\u2500', self.Styles.DELIM, '-', test_encoding=True))
3940 def render_thumbnails_table(self, info_dict):
3941 thumbnails = list(info_dict.get('thumbnails') or [])
3942 if not thumbnails:
3943 return None
3944 return render_table(
3945 self._list_format_headers('ID', 'Width', 'Height', 'URL'),
3946 [[t.get('id'), t.get('width') or 'unknown', t.get('height') or 'unknown', t['url']] for t in thumbnails])
3948 def render_subtitles_table(self, video_id, subtitles):
3949 def _row(lang, formats):
3950 exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats)))
3951 if len(set(names)) == 1:
3952 names = [] if names[0] == 'unknown' else names[:1]
3953 return [lang, ', '.join(names), ', '.join(exts)]
3955 if not subtitles:
3956 return None
3957 return render_table(
3958 self._list_format_headers('Language', 'Name', 'Formats'),
3959 [_row(lang, formats) for lang, formats in subtitles.items()],
3960 hide_empty=True)
3962 def __list_table(self, video_id, name, func, *args):
3963 table = func(*args)
3964 if not table:
3965 self.to_screen(f'{video_id} has no {name}')
3966 return
3967 self.to_screen(f'[info] Available {name} for {video_id}:')
3968 self.to_stdout(table)
3970 def list_formats(self, info_dict):
3971 self.__list_table(info_dict['id'], 'formats', self.render_formats_table, info_dict)
3973 def list_thumbnails(self, info_dict):
3974 self.__list_table(info_dict['id'], 'thumbnails', self.render_thumbnails_table, info_dict)
3976 def list_subtitles(self, video_id, subtitles, name='subtitles'):
3977 self.__list_table(video_id, name, self.render_subtitles_table, video_id, subtitles)
3979 def print_debug_header(self):
3980 if not self.params.get('verbose'):
3981 return
3983 from . import _IN_CLI # Must be delayed import
3985 # These imports can be slow. So import them only as needed
3986 from .extractor.extractors import _LAZY_LOADER
3987 from .extractor.extractors import (
3988 _PLUGIN_CLASSES as plugin_ies,
3989 _PLUGIN_OVERRIDES as plugin_ie_overrides,
3992 def get_encoding(stream):
3993 ret = str(getattr(stream, 'encoding', f'missing ({type(stream).__name__})'))
3994 additional_info = []
3995 if os.environ.get('TERM', '').lower() == 'dumb':
3996 additional_info.append('dumb')
3997 if not supports_terminal_sequences(stream):
3998 from .utils import WINDOWS_VT_MODE # Must be imported locally
3999 additional_info.append('No VT' if WINDOWS_VT_MODE is False else 'No ANSI')
4000 if additional_info:
4001 ret = f'{ret} ({",".join(additional_info)})'
4002 return ret
4004 encoding_str = 'Encodings: locale {}, fs {}, pref {}, {}'.format(
4005 locale.getpreferredencoding(),
4006 sys.getfilesystemencoding(),
4007 self.get_encoding(),
4008 ', '.join(
4009 f'{key} {get_encoding(stream)}' for key, stream in self._out_files.items_
4010 if stream is not None and key != 'console'),
4013 logger = self.params.get('logger')
4014 if logger:
4015 write_debug = lambda msg: logger.debug(f'[debug] {msg}')
4016 write_debug(encoding_str)
4017 else:
4018 write_string(f'[debug] {encoding_str}\n', encoding=None)
4019 write_debug = lambda msg: self._write_string(f'[debug] {msg}\n')
4021 source = detect_variant()
4022 if VARIANT not in (None, 'pip'):
4023 source += '*'
4024 klass = type(self)
4025 write_debug(join_nonempty(
4026 f'{REPOSITORY.rpartition("/")[2]} version',
4027 _make_label(ORIGIN, CHANNEL.partition('@')[2] or __version__, __version__),
4028 f'[{RELEASE_GIT_HEAD[:9]}]' if RELEASE_GIT_HEAD else '',
4029 '' if source == 'unknown' else f'({source})',
4030 '' if _IN_CLI else 'API' if klass == YoutubeDL else f'API:{self.__module__}.{klass.__qualname__}',
4031 delim=' '))
4033 if not _IN_CLI:
4034 write_debug(f'params: {self.params}')
4036 if not _LAZY_LOADER:
4037 if os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'):
4038 write_debug('Lazy loading extractors is forcibly disabled')
4039 else:
4040 write_debug('Lazy loading extractors is disabled')
4041 if self.params['compat_opts']:
4042 write_debug('Compatibility options: {}'.format(', '.join(self.params['compat_opts'])))
4044 if current_git_head():
4045 write_debug(f'Git HEAD: {current_git_head()}')
4046 write_debug(system_identifier())
4048 exe_versions, ffmpeg_features = FFmpegPostProcessor.get_versions_and_features(self)
4049 ffmpeg_features = {key for key, val in ffmpeg_features.items() if val}
4050 if ffmpeg_features:
4051 exe_versions['ffmpeg'] += ' ({})'.format(','.join(sorted(ffmpeg_features)))
4053 exe_versions['rtmpdump'] = rtmpdump_version()
4054 exe_versions['phantomjs'] = PhantomJSwrapper._version()
4055 exe_str = ', '.join(
4056 f'{exe} {v}' for exe, v in sorted(exe_versions.items()) if v
4057 ) or 'none'
4058 write_debug(f'exe versions: {exe_str}')
4060 from .compat.compat_utils import get_package_info
4061 from .dependencies import available_dependencies
4063 write_debug('Optional libraries: %s' % (', '.join(sorted({
4064 join_nonempty(*get_package_info(m)) for m in available_dependencies.values()
4065 })) or 'none'))
4067 write_debug(f'Proxy map: {self.proxies}')
4068 write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers.values())}')
4069 if os.environ.get('YTDLP_NO_PLUGINS'):
4070 write_debug('Plugins are forcibly disabled')
4071 return
4073 for plugin_type, plugins in {'Extractor': plugin_ies, 'Post-Processor': plugin_pps}.items():
4074 display_list = ['{}{}'.format(
4075 klass.__name__, '' if klass.__name__ == name else f' as {name}')
4076 for name, klass in plugins.items()]
4077 if plugin_type == 'Extractor':
4078 display_list.extend(f'{plugins[-1].IE_NAME.partition("+")[2]} ({parent.__name__})'
4079 for parent, plugins in plugin_ie_overrides.items())
4080 if not display_list:
4081 continue
4082 write_debug(f'{plugin_type} Plugins: {", ".join(sorted(display_list))}')
4084 plugin_dirs = plugin_directories()
4085 if plugin_dirs:
4086 write_debug(f'Plugin directories: {plugin_dirs}')
4088 @functools.cached_property
4089 def proxies(self):
4090 """Global proxy configuration"""
4091 opts_proxy = self.params.get('proxy')
4092 if opts_proxy is not None:
4093 if opts_proxy == '':
4094 opts_proxy = '__noproxy__'
4095 proxies = {'all': opts_proxy}
4096 else:
4097 proxies = urllib.request.getproxies()
4098 # compat. Set HTTPS_PROXY to __noproxy__ to revert
4099 if 'http' in proxies and 'https' not in proxies:
4100 proxies['https'] = proxies['http']
4102 return proxies
4104 @functools.cached_property
4105 def cookiejar(self):
4106 """Global cookiejar instance"""
4107 try:
4108 return load_cookies(
4109 self.params.get('cookiefile'), self.params.get('cookiesfrombrowser'), self)
4110 except CookieLoadError as error:
4111 cause = error.__context__
4112 # compat: <=py3.9: `traceback.format_exception` has a different signature
4113 self.report_error(str(cause), tb=''.join(traceback.format_exception(None, cause, cause.__traceback__)))
4114 raise
4116 @property
4117 def _opener(self):
4119 Get a urllib OpenerDirector from the Urllib handler (deprecated).
4121 self.deprecation_warning('YoutubeDL._opener is deprecated, use YoutubeDL.urlopen()')
4122 handler = self._request_director.handlers['Urllib']
4123 return handler._get_instance(cookiejar=self.cookiejar, proxies=self.proxies)
4125 def _get_available_impersonate_targets(self):
4126 # TODO(future): make available as public API
4127 return [
4128 (target, rh.RH_NAME)
4129 for rh in self._request_director.handlers.values()
4130 if isinstance(rh, ImpersonateRequestHandler)
4131 for target in rh.supported_targets
4134 def _impersonate_target_available(self, target):
4135 # TODO(future): make available as public API
4136 return any(
4137 rh.is_supported_target(target)
4138 for rh in self._request_director.handlers.values()
4139 if isinstance(rh, ImpersonateRequestHandler))
4141 def urlopen(self, req):
4142 """ Start an HTTP download """
4143 if isinstance(req, str):
4144 req = Request(req)
4145 elif isinstance(req, urllib.request.Request):
4146 self.deprecation_warning(
4147 'Passing a urllib.request.Request object to YoutubeDL.urlopen() is deprecated. '
4148 'Use yt_dlp.networking.common.Request instead.')
4149 req = urllib_req_to_req(req)
4150 assert isinstance(req, Request)
4152 # compat: Assume user:pass url params are basic auth
4153 url, basic_auth_header = extract_basic_auth(req.url)
4154 if basic_auth_header:
4155 req.headers['Authorization'] = basic_auth_header
4156 req.url = sanitize_url(url)
4158 clean_proxies(proxies=req.proxies, headers=req.headers)
4159 clean_headers(req.headers)
4161 try:
4162 return self._request_director.send(req)
4163 except NoSupportingHandlers as e:
4164 for ue in e.unsupported_errors:
4165 # FIXME: This depends on the order of errors.
4166 if not (ue.handler and ue.msg):
4167 continue
4168 if ue.handler.RH_KEY == 'Urllib' and 'unsupported url scheme: "file"' in ue.msg.lower():
4169 raise RequestError(
4170 'file:// URLs are disabled by default in yt-dlp for security reasons. '
4171 'Use --enable-file-urls to enable at your own risk.', cause=ue) from ue
4172 if (
4173 'unsupported proxy type: "https"' in ue.msg.lower()
4174 and 'requests' not in self._request_director.handlers
4175 and 'curl_cffi' not in self._request_director.handlers
4177 raise RequestError(
4178 'To use an HTTPS proxy for this request, one of the following dependencies needs to be installed: requests, curl_cffi')
4180 elif (
4181 re.match(r'unsupported url scheme: "wss?"', ue.msg.lower())
4182 and 'websockets' not in self._request_director.handlers
4184 raise RequestError(
4185 'This request requires WebSocket support. '
4186 'Ensure one of the following dependencies are installed: websockets',
4187 cause=ue) from ue
4189 elif re.match(r'unsupported (?:extensions: impersonate|impersonate target)', ue.msg.lower()):
4190 raise RequestError(
4191 f'Impersonate target "{req.extensions["impersonate"]}" is not available.'
4192 f' See --list-impersonate-targets for available targets.'
4193 f' This request requires browser impersonation, however you may be missing dependencies'
4194 f' required to support this target.')
4195 raise
4196 except SSLError as e:
4197 if 'UNSAFE_LEGACY_RENEGOTIATION_DISABLED' in str(e):
4198 raise RequestError('UNSAFE_LEGACY_RENEGOTIATION_DISABLED: Try using --legacy-server-connect', cause=e) from e
4199 elif 'SSLV3_ALERT_HANDSHAKE_FAILURE' in str(e):
4200 raise RequestError(
4201 'SSLV3_ALERT_HANDSHAKE_FAILURE: The server may not support the current cipher list. '
4202 'Try using --legacy-server-connect', cause=e) from e
4203 raise
4205 def build_request_director(self, handlers, preferences=None):
4206 logger = _YDLLogger(self)
4207 headers = self.params['http_headers'].copy()
4208 proxies = self.proxies.copy()
4209 clean_headers(headers)
4210 clean_proxies(proxies, headers)
4212 director = RequestDirector(logger=logger, verbose=self.params.get('debug_printtraffic'))
4213 for handler in handlers:
4214 director.add_handler(handler(
4215 logger=logger,
4216 headers=headers,
4217 cookiejar=self.cookiejar,
4218 proxies=proxies,
4219 prefer_system_certs='no-certifi' in self.params['compat_opts'],
4220 verify=not self.params.get('nocheckcertificate'),
4221 **traverse_obj(self.params, {
4222 'verbose': 'debug_printtraffic',
4223 'source_address': 'source_address',
4224 'timeout': 'socket_timeout',
4225 'legacy_ssl_support': 'legacyserverconnect',
4226 'enable_file_urls': 'enable_file_urls',
4227 'impersonate': 'impersonate',
4228 'client_cert': {
4229 'client_certificate': 'client_certificate',
4230 'client_certificate_key': 'client_certificate_key',
4231 'client_certificate_password': 'client_certificate_password',
4235 director.preferences.update(preferences or [])
4236 if 'prefer-legacy-http-handler' in self.params['compat_opts']:
4237 director.preferences.add(lambda rh, _: 500 if rh.RH_KEY == 'Urllib' else 0)
4238 return director
4240 @functools.cached_property
4241 def _request_director(self):
4242 return self.build_request_director(_REQUEST_HANDLERS.values(), _RH_PREFERENCES)
4244 def encode(self, s):
4245 if isinstance(s, bytes):
4246 return s # Already encoded
4248 try:
4249 return s.encode(self.get_encoding())
4250 except UnicodeEncodeError as err:
4251 err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
4252 raise
4254 def get_encoding(self):
4255 encoding = self.params.get('encoding')
4256 if encoding is None:
4257 encoding = preferredencoding()
4258 return encoding
4260 def _write_info_json(self, label, ie_result, infofn, overwrite=None):
4261 """ Write infojson and returns True = written, 'exists' = Already exists, False = skip, None = error """
4262 if overwrite is None:
4263 overwrite = self.params.get('overwrites', True)
4264 if not self.params.get('writeinfojson'):
4265 return False
4266 elif not infofn:
4267 self.write_debug(f'Skipping writing {label} infojson')
4268 return False
4269 elif not self._ensure_dir_exists(infofn):
4270 return None
4271 elif not overwrite and os.path.exists(infofn):
4272 self.to_screen(f'[info] {label.title()} metadata is already present')
4273 return 'exists'
4275 self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}')
4276 try:
4277 write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn)
4278 return True
4279 except OSError:
4280 self.report_error(f'Cannot write {label} metadata to JSON file {infofn}')
4281 return None
4283 def _write_description(self, label, ie_result, descfn):
4284 """ Write description and returns True = written, False = skip, None = error """
4285 if not self.params.get('writedescription'):
4286 return False
4287 elif not descfn:
4288 self.write_debug(f'Skipping writing {label} description')
4289 return False
4290 elif not self._ensure_dir_exists(descfn):
4291 return None
4292 elif not self.params.get('overwrites', True) and os.path.exists(descfn):
4293 self.to_screen(f'[info] {label.title()} description is already present')
4294 elif ie_result.get('description') is None:
4295 self.to_screen(f'[info] There\'s no {label} description to write')
4296 return False
4297 else:
4298 try:
4299 self.to_screen(f'[info] Writing {label} description to: {descfn}')
4300 with open(descfn, 'w', encoding='utf-8') as descfile:
4301 descfile.write(ie_result['description'])
4302 except OSError:
4303 self.report_error(f'Cannot write {label} description file {descfn}')
4304 return None
4305 return True
4307 def _write_subtitles(self, info_dict, filename):
4308 """ Write subtitles to file and return list of (sub_filename, final_sub_filename); or None if error"""
4309 ret = []
4310 subtitles = info_dict.get('requested_subtitles')
4311 if not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')):
4312 # subtitles download errors are already managed as troubles in relevant IE
4313 # that way it will silently go on when used with unsupporting IE
4314 return ret
4315 elif not subtitles:
4316 self.to_screen('[info] There are no subtitles for the requested languages')
4317 return ret
4318 sub_filename_base = self.prepare_filename(info_dict, 'subtitle')
4319 if not sub_filename_base:
4320 self.to_screen('[info] Skipping writing video subtitles')
4321 return ret
4323 for sub_lang, sub_info in subtitles.items():
4324 sub_format = sub_info['ext']
4325 sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
4326 sub_filename_final = subtitles_filename(sub_filename_base, sub_lang, sub_format, info_dict.get('ext'))
4327 existing_sub = self.existing_file((sub_filename_final, sub_filename))
4328 if existing_sub:
4329 self.to_screen(f'[info] Video subtitle {sub_lang}.{sub_format} is already present')
4330 sub_info['filepath'] = existing_sub
4331 ret.append((existing_sub, sub_filename_final))
4332 continue
4334 self.to_screen(f'[info] Writing video subtitles to: {sub_filename}')
4335 if sub_info.get('data') is not None:
4336 try:
4337 # Use newline='' to prevent conversion of newline characters
4338 # See https://github.com/ytdl-org/youtube-dl/issues/10268
4339 with open(sub_filename, 'w', encoding='utf-8', newline='') as subfile:
4340 subfile.write(sub_info['data'])
4341 sub_info['filepath'] = sub_filename
4342 ret.append((sub_filename, sub_filename_final))
4343 continue
4344 except OSError:
4345 self.report_error(f'Cannot write video subtitles file {sub_filename}')
4346 return None
4348 try:
4349 sub_copy = sub_info.copy()
4350 sub_copy.setdefault('http_headers', info_dict.get('http_headers'))
4351 self.dl(sub_filename, sub_copy, subtitle=True)
4352 sub_info['filepath'] = sub_filename
4353 ret.append((sub_filename, sub_filename_final))
4354 except (DownloadError, ExtractorError, OSError, ValueError, *network_exceptions) as err:
4355 msg = f'Unable to download video subtitles for {sub_lang!r}: {err}'
4356 if self.params.get('ignoreerrors') is not True: # False or 'only_download'
4357 if not self.params.get('ignoreerrors'):
4358 self.report_error(msg)
4359 raise DownloadError(msg)
4360 self.report_warning(msg)
4361 return ret
4363 def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None):
4364 """ Write thumbnails to file and return list of (thumb_filename, final_thumb_filename); or None if error """
4365 write_all = self.params.get('write_all_thumbnails', False)
4366 thumbnails, ret = [], []
4367 if write_all or self.params.get('writethumbnail', False):
4368 thumbnails = info_dict.get('thumbnails') or []
4369 if not thumbnails:
4370 self.to_screen(f'[info] There are no {label} thumbnails to download')
4371 return ret
4372 multiple = write_all and len(thumbnails) > 1
4374 if thumb_filename_base is None:
4375 thumb_filename_base = filename
4376 if thumbnails and not thumb_filename_base:
4377 self.write_debug(f'Skipping writing {label} thumbnail')
4378 return ret
4380 if thumbnails and not self._ensure_dir_exists(filename):
4381 return None
4383 for idx, t in list(enumerate(thumbnails))[::-1]:
4384 thumb_ext = t.get('ext') or determine_ext(t['url'], 'jpg')
4385 if multiple:
4386 thumb_ext = f'{t["id"]}.{thumb_ext}'
4387 thumb_display_id = f'{label} thumbnail {t["id"]}'
4388 thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext'))
4389 thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext'))
4391 existing_thumb = self.existing_file((thumb_filename_final, thumb_filename))
4392 if existing_thumb:
4393 self.to_screen('[info] {} is already present'.format((
4394 thumb_display_id if multiple else f'{label} thumbnail').capitalize()))
4395 t['filepath'] = existing_thumb
4396 ret.append((existing_thumb, thumb_filename_final))
4397 else:
4398 self.to_screen(f'[info] Downloading {thumb_display_id} ...')
4399 try:
4400 uf = self.urlopen(Request(t['url'], headers=t.get('http_headers', {})))
4401 self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}')
4402 with open(thumb_filename, 'wb') as thumbf:
4403 shutil.copyfileobj(uf, thumbf)
4404 ret.append((thumb_filename, thumb_filename_final))
4405 t['filepath'] = thumb_filename
4406 except network_exceptions as err:
4407 if isinstance(err, HTTPError) and err.status == 404:
4408 self.to_screen(f'[info] {thumb_display_id.title()} does not exist')
4409 else:
4410 self.report_warning(f'Unable to download {thumb_display_id}: {err}')
4411 thumbnails.pop(idx)
4412 if ret and not write_all:
4413 break
4414 return ret