[ie/tiktok] Fix and deprioritize JSON subtitles (#10516)
[yt-dlp3.git] / yt_dlp / YoutubeDL.py
blob9691a1ea7c3f96008b2fe0b9b1ca2b0510012343
1 import collections
2 import contextlib
3 import copy
4 import datetime as dt
5 import errno
6 import fileinput
7 import functools
8 import http.cookiejar
9 import io
10 import itertools
11 import json
12 import locale
13 import operator
14 import os
15 import random
16 import re
17 import shutil
18 import string
19 import subprocess
20 import sys
21 import tempfile
22 import time
23 import tokenize
24 import traceback
25 import unicodedata
27 from .cache import Cache
28 from .compat import urllib # isort: split
29 from .compat import compat_os_name, urllib_req_to_req
30 from .cookies import LenientSimpleCookie, load_cookies
31 from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name
32 from .downloader.rtmp import rtmpdump_version
33 from .extractor import gen_extractor_classes, get_info_extractor
34 from .extractor.common import UnsupportedURLIE
35 from .extractor.openload import PhantomJSwrapper
36 from .minicurses import format_text
37 from .networking import HEADRequest, Request, RequestDirector
38 from .networking.common import _REQUEST_HANDLERS, _RH_PREFERENCES
39 from .networking.exceptions import (
40 HTTPError,
41 NoSupportingHandlers,
42 RequestError,
43 SSLError,
44 network_exceptions,
46 from .networking.impersonate import ImpersonateRequestHandler
47 from .plugins import directories as plugin_directories
48 from .postprocessor import _PLUGIN_CLASSES as plugin_pps
49 from .postprocessor import (
50 EmbedThumbnailPP,
51 FFmpegFixupDuplicateMoovPP,
52 FFmpegFixupDurationPP,
53 FFmpegFixupM3u8PP,
54 FFmpegFixupM4aPP,
55 FFmpegFixupStretchedPP,
56 FFmpegFixupTimestampPP,
57 FFmpegMergerPP,
58 FFmpegPostProcessor,
59 FFmpegVideoConvertorPP,
60 MoveFilesAfterDownloadPP,
61 get_postprocessor,
63 from .postprocessor.ffmpeg import resolve_mapping as resolve_recode_mapping
64 from .update import (
65 REPOSITORY,
66 _get_system_deprecation,
67 _make_label,
68 current_git_head,
69 detect_variant,
71 from .utils import (
72 DEFAULT_OUTTMPL,
73 IDENTITY,
74 LINK_TEMPLATES,
75 MEDIA_EXTENSIONS,
76 NO_DEFAULT,
77 NUMBER_RE,
78 OUTTMPL_TYPES,
79 POSTPROCESS_WHEN,
80 STR_FORMAT_RE_TMPL,
81 STR_FORMAT_TYPES,
82 ContentTooShortError,
83 DateRange,
84 DownloadCancelled,
85 DownloadError,
86 EntryNotInPlaylist,
87 ExistingVideoReached,
88 ExtractorError,
89 FormatSorter,
90 GeoRestrictedError,
91 ISO3166Utils,
92 LazyList,
93 MaxDownloadsReached,
94 Namespace,
95 PagedList,
96 PlaylistEntries,
97 Popen,
98 PostProcessingError,
99 ReExtractInfo,
100 RejectedVideoReached,
101 SameFileError,
102 UnavailableVideoError,
103 UserNotLive,
104 YoutubeDLError,
105 age_restricted,
106 bug_reports_message,
107 date_from_str,
108 deprecation_warning,
109 determine_ext,
110 determine_protocol,
111 encode_compat_str,
112 encodeFilename,
113 escapeHTML,
114 expand_path,
115 extract_basic_auth,
116 filter_dict,
117 float_or_none,
118 format_bytes,
119 format_decimal_suffix,
120 format_field,
121 formatSeconds,
122 get_compatible_ext,
123 get_domain,
124 int_or_none,
125 iri_to_uri,
126 is_path_like,
127 join_nonempty,
128 locked_file,
129 make_archive_id,
130 make_dir,
131 number_of_digits,
132 orderedSet,
133 orderedSet_from_options,
134 parse_filesize,
135 preferredencoding,
136 prepend_extension,
137 remove_terminal_sequences,
138 render_table,
139 replace_extension,
140 sanitize_filename,
141 sanitize_path,
142 sanitize_url,
143 shell_quote,
144 str_or_none,
145 strftime_or_none,
146 subtitles_filename,
147 supports_terminal_sequences,
148 system_identifier,
149 filesize_from_tbr,
150 timetuple_from_msec,
151 to_high_limit_path,
152 traverse_obj,
153 try_call,
154 try_get,
155 url_basename,
156 variadic,
157 version_tuple,
158 windows_enable_vt_mode,
159 write_json_file,
160 write_string,
162 from .utils._utils import _UnsafeExtensionError, _YDLLogger
163 from .utils.networking import (
164 HTTPHeaderDict,
165 clean_headers,
166 clean_proxies,
167 std_headers,
169 from .version import CHANNEL, ORIGIN, RELEASE_GIT_HEAD, VARIANT, __version__
171 if compat_os_name == 'nt':
172 import ctypes
175 def _catch_unsafe_extension_error(func):
176 @functools.wraps(func)
177 def wrapper(self, *args, **kwargs):
178 try:
179 return func(self, *args, **kwargs)
180 except _UnsafeExtensionError as error:
181 self.report_error(
182 f'The extracted extension ({error.extension!r}) is unusual '
183 'and will be skipped for safety reasons. '
184 f'If you believe this is an error{bug_reports_message(",")}')
186 return wrapper
189 class YoutubeDL:
190 """YoutubeDL class.
192 YoutubeDL objects are the ones responsible of downloading the
193 actual video file and writing it to disk if the user has requested
194 it, among some other tasks. In most cases there should be one per
195 program. As, given a video URL, the downloader doesn't know how to
196 extract all the needed information, task that InfoExtractors do, it
197 has to pass the URL to one of them.
199 For this, YoutubeDL objects have a method that allows
200 InfoExtractors to be registered in a given order. When it is passed
201 a URL, the YoutubeDL object handles it to the first InfoExtractor it
202 finds that reports being able to handle it. The InfoExtractor extracts
203 all the information about the video or videos the URL refers to, and
204 YoutubeDL process the extracted information, possibly using a File
205 Downloader to download the video.
207 YoutubeDL objects accept a lot of parameters. In order not to saturate
208 the object constructor with arguments, it receives a dictionary of
209 options instead. These options are available through the params
210 attribute for the InfoExtractors to use. The YoutubeDL also
211 registers itself as the downloader in charge for the InfoExtractors
212 that are added to it, so this is a "mutual registration".
214 Available options:
216 username: Username for authentication purposes.
217 password: Password for authentication purposes.
218 videopassword: Password for accessing a video.
219 ap_mso: Adobe Pass multiple-system operator identifier.
220 ap_username: Multiple-system operator account username.
221 ap_password: Multiple-system operator account password.
222 usenetrc: Use netrc for authentication instead.
223 netrc_location: Location of the netrc file. Defaults to ~/.netrc.
224 netrc_cmd: Use a shell command to get credentials
225 verbose: Print additional info to stdout.
226 quiet: Do not print messages to stdout.
227 no_warnings: Do not print out anything for warnings.
228 forceprint: A dict with keys WHEN mapped to a list of templates to
229 print to stdout. The allowed keys are video or any of the
230 items in utils.POSTPROCESS_WHEN.
231 For compatibility, a single list is also accepted
232 print_to_file: A dict with keys WHEN (same as forceprint) mapped to
233 a list of tuples with (template, filename)
234 forcejson: Force printing info_dict as JSON.
235 dump_single_json: Force printing the info_dict of the whole playlist
236 (or video) as a single JSON line.
237 force_write_download_archive: Force writing download archive regardless
238 of 'skip_download' or 'simulate'.
239 simulate: Do not download the video files. If unset (or None),
240 simulate only if listsubtitles, listformats or list_thumbnails is used
241 format: Video format code. see "FORMAT SELECTION" for more details.
242 You can also pass a function. The function takes 'ctx' as
243 argument and returns the formats to download.
244 See "build_format_selector" for an implementation
245 allow_unplayable_formats: Allow unplayable formats to be extracted and downloaded.
246 ignore_no_formats_error: Ignore "No video formats" error. Usefull for
247 extracting metadata even if the video is not actually
248 available for download (experimental)
249 format_sort: A list of fields by which to sort the video formats.
250 See "Sorting Formats" for more details.
251 format_sort_force: Force the given format_sort. see "Sorting Formats"
252 for more details.
253 prefer_free_formats: Whether to prefer video formats with free containers
254 over non-free ones of same quality.
255 allow_multiple_video_streams: Allow multiple video streams to be merged
256 into a single file
257 allow_multiple_audio_streams: Allow multiple audio streams to be merged
258 into a single file
259 check_formats Whether to test if the formats are downloadable.
260 Can be True (check all), False (check none),
261 'selected' (check selected formats),
262 or None (check only if requested by extractor)
263 paths: Dictionary of output paths. The allowed keys are 'home'
264 'temp' and the keys of OUTTMPL_TYPES (in utils/_utils.py)
265 outtmpl: Dictionary of templates for output names. Allowed keys
266 are 'default' and the keys of OUTTMPL_TYPES (in utils/_utils.py).
267 For compatibility with youtube-dl, a single string can also be used
268 outtmpl_na_placeholder: Placeholder for unavailable meta fields.
269 restrictfilenames: Do not allow "&" and spaces in file names
270 trim_file_name: Limit length of filename (extension excluded)
271 windowsfilenames: Force the filenames to be windows compatible
272 ignoreerrors: Do not stop on download/postprocessing errors.
273 Can be 'only_download' to ignore only download errors.
274 Default is 'only_download' for CLI, but False for API
275 skip_playlist_after_errors: Number of allowed failures until the rest of
276 the playlist is skipped
277 allowed_extractors: List of regexes to match against extractor names that are allowed
278 overwrites: Overwrite all video and metadata files if True,
279 overwrite only non-video files if None
280 and don't overwrite any file if False
281 playlist_items: Specific indices of playlist to download.
282 playlistrandom: Download playlist items in random order.
283 lazy_playlist: Process playlist entries as they are received.
284 matchtitle: Download only matching titles.
285 rejecttitle: Reject downloads for matching titles.
286 logger: Log messages to a logging.Logger instance.
287 logtostderr: Print everything to stderr instead of stdout.
288 consoletitle: Display progress in console window's titlebar.
289 writedescription: Write the video description to a .description file
290 writeinfojson: Write the video description to a .info.json file
291 clean_infojson: Remove internal metadata from the infojson
292 getcomments: Extract video comments. This will not be written to disk
293 unless writeinfojson is also given
294 writeannotations: Write the video annotations to a .annotations.xml file
295 writethumbnail: Write the thumbnail image to a file
296 allow_playlist_files: Whether to write playlists' description, infojson etc
297 also to disk when using the 'write*' options
298 write_all_thumbnails: Write all thumbnail formats to files
299 writelink: Write an internet shortcut file, depending on the
300 current platform (.url/.webloc/.desktop)
301 writeurllink: Write a Windows internet shortcut file (.url)
302 writewebloclink: Write a macOS internet shortcut file (.webloc)
303 writedesktoplink: Write a Linux internet shortcut file (.desktop)
304 writesubtitles: Write the video subtitles to a file
305 writeautomaticsub: Write the automatically generated subtitles to a file
306 listsubtitles: Lists all available subtitles for the video
307 subtitlesformat: The format code for subtitles
308 subtitleslangs: List of languages of the subtitles to download (can be regex).
309 The list may contain "all" to refer to all the available
310 subtitles. The language can be prefixed with a "-" to
311 exclude it from the requested languages, e.g. ['all', '-live_chat']
312 keepvideo: Keep the video file after post-processing
313 daterange: A utils.DateRange object, download only if the upload_date is in the range.
314 skip_download: Skip the actual download of the video file
315 cachedir: Location of the cache files in the filesystem.
316 False to disable filesystem cache.
317 noplaylist: Download single video instead of a playlist if in doubt.
318 age_limit: An integer representing the user's age in years.
319 Unsuitable videos for the given age are skipped.
320 min_views: An integer representing the minimum view count the video
321 must have in order to not be skipped.
322 Videos without view count information are always
323 downloaded. None for no limit.
324 max_views: An integer representing the maximum view count.
325 Videos that are more popular than that are not
326 downloaded.
327 Videos without view count information are always
328 downloaded. None for no limit.
329 download_archive: A set, or the name of a file where all downloads are recorded.
330 Videos already present in the file are not downloaded again.
331 break_on_existing: Stop the download process after attempting to download a
332 file that is in the archive.
333 break_per_url: Whether break_on_reject and break_on_existing
334 should act on each input URL as opposed to for the entire queue
335 cookiefile: File name or text stream from where cookies should be read and dumped to
336 cookiesfrombrowser: A tuple containing the name of the browser, the profile
337 name/path from where cookies are loaded, the name of the keyring,
338 and the container name, e.g. ('chrome', ) or
339 ('vivaldi', 'default', 'BASICTEXT') or ('firefox', 'default', None, 'Meta')
340 legacyserverconnect: Explicitly allow HTTPS connection to servers that do not
341 support RFC 5746 secure renegotiation
342 nocheckcertificate: Do not verify SSL certificates
343 client_certificate: Path to client certificate file in PEM format. May include the private key
344 client_certificate_key: Path to private key file for client certificate
345 client_certificate_password: Password for client certificate private key, if encrypted.
346 If not provided and the key is encrypted, yt-dlp will ask interactively
347 prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
348 (Only supported by some extractors)
349 enable_file_urls: Enable file:// URLs. This is disabled by default for security reasons.
350 http_headers: A dictionary of custom headers to be used for all requests
351 proxy: URL of the proxy server to use
352 geo_verification_proxy: URL of the proxy to use for IP address verification
353 on geo-restricted sites.
354 socket_timeout: Time to wait for unresponsive hosts, in seconds
355 bidi_workaround: Work around buggy terminals without bidirectional text
356 support, using fridibi
357 debug_printtraffic:Print out sent and received HTTP traffic
358 default_search: Prepend this string if an input url is not valid.
359 'auto' for elaborate guessing
360 encoding: Use this encoding instead of the system-specified.
361 extract_flat: Whether to resolve and process url_results further
362 * False: Always process. Default for API
363 * True: Never process
364 * 'in_playlist': Do not process inside playlist/multi_video
365 * 'discard': Always process, but don't return the result
366 from inside playlist/multi_video
367 * 'discard_in_playlist': Same as "discard", but only for
368 playlists (not multi_video). Default for CLI
369 wait_for_video: If given, wait for scheduled streams to become available.
370 The value should be a tuple containing the range
371 (min_secs, max_secs) to wait between retries
372 postprocessors: A list of dictionaries, each with an entry
373 * key: The name of the postprocessor. See
374 yt_dlp/postprocessor/__init__.py for a list.
375 * when: When to run the postprocessor. Allowed values are
376 the entries of utils.POSTPROCESS_WHEN
377 Assumed to be 'post_process' if not given
378 progress_hooks: A list of functions that get called on download
379 progress, with a dictionary with the entries
380 * status: One of "downloading", "error", or "finished".
381 Check this first and ignore unknown values.
382 * info_dict: The extracted info_dict
384 If status is one of "downloading", or "finished", the
385 following properties may also be present:
386 * filename: The final filename (always present)
387 * tmpfilename: The filename we're currently writing to
388 * downloaded_bytes: Bytes on disk
389 * total_bytes: Size of the whole file, None if unknown
390 * total_bytes_estimate: Guess of the eventual file size,
391 None if unavailable.
392 * elapsed: The number of seconds since download started.
393 * eta: The estimated time in seconds, None if unknown
394 * speed: The download speed in bytes/second, None if
395 unknown
396 * fragment_index: The counter of the currently
397 downloaded video fragment.
398 * fragment_count: The number of fragments (= individual
399 files that will be merged)
401 Progress hooks are guaranteed to be called at least once
402 (with status "finished") if the download is successful.
403 postprocessor_hooks: A list of functions that get called on postprocessing
404 progress, with a dictionary with the entries
405 * status: One of "started", "processing", or "finished".
406 Check this first and ignore unknown values.
407 * postprocessor: Name of the postprocessor
408 * info_dict: The extracted info_dict
410 Progress hooks are guaranteed to be called at least twice
411 (with status "started" and "finished") if the processing is successful.
412 merge_output_format: "/" separated list of extensions to use when merging formats.
413 final_ext: Expected final extension; used to detect when the file was
414 already downloaded and converted
415 fixup: Automatically correct known faults of the file.
416 One of:
417 - "never": do nothing
418 - "warn": only emit a warning
419 - "detect_or_warn": check whether we can do anything
420 about it, warn otherwise (default)
421 source_address: Client-side IP address to bind to.
422 impersonate: Client to impersonate for requests.
423 An ImpersonateTarget (from yt_dlp.networking.impersonate)
424 sleep_interval_requests: Number of seconds to sleep between requests
425 during extraction
426 sleep_interval: Number of seconds to sleep before each download when
427 used alone or a lower bound of a range for randomized
428 sleep before each download (minimum possible number
429 of seconds to sleep) when used along with
430 max_sleep_interval.
431 max_sleep_interval:Upper bound of a range for randomized sleep before each
432 download (maximum possible number of seconds to sleep).
433 Must only be used along with sleep_interval.
434 Actual sleep time will be a random float from range
435 [sleep_interval; max_sleep_interval].
436 sleep_interval_subtitles: Number of seconds to sleep before each subtitle download
437 listformats: Print an overview of available video formats and exit.
438 list_thumbnails: Print a table of all thumbnails and exit.
439 match_filter: A function that gets called for every video with the signature
440 (info_dict, *, incomplete: bool) -> Optional[str]
441 For backward compatibility with youtube-dl, the signature
442 (info_dict) -> Optional[str] is also allowed.
443 - If it returns a message, the video is ignored.
444 - If it returns None, the video is downloaded.
445 - If it returns utils.NO_DEFAULT, the user is interactively
446 asked whether to download the video.
447 - Raise utils.DownloadCancelled(msg) to abort remaining
448 downloads when a video is rejected.
449 match_filter_func in utils/_utils.py is one example for this.
450 color: A Dictionary with output stream names as keys
451 and their respective color policy as values.
452 Can also just be a single color policy,
453 in which case it applies to all outputs.
454 Valid stream names are 'stdout' and 'stderr'.
455 Valid color policies are one of 'always', 'auto',
456 'no_color', 'never', 'auto-tty' or 'no_color-tty'.
457 geo_bypass: Bypass geographic restriction via faking X-Forwarded-For
458 HTTP header
459 geo_bypass_country:
460 Two-letter ISO 3166-2 country code that will be used for
461 explicit geographic restriction bypassing via faking
462 X-Forwarded-For HTTP header
463 geo_bypass_ip_block:
464 IP range in CIDR notation that will be used similarly to
465 geo_bypass_country
466 external_downloader: A dictionary of protocol keys and the executable of the
467 external downloader to use for it. The allowed protocols
468 are default|http|ftp|m3u8|dash|rtsp|rtmp|mms.
469 Set the value to 'native' to use the native downloader
470 compat_opts: Compatibility options. See "Differences in default behavior".
471 The following options do not work when used through the API:
472 filename, abort-on-error, multistreams, no-live-chat,
473 format-sort, no-clean-infojson, no-playlist-metafiles,
474 no-keep-subs, no-attach-info-json, allow-unsafe-ext.
475 Refer __init__.py for their implementation
476 progress_template: Dictionary of templates for progress outputs.
477 Allowed keys are 'download', 'postprocess',
478 'download-title' (console title) and 'postprocess-title'.
479 The template is mapped on a dictionary with keys 'progress' and 'info'
480 retry_sleep_functions: Dictionary of functions that takes the number of attempts
481 as argument and returns the time to sleep in seconds.
482 Allowed keys are 'http', 'fragment', 'file_access'
483 download_ranges: A callback function that gets called for every video with
484 the signature (info_dict, ydl) -> Iterable[Section].
485 Only the returned sections will be downloaded.
486 Each Section is a dict with the following keys:
487 * start_time: Start time of the section in seconds
488 * end_time: End time of the section in seconds
489 * title: Section title (Optional)
490 * index: Section number (Optional)
491 force_keyframes_at_cuts: Re-encode the video when downloading ranges to get precise cuts
492 noprogress: Do not print the progress bar
493 live_from_start: Whether to download livestreams videos from the start
495 The following parameters are not used by YoutubeDL itself, they are used by
496 the downloader (see yt_dlp/downloader/common.py):
497 nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize,
498 max_filesize, test, noresizebuffer, retries, file_access_retries, fragment_retries,
499 continuedl, xattr_set_filesize, hls_use_mpegts, http_chunk_size,
500 external_downloader_args, concurrent_fragment_downloads, progress_delta.
502 The following options are used by the post processors:
503 ffmpeg_location: Location of the ffmpeg/avconv binary; either the path
504 to the binary or its containing directory.
505 postprocessor_args: A dictionary of postprocessor/executable keys (in lower case)
506 and a list of additional command-line arguments for the
507 postprocessor/executable. The dict can also have "PP+EXE" keys
508 which are used when the given exe is used by the given PP.
509 Use 'default' as the name for arguments to passed to all PP
510 For compatibility with youtube-dl, a single list of args
511 can also be used
513 The following options are used by the extractors:
514 extractor_retries: Number of times to retry for known errors (default: 3)
515 dynamic_mpd: Whether to process dynamic DASH manifests (default: True)
516 hls_split_discontinuity: Split HLS playlists to different formats at
517 discontinuities such as ad breaks (default: False)
518 extractor_args: A dictionary of arguments to be passed to the extractors.
519 See "EXTRACTOR ARGUMENTS" for details.
520 E.g. {'youtube': {'skip': ['dash', 'hls']}}
521 mark_watched: Mark videos watched (even with --simulate). Only for YouTube
523 The following options are deprecated and may be removed in the future:
525 break_on_reject: Stop the download process when encountering a video that
526 has been filtered out.
527 - `raise DownloadCancelled(msg)` in match_filter instead
528 force_generic_extractor: Force downloader to use the generic extractor
529 - Use allowed_extractors = ['generic', 'default']
530 playliststart: - Use playlist_items
531 Playlist item to start at.
532 playlistend: - Use playlist_items
533 Playlist item to end at.
534 playlistreverse: - Use playlist_items
535 Download playlist items in reverse order.
536 forceurl: - Use forceprint
537 Force printing final URL.
538 forcetitle: - Use forceprint
539 Force printing title.
540 forceid: - Use forceprint
541 Force printing ID.
542 forcethumbnail: - Use forceprint
543 Force printing thumbnail URL.
544 forcedescription: - Use forceprint
545 Force printing description.
546 forcefilename: - Use forceprint
547 Force printing final filename.
548 forceduration: - Use forceprint
549 Force printing duration.
550 allsubtitles: - Use subtitleslangs = ['all']
551 Downloads all the subtitles of the video
552 (requires writesubtitles or writeautomaticsub)
553 include_ads: - Doesn't work
554 Download ads as well
555 call_home: - Not implemented
556 Boolean, true iff we are allowed to contact the
557 yt-dlp servers for debugging.
558 post_hooks: - Register a custom postprocessor
559 A list of functions that get called as the final step
560 for each video file, after all postprocessors have been
561 called. The filename will be passed as the only argument.
562 hls_prefer_native: - Use external_downloader = {'m3u8': 'native'} or {'m3u8': 'ffmpeg'}.
563 Use the native HLS downloader instead of ffmpeg/avconv
564 if True, otherwise use ffmpeg/avconv if False, otherwise
565 use downloader suggested by extractor if None.
566 prefer_ffmpeg: - avconv support is deprecated
567 If False, use avconv instead of ffmpeg if both are available,
568 otherwise prefer ffmpeg.
569 youtube_include_dash_manifest: - Use extractor_args
570 If True (default), DASH manifests and related
571 data will be downloaded and processed by extractor.
572 You can reduce network I/O by disabling it if you don't
573 care about DASH. (only for youtube)
574 youtube_include_hls_manifest: - Use extractor_args
575 If True (default), HLS manifests and related
576 data will be downloaded and processed by extractor.
577 You can reduce network I/O by disabling it if you don't
578 care about HLS. (only for youtube)
579 no_color: Same as `color='no_color'`
580 no_overwrites: Same as `overwrites=False`
583 _NUMERIC_FIELDS = {
584 'width', 'height', 'asr', 'audio_channels', 'fps',
585 'tbr', 'abr', 'vbr', 'filesize', 'filesize_approx',
586 'timestamp', 'release_timestamp',
587 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
588 'average_rating', 'comment_count', 'age_limit',
589 'start_time', 'end_time',
590 'chapter_number', 'season_number', 'episode_number',
591 'track_number', 'disc_number', 'release_year',
594 _format_fields = {
595 # NB: Keep in sync with the docstring of extractor/common.py
596 'url', 'manifest_url', 'manifest_stream_number', 'ext', 'format', 'format_id', 'format_note',
597 'width', 'height', 'aspect_ratio', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', 'audio_channels',
598 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'rows', 'columns',
599 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start', 'is_dash_periods', 'request_data',
600 'preference', 'language', 'language_preference', 'quality', 'source_preference', 'cookies',
601 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'extra_param_to_key_url',
602 'hls_aes', 'downloader_options', 'page_url', 'app', 'play_path', 'tc_url', 'flash_version',
603 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time',
605 _deprecated_multivalue_fields = {
606 'album_artist': 'album_artists',
607 'artist': 'artists',
608 'composer': 'composers',
609 'creator': 'creators',
610 'genre': 'genres',
612 _format_selection_exts = {
613 'audio': set(MEDIA_EXTENSIONS.common_audio),
614 'video': {*MEDIA_EXTENSIONS.common_video, '3gp'},
615 'storyboards': set(MEDIA_EXTENSIONS.storyboards),
618 def __init__(self, params=None, auto_init=True):
619 """Create a FileDownloader object with the given options.
620 @param auto_init Whether to load the default extractors and print header (if verbose).
621 Set to 'no_verbose_header' to not print the header
623 if params is None:
624 params = {}
625 self.params = params
626 self._ies = {}
627 self._ies_instances = {}
628 self._pps = {k: [] for k in POSTPROCESS_WHEN}
629 self._printed_messages = set()
630 self._first_webpage_request = True
631 self._post_hooks = []
632 self._progress_hooks = []
633 self._postprocessor_hooks = []
634 self._download_retcode = 0
635 self._num_downloads = 0
636 self._num_videos = 0
637 self._playlist_level = 0
638 self._playlist_urls = set()
639 self.cache = Cache(self)
640 self.__header_cookies = []
642 stdout = sys.stderr if self.params.get('logtostderr') else sys.stdout
643 self._out_files = Namespace(
644 out=stdout,
645 error=sys.stderr,
646 screen=sys.stderr if self.params.get('quiet') else stdout,
647 console=None if compat_os_name == 'nt' else next(
648 filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None),
651 try:
652 windows_enable_vt_mode()
653 except Exception as e:
654 self.write_debug(f'Failed to enable VT mode: {e}')
656 if self.params.get('no_color'):
657 if self.params.get('color') is not None:
658 self.params.setdefault('_warnings', []).append(
659 'Overwriting params from "color" with "no_color"')
660 self.params['color'] = 'no_color'
662 term_allow_color = os.getenv('TERM', '').lower() != 'dumb'
663 base_no_color = bool(os.getenv('NO_COLOR'))
665 def process_color_policy(stream):
666 stream_name = {sys.stdout: 'stdout', sys.stderr: 'stderr'}[stream]
667 policy = traverse_obj(self.params, ('color', (stream_name, None), {str}, any)) or 'auto'
668 if policy in ('auto', 'auto-tty', 'no_color-tty'):
669 no_color = base_no_color
670 if policy.endswith('tty'):
671 no_color = policy.startswith('no_color')
672 if term_allow_color and supports_terminal_sequences(stream):
673 return 'no_color' if no_color else True
674 return False
675 assert policy in ('always', 'never', 'no_color'), policy
676 return {'always': True, 'never': False}.get(policy, policy)
678 self._allow_colors = Namespace(**{
679 name: process_color_policy(stream)
680 for name, stream in self._out_files.items_ if name != 'console'
683 system_deprecation = _get_system_deprecation()
684 if system_deprecation:
685 self.deprecated_feature(system_deprecation.replace('\n', '\n '))
687 if self.params.get('allow_unplayable_formats'):
688 self.report_warning(
689 f'You have asked for {self._format_err("UNPLAYABLE", self.Styles.EMPHASIS)} formats to be listed/downloaded. '
690 'This is a developer option intended for debugging. \n'
691 ' If you experience any issues while using this option, '
692 f'{self._format_err("DO NOT", self.Styles.ERROR)} open a bug report')
694 if self.params.get('bidi_workaround', False):
695 try:
696 import pty
697 master, slave = pty.openpty()
698 width = shutil.get_terminal_size().columns
699 width_args = [] if width is None else ['-w', str(width)]
700 sp_kwargs = {'stdin': subprocess.PIPE, 'stdout': slave, 'stderr': self._out_files.error}
701 try:
702 self._output_process = Popen(['bidiv', *width_args], **sp_kwargs)
703 except OSError:
704 self._output_process = Popen(['fribidi', '-c', 'UTF-8', *width_args], **sp_kwargs)
705 self._output_channel = os.fdopen(master, 'rb')
706 except OSError as ose:
707 if ose.errno == errno.ENOENT:
708 self.report_warning(
709 'Could not find fribidi executable, ignoring --bidi-workaround. '
710 'Make sure that fribidi is an executable file in one of the directories in your $PATH.')
711 else:
712 raise
714 self.params['compat_opts'] = set(self.params.get('compat_opts', ()))
715 self.params['http_headers'] = HTTPHeaderDict(std_headers, self.params.get('http_headers'))
716 self._load_cookies(self.params['http_headers'].get('Cookie')) # compat
717 self.params['http_headers'].pop('Cookie', None)
719 if auto_init and auto_init != 'no_verbose_header':
720 self.print_debug_header()
722 def check_deprecated(param, option, suggestion):
723 if self.params.get(param) is not None:
724 self.report_warning(f'{option} is deprecated. Use {suggestion} instead')
725 return True
726 return False
728 if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
729 if self.params.get('geo_verification_proxy') is None:
730 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
732 check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
733 check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
734 check_deprecated('useid', '--id', '-o "%(id)s.%(ext)s"')
736 for msg in self.params.get('_warnings', []):
737 self.report_warning(msg)
738 for msg in self.params.get('_deprecation_warnings', []):
739 self.deprecated_feature(msg)
741 if impersonate_target := self.params.get('impersonate'):
742 if not self._impersonate_target_available(impersonate_target):
743 raise YoutubeDLError(
744 f'Impersonate target "{impersonate_target}" is not available. '
745 f'Use --list-impersonate-targets to see available targets. '
746 f'You may be missing dependencies required to support this target.')
748 if 'list-formats' in self.params['compat_opts']:
749 self.params['listformats_table'] = False
751 if 'overwrites' not in self.params and self.params.get('nooverwrites') is not None:
752 # nooverwrites was unnecessarily changed to overwrites
753 # in 0c3d0f51778b153f65c21906031c2e091fcfb641
754 # This ensures compatibility with both keys
755 self.params['overwrites'] = not self.params['nooverwrites']
756 elif self.params.get('overwrites') is None:
757 self.params.pop('overwrites', None)
758 else:
759 self.params['nooverwrites'] = not self.params['overwrites']
761 if self.params.get('simulate') is None and any((
762 self.params.get('list_thumbnails'),
763 self.params.get('listformats'),
764 self.params.get('listsubtitles'),
766 self.params['simulate'] = 'list_only'
768 self.params.setdefault('forceprint', {})
769 self.params.setdefault('print_to_file', {})
771 # Compatibility with older syntax
772 if not isinstance(params['forceprint'], dict):
773 self.params['forceprint'] = {'video': params['forceprint']}
775 if auto_init:
776 self.add_default_info_extractors()
778 if (sys.platform != 'win32'
779 and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
780 and not self.params.get('restrictfilenames', False)):
781 # Unicode filesystem API will throw errors (#1474, #13027)
782 self.report_warning(
783 'Assuming --restrict-filenames since file system encoding '
784 'cannot encode all characters. '
785 'Set the LC_ALL environment variable to fix this.')
786 self.params['restrictfilenames'] = True
788 self._parse_outtmpl()
790 # Creating format selector here allows us to catch syntax errors before the extraction
791 self.format_selector = (
792 self.params.get('format') if self.params.get('format') in (None, '-')
793 else self.params['format'] if callable(self.params['format'])
794 else self.build_format_selector(self.params['format']))
796 hooks = {
797 'post_hooks': self.add_post_hook,
798 'progress_hooks': self.add_progress_hook,
799 'postprocessor_hooks': self.add_postprocessor_hook,
801 for opt, fn in hooks.items():
802 for ph in self.params.get(opt, []):
803 fn(ph)
805 for pp_def_raw in self.params.get('postprocessors', []):
806 pp_def = dict(pp_def_raw)
807 when = pp_def.pop('when', 'post_process')
808 self.add_post_processor(
809 get_postprocessor(pp_def.pop('key'))(self, **pp_def),
810 when=when)
812 def preload_download_archive(fn):
813 """Preload the archive, if any is specified"""
814 archive = set()
815 if fn is None:
816 return archive
817 elif not is_path_like(fn):
818 return fn
820 self.write_debug(f'Loading archive file {fn!r}')
821 try:
822 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
823 for line in archive_file:
824 archive.add(line.strip())
825 except OSError as ioe:
826 if ioe.errno != errno.ENOENT:
827 raise
828 return archive
830 self.archive = preload_download_archive(self.params.get('download_archive'))
832 def warn_if_short_id(self, argv):
833 # short YouTube ID starting with dash?
834 idxs = [
835 i for i, a in enumerate(argv)
836 if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
837 if idxs:
838 correct_argv = (
839 ['yt-dlp']
840 + [a for i, a in enumerate(argv) if i not in idxs]
841 + ['--'] + [argv[i] for i in idxs]
843 self.report_warning(
844 'Long argument string detected. '
845 f'Use -- to separate parameters and URLs, like this:\n{shell_quote(correct_argv)}')
847 def add_info_extractor(self, ie):
848 """Add an InfoExtractor object to the end of the list."""
849 ie_key = ie.ie_key()
850 self._ies[ie_key] = ie
851 if not isinstance(ie, type):
852 self._ies_instances[ie_key] = ie
853 ie.set_downloader(self)
855 def get_info_extractor(self, ie_key):
857 Get an instance of an IE with name ie_key, it will try to get one from
858 the _ies list, if there's no instance it will create a new one and add
859 it to the extractor list.
861 ie = self._ies_instances.get(ie_key)
862 if ie is None:
863 ie = get_info_extractor(ie_key)()
864 self.add_info_extractor(ie)
865 return ie
867 def add_default_info_extractors(self):
869 Add the InfoExtractors returned by gen_extractors to the end of the list
871 all_ies = {ie.IE_NAME.lower(): ie for ie in gen_extractor_classes()}
872 all_ies['end'] = UnsupportedURLIE()
873 try:
874 ie_names = orderedSet_from_options(
875 self.params.get('allowed_extractors', ['default']), {
876 'all': list(all_ies),
877 'default': [name for name, ie in all_ies.items() if ie._ENABLED],
878 }, use_regex=True)
879 except re.error as e:
880 raise ValueError(f'Wrong regex for allowed_extractors: {e.pattern}')
881 for name in ie_names:
882 self.add_info_extractor(all_ies[name])
883 self.write_debug(f'Loaded {len(ie_names)} extractors')
885 def add_post_processor(self, pp, when='post_process'):
886 """Add a PostProcessor object to the end of the chain."""
887 assert when in POSTPROCESS_WHEN, f'Invalid when={when}'
888 self._pps[when].append(pp)
889 pp.set_downloader(self)
891 def add_post_hook(self, ph):
892 """Add the post hook"""
893 self._post_hooks.append(ph)
895 def add_progress_hook(self, ph):
896 """Add the download progress hook"""
897 self._progress_hooks.append(ph)
899 def add_postprocessor_hook(self, ph):
900 """Add the postprocessing progress hook"""
901 self._postprocessor_hooks.append(ph)
902 for pps in self._pps.values():
903 for pp in pps:
904 pp.add_progress_hook(ph)
906 def _bidi_workaround(self, message):
907 if not hasattr(self, '_output_channel'):
908 return message
910 assert hasattr(self, '_output_process')
911 assert isinstance(message, str)
912 line_count = message.count('\n') + 1
913 self._output_process.stdin.write((message + '\n').encode())
914 self._output_process.stdin.flush()
915 res = ''.join(self._output_channel.readline().decode()
916 for _ in range(line_count))
917 return res[:-len('\n')]
919 def _write_string(self, message, out=None, only_once=False):
920 if only_once:
921 if message in self._printed_messages:
922 return
923 self._printed_messages.add(message)
924 write_string(message, out=out, encoding=self.params.get('encoding'))
926 def to_stdout(self, message, skip_eol=False, quiet=None):
927 """Print message to stdout"""
928 if quiet is not None:
929 self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument quiet. '
930 'Use "YoutubeDL.to_screen" instead')
931 if skip_eol is not False:
932 self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument skip_eol. '
933 'Use "YoutubeDL.to_screen" instead')
934 self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.out)
936 def to_screen(self, message, skip_eol=False, quiet=None, only_once=False):
937 """Print message to screen if not in quiet mode"""
938 if self.params.get('logger'):
939 self.params['logger'].debug(message)
940 return
941 if (self.params.get('quiet') if quiet is None else quiet) and not self.params.get('verbose'):
942 return
943 self._write_string(
944 '{}{}'.format(self._bidi_workaround(message), ('' if skip_eol else '\n')),
945 self._out_files.screen, only_once=only_once)
947 def to_stderr(self, message, only_once=False):
948 """Print message to stderr"""
949 assert isinstance(message, str)
950 if self.params.get('logger'):
951 self.params['logger'].error(message)
952 else:
953 self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.error, only_once=only_once)
955 def _send_console_code(self, code):
956 if compat_os_name == 'nt' or not self._out_files.console:
957 return
958 self._write_string(code, self._out_files.console)
960 def to_console_title(self, message):
961 if not self.params.get('consoletitle', False):
962 return
963 message = remove_terminal_sequences(message)
964 if compat_os_name == 'nt':
965 if ctypes.windll.kernel32.GetConsoleWindow():
966 # c_wchar_p() might not be necessary if `message` is
967 # already of type unicode()
968 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
969 else:
970 self._send_console_code(f'\033]0;{message}\007')
972 def save_console_title(self):
973 if not self.params.get('consoletitle') or self.params.get('simulate'):
974 return
975 self._send_console_code('\033[22;0t') # Save the title on stack
977 def restore_console_title(self):
978 if not self.params.get('consoletitle') or self.params.get('simulate'):
979 return
980 self._send_console_code('\033[23;0t') # Restore the title from stack
982 def __enter__(self):
983 self.save_console_title()
984 return self
986 def save_cookies(self):
987 if self.params.get('cookiefile') is not None:
988 self.cookiejar.save()
990 def __exit__(self, *args):
991 self.restore_console_title()
992 self.close()
994 def close(self):
995 self.save_cookies()
996 if '_request_director' in self.__dict__:
997 self._request_director.close()
998 del self._request_director
1000 def trouble(self, message=None, tb=None, is_error=True):
1001 """Determine action to take when a download problem appears.
1003 Depending on if the downloader has been configured to ignore
1004 download errors or not, this method may throw an exception or
1005 not when errors are found, after printing the message.
1007 @param tb If given, is additional traceback information
1008 @param is_error Whether to raise error according to ignorerrors
1010 if message is not None:
1011 self.to_stderr(message)
1012 if self.params.get('verbose'):
1013 if tb is None:
1014 if sys.exc_info()[0]: # if .trouble has been called from an except block
1015 tb = ''
1016 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
1017 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
1018 tb += encode_compat_str(traceback.format_exc())
1019 else:
1020 tb_data = traceback.format_list(traceback.extract_stack())
1021 tb = ''.join(tb_data)
1022 if tb:
1023 self.to_stderr(tb)
1024 if not is_error:
1025 return
1026 if not self.params.get('ignoreerrors'):
1027 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
1028 exc_info = sys.exc_info()[1].exc_info
1029 else:
1030 exc_info = sys.exc_info()
1031 raise DownloadError(message, exc_info)
1032 self._download_retcode = 1
1034 Styles = Namespace(
1035 HEADERS='yellow',
1036 EMPHASIS='light blue',
1037 FILENAME='green',
1038 ID='green',
1039 DELIM='blue',
1040 ERROR='red',
1041 BAD_FORMAT='light red',
1042 WARNING='yellow',
1043 SUPPRESS='light black',
1046 def _format_text(self, handle, allow_colors, text, f, fallback=None, *, test_encoding=False):
1047 text = str(text)
1048 if test_encoding:
1049 original_text = text
1050 # handle.encoding can be None. See https://github.com/yt-dlp/yt-dlp/issues/2711
1051 encoding = self.params.get('encoding') or getattr(handle, 'encoding', None) or 'ascii'
1052 text = text.encode(encoding, 'ignore').decode(encoding)
1053 if fallback is not None and text != original_text:
1054 text = fallback
1055 return format_text(text, f) if allow_colors is True else text if fallback is None else fallback
1057 def _format_out(self, *args, **kwargs):
1058 return self._format_text(self._out_files.out, self._allow_colors.out, *args, **kwargs)
1060 def _format_screen(self, *args, **kwargs):
1061 return self._format_text(self._out_files.screen, self._allow_colors.screen, *args, **kwargs)
1063 def _format_err(self, *args, **kwargs):
1064 return self._format_text(self._out_files.error, self._allow_colors.error, *args, **kwargs)
1066 def report_warning(self, message, only_once=False):
1068 Print the message to stderr, it will be prefixed with 'WARNING:'
1069 If stderr is a tty file the 'WARNING:' will be colored
1071 if self.params.get('logger') is not None:
1072 self.params['logger'].warning(message)
1073 else:
1074 if self.params.get('no_warnings'):
1075 return
1076 self.to_stderr(f'{self._format_err("WARNING:", self.Styles.WARNING)} {message}', only_once)
1078 def deprecation_warning(self, message, *, stacklevel=0):
1079 deprecation_warning(
1080 message, stacklevel=stacklevel + 1, printer=self.report_error, is_error=False)
1082 def deprecated_feature(self, message):
1083 if self.params.get('logger') is not None:
1084 self.params['logger'].warning(f'Deprecated Feature: {message}')
1085 self.to_stderr(f'{self._format_err("Deprecated Feature:", self.Styles.ERROR)} {message}', True)
1087 def report_error(self, message, *args, **kwargs):
1089 Do the same as trouble, but prefixes the message with 'ERROR:', colored
1090 in red if stderr is a tty file.
1092 self.trouble(f'{self._format_err("ERROR:", self.Styles.ERROR)} {message}', *args, **kwargs)
1094 def write_debug(self, message, only_once=False):
1095 """Log debug message or Print message to stderr"""
1096 if not self.params.get('verbose', False):
1097 return
1098 message = f'[debug] {message}'
1099 if self.params.get('logger'):
1100 self.params['logger'].debug(message)
1101 else:
1102 self.to_stderr(message, only_once)
1104 def report_file_already_downloaded(self, file_name):
1105 """Report file has already been fully downloaded."""
1106 try:
1107 self.to_screen(f'[download] {file_name} has already been downloaded')
1108 except UnicodeEncodeError:
1109 self.to_screen('[download] The file has already been downloaded')
1111 def report_file_delete(self, file_name):
1112 """Report that existing file will be deleted."""
1113 try:
1114 self.to_screen(f'Deleting existing file {file_name}')
1115 except UnicodeEncodeError:
1116 self.to_screen('Deleting existing file')
1118 def raise_no_formats(self, info, forced=False, *, msg=None):
1119 has_drm = info.get('_has_drm')
1120 ignored, expected = self.params.get('ignore_no_formats_error'), bool(msg)
1121 msg = msg or has_drm and 'This video is DRM protected' or 'No video formats found!'
1122 if forced or not ignored:
1123 raise ExtractorError(msg, video_id=info['id'], ie=info['extractor'],
1124 expected=has_drm or ignored or expected)
1125 else:
1126 self.report_warning(msg)
1128 def parse_outtmpl(self):
1129 self.deprecation_warning('"YoutubeDL.parse_outtmpl" is deprecated and may be removed in a future version')
1130 self._parse_outtmpl()
1131 return self.params['outtmpl']
1133 def _parse_outtmpl(self):
1134 sanitize = IDENTITY
1135 if self.params.get('restrictfilenames'): # Remove spaces in the default template
1136 sanitize = lambda x: x.replace(' - ', ' ').replace(' ', '-')
1138 outtmpl = self.params.setdefault('outtmpl', {})
1139 if not isinstance(outtmpl, dict):
1140 self.params['outtmpl'] = outtmpl = {'default': outtmpl}
1141 outtmpl.update({k: sanitize(v) for k, v in DEFAULT_OUTTMPL.items() if outtmpl.get(k) is None})
1143 def get_output_path(self, dir_type='', filename=None):
1144 paths = self.params.get('paths', {})
1145 assert isinstance(paths, dict), '"paths" parameter must be a dictionary'
1146 path = os.path.join(
1147 expand_path(paths.get('home', '').strip()),
1148 expand_path(paths.get(dir_type, '').strip()) if dir_type else '',
1149 filename or '')
1150 return sanitize_path(path, force=self.params.get('windowsfilenames'))
1152 @staticmethod
1153 def _outtmpl_expandpath(outtmpl):
1154 # expand_path translates '%%' into '%' and '$$' into '$'
1155 # correspondingly that is not what we want since we need to keep
1156 # '%%' intact for template dict substitution step. Working around
1157 # with boundary-alike separator hack.
1158 sep = ''.join(random.choices(string.ascii_letters, k=32))
1159 outtmpl = outtmpl.replace('%%', f'%{sep}%').replace('$$', f'${sep}$')
1161 # outtmpl should be expand_path'ed before template dict substitution
1162 # because meta fields may contain env variables we don't want to
1163 # be expanded. E.g. for outtmpl "%(title)s.%(ext)s" and
1164 # title "Hello $PATH", we don't want `$PATH` to be expanded.
1165 return expand_path(outtmpl).replace(sep, '')
1167 @staticmethod
1168 def escape_outtmpl(outtmpl):
1169 """ Escape any remaining strings like %s, %abc% etc. """
1170 return re.sub(
1171 STR_FORMAT_RE_TMPL.format('', '(?![%(\0])'),
1172 lambda mobj: ('' if mobj.group('has_key') else '%') + mobj.group(0),
1173 outtmpl)
1175 @classmethod
1176 def validate_outtmpl(cls, outtmpl):
1177 """ @return None or Exception object """
1178 outtmpl = re.sub(
1179 STR_FORMAT_RE_TMPL.format('[^)]*', '[ljhqBUDS]'),
1180 lambda mobj: f'{mobj.group(0)[:-1]}s',
1181 cls._outtmpl_expandpath(outtmpl))
1182 try:
1183 cls.escape_outtmpl(outtmpl) % collections.defaultdict(int)
1184 return None
1185 except ValueError as err:
1186 return err
1188 @staticmethod
1189 def _copy_infodict(info_dict):
1190 info_dict = dict(info_dict)
1191 info_dict.pop('__postprocessors', None)
1192 info_dict.pop('__pending_error', None)
1193 return info_dict
1195 def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False):
1196 """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict
1197 @param sanitize Whether to sanitize the output as a filename.
1198 For backward compatibility, a function can also be passed
1201 info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set
1203 info_dict = self._copy_infodict(info_dict)
1204 info_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs
1205 formatSeconds(info_dict['duration'], '-' if sanitize else ':')
1206 if info_dict.get('duration', None) is not None
1207 else None)
1208 info_dict['autonumber'] = int(self.params.get('autonumber_start', 1) - 1 + self._num_downloads)
1209 info_dict['video_autonumber'] = self._num_videos
1210 if info_dict.get('resolution') is None:
1211 info_dict['resolution'] = self.format_resolution(info_dict, default=None)
1213 # For fields playlist_index, playlist_autonumber and autonumber convert all occurrences
1214 # of %(field)s to %(field)0Nd for backward compatibility
1215 field_size_compat_map = {
1216 'playlist_index': number_of_digits(info_dict.get('__last_playlist_index') or 0),
1217 'playlist_autonumber': number_of_digits(info_dict.get('n_entries') or 0),
1218 'autonumber': self.params.get('autonumber_size') or 5,
1221 TMPL_DICT = {}
1222 EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljhqBUDS]'))
1223 MATH_FUNCTIONS = {
1224 '+': float.__add__,
1225 '-': float.__sub__,
1226 '*': float.__mul__,
1228 # Field is of the form key1.key2...
1229 # where keys (except first) can be string, int, slice or "{field, ...}"
1230 FIELD_INNER_RE = r'(?:\w+|%(num)s|%(num)s?(?::%(num)s?){1,2})' % {'num': r'(?:-?\d+)'} # noqa: UP031
1231 FIELD_RE = r'\w*(?:\.(?:%(inner)s|{%(field)s(?:,%(field)s)*}))*' % { # noqa: UP031
1232 'inner': FIELD_INNER_RE,
1233 'field': rf'\w*(?:\.{FIELD_INNER_RE})*',
1235 MATH_FIELD_RE = rf'(?:{FIELD_RE}|-?{NUMBER_RE})'
1236 MATH_OPERATORS_RE = r'(?:{})'.format('|'.join(map(re.escape, MATH_FUNCTIONS.keys())))
1237 INTERNAL_FORMAT_RE = re.compile(rf'''(?xs)
1238 (?P<negate>-)?
1239 (?P<fields>{FIELD_RE})
1240 (?P<maths>(?:{MATH_OPERATORS_RE}{MATH_FIELD_RE})*)
1241 (?:>(?P<strf_format>.+?))?
1242 (?P<remaining>
1243 (?P<alternate>(?<!\\),[^|&)]+)?
1244 (?:&(?P<replacement>.*?))?
1245 (?:\|(?P<default>.*?))?
1246 )$''')
1248 def _from_user_input(field):
1249 if field == ':':
1250 return ...
1251 elif ':' in field:
1252 return slice(*map(int_or_none, field.split(':')))
1253 elif int_or_none(field) is not None:
1254 return int(field)
1255 return field
1257 def _traverse_infodict(fields):
1258 fields = [f for x in re.split(r'\.({.+?})\.?', fields)
1259 for f in ([x] if x.startswith('{') else x.split('.'))]
1260 for i in (0, -1):
1261 if fields and not fields[i]:
1262 fields.pop(i)
1264 for i, f in enumerate(fields):
1265 if not f.startswith('{'):
1266 fields[i] = _from_user_input(f)
1267 continue
1268 assert f.endswith('}'), f'No closing brace for {f} in {fields}'
1269 fields[i] = {k: list(map(_from_user_input, k.split('.'))) for k in f[1:-1].split(',')}
1271 return traverse_obj(info_dict, fields, traverse_string=True)
1273 def get_value(mdict):
1274 # Object traversal
1275 value = _traverse_infodict(mdict['fields'])
1276 # Negative
1277 if mdict['negate']:
1278 value = float_or_none(value)
1279 if value is not None:
1280 value *= -1
1281 # Do maths
1282 offset_key = mdict['maths']
1283 if offset_key:
1284 value = float_or_none(value)
1285 operator = None
1286 while offset_key:
1287 item = re.match(
1288 MATH_FIELD_RE if operator else MATH_OPERATORS_RE,
1289 offset_key).group(0)
1290 offset_key = offset_key[len(item):]
1291 if operator is None:
1292 operator = MATH_FUNCTIONS[item]
1293 continue
1294 item, multiplier = (item[1:], -1) if item[0] == '-' else (item, 1)
1295 offset = float_or_none(item)
1296 if offset is None:
1297 offset = float_or_none(_traverse_infodict(item))
1298 try:
1299 value = operator(value, multiplier * offset)
1300 except (TypeError, ZeroDivisionError):
1301 return None
1302 operator = None
1303 # Datetime formatting
1304 if mdict['strf_format']:
1305 value = strftime_or_none(value, mdict['strf_format'].replace('\\,', ','))
1307 # XXX: Workaround for https://github.com/yt-dlp/yt-dlp/issues/4485
1308 if sanitize and value == '':
1309 value = None
1310 return value
1312 na = self.params.get('outtmpl_na_placeholder', 'NA')
1314 def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')):
1315 return sanitize_filename(str(value), restricted=restricted, is_id=(
1316 bool(re.search(r'(^|[_.])id(\.|$)', key))
1317 if 'filename-sanitization' in self.params['compat_opts']
1318 else NO_DEFAULT))
1320 sanitizer = sanitize if callable(sanitize) else filename_sanitizer
1321 sanitize = bool(sanitize)
1323 def _dumpjson_default(obj):
1324 if isinstance(obj, (set, LazyList)):
1325 return list(obj)
1326 return repr(obj)
1328 class _ReplacementFormatter(string.Formatter):
1329 def get_field(self, field_name, args, kwargs):
1330 if field_name.isdigit():
1331 return args[0], -1
1332 raise ValueError('Unsupported field')
1334 replacement_formatter = _ReplacementFormatter()
1336 def create_key(outer_mobj):
1337 if not outer_mobj.group('has_key'):
1338 return outer_mobj.group(0)
1339 key = outer_mobj.group('key')
1340 mobj = re.match(INTERNAL_FORMAT_RE, key)
1341 value, replacement, default, last_field = None, None, na, ''
1342 while mobj:
1343 mobj = mobj.groupdict()
1344 default = mobj['default'] if mobj['default'] is not None else default
1345 value = get_value(mobj)
1346 last_field, replacement = mobj['fields'], mobj['replacement']
1347 if value is None and mobj['alternate']:
1348 mobj = re.match(INTERNAL_FORMAT_RE, mobj['remaining'][1:])
1349 else:
1350 break
1352 if None not in (value, replacement):
1353 try:
1354 value = replacement_formatter.format(replacement, value)
1355 except ValueError:
1356 value, default = None, na
1358 fmt = outer_mobj.group('format')
1359 if fmt == 's' and last_field in field_size_compat_map and isinstance(value, int):
1360 fmt = f'0{field_size_compat_map[last_field]:d}d'
1362 flags = outer_mobj.group('conversion') or ''
1363 str_fmt = f'{fmt[:-1]}s'
1364 if value is None:
1365 value, fmt = default, 's'
1366 elif fmt[-1] == 'l': # list
1367 delim = '\n' if '#' in flags else ', '
1368 value, fmt = delim.join(map(str, variadic(value, allowed_types=(str, bytes)))), str_fmt
1369 elif fmt[-1] == 'j': # json
1370 value, fmt = json.dumps(
1371 value, default=_dumpjson_default,
1372 indent=4 if '#' in flags else None, ensure_ascii='+' not in flags), str_fmt
1373 elif fmt[-1] == 'h': # html
1374 value, fmt = escapeHTML(str(value)), str_fmt
1375 elif fmt[-1] == 'q': # quoted
1376 value = map(str, variadic(value) if '#' in flags else [value])
1377 value, fmt = shell_quote(value, shell=True), str_fmt
1378 elif fmt[-1] == 'B': # bytes
1379 value = f'%{str_fmt}'.encode() % str(value).encode()
1380 value, fmt = value.decode('utf-8', 'ignore'), 's'
1381 elif fmt[-1] == 'U': # unicode normalized
1382 value, fmt = unicodedata.normalize(
1383 # "+" = compatibility equivalence, "#" = NFD
1384 'NF{}{}'.format('K' if '+' in flags else '', 'D' if '#' in flags else 'C'),
1385 value), str_fmt
1386 elif fmt[-1] == 'D': # decimal suffix
1387 num_fmt, fmt = fmt[:-1].replace('#', ''), 's'
1388 value = format_decimal_suffix(value, f'%{num_fmt}f%s' if num_fmt else '%d%s',
1389 factor=1024 if '#' in flags else 1000)
1390 elif fmt[-1] == 'S': # filename sanitization
1391 value, fmt = filename_sanitizer(last_field, value, restricted='#' in flags), str_fmt
1392 elif fmt[-1] == 'c':
1393 if value:
1394 value = str(value)[0]
1395 else:
1396 fmt = str_fmt
1397 elif fmt[-1] not in 'rsa': # numeric
1398 value = float_or_none(value)
1399 if value is None:
1400 value, fmt = default, 's'
1402 if sanitize:
1403 # If value is an object, sanitize might convert it to a string
1404 # So we convert it to repr first
1405 if fmt[-1] == 'r':
1406 value, fmt = repr(value), str_fmt
1407 elif fmt[-1] == 'a':
1408 value, fmt = ascii(value), str_fmt
1409 if fmt[-1] in 'csra':
1410 value = sanitizer(last_field, value)
1412 key = '{}\0{}'.format(key.replace('%', '%\0'), outer_mobj.group('format'))
1413 TMPL_DICT[key] = value
1414 return '{prefix}%({key}){fmt}'.format(key=key, fmt=fmt, prefix=outer_mobj.group('prefix'))
1416 return EXTERNAL_FORMAT_RE.sub(create_key, outtmpl), TMPL_DICT
1418 def evaluate_outtmpl(self, outtmpl, info_dict, *args, **kwargs):
1419 outtmpl, info_dict = self.prepare_outtmpl(outtmpl, info_dict, *args, **kwargs)
1420 return self.escape_outtmpl(outtmpl) % info_dict
1422 @_catch_unsafe_extension_error
1423 def _prepare_filename(self, info_dict, *, outtmpl=None, tmpl_type=None):
1424 assert None in (outtmpl, tmpl_type), 'outtmpl and tmpl_type are mutually exclusive'
1425 if outtmpl is None:
1426 outtmpl = self.params['outtmpl'].get(tmpl_type or 'default', self.params['outtmpl']['default'])
1427 try:
1428 outtmpl = self._outtmpl_expandpath(outtmpl)
1429 filename = self.evaluate_outtmpl(outtmpl, info_dict, True)
1430 if not filename:
1431 return None
1433 if tmpl_type in ('', 'temp'):
1434 final_ext, ext = self.params.get('final_ext'), info_dict.get('ext')
1435 if final_ext and ext and final_ext != ext and filename.endswith(f'.{final_ext}'):
1436 filename = replace_extension(filename, ext, final_ext)
1437 elif tmpl_type:
1438 force_ext = OUTTMPL_TYPES[tmpl_type]
1439 if force_ext:
1440 filename = replace_extension(filename, force_ext, info_dict.get('ext'))
1442 # https://github.com/blackjack4494/youtube-dlc/issues/85
1443 trim_file_name = self.params.get('trim_file_name', False)
1444 if trim_file_name:
1445 no_ext, *ext = filename.rsplit('.', 2)
1446 filename = join_nonempty(no_ext[:trim_file_name], *ext, delim='.')
1448 return filename
1449 except ValueError as err:
1450 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
1451 return None
1453 def prepare_filename(self, info_dict, dir_type='', *, outtmpl=None, warn=False):
1454 """Generate the output filename"""
1455 if outtmpl:
1456 assert not dir_type, 'outtmpl and dir_type are mutually exclusive'
1457 dir_type = None
1458 filename = self._prepare_filename(info_dict, tmpl_type=dir_type, outtmpl=outtmpl)
1459 if not filename and dir_type not in ('', 'temp'):
1460 return ''
1462 if warn:
1463 if not self.params.get('paths'):
1464 pass
1465 elif filename == '-':
1466 self.report_warning('--paths is ignored when an outputting to stdout', only_once=True)
1467 elif os.path.isabs(filename):
1468 self.report_warning('--paths is ignored since an absolute path is given in output template', only_once=True)
1469 if filename == '-' or not filename:
1470 return filename
1472 return self.get_output_path(dir_type, filename)
1474 def _match_entry(self, info_dict, incomplete=False, silent=False):
1475 """Returns None if the file should be downloaded"""
1476 _type = 'video' if 'playlist-match-filter' in self.params['compat_opts'] else info_dict.get('_type', 'video')
1477 assert incomplete or _type == 'video', 'Only video result can be considered complete'
1479 video_title = info_dict.get('title', info_dict.get('id', 'entry'))
1481 def check_filter():
1482 if _type in ('playlist', 'multi_video'):
1483 return
1484 elif _type in ('url', 'url_transparent') and not try_call(
1485 lambda: self.get_info_extractor(info_dict['ie_key']).is_single_video(info_dict['url'])):
1486 return
1488 if 'title' in info_dict:
1489 # This can happen when we're just evaluating the playlist
1490 title = info_dict['title']
1491 matchtitle = self.params.get('matchtitle', False)
1492 if matchtitle:
1493 if not re.search(matchtitle, title, re.IGNORECASE):
1494 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
1495 rejecttitle = self.params.get('rejecttitle', False)
1496 if rejecttitle:
1497 if re.search(rejecttitle, title, re.IGNORECASE):
1498 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
1500 date = info_dict.get('upload_date')
1501 if date is not None:
1502 date_range = self.params.get('daterange', DateRange())
1503 if date not in date_range:
1504 return f'{date_from_str(date).isoformat()} upload date is not in range {date_range}'
1505 view_count = info_dict.get('view_count')
1506 if view_count is not None:
1507 min_views = self.params.get('min_views')
1508 if min_views is not None and view_count < min_views:
1509 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
1510 max_views = self.params.get('max_views')
1511 if max_views is not None and view_count > max_views:
1512 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
1513 if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
1514 return f'Skipping "{video_title}" because it is age restricted'
1516 match_filter = self.params.get('match_filter')
1517 if match_filter is None:
1518 return None
1520 cancelled = None
1521 try:
1522 try:
1523 ret = match_filter(info_dict, incomplete=incomplete)
1524 except TypeError:
1525 # For backward compatibility
1526 ret = None if incomplete else match_filter(info_dict)
1527 except DownloadCancelled as err:
1528 if err.msg is not NO_DEFAULT:
1529 raise
1530 ret, cancelled = err.msg, err
1532 if ret is NO_DEFAULT:
1533 while True:
1534 filename = self._format_screen(self.prepare_filename(info_dict), self.Styles.FILENAME)
1535 reply = input(self._format_screen(
1536 f'Download "{filename}"? (Y/n): ', self.Styles.EMPHASIS)).lower().strip()
1537 if reply in {'y', ''}:
1538 return None
1539 elif reply == 'n':
1540 if cancelled:
1541 raise type(cancelled)(f'Skipping {video_title}')
1542 return f'Skipping {video_title}'
1543 return ret
1545 if self.in_download_archive(info_dict):
1546 reason = ''.join((
1547 format_field(info_dict, 'id', f'{self._format_screen("%s", self.Styles.ID)}: '),
1548 format_field(info_dict, 'title', f'{self._format_screen("%s", self.Styles.EMPHASIS)} '),
1549 'has already been recorded in the archive'))
1550 break_opt, break_err = 'break_on_existing', ExistingVideoReached
1551 else:
1552 try:
1553 reason = check_filter()
1554 except DownloadCancelled as e:
1555 reason, break_opt, break_err = e.msg, 'match_filter', type(e)
1556 else:
1557 break_opt, break_err = 'break_on_reject', RejectedVideoReached
1558 if reason is not None:
1559 if not silent:
1560 self.to_screen('[download] ' + reason)
1561 if self.params.get(break_opt, False):
1562 raise break_err()
1563 return reason
1565 @staticmethod
1566 def add_extra_info(info_dict, extra_info):
1567 """Set the keys from extra_info in info dict if they are missing"""
1568 for key, value in extra_info.items():
1569 info_dict.setdefault(key, value)
1571 def extract_info(self, url, download=True, ie_key=None, extra_info=None,
1572 process=True, force_generic_extractor=False):
1574 Extract and return the information dictionary of the URL
1576 Arguments:
1577 @param url URL to extract
1579 Keyword arguments:
1580 @param download Whether to download videos
1581 @param process Whether to resolve all unresolved references (URLs, playlist items).
1582 Must be True for download to work
1583 @param ie_key Use only the extractor with this key
1585 @param extra_info Dictionary containing the extra values to add to the info (For internal use only)
1586 @force_generic_extractor Force using the generic extractor (Deprecated; use ie_key='Generic')
1589 if extra_info is None:
1590 extra_info = {}
1592 if not ie_key and force_generic_extractor:
1593 ie_key = 'Generic'
1595 if ie_key:
1596 ies = {ie_key: self._ies[ie_key]} if ie_key in self._ies else {}
1597 else:
1598 ies = self._ies
1600 for key, ie in ies.items():
1601 if not ie.suitable(url):
1602 continue
1604 if not ie.working():
1605 self.report_warning('The program functionality for this site has been marked as broken, '
1606 'and will probably not work.')
1608 temp_id = ie.get_temp_id(url)
1609 if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': key}):
1610 self.to_screen(f'[download] {self._format_screen(temp_id, self.Styles.ID)}: '
1611 'has already been recorded in the archive')
1612 if self.params.get('break_on_existing', False):
1613 raise ExistingVideoReached
1614 break
1615 return self.__extract_info(url, self.get_info_extractor(key), download, extra_info, process)
1616 else:
1617 extractors_restricted = self.params.get('allowed_extractors') not in (None, ['default'])
1618 self.report_error(f'No suitable extractor{format_field(ie_key, None, " (%s)")} found for URL {url}',
1619 tb=False if extractors_restricted else None)
1621 def _handle_extraction_exceptions(func):
1622 @functools.wraps(func)
1623 def wrapper(self, *args, **kwargs):
1624 while True:
1625 try:
1626 return func(self, *args, **kwargs)
1627 except (DownloadCancelled, LazyList.IndexError, PagedList.IndexError):
1628 raise
1629 except ReExtractInfo as e:
1630 if e.expected:
1631 self.to_screen(f'{e}; Re-extracting data')
1632 else:
1633 self.to_stderr('\r')
1634 self.report_warning(f'{e}; Re-extracting data')
1635 continue
1636 except GeoRestrictedError as e:
1637 msg = e.msg
1638 if e.countries:
1639 msg += '\nThis video is available in {}.'.format(', '.join(
1640 map(ISO3166Utils.short2full, e.countries)))
1641 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
1642 self.report_error(msg)
1643 except ExtractorError as e: # An error we somewhat expected
1644 self.report_error(str(e), e.format_traceback())
1645 except Exception as e:
1646 if self.params.get('ignoreerrors'):
1647 self.report_error(str(e), tb=encode_compat_str(traceback.format_exc()))
1648 else:
1649 raise
1650 break
1651 return wrapper
1653 def _wait_for_video(self, ie_result={}):
1654 if (not self.params.get('wait_for_video')
1655 or ie_result.get('_type', 'video') != 'video'
1656 or ie_result.get('formats') or ie_result.get('url')):
1657 return
1659 format_dur = lambda dur: '%02d:%02d:%02d' % timetuple_from_msec(dur * 1000)[:-1]
1660 last_msg = ''
1662 def progress(msg):
1663 nonlocal last_msg
1664 full_msg = f'{msg}\n'
1665 if not self.params.get('noprogress'):
1666 full_msg = msg + ' ' * (len(last_msg) - len(msg)) + '\r'
1667 elif last_msg:
1668 return
1669 self.to_screen(full_msg, skip_eol=True)
1670 last_msg = msg
1672 min_wait, max_wait = self.params.get('wait_for_video')
1673 diff = try_get(ie_result, lambda x: x['release_timestamp'] - time.time())
1674 if diff is None and ie_result.get('live_status') == 'is_upcoming':
1675 diff = round(random.uniform(min_wait, max_wait) if (max_wait and min_wait) else (max_wait or min_wait), 0)
1676 self.report_warning('Release time of video is not known')
1677 elif ie_result and (diff or 0) <= 0:
1678 self.report_warning('Video should already be available according to extracted info')
1679 diff = min(max(diff or 0, min_wait or 0), max_wait or float('inf'))
1680 self.to_screen(f'[wait] Waiting for {format_dur(diff)} - Press Ctrl+C to try now')
1682 wait_till = time.time() + diff
1683 try:
1684 while True:
1685 diff = wait_till - time.time()
1686 if diff <= 0:
1687 progress('')
1688 raise ReExtractInfo('[wait] Wait period ended', expected=True)
1689 progress(f'[wait] Remaining time until next attempt: {self._format_screen(format_dur(diff), self.Styles.EMPHASIS)}')
1690 time.sleep(1)
1691 except KeyboardInterrupt:
1692 progress('')
1693 raise ReExtractInfo('[wait] Interrupted by user', expected=True)
1694 except BaseException as e:
1695 if not isinstance(e, ReExtractInfo):
1696 self.to_screen('')
1697 raise
1699 def _load_cookies(self, data, *, autoscope=True):
1700 """Loads cookies from a `Cookie` header
1702 This tries to work around the security vulnerability of passing cookies to every domain.
1703 See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj
1705 @param data The Cookie header as string to load the cookies from
1706 @param autoscope If `False`, scope cookies using Set-Cookie syntax and error for cookie without domains
1707 If `True`, save cookies for later to be stored in the jar with a limited scope
1708 If a URL, save cookies in the jar with the domain of the URL
1710 for cookie in LenientSimpleCookie(data).values():
1711 if autoscope and any(cookie.values()):
1712 raise ValueError('Invalid syntax in Cookie Header')
1714 domain = cookie.get('domain') or ''
1715 expiry = cookie.get('expires')
1716 if expiry == '': # 0 is valid
1717 expiry = None
1718 prepared_cookie = http.cookiejar.Cookie(
1719 cookie.get('version') or 0, cookie.key, cookie.value, None, False,
1720 domain, True, True, cookie.get('path') or '', bool(cookie.get('path')),
1721 cookie.get('secure') or False, expiry, False, None, None, {})
1723 if domain:
1724 self.cookiejar.set_cookie(prepared_cookie)
1725 elif autoscope is True:
1726 self.deprecated_feature(
1727 'Passing cookies as a header is a potential security risk; '
1728 'they will be scoped to the domain of the downloaded urls. '
1729 'Please consider loading cookies from a file or browser instead.')
1730 self.__header_cookies.append(prepared_cookie)
1731 elif autoscope:
1732 self.report_warning(
1733 'The extractor result contains an unscoped cookie as an HTTP header. '
1734 f'If you are using yt-dlp with an input URL{bug_reports_message(before=",")}',
1735 only_once=True)
1736 self._apply_header_cookies(autoscope, [prepared_cookie])
1737 else:
1738 self.report_error('Unscoped cookies are not allowed; please specify some sort of scoping',
1739 tb=False, is_error=False)
1741 def _apply_header_cookies(self, url, cookies=None):
1742 """Applies stray header cookies to the provided url
1744 This loads header cookies and scopes them to the domain provided in `url`.
1745 While this is not ideal, it helps reduce the risk of them being sent
1746 to an unintended destination while mostly maintaining compatibility.
1748 parsed = urllib.parse.urlparse(url)
1749 if not parsed.hostname:
1750 return
1752 for cookie in map(copy.copy, cookies or self.__header_cookies):
1753 cookie.domain = f'.{parsed.hostname}'
1754 self.cookiejar.set_cookie(cookie)
1756 @_handle_extraction_exceptions
1757 def __extract_info(self, url, ie, download, extra_info, process):
1758 self._apply_header_cookies(url)
1760 try:
1761 ie_result = ie.extract(url)
1762 except UserNotLive as e:
1763 if process:
1764 if self.params.get('wait_for_video'):
1765 self.report_warning(e)
1766 self._wait_for_video()
1767 raise
1768 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
1769 self.report_warning(f'Extractor {ie.IE_NAME} returned nothing{bug_reports_message()}')
1770 return
1771 if isinstance(ie_result, list):
1772 # Backwards compatibility: old IE result format
1773 ie_result = {
1774 '_type': 'compat_list',
1775 'entries': ie_result,
1777 if extra_info.get('original_url'):
1778 ie_result.setdefault('original_url', extra_info['original_url'])
1779 self.add_default_extra_info(ie_result, ie, url)
1780 if process:
1781 self._wait_for_video(ie_result)
1782 return self.process_ie_result(ie_result, download, extra_info)
1783 else:
1784 return ie_result
1786 def add_default_extra_info(self, ie_result, ie, url):
1787 if url is not None:
1788 self.add_extra_info(ie_result, {
1789 'webpage_url': url,
1790 'original_url': url,
1792 webpage_url = ie_result.get('webpage_url')
1793 if webpage_url:
1794 self.add_extra_info(ie_result, {
1795 'webpage_url_basename': url_basename(webpage_url),
1796 'webpage_url_domain': get_domain(webpage_url),
1798 if ie is not None:
1799 self.add_extra_info(ie_result, {
1800 'extractor': ie.IE_NAME,
1801 'extractor_key': ie.ie_key(),
1804 def process_ie_result(self, ie_result, download=True, extra_info=None):
1806 Take the result of the ie(may be modified) and resolve all unresolved
1807 references (URLs, playlist items).
1809 It will also download the videos if 'download'.
1810 Returns the resolved ie_result.
1812 if extra_info is None:
1813 extra_info = {}
1814 result_type = ie_result.get('_type', 'video')
1816 if result_type in ('url', 'url_transparent'):
1817 ie_result['url'] = sanitize_url(
1818 ie_result['url'], scheme='http' if self.params.get('prefer_insecure') else 'https')
1819 if ie_result.get('original_url') and not extra_info.get('original_url'):
1820 extra_info = {'original_url': ie_result['original_url'], **extra_info}
1822 extract_flat = self.params.get('extract_flat', False)
1823 if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
1824 or extract_flat is True):
1825 info_copy = ie_result.copy()
1826 ie = try_get(ie_result.get('ie_key'), self.get_info_extractor)
1827 if ie and not ie_result.get('id'):
1828 info_copy['id'] = ie.get_temp_id(ie_result['url'])
1829 self.add_default_extra_info(info_copy, ie, ie_result['url'])
1830 self.add_extra_info(info_copy, extra_info)
1831 info_copy, _ = self.pre_process(info_copy)
1832 self._fill_common_fields(info_copy, False)
1833 self.__forced_printings(info_copy)
1834 self._raise_pending_errors(info_copy)
1835 if self.params.get('force_write_download_archive', False):
1836 self.record_download_archive(info_copy)
1837 return ie_result
1839 if result_type == 'video':
1840 self.add_extra_info(ie_result, extra_info)
1841 ie_result = self.process_video_result(ie_result, download=download)
1842 self._raise_pending_errors(ie_result)
1843 additional_urls = (ie_result or {}).get('additional_urls')
1844 if additional_urls:
1845 # TODO: Improve MetadataParserPP to allow setting a list
1846 if isinstance(additional_urls, str):
1847 additional_urls = [additional_urls]
1848 self.to_screen(
1849 '[info] {}: {} additional URL(s) requested'.format(ie_result['id'], len(additional_urls)))
1850 self.write_debug('Additional URLs: "{}"'.format('", "'.join(additional_urls)))
1851 ie_result['additional_entries'] = [
1852 self.extract_info(
1853 url, download, extra_info=extra_info,
1854 force_generic_extractor=self.params.get('force_generic_extractor'))
1855 for url in additional_urls
1857 return ie_result
1858 elif result_type == 'url':
1859 # We have to add extra_info to the results because it may be
1860 # contained in a playlist
1861 return self.extract_info(
1862 ie_result['url'], download,
1863 ie_key=ie_result.get('ie_key'),
1864 extra_info=extra_info)
1865 elif result_type == 'url_transparent':
1866 # Use the information from the embedding page
1867 info = self.extract_info(
1868 ie_result['url'], ie_key=ie_result.get('ie_key'),
1869 extra_info=extra_info, download=False, process=False)
1871 # extract_info may return None when ignoreerrors is enabled and
1872 # extraction failed with an error, don't crash and return early
1873 # in this case
1874 if not info:
1875 return info
1877 exempted_fields = {'_type', 'url', 'ie_key'}
1878 if not ie_result.get('section_end') and ie_result.get('section_start') is None:
1879 # For video clips, the id etc of the clip extractor should be used
1880 exempted_fields |= {'id', 'extractor', 'extractor_key'}
1882 new_result = info.copy()
1883 new_result.update(filter_dict(ie_result, lambda k, v: v is not None and k not in exempted_fields))
1885 # Extracted info may not be a video result (i.e.
1886 # info.get('_type', 'video') != video) but rather an url or
1887 # url_transparent. In such cases outer metadata (from ie_result)
1888 # should be propagated to inner one (info). For this to happen
1889 # _type of info should be overridden with url_transparent. This
1890 # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163.
1891 if new_result.get('_type') == 'url':
1892 new_result['_type'] = 'url_transparent'
1894 return self.process_ie_result(
1895 new_result, download=download, extra_info=extra_info)
1896 elif result_type in ('playlist', 'multi_video'):
1897 # Protect from infinite recursion due to recursively nested playlists
1898 # (see https://github.com/ytdl-org/youtube-dl/issues/27833)
1899 webpage_url = ie_result.get('webpage_url') # Playlists maynot have webpage_url
1900 if webpage_url and webpage_url in self._playlist_urls:
1901 self.to_screen(
1902 '[download] Skipping already downloaded playlist: {}'.format(
1903 ie_result.get('title')) or ie_result.get('id'))
1904 return
1906 self._playlist_level += 1
1907 self._playlist_urls.add(webpage_url)
1908 self._fill_common_fields(ie_result, False)
1909 self._sanitize_thumbnails(ie_result)
1910 try:
1911 return self.__process_playlist(ie_result, download)
1912 finally:
1913 self._playlist_level -= 1
1914 if not self._playlist_level:
1915 self._playlist_urls.clear()
1916 elif result_type == 'compat_list':
1917 self.report_warning(
1918 'Extractor {} returned a compat_list result. '
1919 'It needs to be updated.'.format(ie_result.get('extractor')))
1921 def _fixup(r):
1922 self.add_extra_info(r, {
1923 'extractor': ie_result['extractor'],
1924 'webpage_url': ie_result['webpage_url'],
1925 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1926 'webpage_url_domain': get_domain(ie_result['webpage_url']),
1927 'extractor_key': ie_result['extractor_key'],
1929 return r
1930 ie_result['entries'] = [
1931 self.process_ie_result(_fixup(r), download, extra_info)
1932 for r in ie_result['entries']
1934 return ie_result
1935 else:
1936 raise Exception(f'Invalid result type: {result_type}')
1938 def _ensure_dir_exists(self, path):
1939 return make_dir(path, self.report_error)
1941 @staticmethod
1942 def _playlist_infodict(ie_result, strict=False, **kwargs):
1943 info = {
1944 'playlist_count': ie_result.get('playlist_count'),
1945 'playlist': ie_result.get('title') or ie_result.get('id'),
1946 'playlist_id': ie_result.get('id'),
1947 'playlist_title': ie_result.get('title'),
1948 'playlist_uploader': ie_result.get('uploader'),
1949 'playlist_uploader_id': ie_result.get('uploader_id'),
1950 'playlist_channel': ie_result.get('channel'),
1951 'playlist_channel_id': ie_result.get('channel_id'),
1952 **kwargs,
1954 if strict:
1955 return info
1956 if ie_result.get('webpage_url'):
1957 info.update({
1958 'webpage_url': ie_result['webpage_url'],
1959 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1960 'webpage_url_domain': get_domain(ie_result['webpage_url']),
1962 return {
1963 **info,
1964 'playlist_index': 0,
1965 '__last_playlist_index': max(ie_result.get('requested_entries') or (0, 0)),
1966 'extractor': ie_result['extractor'],
1967 'extractor_key': ie_result['extractor_key'],
1970 def __process_playlist(self, ie_result, download):
1971 """Process each entry in the playlist"""
1972 assert ie_result['_type'] in ('playlist', 'multi_video')
1974 common_info = self._playlist_infodict(ie_result, strict=True)
1975 title = common_info.get('playlist') or '<Untitled>'
1976 if self._match_entry(common_info, incomplete=True) is not None:
1977 return
1978 self.to_screen(f'[download] Downloading {ie_result["_type"]}: {title}')
1980 all_entries = PlaylistEntries(self, ie_result)
1981 entries = orderedSet(all_entries.get_requested_items(), lazy=True)
1983 lazy = self.params.get('lazy_playlist')
1984 if lazy:
1985 resolved_entries, n_entries = [], 'N/A'
1986 ie_result['requested_entries'], ie_result['entries'] = None, None
1987 else:
1988 entries = resolved_entries = list(entries)
1989 n_entries = len(resolved_entries)
1990 ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*resolved_entries)) or ([], [])
1991 if not ie_result.get('playlist_count'):
1992 # Better to do this after potentially exhausting entries
1993 ie_result['playlist_count'] = all_entries.get_full_count()
1995 extra = self._playlist_infodict(ie_result, n_entries=int_or_none(n_entries))
1996 ie_copy = collections.ChainMap(ie_result, extra)
1998 _infojson_written = False
1999 write_playlist_files = self.params.get('allow_playlist_files', True)
2000 if write_playlist_files and self.params.get('list_thumbnails'):
2001 self.list_thumbnails(ie_result)
2002 if write_playlist_files and not self.params.get('simulate'):
2003 _infojson_written = self._write_info_json(
2004 'playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson'))
2005 if _infojson_written is None:
2006 return
2007 if self._write_description('playlist', ie_result,
2008 self.prepare_filename(ie_copy, 'pl_description')) is None:
2009 return
2010 # TODO: This should be passed to ThumbnailsConvertor if necessary
2011 self._write_thumbnails('playlist', ie_result, self.prepare_filename(ie_copy, 'pl_thumbnail'))
2013 if lazy:
2014 if self.params.get('playlistreverse') or self.params.get('playlistrandom'):
2015 self.report_warning('playlistreverse and playlistrandom are not supported with lazy_playlist', only_once=True)
2016 elif self.params.get('playlistreverse'):
2017 entries.reverse()
2018 elif self.params.get('playlistrandom'):
2019 random.shuffle(entries)
2021 self.to_screen(f'[{ie_result["extractor"]}] Playlist {title}: Downloading {n_entries} items'
2022 f'{format_field(ie_result, "playlist_count", " of %s")}')
2024 keep_resolved_entries = self.params.get('extract_flat') != 'discard'
2025 if self.params.get('extract_flat') == 'discard_in_playlist':
2026 keep_resolved_entries = ie_result['_type'] != 'playlist'
2027 if keep_resolved_entries:
2028 self.write_debug('The information of all playlist entries will be held in memory')
2030 failures = 0
2031 max_failures = self.params.get('skip_playlist_after_errors') or float('inf')
2032 for i, (playlist_index, entry) in enumerate(entries):
2033 if lazy:
2034 resolved_entries.append((playlist_index, entry))
2035 if not entry:
2036 continue
2038 entry['__x_forwarded_for_ip'] = ie_result.get('__x_forwarded_for_ip')
2039 if not lazy and 'playlist-index' in self.params['compat_opts']:
2040 playlist_index = ie_result['requested_entries'][i]
2042 entry_copy = collections.ChainMap(entry, {
2043 **common_info,
2044 'n_entries': int_or_none(n_entries),
2045 'playlist_index': playlist_index,
2046 'playlist_autonumber': i + 1,
2049 if self._match_entry(entry_copy, incomplete=True) is not None:
2050 # For compatabilty with youtube-dl. See https://github.com/yt-dlp/yt-dlp/issues/4369
2051 resolved_entries[i] = (playlist_index, NO_DEFAULT)
2052 continue
2054 self.to_screen(
2055 f'[download] Downloading item {self._format_screen(i + 1, self.Styles.ID)} '
2056 f'of {self._format_screen(n_entries, self.Styles.EMPHASIS)}')
2058 entry_result = self.__process_iterable_entry(entry, download, collections.ChainMap({
2059 'playlist_index': playlist_index,
2060 'playlist_autonumber': i + 1,
2061 }, extra))
2062 if not entry_result:
2063 failures += 1
2064 if failures >= max_failures:
2065 self.report_error(
2066 f'Skipping the remaining entries in playlist "{title}" since {failures} items failed extraction')
2067 break
2068 if keep_resolved_entries:
2069 resolved_entries[i] = (playlist_index, entry_result)
2071 # Update with processed data
2072 ie_result['entries'] = [e for _, e in resolved_entries if e is not NO_DEFAULT]
2073 ie_result['requested_entries'] = [i for i, e in resolved_entries if e is not NO_DEFAULT]
2074 if ie_result['requested_entries'] == try_call(lambda: list(range(1, ie_result['playlist_count'] + 1))):
2075 # Do not set for full playlist
2076 ie_result.pop('requested_entries')
2078 # Write the updated info to json
2079 if _infojson_written is True and self._write_info_json(
2080 'updated playlist', ie_result,
2081 self.prepare_filename(ie_copy, 'pl_infojson'), overwrite=True) is None:
2082 return
2084 ie_result = self.run_all_pps('playlist', ie_result)
2085 self.to_screen(f'[download] Finished downloading playlist: {title}')
2086 return ie_result
2088 @_handle_extraction_exceptions
2089 def __process_iterable_entry(self, entry, download, extra_info):
2090 return self.process_ie_result(
2091 entry, download=download, extra_info=extra_info)
2093 def _build_format_filter(self, filter_spec):
2094 " Returns a function to filter the formats according to the filter_spec "
2096 OPERATORS = {
2097 '<': operator.lt,
2098 '<=': operator.le,
2099 '>': operator.gt,
2100 '>=': operator.ge,
2101 '=': operator.eq,
2102 '!=': operator.ne,
2104 operator_rex = re.compile(r'''(?x)\s*
2105 (?P<key>[\w.-]+)\s*
2106 (?P<op>{})(?P<none_inclusive>\s*\?)?\s*
2107 (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s*
2108 '''.format('|'.join(map(re.escape, OPERATORS.keys()))))
2109 m = operator_rex.fullmatch(filter_spec)
2110 if m:
2111 try:
2112 comparison_value = int(m.group('value'))
2113 except ValueError:
2114 comparison_value = parse_filesize(m.group('value'))
2115 if comparison_value is None:
2116 comparison_value = parse_filesize(m.group('value') + 'B')
2117 if comparison_value is None:
2118 raise ValueError(
2119 'Invalid value {!r} in format specification {!r}'.format(
2120 m.group('value'), filter_spec))
2121 op = OPERATORS[m.group('op')]
2123 if not m:
2124 STR_OPERATORS = {
2125 '=': operator.eq,
2126 '^=': lambda attr, value: attr.startswith(value),
2127 '$=': lambda attr, value: attr.endswith(value),
2128 '*=': lambda attr, value: value in attr,
2129 '~=': lambda attr, value: value.search(attr) is not None,
2131 str_operator_rex = re.compile(r'''(?x)\s*
2132 (?P<key>[a-zA-Z0-9._-]+)\s*
2133 (?P<negation>!\s*)?(?P<op>{})\s*(?P<none_inclusive>\?\s*)?
2134 (?P<quote>["'])?
2135 (?P<value>(?(quote)(?:(?!(?P=quote))[^\\]|\\.)+|[\w.-]+))
2136 (?(quote)(?P=quote))\s*
2137 '''.format('|'.join(map(re.escape, STR_OPERATORS.keys()))))
2138 m = str_operator_rex.fullmatch(filter_spec)
2139 if m:
2140 if m.group('op') == '~=':
2141 comparison_value = re.compile(m.group('value'))
2142 else:
2143 comparison_value = re.sub(r'''\\([\\"'])''', r'\1', m.group('value'))
2144 str_op = STR_OPERATORS[m.group('op')]
2145 if m.group('negation'):
2146 op = lambda attr, value: not str_op(attr, value)
2147 else:
2148 op = str_op
2150 if not m:
2151 raise SyntaxError(f'Invalid filter specification {filter_spec!r}')
2153 def _filter(f):
2154 actual_value = f.get(m.group('key'))
2155 if actual_value is None:
2156 return m.group('none_inclusive')
2157 return op(actual_value, comparison_value)
2158 return _filter
2160 def _check_formats(self, formats):
2161 for f in formats:
2162 working = f.get('__working')
2163 if working is not None:
2164 if working:
2165 yield f
2166 continue
2167 self.to_screen('[info] Testing format {}'.format(f['format_id']))
2168 path = self.get_output_path('temp')
2169 if not self._ensure_dir_exists(f'{path}/'):
2170 continue
2171 temp_file = tempfile.NamedTemporaryFile(suffix='.tmp', delete=False, dir=path or None)
2172 temp_file.close()
2173 try:
2174 success, _ = self.dl(temp_file.name, f, test=True)
2175 except (DownloadError, OSError, ValueError, *network_exceptions):
2176 success = False
2177 finally:
2178 if os.path.exists(temp_file.name):
2179 try:
2180 os.remove(temp_file.name)
2181 except OSError:
2182 self.report_warning(f'Unable to delete temporary file "{temp_file.name}"')
2183 f['__working'] = success
2184 if success:
2185 yield f
2186 else:
2187 self.to_screen('[info] Unable to download format {}. Skipping...'.format(f['format_id']))
2189 def _select_formats(self, formats, selector):
2190 return list(selector({
2191 'formats': formats,
2192 'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats),
2193 'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats) # No formats with video
2194 or all(f.get('acodec') == 'none' for f in formats)), # OR, No formats with audio
2197 def _default_format_spec(self, info_dict):
2198 prefer_best = (
2199 self.params['outtmpl']['default'] == '-'
2200 or info_dict.get('is_live') and not self.params.get('live_from_start'))
2202 def can_merge():
2203 merger = FFmpegMergerPP(self)
2204 return merger.available and merger.can_merge()
2206 if not prefer_best and not can_merge():
2207 prefer_best = True
2208 formats = self._get_formats(info_dict)
2209 evaluate_formats = lambda spec: self._select_formats(formats, self.build_format_selector(spec))
2210 if evaluate_formats('b/bv+ba') != evaluate_formats('bv*+ba/b'):
2211 self.report_warning('ffmpeg not found. The downloaded format may not be the best available. '
2212 'Installing ffmpeg is strongly recommended: https://github.com/yt-dlp/yt-dlp#dependencies')
2214 compat = (self.params.get('allow_multiple_audio_streams')
2215 or 'format-spec' in self.params['compat_opts'])
2217 return ('best/bestvideo+bestaudio' if prefer_best
2218 else 'bestvideo+bestaudio/best' if compat
2219 else 'bestvideo*+bestaudio/best')
2221 def build_format_selector(self, format_spec):
2222 def syntax_error(note, start):
2223 message = (
2224 'Invalid format specification: '
2225 '{}\n\t{}\n\t{}^'.format(note, format_spec, ' ' * start[1]))
2226 return SyntaxError(message)
2228 PICKFIRST = 'PICKFIRST'
2229 MERGE = 'MERGE'
2230 SINGLE = 'SINGLE'
2231 GROUP = 'GROUP'
2232 FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
2234 allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False),
2235 'video': self.params.get('allow_multiple_video_streams', False)}
2237 def _parse_filter(tokens):
2238 filter_parts = []
2239 for type_, string_, _start, _, _ in tokens:
2240 if type_ == tokenize.OP and string_ == ']':
2241 return ''.join(filter_parts)
2242 else:
2243 filter_parts.append(string_)
2245 def _remove_unused_ops(tokens):
2246 # Remove operators that we don't use and join them with the surrounding strings.
2247 # E.g. 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
2248 ALLOWED_OPS = ('/', '+', ',', '(', ')')
2249 last_string, last_start, last_end, last_line = None, None, None, None
2250 for type_, string_, start, end, line in tokens:
2251 if type_ == tokenize.OP and string_ == '[':
2252 if last_string:
2253 yield tokenize.NAME, last_string, last_start, last_end, last_line
2254 last_string = None
2255 yield type_, string_, start, end, line
2256 # everything inside brackets will be handled by _parse_filter
2257 for type_, string_, start, end, line in tokens:
2258 yield type_, string_, start, end, line
2259 if type_ == tokenize.OP and string_ == ']':
2260 break
2261 elif type_ == tokenize.OP and string_ in ALLOWED_OPS:
2262 if last_string:
2263 yield tokenize.NAME, last_string, last_start, last_end, last_line
2264 last_string = None
2265 yield type_, string_, start, end, line
2266 elif type_ in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
2267 if not last_string:
2268 last_string = string_
2269 last_start = start
2270 last_end = end
2271 else:
2272 last_string += string_
2273 if last_string:
2274 yield tokenize.NAME, last_string, last_start, last_end, last_line
2276 def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
2277 selectors = []
2278 current_selector = None
2279 for type_, string_, start, _, _ in tokens:
2280 # ENCODING is only defined in Python 3.x
2281 if type_ == getattr(tokenize, 'ENCODING', None):
2282 continue
2283 elif type_ in [tokenize.NAME, tokenize.NUMBER]:
2284 current_selector = FormatSelector(SINGLE, string_, [])
2285 elif type_ == tokenize.OP:
2286 if string_ == ')':
2287 if not inside_group:
2288 # ')' will be handled by the parentheses group
2289 tokens.restore_last_token()
2290 break
2291 elif inside_merge and string_ in ['/', ',']:
2292 tokens.restore_last_token()
2293 break
2294 elif inside_choice and string_ == ',':
2295 tokens.restore_last_token()
2296 break
2297 elif string_ == ',':
2298 if not current_selector:
2299 raise syntax_error('"," must follow a format selector', start)
2300 selectors.append(current_selector)
2301 current_selector = None
2302 elif string_ == '/':
2303 if not current_selector:
2304 raise syntax_error('"/" must follow a format selector', start)
2305 first_choice = current_selector
2306 second_choice = _parse_format_selection(tokens, inside_choice=True)
2307 current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
2308 elif string_ == '[':
2309 if not current_selector:
2310 current_selector = FormatSelector(SINGLE, 'best', [])
2311 format_filter = _parse_filter(tokens)
2312 current_selector.filters.append(format_filter)
2313 elif string_ == '(':
2314 if current_selector:
2315 raise syntax_error('Unexpected "("', start)
2316 group = _parse_format_selection(tokens, inside_group=True)
2317 current_selector = FormatSelector(GROUP, group, [])
2318 elif string_ == '+':
2319 if not current_selector:
2320 raise syntax_error('Unexpected "+"', start)
2321 selector_1 = current_selector
2322 selector_2 = _parse_format_selection(tokens, inside_merge=True)
2323 if not selector_2:
2324 raise syntax_error('Expected a selector', start)
2325 current_selector = FormatSelector(MERGE, (selector_1, selector_2), [])
2326 else:
2327 raise syntax_error(f'Operator not recognized: "{string_}"', start)
2328 elif type_ == tokenize.ENDMARKER:
2329 break
2330 if current_selector:
2331 selectors.append(current_selector)
2332 return selectors
2334 def _merge(formats_pair):
2335 format_1, format_2 = formats_pair
2337 formats_info = []
2338 formats_info.extend(format_1.get('requested_formats', (format_1,)))
2339 formats_info.extend(format_2.get('requested_formats', (format_2,)))
2341 if not allow_multiple_streams['video'] or not allow_multiple_streams['audio']:
2342 get_no_more = {'video': False, 'audio': False}
2343 for (i, fmt_info) in enumerate(formats_info):
2344 if fmt_info.get('acodec') == fmt_info.get('vcodec') == 'none':
2345 formats_info.pop(i)
2346 continue
2347 for aud_vid in ['audio', 'video']:
2348 if not allow_multiple_streams[aud_vid] and fmt_info.get(aud_vid[0] + 'codec') != 'none':
2349 if get_no_more[aud_vid]:
2350 formats_info.pop(i)
2351 break
2352 get_no_more[aud_vid] = True
2354 if len(formats_info) == 1:
2355 return formats_info[0]
2357 video_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('vcodec') != 'none']
2358 audio_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('acodec') != 'none']
2360 the_only_video = video_fmts[0] if len(video_fmts) == 1 else None
2361 the_only_audio = audio_fmts[0] if len(audio_fmts) == 1 else None
2363 output_ext = get_compatible_ext(
2364 vcodecs=[f.get('vcodec') for f in video_fmts],
2365 acodecs=[f.get('acodec') for f in audio_fmts],
2366 vexts=[f['ext'] for f in video_fmts],
2367 aexts=[f['ext'] for f in audio_fmts],
2368 preferences=(try_call(lambda: self.params['merge_output_format'].split('/'))
2369 or self.params.get('prefer_free_formats') and ('webm', 'mkv')))
2371 filtered = lambda *keys: filter(None, (traverse_obj(fmt, *keys) for fmt in formats_info))
2373 new_dict = {
2374 'requested_formats': formats_info,
2375 'format': '+'.join(filtered('format')),
2376 'format_id': '+'.join(filtered('format_id')),
2377 'ext': output_ext,
2378 'protocol': '+'.join(map(determine_protocol, formats_info)),
2379 'language': '+'.join(orderedSet(filtered('language'))) or None,
2380 'format_note': '+'.join(orderedSet(filtered('format_note'))) or None,
2381 'filesize_approx': sum(filtered('filesize', 'filesize_approx')) or None,
2382 'tbr': sum(filtered('tbr', 'vbr', 'abr')),
2385 if the_only_video:
2386 new_dict.update({
2387 'width': the_only_video.get('width'),
2388 'height': the_only_video.get('height'),
2389 'resolution': the_only_video.get('resolution') or self.format_resolution(the_only_video),
2390 'fps': the_only_video.get('fps'),
2391 'dynamic_range': the_only_video.get('dynamic_range'),
2392 'vcodec': the_only_video.get('vcodec'),
2393 'vbr': the_only_video.get('vbr'),
2394 'stretched_ratio': the_only_video.get('stretched_ratio'),
2395 'aspect_ratio': the_only_video.get('aspect_ratio'),
2398 if the_only_audio:
2399 new_dict.update({
2400 'acodec': the_only_audio.get('acodec'),
2401 'abr': the_only_audio.get('abr'),
2402 'asr': the_only_audio.get('asr'),
2403 'audio_channels': the_only_audio.get('audio_channels'),
2406 return new_dict
2408 def _check_formats(formats):
2409 if self.params.get('check_formats') == 'selected':
2410 yield from self._check_formats(formats)
2411 return
2412 elif (self.params.get('check_formats') is not None
2413 or self.params.get('allow_unplayable_formats')):
2414 yield from formats
2415 return
2417 for f in formats:
2418 if f.get('has_drm') or f.get('__needs_testing'):
2419 yield from self._check_formats([f])
2420 else:
2421 yield f
2423 def _build_selector_function(selector):
2424 if isinstance(selector, list): # ,
2425 fs = [_build_selector_function(s) for s in selector]
2427 def selector_function(ctx):
2428 for f in fs:
2429 yield from f(ctx)
2430 return selector_function
2432 elif selector.type == GROUP: # ()
2433 selector_function = _build_selector_function(selector.selector)
2435 elif selector.type == PICKFIRST: # /
2436 fs = [_build_selector_function(s) for s in selector.selector]
2438 def selector_function(ctx):
2439 for f in fs:
2440 picked_formats = list(f(ctx))
2441 if picked_formats:
2442 return picked_formats
2443 return []
2445 elif selector.type == MERGE: # +
2446 selector_1, selector_2 = map(_build_selector_function, selector.selector)
2448 def selector_function(ctx):
2449 for pair in itertools.product(selector_1(ctx), selector_2(ctx)):
2450 yield _merge(pair)
2452 elif selector.type == SINGLE: # atom
2453 format_spec = selector.selector or 'best'
2455 # TODO: Add allvideo, allaudio etc by generalizing the code with best/worst selector
2456 if format_spec == 'all':
2457 def selector_function(ctx):
2458 yield from _check_formats(ctx['formats'][::-1])
2459 elif format_spec == 'mergeall':
2460 def selector_function(ctx):
2461 formats = list(_check_formats(
2462 f for f in ctx['formats'] if f.get('vcodec') != 'none' or f.get('acodec') != 'none'))
2463 if not formats:
2464 return
2465 merged_format = formats[-1]
2466 for f in formats[-2::-1]:
2467 merged_format = _merge((merged_format, f))
2468 yield merged_format
2470 else:
2471 format_fallback, seperate_fallback, format_reverse, format_idx = False, None, True, 1
2472 mobj = re.match(
2473 r'(?P<bw>best|worst|b|w)(?P<type>video|audio|v|a)?(?P<mod>\*)?(?:\.(?P<n>[1-9]\d*))?$',
2474 format_spec)
2475 if mobj is not None:
2476 format_idx = int_or_none(mobj.group('n'), default=1)
2477 format_reverse = mobj.group('bw')[0] == 'b'
2478 format_type = (mobj.group('type') or [None])[0]
2479 not_format_type = {'v': 'a', 'a': 'v'}.get(format_type)
2480 format_modified = mobj.group('mod') is not None
2482 format_fallback = not format_type and not format_modified # for b, w
2483 _filter_f = (
2484 (lambda f: f.get(f'{format_type}codec') != 'none')
2485 if format_type and format_modified # bv*, ba*, wv*, wa*
2486 else (lambda f: f.get(f'{not_format_type}codec') == 'none')
2487 if format_type # bv, ba, wv, wa
2488 else (lambda f: f.get('vcodec') != 'none' and f.get('acodec') != 'none')
2489 if not format_modified # b, w
2490 else lambda f: True) # b*, w*
2491 filter_f = lambda f: _filter_f(f) and (
2492 f.get('vcodec') != 'none' or f.get('acodec') != 'none')
2493 else:
2494 if format_spec in self._format_selection_exts['audio']:
2495 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none'
2496 elif format_spec in self._format_selection_exts['video']:
2497 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' and f.get('vcodec') != 'none'
2498 seperate_fallback = lambda f: f.get('ext') == format_spec and f.get('vcodec') != 'none'
2499 elif format_spec in self._format_selection_exts['storyboards']:
2500 filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') == 'none' and f.get('vcodec') == 'none'
2501 else:
2502 filter_f = lambda f: f.get('format_id') == format_spec # id
2504 def selector_function(ctx):
2505 formats = list(ctx['formats'])
2506 matches = list(filter(filter_f, formats)) if filter_f is not None else formats
2507 if not matches:
2508 if format_fallback and ctx['incomplete_formats']:
2509 # for extractors with incomplete formats (audio only (soundcloud)
2510 # or video only (imgur)) best/worst will fallback to
2511 # best/worst {video,audio}-only format
2512 matches = list(filter(lambda f: f.get('vcodec') != 'none' or f.get('acodec') != 'none', formats))
2513 elif seperate_fallback and not ctx['has_merged_format']:
2514 # for compatibility with youtube-dl when there is no pre-merged format
2515 matches = list(filter(seperate_fallback, formats))
2516 matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1]))
2517 try:
2518 yield matches[format_idx - 1]
2519 except LazyList.IndexError:
2520 return
2522 filters = [self._build_format_filter(f) for f in selector.filters]
2524 def final_selector(ctx):
2525 ctx_copy = dict(ctx)
2526 for _filter in filters:
2527 ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
2528 return selector_function(ctx_copy)
2529 return final_selector
2531 # HACK: Python 3.12 changed the underlying parser, rendering '7_a' invalid
2532 # Prefix numbers with random letters to avoid it being classified as a number
2533 # See: https://github.com/yt-dlp/yt-dlp/pulls/8797
2534 # TODO: Implement parser not reliant on tokenize.tokenize
2535 prefix = ''.join(random.choices(string.ascii_letters, k=32))
2536 stream = io.BytesIO(re.sub(r'\d[_\d]*', rf'{prefix}\g<0>', format_spec).encode())
2537 try:
2538 tokens = list(_remove_unused_ops(
2539 token._replace(string=token.string.replace(prefix, ''))
2540 for token in tokenize.tokenize(stream.readline)))
2541 except tokenize.TokenError:
2542 raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
2544 class TokenIterator:
2545 def __init__(self, tokens):
2546 self.tokens = tokens
2547 self.counter = 0
2549 def __iter__(self):
2550 return self
2552 def __next__(self):
2553 if self.counter >= len(self.tokens):
2554 raise StopIteration
2555 value = self.tokens[self.counter]
2556 self.counter += 1
2557 return value
2559 next = __next__
2561 def restore_last_token(self):
2562 self.counter -= 1
2564 parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
2565 return _build_selector_function(parsed_selector)
2567 def _calc_headers(self, info_dict, load_cookies=False):
2568 res = HTTPHeaderDict(self.params['http_headers'], info_dict.get('http_headers'))
2569 clean_headers(res)
2571 if load_cookies: # For --load-info-json
2572 self._load_cookies(res.get('Cookie'), autoscope=info_dict['url']) # compat
2573 self._load_cookies(info_dict.get('cookies'), autoscope=False)
2574 # The `Cookie` header is removed to prevent leaks and unscoped cookies.
2575 # See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj
2576 res.pop('Cookie', None)
2577 cookies = self.cookiejar.get_cookies_for_url(info_dict['url'])
2578 if cookies:
2579 encoder = LenientSimpleCookie()
2580 values = []
2581 for cookie in cookies:
2582 _, value = encoder.value_encode(cookie.value)
2583 values.append(f'{cookie.name}={value}')
2584 if cookie.domain:
2585 values.append(f'Domain={cookie.domain}')
2586 if cookie.path:
2587 values.append(f'Path={cookie.path}')
2588 if cookie.secure:
2589 values.append('Secure')
2590 if cookie.expires:
2591 values.append(f'Expires={cookie.expires}')
2592 if cookie.version:
2593 values.append(f'Version={cookie.version}')
2594 info_dict['cookies'] = '; '.join(values)
2596 if 'X-Forwarded-For' not in res:
2597 x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
2598 if x_forwarded_for_ip:
2599 res['X-Forwarded-For'] = x_forwarded_for_ip
2601 return res
2603 def _calc_cookies(self, url):
2604 self.deprecation_warning('"YoutubeDL._calc_cookies" is deprecated and may be removed in a future version')
2605 return self.cookiejar.get_cookie_header(url)
2607 def _sort_thumbnails(self, thumbnails):
2608 thumbnails.sort(key=lambda t: (
2609 t.get('preference') if t.get('preference') is not None else -1,
2610 t.get('width') if t.get('width') is not None else -1,
2611 t.get('height') if t.get('height') is not None else -1,
2612 t.get('id') if t.get('id') is not None else '',
2613 t.get('url')))
2615 def _sanitize_thumbnails(self, info_dict):
2616 thumbnails = info_dict.get('thumbnails')
2617 if thumbnails is None:
2618 thumbnail = info_dict.get('thumbnail')
2619 if thumbnail:
2620 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
2621 if not thumbnails:
2622 return
2624 def check_thumbnails(thumbnails):
2625 for t in thumbnails:
2626 self.to_screen(f'[info] Testing thumbnail {t["id"]}')
2627 try:
2628 self.urlopen(HEADRequest(t['url']))
2629 except network_exceptions as err:
2630 self.to_screen(f'[info] Unable to connect to thumbnail {t["id"]} URL {t["url"]!r} - {err}. Skipping...')
2631 continue
2632 yield t
2634 self._sort_thumbnails(thumbnails)
2635 for i, t in enumerate(thumbnails):
2636 if t.get('id') is None:
2637 t['id'] = str(i)
2638 if t.get('width') and t.get('height'):
2639 t['resolution'] = '%dx%d' % (t['width'], t['height'])
2640 t['url'] = sanitize_url(t['url'])
2642 if self.params.get('check_formats') is True:
2643 info_dict['thumbnails'] = LazyList(check_thumbnails(thumbnails[::-1]), reverse=True)
2644 else:
2645 info_dict['thumbnails'] = thumbnails
2647 def _fill_common_fields(self, info_dict, final=True):
2648 # TODO: move sanitization here
2649 if final:
2650 title = info_dict['fulltitle'] = info_dict.get('title')
2651 if not title:
2652 if title == '':
2653 self.write_debug('Extractor gave empty title. Creating a generic title')
2654 else:
2655 self.report_warning('Extractor failed to obtain "title". Creating a generic title instead')
2656 info_dict['title'] = f'{info_dict["extractor"].replace(":", "-")} video #{info_dict["id"]}'
2658 if info_dict.get('duration') is not None:
2659 info_dict['duration_string'] = formatSeconds(info_dict['duration'])
2661 for ts_key, date_key in (
2662 ('timestamp', 'upload_date'),
2663 ('release_timestamp', 'release_date'),
2664 ('modified_timestamp', 'modified_date'),
2666 if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
2667 # Working around out-of-range timestamp values (e.g. negative ones on Windows,
2668 # see http://bugs.python.org/issue1646728)
2669 with contextlib.suppress(ValueError, OverflowError, OSError):
2670 upload_date = dt.datetime.fromtimestamp(info_dict[ts_key], dt.timezone.utc)
2671 info_dict[date_key] = upload_date.strftime('%Y%m%d')
2673 if not info_dict.get('release_year'):
2674 info_dict['release_year'] = traverse_obj(info_dict, ('release_date', {lambda x: int(x[:4])}))
2676 live_keys = ('is_live', 'was_live')
2677 live_status = info_dict.get('live_status')
2678 if live_status is None:
2679 for key in live_keys:
2680 if info_dict.get(key) is False:
2681 continue
2682 if info_dict.get(key):
2683 live_status = key
2684 break
2685 if all(info_dict.get(key) is False for key in live_keys):
2686 live_status = 'not_live'
2687 if live_status:
2688 info_dict['live_status'] = live_status
2689 for key in live_keys:
2690 if info_dict.get(key) is None:
2691 info_dict[key] = (live_status == key)
2692 if live_status == 'post_live':
2693 info_dict['was_live'] = True
2695 # Auto generate title fields corresponding to the *_number fields when missing
2696 # in order to always have clean titles. This is very common for TV series.
2697 for field in ('chapter', 'season', 'episode'):
2698 if final and info_dict.get(f'{field}_number') is not None and not info_dict.get(field):
2699 info_dict[field] = '%s %d' % (field.capitalize(), info_dict[f'{field}_number'])
2701 for old_key, new_key in self._deprecated_multivalue_fields.items():
2702 if new_key in info_dict and old_key in info_dict:
2703 if '_version' not in info_dict: # HACK: Do not warn when using --load-info-json
2704 self.deprecation_warning(f'Do not return {old_key!r} when {new_key!r} is present')
2705 elif old_value := info_dict.get(old_key):
2706 info_dict[new_key] = old_value.split(', ')
2707 elif new_value := info_dict.get(new_key):
2708 info_dict[old_key] = ', '.join(v.replace(',', '\N{FULLWIDTH COMMA}') for v in new_value)
2710 def _raise_pending_errors(self, info):
2711 err = info.pop('__pending_error', None)
2712 if err:
2713 self.report_error(err, tb=False)
2715 def sort_formats(self, info_dict):
2716 formats = self._get_formats(info_dict)
2717 formats.sort(key=FormatSorter(
2718 self, info_dict.get('_format_sort_fields') or []).calculate_preference)
2720 def process_video_result(self, info_dict, download=True):
2721 assert info_dict.get('_type', 'video') == 'video'
2722 self._num_videos += 1
2724 if 'id' not in info_dict:
2725 raise ExtractorError('Missing "id" field in extractor result', ie=info_dict['extractor'])
2726 elif not info_dict.get('id'):
2727 raise ExtractorError('Extractor failed to obtain "id"', ie=info_dict['extractor'])
2729 def report_force_conversion(field, field_not, conversion):
2730 self.report_warning(
2731 f'"{field}" field is not {field_not} - forcing {conversion} conversion, '
2732 'there is an error in extractor')
2734 def sanitize_string_field(info, string_field):
2735 field = info.get(string_field)
2736 if field is None or isinstance(field, str):
2737 return
2738 report_force_conversion(string_field, 'a string', 'string')
2739 info[string_field] = str(field)
2741 def sanitize_numeric_fields(info):
2742 for numeric_field in self._NUMERIC_FIELDS:
2743 field = info.get(numeric_field)
2744 if field is None or isinstance(field, (int, float)):
2745 continue
2746 report_force_conversion(numeric_field, 'numeric', 'int')
2747 info[numeric_field] = int_or_none(field)
2749 sanitize_string_field(info_dict, 'id')
2750 sanitize_numeric_fields(info_dict)
2751 if info_dict.get('section_end') and info_dict.get('section_start') is not None:
2752 info_dict['duration'] = round(info_dict['section_end'] - info_dict['section_start'], 3)
2753 if (info_dict.get('duration') or 0) <= 0 and info_dict.pop('duration', None):
2754 self.report_warning('"duration" field is negative, there is an error in extractor')
2756 chapters = info_dict.get('chapters') or []
2757 if chapters and chapters[0].get('start_time'):
2758 chapters.insert(0, {'start_time': 0})
2760 dummy_chapter = {'end_time': 0, 'start_time': info_dict.get('duration')}
2761 for idx, (prev, current, next_) in enumerate(zip(
2762 (dummy_chapter, *chapters), chapters, (*chapters[1:], dummy_chapter)), 1):
2763 if current.get('start_time') is None:
2764 current['start_time'] = prev.get('end_time')
2765 if not current.get('end_time'):
2766 current['end_time'] = next_.get('start_time')
2767 if not current.get('title'):
2768 current['title'] = f'<Untitled Chapter {idx}>'
2770 if 'playlist' not in info_dict:
2771 # It isn't part of a playlist
2772 info_dict['playlist'] = None
2773 info_dict['playlist_index'] = None
2775 self._sanitize_thumbnails(info_dict)
2777 thumbnail = info_dict.get('thumbnail')
2778 thumbnails = info_dict.get('thumbnails')
2779 if thumbnail:
2780 info_dict['thumbnail'] = sanitize_url(thumbnail)
2781 elif thumbnails:
2782 info_dict['thumbnail'] = thumbnails[-1]['url']
2784 if info_dict.get('display_id') is None and 'id' in info_dict:
2785 info_dict['display_id'] = info_dict['id']
2787 self._fill_common_fields(info_dict)
2789 for cc_kind in ('subtitles', 'automatic_captions'):
2790 cc = info_dict.get(cc_kind)
2791 if cc:
2792 for _, subtitle in cc.items():
2793 for subtitle_format in subtitle:
2794 if subtitle_format.get('url'):
2795 subtitle_format['url'] = sanitize_url(subtitle_format['url'])
2796 if subtitle_format.get('ext') is None:
2797 subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
2799 automatic_captions = info_dict.get('automatic_captions')
2800 subtitles = info_dict.get('subtitles')
2802 info_dict['requested_subtitles'] = self.process_subtitles(
2803 info_dict['id'], subtitles, automatic_captions)
2805 formats = self._get_formats(info_dict)
2807 # Backward compatibility with InfoExtractor._sort_formats
2808 field_preference = (formats or [{}])[0].pop('__sort_fields', None)
2809 if field_preference:
2810 info_dict['_format_sort_fields'] = field_preference
2812 info_dict['_has_drm'] = any( # or None ensures --clean-infojson removes it
2813 f.get('has_drm') and f['has_drm'] != 'maybe' for f in formats) or None
2814 if not self.params.get('allow_unplayable_formats'):
2815 formats = [f for f in formats if not f.get('has_drm') or f['has_drm'] == 'maybe']
2817 if formats and all(f.get('acodec') == f.get('vcodec') == 'none' for f in formats):
2818 self.report_warning(
2819 f'{"This video is DRM protected and " if info_dict["_has_drm"] else ""}'
2820 'only images are available for download. Use --list-formats to see them'.capitalize())
2822 get_from_start = not info_dict.get('is_live') or bool(self.params.get('live_from_start'))
2823 if not get_from_start:
2824 info_dict['title'] += ' ' + dt.datetime.now().strftime('%Y-%m-%d %H:%M')
2825 if info_dict.get('is_live') and formats:
2826 formats = [f for f in formats if bool(f.get('is_from_start')) == get_from_start]
2827 if get_from_start and not formats:
2828 self.raise_no_formats(info_dict, msg=(
2829 '--live-from-start is passed, but there are no formats that can be downloaded from the start. '
2830 'If you want to download from the current time, use --no-live-from-start'))
2832 def is_wellformed(f):
2833 url = f.get('url')
2834 if not url:
2835 self.report_warning(
2836 '"url" field is missing or empty - skipping format, '
2837 'there is an error in extractor')
2838 return False
2839 if isinstance(url, bytes):
2840 sanitize_string_field(f, 'url')
2841 return True
2843 # Filter out malformed formats for better extraction robustness
2844 formats = list(filter(is_wellformed, formats or []))
2846 if not formats:
2847 self.raise_no_formats(info_dict)
2849 for fmt in formats:
2850 sanitize_string_field(fmt, 'format_id')
2851 sanitize_numeric_fields(fmt)
2852 fmt['url'] = sanitize_url(fmt['url'])
2853 if fmt.get('ext') is None:
2854 fmt['ext'] = determine_ext(fmt['url']).lower()
2855 if fmt['ext'] in ('aac', 'opus', 'mp3', 'flac', 'vorbis'):
2856 if fmt.get('acodec') is None:
2857 fmt['acodec'] = fmt['ext']
2858 if fmt.get('protocol') is None:
2859 fmt['protocol'] = determine_protocol(fmt)
2860 if fmt.get('resolution') is None:
2861 fmt['resolution'] = self.format_resolution(fmt, default=None)
2862 if fmt.get('dynamic_range') is None and fmt.get('vcodec') != 'none':
2863 fmt['dynamic_range'] = 'SDR'
2864 if fmt.get('aspect_ratio') is None:
2865 fmt['aspect_ratio'] = try_call(lambda: round(fmt['width'] / fmt['height'], 2))
2866 # For fragmented formats, "tbr" is often max bitrate and not average
2867 if (('manifest-filesize-approx' in self.params['compat_opts'] or not fmt.get('manifest_url'))
2868 and not fmt.get('filesize') and not fmt.get('filesize_approx')):
2869 fmt['filesize_approx'] = filesize_from_tbr(fmt.get('tbr'), info_dict.get('duration'))
2870 fmt['http_headers'] = self._calc_headers(collections.ChainMap(fmt, info_dict), load_cookies=True)
2872 # Safeguard against old/insecure infojson when using --load-info-json
2873 if info_dict.get('http_headers'):
2874 info_dict['http_headers'] = HTTPHeaderDict(info_dict['http_headers'])
2875 info_dict['http_headers'].pop('Cookie', None)
2877 # This is copied to http_headers by the above _calc_headers and can now be removed
2878 if '__x_forwarded_for_ip' in info_dict:
2879 del info_dict['__x_forwarded_for_ip']
2881 self.sort_formats({
2882 'formats': formats,
2883 '_format_sort_fields': info_dict.get('_format_sort_fields'),
2886 # Sanitize and group by format_id
2887 formats_dict = {}
2888 for i, fmt in enumerate(formats):
2889 if not fmt.get('format_id'):
2890 fmt['format_id'] = str(i)
2891 else:
2892 # Sanitize format_id from characters used in format selector expression
2893 fmt['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', fmt['format_id'])
2894 formats_dict.setdefault(fmt['format_id'], []).append(fmt)
2896 # Make sure all formats have unique format_id
2897 common_exts = set(itertools.chain(*self._format_selection_exts.values()))
2898 for format_id, ambiguous_formats in formats_dict.items():
2899 ambigious_id = len(ambiguous_formats) > 1
2900 for i, fmt in enumerate(ambiguous_formats):
2901 if ambigious_id:
2902 fmt['format_id'] = f'{format_id}-{i}'
2903 # Ensure there is no conflict between id and ext in format selection
2904 # See https://github.com/yt-dlp/yt-dlp/issues/1282
2905 if fmt['format_id'] != fmt['ext'] and fmt['format_id'] in common_exts:
2906 fmt['format_id'] = 'f{}'.format(fmt['format_id'])
2908 if fmt.get('format') is None:
2909 fmt['format'] = '{id} - {res}{note}'.format(
2910 id=fmt['format_id'],
2911 res=self.format_resolution(fmt),
2912 note=format_field(fmt, 'format_note', ' (%s)'),
2915 if self.params.get('check_formats') is True:
2916 formats = LazyList(self._check_formats(formats[::-1]), reverse=True)
2918 if not formats or formats[0] is not info_dict:
2919 # only set the 'formats' fields if the original info_dict list them
2920 # otherwise we end up with a circular reference, the first (and unique)
2921 # element in the 'formats' field in info_dict is info_dict itself,
2922 # which can't be exported to json
2923 info_dict['formats'] = formats
2925 info_dict, _ = self.pre_process(info_dict)
2927 if self._match_entry(info_dict, incomplete=self._format_fields) is not None:
2928 return info_dict
2930 self.post_extract(info_dict)
2931 info_dict, _ = self.pre_process(info_dict, 'after_filter')
2933 # The pre-processors may have modified the formats
2934 formats = self._get_formats(info_dict)
2936 list_only = self.params.get('simulate') == 'list_only'
2937 interactive_format_selection = not list_only and self.format_selector == '-'
2938 if self.params.get('list_thumbnails'):
2939 self.list_thumbnails(info_dict)
2940 if self.params.get('listsubtitles'):
2941 if 'automatic_captions' in info_dict:
2942 self.list_subtitles(
2943 info_dict['id'], automatic_captions, 'automatic captions')
2944 self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
2945 if self.params.get('listformats') or interactive_format_selection:
2946 self.list_formats(info_dict)
2947 if list_only:
2948 # Without this printing, -F --print-json will not work
2949 self.__forced_printings(info_dict)
2950 return info_dict
2952 format_selector = self.format_selector
2953 while True:
2954 if interactive_format_selection:
2955 req_format = input(self._format_screen('\nEnter format selector ', self.Styles.EMPHASIS)
2956 + '(Press ENTER for default, or Ctrl+C to quit)'
2957 + self._format_screen(': ', self.Styles.EMPHASIS))
2958 try:
2959 format_selector = self.build_format_selector(req_format) if req_format else None
2960 except SyntaxError as err:
2961 self.report_error(err, tb=False, is_error=False)
2962 continue
2964 if format_selector is None:
2965 req_format = self._default_format_spec(info_dict)
2966 self.write_debug(f'Default format spec: {req_format}')
2967 format_selector = self.build_format_selector(req_format)
2969 formats_to_download = self._select_formats(formats, format_selector)
2970 if interactive_format_selection and not formats_to_download:
2971 self.report_error('Requested format is not available', tb=False, is_error=False)
2972 continue
2973 break
2975 if not formats_to_download:
2976 if not self.params.get('ignore_no_formats_error'):
2977 raise ExtractorError(
2978 'Requested format is not available. Use --list-formats for a list of available formats',
2979 expected=True, video_id=info_dict['id'], ie=info_dict['extractor'])
2980 self.report_warning('Requested format is not available')
2981 # Process what we can, even without any available formats.
2982 formats_to_download = [{}]
2984 requested_ranges = tuple(self.params.get('download_ranges', lambda *_: [{}])(info_dict, self))
2985 best_format, downloaded_formats = formats_to_download[-1], []
2986 if download:
2987 if best_format and requested_ranges:
2988 def to_screen(*msg):
2989 self.to_screen(f'[info] {info_dict["id"]}: {" ".join(", ".join(variadic(m)) for m in msg)}')
2991 to_screen(f'Downloading {len(formats_to_download)} format(s):',
2992 (f['format_id'] for f in formats_to_download))
2993 if requested_ranges != ({}, ):
2994 to_screen(f'Downloading {len(requested_ranges)} time ranges:',
2995 (f'{c["start_time"]:.1f}-{c["end_time"]:.1f}' for c in requested_ranges))
2996 max_downloads_reached = False
2998 for fmt, chapter in itertools.product(formats_to_download, requested_ranges):
2999 new_info = self._copy_infodict(info_dict)
3000 new_info.update(fmt)
3001 offset, duration = info_dict.get('section_start') or 0, info_dict.get('duration') or float('inf')
3002 end_time = offset + min(chapter.get('end_time', duration), duration)
3003 # duration may not be accurate. So allow deviations <1sec
3004 if end_time == float('inf') or end_time > offset + duration + 1:
3005 end_time = None
3006 if chapter or offset:
3007 new_info.update({
3008 'section_start': offset + chapter.get('start_time', 0),
3009 'section_end': end_time,
3010 'section_title': chapter.get('title'),
3011 'section_number': chapter.get('index'),
3013 downloaded_formats.append(new_info)
3014 try:
3015 self.process_info(new_info)
3016 except MaxDownloadsReached:
3017 max_downloads_reached = True
3018 self._raise_pending_errors(new_info)
3019 # Remove copied info
3020 for key, val in tuple(new_info.items()):
3021 if info_dict.get(key) == val:
3022 new_info.pop(key)
3023 if max_downloads_reached:
3024 break
3026 write_archive = {f.get('__write_download_archive', False) for f in downloaded_formats}
3027 assert write_archive.issubset({True, False, 'ignore'})
3028 if True in write_archive and False not in write_archive:
3029 self.record_download_archive(info_dict)
3031 info_dict['requested_downloads'] = downloaded_formats
3032 info_dict = self.run_all_pps('after_video', info_dict)
3033 if max_downloads_reached:
3034 raise MaxDownloadsReached
3036 # We update the info dict with the selected best quality format (backwards compatibility)
3037 info_dict.update(best_format)
3038 return info_dict
3040 def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
3041 """Select the requested subtitles and their format"""
3042 available_subs, normal_sub_langs = {}, []
3043 if normal_subtitles and self.params.get('writesubtitles'):
3044 available_subs.update(normal_subtitles)
3045 normal_sub_langs = tuple(normal_subtitles.keys())
3046 if automatic_captions and self.params.get('writeautomaticsub'):
3047 for lang, cap_info in automatic_captions.items():
3048 if lang not in available_subs:
3049 available_subs[lang] = cap_info
3051 if not available_subs or (
3052 not self.params.get('writesubtitles')
3053 and not self.params.get('writeautomaticsub')):
3054 return None
3056 all_sub_langs = tuple(available_subs.keys())
3057 if self.params.get('allsubtitles', False):
3058 requested_langs = all_sub_langs
3059 elif self.params.get('subtitleslangs', False):
3060 try:
3061 requested_langs = orderedSet_from_options(
3062 self.params.get('subtitleslangs'), {'all': all_sub_langs}, use_regex=True)
3063 except re.error as e:
3064 raise ValueError(f'Wrong regex for subtitlelangs: {e.pattern}')
3065 else:
3066 requested_langs = LazyList(itertools.chain(
3067 ['en'] if 'en' in normal_sub_langs else [],
3068 filter(lambda f: f.startswith('en'), normal_sub_langs),
3069 ['en'] if 'en' in all_sub_langs else [],
3070 filter(lambda f: f.startswith('en'), all_sub_langs),
3071 normal_sub_langs, all_sub_langs,
3072 ))[:1]
3073 if requested_langs:
3074 self.to_screen(f'[info] {video_id}: Downloading subtitles: {", ".join(requested_langs)}')
3076 formats_query = self.params.get('subtitlesformat', 'best')
3077 formats_preference = formats_query.split('/') if formats_query else []
3078 subs = {}
3079 for lang in requested_langs:
3080 formats = available_subs.get(lang)
3081 if formats is None:
3082 self.report_warning(f'{lang} subtitles not available for {video_id}')
3083 continue
3084 for ext in formats_preference:
3085 if ext == 'best':
3086 f = formats[-1]
3087 break
3088 matches = list(filter(lambda f: f['ext'] == ext, formats))
3089 if matches:
3090 f = matches[-1]
3091 break
3092 else:
3093 f = formats[-1]
3094 self.report_warning(
3095 'No subtitle format found matching "{}" for language {}, '
3096 'using {}. Use --list-subs for a list of available subtitles'.format(formats_query, lang, f['ext']))
3097 subs[lang] = f
3098 return subs
3100 def _forceprint(self, key, info_dict):
3101 if info_dict is None:
3102 return
3103 info_copy = info_dict.copy()
3104 info_copy.setdefault('filename', self.prepare_filename(info_dict))
3105 if info_dict.get('requested_formats') is not None:
3106 # For RTMP URLs, also include the playpath
3107 info_copy['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats'])
3108 elif info_dict.get('url'):
3109 info_copy['urls'] = info_dict['url'] + info_dict.get('play_path', '')
3110 info_copy['formats_table'] = self.render_formats_table(info_dict)
3111 info_copy['thumbnails_table'] = self.render_thumbnails_table(info_dict)
3112 info_copy['subtitles_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('subtitles'))
3113 info_copy['automatic_captions_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('automatic_captions'))
3115 def format_tmpl(tmpl):
3116 mobj = re.fullmatch(r'([\w.:,]|-\d|(?P<dict>{([\w.:,]|-\d)+}))+=?', tmpl)
3117 if not mobj:
3118 return tmpl
3120 fmt = '%({})s'
3121 if tmpl.startswith('{'):
3122 tmpl, fmt = f'.{tmpl}', '%({})j'
3123 if tmpl.endswith('='):
3124 tmpl, fmt = tmpl[:-1], '{0} = %({0})#j'
3125 return '\n'.join(map(fmt.format, [tmpl] if mobj.group('dict') else tmpl.split(',')))
3127 for tmpl in self.params['forceprint'].get(key, []):
3128 self.to_stdout(self.evaluate_outtmpl(format_tmpl(tmpl), info_copy))
3130 for tmpl, file_tmpl in self.params['print_to_file'].get(key, []):
3131 filename = self.prepare_filename(info_dict, outtmpl=file_tmpl)
3132 tmpl = format_tmpl(tmpl)
3133 self.to_screen(f'[info] Writing {tmpl!r} to: {filename}')
3134 if self._ensure_dir_exists(filename):
3135 with open(filename, 'a', encoding='utf-8', newline='') as f:
3136 f.write(self.evaluate_outtmpl(tmpl, info_copy) + os.linesep)
3138 return info_copy
3140 def __forced_printings(self, info_dict, filename=None, incomplete=True):
3141 if (self.params.get('forcejson')
3142 or self.params['forceprint'].get('video')
3143 or self.params['print_to_file'].get('video')):
3144 self.post_extract(info_dict)
3145 if filename:
3146 info_dict['filename'] = filename
3147 info_copy = self._forceprint('video', info_dict)
3149 def print_field(field, actual_field=None, optional=False):
3150 if actual_field is None:
3151 actual_field = field
3152 if self.params.get(f'force{field}') and (
3153 info_copy.get(field) is not None or (not optional and not incomplete)):
3154 self.to_stdout(info_copy[actual_field])
3156 print_field('title')
3157 print_field('id')
3158 print_field('url', 'urls')
3159 print_field('thumbnail', optional=True)
3160 print_field('description', optional=True)
3161 print_field('filename')
3162 if self.params.get('forceduration') and info_copy.get('duration') is not None:
3163 self.to_stdout(formatSeconds(info_copy['duration']))
3164 print_field('format')
3166 if self.params.get('forcejson'):
3167 self.to_stdout(json.dumps(self.sanitize_info(info_dict)))
3169 def dl(self, name, info, subtitle=False, test=False):
3170 if not info.get('url'):
3171 self.raise_no_formats(info, True)
3173 if test:
3174 verbose = self.params.get('verbose')
3175 quiet = self.params.get('quiet') or not verbose
3176 params = {
3177 'test': True,
3178 'quiet': quiet,
3179 'verbose': verbose,
3180 'noprogress': quiet,
3181 'nopart': True,
3182 'skip_unavailable_fragments': False,
3183 'keep_fragments': False,
3184 'overwrites': True,
3185 '_no_ytdl_file': True,
3187 else:
3188 params = self.params
3189 fd = get_suitable_downloader(info, params, to_stdout=(name == '-'))(self, params)
3190 if not test:
3191 for ph in self._progress_hooks:
3192 fd.add_progress_hook(ph)
3193 urls = '", "'.join(
3194 (f['url'].split(',')[0] + ',<data>' if f['url'].startswith('data:') else f['url'])
3195 for f in info.get('requested_formats', []) or [info])
3196 self.write_debug(f'Invoking {fd.FD_NAME} downloader on "{urls}"')
3198 # Note: Ideally info should be a deep-copied so that hooks cannot modify it.
3199 # But it may contain objects that are not deep-copyable
3200 new_info = self._copy_infodict(info)
3201 if new_info.get('http_headers') is None:
3202 new_info['http_headers'] = self._calc_headers(new_info)
3203 return fd.download(name, new_info, subtitle)
3205 def existing_file(self, filepaths, *, default_overwrite=True):
3206 existing_files = list(filter(os.path.exists, orderedSet(filepaths)))
3207 if existing_files and not self.params.get('overwrites', default_overwrite):
3208 return existing_files[0]
3210 for file in existing_files:
3211 self.report_file_delete(file)
3212 os.remove(file)
3213 return None
3215 @_catch_unsafe_extension_error
3216 def process_info(self, info_dict):
3217 """Process a single resolved IE result. (Modifies it in-place)"""
3219 assert info_dict.get('_type', 'video') == 'video'
3220 original_infodict = info_dict
3222 if 'format' not in info_dict and 'ext' in info_dict:
3223 info_dict['format'] = info_dict['ext']
3225 if self._match_entry(info_dict) is not None:
3226 info_dict['__write_download_archive'] = 'ignore'
3227 return
3229 # Does nothing under normal operation - for backward compatibility of process_info
3230 self.post_extract(info_dict)
3232 def replace_info_dict(new_info):
3233 nonlocal info_dict
3234 if new_info == info_dict:
3235 return
3236 info_dict.clear()
3237 info_dict.update(new_info)
3239 new_info, _ = self.pre_process(info_dict, 'video')
3240 replace_info_dict(new_info)
3241 self._num_downloads += 1
3243 # info_dict['_filename'] needs to be set for backward compatibility
3244 info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True)
3245 temp_filename = self.prepare_filename(info_dict, 'temp')
3246 files_to_move = {}
3248 # Forced printings
3249 self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
3251 def check_max_downloads():
3252 if self._num_downloads >= float(self.params.get('max_downloads') or 'inf'):
3253 raise MaxDownloadsReached
3255 if self.params.get('simulate'):
3256 info_dict['__write_download_archive'] = self.params.get('force_write_download_archive')
3257 check_max_downloads()
3258 return
3260 if full_filename is None:
3261 return
3262 if not self._ensure_dir_exists(encodeFilename(full_filename)):
3263 return
3264 if not self._ensure_dir_exists(encodeFilename(temp_filename)):
3265 return
3267 if self._write_description('video', info_dict,
3268 self.prepare_filename(info_dict, 'description')) is None:
3269 return
3271 sub_files = self._write_subtitles(info_dict, temp_filename)
3272 if sub_files is None:
3273 return
3274 files_to_move.update(dict(sub_files))
3276 thumb_files = self._write_thumbnails(
3277 'video', info_dict, temp_filename, self.prepare_filename(info_dict, 'thumbnail'))
3278 if thumb_files is None:
3279 return
3280 files_to_move.update(dict(thumb_files))
3282 infofn = self.prepare_filename(info_dict, 'infojson')
3283 _infojson_written = self._write_info_json('video', info_dict, infofn)
3284 if _infojson_written:
3285 info_dict['infojson_filename'] = infofn
3286 # For backward compatibility, even though it was a private field
3287 info_dict['__infojson_filename'] = infofn
3288 elif _infojson_written is None:
3289 return
3291 # Note: Annotations are deprecated
3292 annofn = None
3293 if self.params.get('writeannotations', False):
3294 annofn = self.prepare_filename(info_dict, 'annotation')
3295 if annofn:
3296 if not self._ensure_dir_exists(encodeFilename(annofn)):
3297 return
3298 if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(annofn)):
3299 self.to_screen('[info] Video annotations are already present')
3300 elif not info_dict.get('annotations'):
3301 self.report_warning('There are no annotations to write.')
3302 else:
3303 try:
3304 self.to_screen('[info] Writing video annotations to: ' + annofn)
3305 with open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
3306 annofile.write(info_dict['annotations'])
3307 except (KeyError, TypeError):
3308 self.report_warning('There are no annotations to write.')
3309 except OSError:
3310 self.report_error('Cannot write annotations file: ' + annofn)
3311 return
3313 # Write internet shortcut files
3314 def _write_link_file(link_type):
3315 url = try_get(info_dict['webpage_url'], iri_to_uri)
3316 if not url:
3317 self.report_warning(
3318 f'Cannot write internet shortcut file because the actual URL of "{info_dict["webpage_url"]}" is unknown')
3319 return True
3320 linkfn = replace_extension(self.prepare_filename(info_dict, 'link'), link_type, info_dict.get('ext'))
3321 if not self._ensure_dir_exists(encodeFilename(linkfn)):
3322 return False
3323 if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)):
3324 self.to_screen(f'[info] Internet shortcut (.{link_type}) is already present')
3325 return True
3326 try:
3327 self.to_screen(f'[info] Writing internet shortcut (.{link_type}) to: {linkfn}')
3328 with open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8',
3329 newline='\r\n' if link_type == 'url' else '\n') as linkfile:
3330 template_vars = {'url': url}
3331 if link_type == 'desktop':
3332 template_vars['filename'] = linkfn[:-(len(link_type) + 1)]
3333 linkfile.write(LINK_TEMPLATES[link_type] % template_vars)
3334 except OSError:
3335 self.report_error(f'Cannot write internet shortcut {linkfn}')
3336 return False
3337 return True
3339 write_links = {
3340 'url': self.params.get('writeurllink'),
3341 'webloc': self.params.get('writewebloclink'),
3342 'desktop': self.params.get('writedesktoplink'),
3344 if self.params.get('writelink'):
3345 link_type = ('webloc' if sys.platform == 'darwin'
3346 else 'desktop' if sys.platform.startswith('linux')
3347 else 'url')
3348 write_links[link_type] = True
3350 if any(should_write and not _write_link_file(link_type)
3351 for link_type, should_write in write_links.items()):
3352 return
3354 new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
3355 replace_info_dict(new_info)
3357 if self.params.get('skip_download'):
3358 info_dict['filepath'] = temp_filename
3359 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
3360 info_dict['__files_to_move'] = files_to_move
3361 replace_info_dict(self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict))
3362 info_dict['__write_download_archive'] = self.params.get('force_write_download_archive')
3363 else:
3364 # Download
3365 info_dict.setdefault('__postprocessors', [])
3366 try:
3368 def existing_video_file(*filepaths):
3369 ext = info_dict.get('ext')
3370 converted = lambda file: replace_extension(file, self.params.get('final_ext') or ext, ext)
3371 file = self.existing_file(itertools.chain(*zip(map(converted, filepaths), filepaths)),
3372 default_overwrite=False)
3373 if file:
3374 info_dict['ext'] = os.path.splitext(file)[1][1:]
3375 return file
3377 fd, success = None, True
3378 if info_dict.get('protocol') or info_dict.get('url'):
3379 fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-')
3380 if fd != FFmpegFD and 'no-direct-merge' not in self.params['compat_opts'] and (
3381 info_dict.get('section_start') or info_dict.get('section_end')):
3382 msg = ('This format cannot be partially downloaded' if FFmpegFD.available()
3383 else 'You have requested downloading the video partially, but ffmpeg is not installed')
3384 self.report_error(f'{msg}. Aborting')
3385 return
3387 if info_dict.get('requested_formats') is not None:
3388 old_ext = info_dict['ext']
3389 if self.params.get('merge_output_format') is None:
3390 if (info_dict['ext'] == 'webm'
3391 and info_dict.get('thumbnails')
3392 # check with type instead of pp_key, __name__, or isinstance
3393 # since we dont want any custom PPs to trigger this
3394 and any(type(pp) == EmbedThumbnailPP for pp in self._pps['post_process'])): # noqa: E721
3395 info_dict['ext'] = 'mkv'
3396 self.report_warning(
3397 'webm doesn\'t support embedding a thumbnail, mkv will be used')
3398 new_ext = info_dict['ext']
3400 def correct_ext(filename, ext=new_ext):
3401 if filename == '-':
3402 return filename
3403 filename_real_ext = os.path.splitext(filename)[1][1:]
3404 filename_wo_ext = (
3405 os.path.splitext(filename)[0]
3406 if filename_real_ext in (old_ext, new_ext)
3407 else filename)
3408 return f'{filename_wo_ext}.{ext}'
3410 # Ensure filename always has a correct extension for successful merge
3411 full_filename = correct_ext(full_filename)
3412 temp_filename = correct_ext(temp_filename)
3413 dl_filename = existing_video_file(full_filename, temp_filename)
3415 info_dict['__real_download'] = False
3416 # NOTE: Copy so that original format dicts are not modified
3417 info_dict['requested_formats'] = list(map(dict, info_dict['requested_formats']))
3419 merger = FFmpegMergerPP(self)
3420 downloaded = []
3421 if dl_filename is not None:
3422 self.report_file_already_downloaded(dl_filename)
3423 elif fd:
3424 for f in info_dict['requested_formats'] if fd != FFmpegFD else []:
3425 f['filepath'] = fname = prepend_extension(
3426 correct_ext(temp_filename, info_dict['ext']),
3427 'f{}'.format(f['format_id']), info_dict['ext'])
3428 downloaded.append(fname)
3429 info_dict['url'] = '\n'.join(f['url'] for f in info_dict['requested_formats'])
3430 success, real_download = self.dl(temp_filename, info_dict)
3431 info_dict['__real_download'] = real_download
3432 else:
3433 if self.params.get('allow_unplayable_formats'):
3434 self.report_warning(
3435 'You have requested merging of multiple formats '
3436 'while also allowing unplayable formats to be downloaded. '
3437 'The formats won\'t be merged to prevent data corruption.')
3438 elif not merger.available:
3439 msg = 'You have requested merging of multiple formats but ffmpeg is not installed'
3440 if not self.params.get('ignoreerrors'):
3441 self.report_error(f'{msg}. Aborting due to --abort-on-error')
3442 return
3443 self.report_warning(f'{msg}. The formats won\'t be merged')
3445 if temp_filename == '-':
3446 reason = ('using a downloader other than ffmpeg' if FFmpegFD.can_merge_formats(info_dict, self.params)
3447 else 'but the formats are incompatible for simultaneous download' if merger.available
3448 else 'but ffmpeg is not installed')
3449 self.report_warning(
3450 f'You have requested downloading multiple formats to stdout {reason}. '
3451 'The formats will be streamed one after the other')
3452 fname = temp_filename
3453 for f in info_dict['requested_formats']:
3454 new_info = dict(info_dict)
3455 del new_info['requested_formats']
3456 new_info.update(f)
3457 if temp_filename != '-':
3458 fname = prepend_extension(
3459 correct_ext(temp_filename, new_info['ext']),
3460 'f{}'.format(f['format_id']), new_info['ext'])
3461 if not self._ensure_dir_exists(fname):
3462 return
3463 f['filepath'] = fname
3464 downloaded.append(fname)
3465 partial_success, real_download = self.dl(fname, new_info)
3466 info_dict['__real_download'] = info_dict['__real_download'] or real_download
3467 success = success and partial_success
3469 if downloaded and merger.available and not self.params.get('allow_unplayable_formats'):
3470 info_dict['__postprocessors'].append(merger)
3471 info_dict['__files_to_merge'] = downloaded
3472 # Even if there were no downloads, it is being merged only now
3473 info_dict['__real_download'] = True
3474 else:
3475 for file in downloaded:
3476 files_to_move[file] = None
3477 else:
3478 # Just a single file
3479 dl_filename = existing_video_file(full_filename, temp_filename)
3480 if dl_filename is None or dl_filename == temp_filename:
3481 # dl_filename == temp_filename could mean that the file was partially downloaded with --no-part.
3482 # So we should try to resume the download
3483 success, real_download = self.dl(temp_filename, info_dict)
3484 info_dict['__real_download'] = real_download
3485 else:
3486 self.report_file_already_downloaded(dl_filename)
3488 dl_filename = dl_filename or temp_filename
3489 info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
3491 except network_exceptions as err:
3492 self.report_error(f'unable to download video data: {err}')
3493 return
3494 except OSError as err:
3495 raise UnavailableVideoError(err)
3496 except ContentTooShortError as err:
3497 self.report_error(f'content too short (expected {err.expected} bytes and served {err.downloaded})')
3498 return
3500 self._raise_pending_errors(info_dict)
3501 if success and full_filename != '-':
3503 def fixup():
3504 do_fixup = True
3505 fixup_policy = self.params.get('fixup')
3506 vid = info_dict['id']
3508 if fixup_policy in ('ignore', 'never'):
3509 return
3510 elif fixup_policy == 'warn':
3511 do_fixup = 'warn'
3512 elif fixup_policy != 'force':
3513 assert fixup_policy in ('detect_or_warn', None)
3514 if not info_dict.get('__real_download'):
3515 do_fixup = False
3517 def ffmpeg_fixup(cndn, msg, cls):
3518 if not (do_fixup and cndn):
3519 return
3520 elif do_fixup == 'warn':
3521 self.report_warning(f'{vid}: {msg}')
3522 return
3523 pp = cls(self)
3524 if pp.available:
3525 info_dict['__postprocessors'].append(pp)
3526 else:
3527 self.report_warning(f'{vid}: {msg}. Install ffmpeg to fix this automatically')
3529 stretched_ratio = info_dict.get('stretched_ratio')
3530 ffmpeg_fixup(stretched_ratio not in (1, None),
3531 f'Non-uniform pixel ratio {stretched_ratio}',
3532 FFmpegFixupStretchedPP)
3534 downloader = get_suitable_downloader(info_dict, self.params) if 'protocol' in info_dict else None
3535 downloader = downloader.FD_NAME if downloader else None
3537 ext = info_dict.get('ext')
3538 postprocessed_by_ffmpeg = info_dict.get('requested_formats') or any((
3539 isinstance(pp, FFmpegVideoConvertorPP)
3540 and resolve_recode_mapping(ext, pp.mapping)[0] not in (ext, None)
3541 ) for pp in self._pps['post_process'])
3543 if not postprocessed_by_ffmpeg:
3544 ffmpeg_fixup(fd != FFmpegFD and ext == 'm4a'
3545 and info_dict.get('container') == 'm4a_dash',
3546 'writing DASH m4a. Only some players support this container',
3547 FFmpegFixupM4aPP)
3548 ffmpeg_fixup(downloader == 'hlsnative' and not self.params.get('hls_use_mpegts')
3549 or info_dict.get('is_live') and self.params.get('hls_use_mpegts') is None,
3550 'Possible MPEG-TS in MP4 container or malformed AAC timestamps',
3551 FFmpegFixupM3u8PP)
3552 ffmpeg_fixup(downloader == 'dashsegments'
3553 and (info_dict.get('is_live') or info_dict.get('is_dash_periods')),
3554 'Possible duplicate MOOV atoms', FFmpegFixupDuplicateMoovPP)
3556 ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed timestamps detected', FFmpegFixupTimestampPP)
3557 ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed duration detected', FFmpegFixupDurationPP)
3559 fixup()
3560 try:
3561 replace_info_dict(self.post_process(dl_filename, info_dict, files_to_move))
3562 except PostProcessingError as err:
3563 self.report_error(f'Postprocessing: {err}')
3564 return
3565 try:
3566 for ph in self._post_hooks:
3567 ph(info_dict['filepath'])
3568 except Exception as err:
3569 self.report_error(f'post hooks: {err}')
3570 return
3571 info_dict['__write_download_archive'] = True
3573 assert info_dict is original_infodict # Make sure the info_dict was modified in-place
3574 if self.params.get('force_write_download_archive'):
3575 info_dict['__write_download_archive'] = True
3576 check_max_downloads()
3578 def __download_wrapper(self, func):
3579 @functools.wraps(func)
3580 def wrapper(*args, **kwargs):
3581 try:
3582 res = func(*args, **kwargs)
3583 except UnavailableVideoError as e:
3584 self.report_error(e)
3585 except DownloadCancelled as e:
3586 self.to_screen(f'[info] {e}')
3587 if not self.params.get('break_per_url'):
3588 raise
3589 self._num_downloads = 0
3590 else:
3591 if self.params.get('dump_single_json', False):
3592 self.post_extract(res)
3593 self.to_stdout(json.dumps(self.sanitize_info(res)))
3594 return wrapper
3596 def download(self, url_list):
3597 """Download a given list of URLs."""
3598 url_list = variadic(url_list) # Passing a single URL is a common mistake
3599 outtmpl = self.params['outtmpl']['default']
3600 if (len(url_list) > 1
3601 and outtmpl != '-'
3602 and '%' not in outtmpl
3603 and self.params.get('max_downloads') != 1):
3604 raise SameFileError(outtmpl)
3606 for url in url_list:
3607 self.__download_wrapper(self.extract_info)(
3608 url, force_generic_extractor=self.params.get('force_generic_extractor', False))
3610 return self._download_retcode
3612 def download_with_info_file(self, info_filename):
3613 with contextlib.closing(fileinput.FileInput(
3614 [info_filename], mode='r',
3615 openhook=fileinput.hook_encoded('utf-8'))) as f:
3616 # FileInput doesn't have a read method, we can't call json.load
3617 infos = [self.sanitize_info(info, self.params.get('clean_infojson', True))
3618 for info in variadic(json.loads('\n'.join(f)))]
3619 for info in infos:
3620 try:
3621 self.__download_wrapper(self.process_ie_result)(info, download=True)
3622 except (DownloadError, EntryNotInPlaylist, ReExtractInfo) as e:
3623 if not isinstance(e, EntryNotInPlaylist):
3624 self.to_stderr('\r')
3625 webpage_url = info.get('webpage_url')
3626 if webpage_url is None:
3627 raise
3628 self.report_warning(f'The info failed to download: {e}; trying with URL {webpage_url}')
3629 self.download([webpage_url])
3630 except ExtractorError as e:
3631 self.report_error(e)
3632 return self._download_retcode
3634 @staticmethod
3635 def sanitize_info(info_dict, remove_private_keys=False):
3636 """ Sanitize the infodict for converting to json """
3637 if info_dict is None:
3638 return info_dict
3639 info_dict.setdefault('epoch', int(time.time()))
3640 info_dict.setdefault('_type', 'video')
3641 info_dict.setdefault('_version', {
3642 'version': __version__,
3643 'current_git_head': current_git_head(),
3644 'release_git_head': RELEASE_GIT_HEAD,
3645 'repository': ORIGIN,
3648 if remove_private_keys:
3649 reject = lambda k, v: v is None or k.startswith('__') or k in {
3650 'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries',
3651 'entries', 'filepath', '_filename', 'filename', 'infojson_filename', 'original_url',
3652 'playlist_autonumber',
3654 else:
3655 reject = lambda k, v: False
3657 def filter_fn(obj):
3658 if isinstance(obj, dict):
3659 return {k: filter_fn(v) for k, v in obj.items() if not reject(k, v)}
3660 elif isinstance(obj, (list, tuple, set, LazyList)):
3661 return list(map(filter_fn, obj))
3662 elif obj is None or isinstance(obj, (str, int, float, bool)):
3663 return obj
3664 else:
3665 return repr(obj)
3667 return filter_fn(info_dict)
3669 @staticmethod
3670 def filter_requested_info(info_dict, actually_filter=True):
3671 """ Alias of sanitize_info for backward compatibility """
3672 return YoutubeDL.sanitize_info(info_dict, actually_filter)
3674 def _delete_downloaded_files(self, *files_to_delete, info={}, msg=None):
3675 for filename in set(filter(None, files_to_delete)):
3676 if msg:
3677 self.to_screen(msg % filename)
3678 try:
3679 os.remove(filename)
3680 except OSError:
3681 self.report_warning(f'Unable to delete file {filename}')
3682 if filename in info.get('__files_to_move', []): # NB: Delete even if None
3683 del info['__files_to_move'][filename]
3685 @staticmethod
3686 def post_extract(info_dict):
3687 def actual_post_extract(info_dict):
3688 if info_dict.get('_type') in ('playlist', 'multi_video'):
3689 for video_dict in info_dict.get('entries', {}):
3690 actual_post_extract(video_dict or {})
3691 return
3693 post_extractor = info_dict.pop('__post_extractor', None) or dict
3694 info_dict.update(post_extractor())
3696 actual_post_extract(info_dict or {})
3698 def run_pp(self, pp, infodict):
3699 files_to_delete = []
3700 if '__files_to_move' not in infodict:
3701 infodict['__files_to_move'] = {}
3702 try:
3703 files_to_delete, infodict = pp.run(infodict)
3704 except PostProcessingError as e:
3705 # Must be True and not 'only_download'
3706 if self.params.get('ignoreerrors') is True:
3707 self.report_error(e)
3708 return infodict
3709 raise
3711 if not files_to_delete:
3712 return infodict
3713 if self.params.get('keepvideo', False):
3714 for f in files_to_delete:
3715 infodict['__files_to_move'].setdefault(f, '')
3716 else:
3717 self._delete_downloaded_files(
3718 *files_to_delete, info=infodict, msg='Deleting original file %s (pass -k to keep)')
3719 return infodict
3721 def run_all_pps(self, key, info, *, additional_pps=None):
3722 if key != 'video':
3723 self._forceprint(key, info)
3724 for pp in (additional_pps or []) + self._pps[key]:
3725 info = self.run_pp(pp, info)
3726 return info
3728 def pre_process(self, ie_info, key='pre_process', files_to_move=None):
3729 info = dict(ie_info)
3730 info['__files_to_move'] = files_to_move or {}
3731 try:
3732 info = self.run_all_pps(key, info)
3733 except PostProcessingError as err:
3734 msg = f'Preprocessing: {err}'
3735 info.setdefault('__pending_error', msg)
3736 self.report_error(msg, is_error=False)
3737 return info, info.pop('__files_to_move', None)
3739 def post_process(self, filename, info, files_to_move=None):
3740 """Run all the postprocessors on the given file."""
3741 info['filepath'] = filename
3742 info['__files_to_move'] = files_to_move or {}
3743 info = self.run_all_pps('post_process', info, additional_pps=info.get('__postprocessors'))
3744 info = self.run_pp(MoveFilesAfterDownloadPP(self), info)
3745 del info['__files_to_move']
3746 return self.run_all_pps('after_move', info)
3748 def _make_archive_id(self, info_dict):
3749 video_id = info_dict.get('id')
3750 if not video_id:
3751 return
3752 # Future-proof against any change in case
3753 # and backwards compatibility with prior versions
3754 extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist
3755 if extractor is None:
3756 url = str_or_none(info_dict.get('url'))
3757 if not url:
3758 return
3759 # Try to find matching extractor for the URL and take its ie_key
3760 for ie_key, ie in self._ies.items():
3761 if ie.suitable(url):
3762 extractor = ie_key
3763 break
3764 else:
3765 return
3766 return make_archive_id(extractor, video_id)
3768 def in_download_archive(self, info_dict):
3769 if not self.archive:
3770 return False
3772 vid_ids = [self._make_archive_id(info_dict)]
3773 vid_ids.extend(info_dict.get('_old_archive_ids') or [])
3774 return any(id_ in self.archive for id_ in vid_ids)
3776 def record_download_archive(self, info_dict):
3777 fn = self.params.get('download_archive')
3778 if fn is None:
3779 return
3780 vid_id = self._make_archive_id(info_dict)
3781 assert vid_id
3783 self.write_debug(f'Adding to archive: {vid_id}')
3784 if is_path_like(fn):
3785 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
3786 archive_file.write(vid_id + '\n')
3787 self.archive.add(vid_id)
3789 @staticmethod
3790 def format_resolution(format, default='unknown'):
3791 if format.get('vcodec') == 'none' and format.get('acodec') != 'none':
3792 return 'audio only'
3793 if format.get('resolution') is not None:
3794 return format['resolution']
3795 if format.get('width') and format.get('height'):
3796 return '%dx%d' % (format['width'], format['height'])
3797 elif format.get('height'):
3798 return '{}p'.format(format['height'])
3799 elif format.get('width'):
3800 return '%dx?' % format['width']
3801 return default
3803 def _list_format_headers(self, *headers):
3804 if self.params.get('listformats_table', True) is not False:
3805 return [self._format_out(header, self.Styles.HEADERS) for header in headers]
3806 return headers
3808 def _format_note(self, fdict):
3809 res = ''
3810 if fdict.get('ext') in ['f4f', 'f4m']:
3811 res += '(unsupported)'
3812 if fdict.get('language'):
3813 if res:
3814 res += ' '
3815 res += '[{}]'.format(fdict['language'])
3816 if fdict.get('format_note') is not None:
3817 if res:
3818 res += ' '
3819 res += fdict['format_note']
3820 if fdict.get('tbr') is not None:
3821 if res:
3822 res += ', '
3823 res += '%4dk' % fdict['tbr']
3824 if fdict.get('container') is not None:
3825 if res:
3826 res += ', '
3827 res += '{} container'.format(fdict['container'])
3828 if (fdict.get('vcodec') is not None
3829 and fdict.get('vcodec') != 'none'):
3830 if res:
3831 res += ', '
3832 res += fdict['vcodec']
3833 if fdict.get('vbr') is not None:
3834 res += '@'
3835 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
3836 res += 'video@'
3837 if fdict.get('vbr') is not None:
3838 res += '%4dk' % fdict['vbr']
3839 if fdict.get('fps') is not None:
3840 if res:
3841 res += ', '
3842 res += '{}fps'.format(fdict['fps'])
3843 if fdict.get('acodec') is not None:
3844 if res:
3845 res += ', '
3846 if fdict['acodec'] == 'none':
3847 res += 'video only'
3848 else:
3849 res += '%-5s' % fdict['acodec']
3850 elif fdict.get('abr') is not None:
3851 if res:
3852 res += ', '
3853 res += 'audio'
3854 if fdict.get('abr') is not None:
3855 res += '@%3dk' % fdict['abr']
3856 if fdict.get('asr') is not None:
3857 res += ' (%5dHz)' % fdict['asr']
3858 if fdict.get('filesize') is not None:
3859 if res:
3860 res += ', '
3861 res += format_bytes(fdict['filesize'])
3862 elif fdict.get('filesize_approx') is not None:
3863 if res:
3864 res += ', '
3865 res += '~' + format_bytes(fdict['filesize_approx'])
3866 return res
3868 def _get_formats(self, info_dict):
3869 if info_dict.get('formats') is None:
3870 if info_dict.get('url') and info_dict.get('_type', 'video') == 'video':
3871 return [info_dict]
3872 return []
3873 return info_dict['formats']
3875 def render_formats_table(self, info_dict):
3876 formats = self._get_formats(info_dict)
3877 if not formats:
3878 return
3879 if not self.params.get('listformats_table', True) is not False:
3880 table = [
3882 format_field(f, 'format_id'),
3883 format_field(f, 'ext'),
3884 self.format_resolution(f),
3885 self._format_note(f),
3886 ] for f in formats if (f.get('preference') or 0) >= -1000]
3887 return render_table(['format code', 'extension', 'resolution', 'note'], table, extra_gap=1)
3889 def simplified_codec(f, field):
3890 assert field in ('acodec', 'vcodec')
3891 codec = f.get(field)
3892 if not codec:
3893 return 'unknown'
3894 elif codec != 'none':
3895 return '.'.join(codec.split('.')[:4])
3897 if field == 'vcodec' and f.get('acodec') == 'none':
3898 return 'images'
3899 elif field == 'acodec' and f.get('vcodec') == 'none':
3900 return ''
3901 return self._format_out('audio only' if field == 'vcodec' else 'video only',
3902 self.Styles.SUPPRESS)
3904 delim = self._format_out('\u2502', self.Styles.DELIM, '|', test_encoding=True)
3905 table = [
3907 self._format_out(format_field(f, 'format_id'), self.Styles.ID),
3908 format_field(f, 'ext'),
3909 format_field(f, func=self.format_resolution, ignore=('audio only', 'images')),
3910 format_field(f, 'fps', '\t%d', func=round),
3911 format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''),
3912 format_field(f, 'audio_channels', '\t%s'),
3913 delim, (
3914 format_field(f, 'filesize', ' \t%s', func=format_bytes)
3915 or format_field(f, 'filesize_approx', '≈\t%s', func=format_bytes)
3916 or format_field(filesize_from_tbr(f.get('tbr'), info_dict.get('duration')), None,
3917 self._format_out('~\t%s', self.Styles.SUPPRESS), func=format_bytes)),
3918 format_field(f, 'tbr', '\t%dk', func=round),
3919 shorten_protocol_name(f.get('protocol', '')),
3920 delim,
3921 simplified_codec(f, 'vcodec'),
3922 format_field(f, 'vbr', '\t%dk', func=round),
3923 simplified_codec(f, 'acodec'),
3924 format_field(f, 'abr', '\t%dk', func=round),
3925 format_field(f, 'asr', '\t%s', func=format_decimal_suffix),
3926 join_nonempty(format_field(f, 'language', '[%s]'), join_nonempty(
3927 self._format_out('UNSUPPORTED', self.Styles.BAD_FORMAT) if f.get('ext') in ('f4f', 'f4m') else None,
3928 (self._format_out('Maybe DRM', self.Styles.WARNING) if f.get('has_drm') == 'maybe'
3929 else self._format_out('DRM', self.Styles.BAD_FORMAT) if f.get('has_drm') else None),
3930 format_field(f, 'format_note'),
3931 format_field(f, 'container', ignore=(None, f.get('ext'))),
3932 delim=', '), delim=' '),
3933 ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
3934 header_line = self._list_format_headers(
3935 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', 'CH', delim, '\tFILESIZE', '\tTBR', 'PROTO',
3936 delim, 'VCODEC', '\tVBR', 'ACODEC', '\tABR', '\tASR', 'MORE INFO')
3938 return render_table(
3939 header_line, table, hide_empty=True,
3940 delim=self._format_out('\u2500', self.Styles.DELIM, '-', test_encoding=True))
3942 def render_thumbnails_table(self, info_dict):
3943 thumbnails = list(info_dict.get('thumbnails') or [])
3944 if not thumbnails:
3945 return None
3946 return render_table(
3947 self._list_format_headers('ID', 'Width', 'Height', 'URL'),
3948 [[t.get('id'), t.get('width') or 'unknown', t.get('height') or 'unknown', t['url']] for t in thumbnails])
3950 def render_subtitles_table(self, video_id, subtitles):
3951 def _row(lang, formats):
3952 exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats)))
3953 if len(set(names)) == 1:
3954 names = [] if names[0] == 'unknown' else names[:1]
3955 return [lang, ', '.join(names), ', '.join(exts)]
3957 if not subtitles:
3958 return None
3959 return render_table(
3960 self._list_format_headers('Language', 'Name', 'Formats'),
3961 [_row(lang, formats) for lang, formats in subtitles.items()],
3962 hide_empty=True)
3964 def __list_table(self, video_id, name, func, *args):
3965 table = func(*args)
3966 if not table:
3967 self.to_screen(f'{video_id} has no {name}')
3968 return
3969 self.to_screen(f'[info] Available {name} for {video_id}:')
3970 self.to_stdout(table)
3972 def list_formats(self, info_dict):
3973 self.__list_table(info_dict['id'], 'formats', self.render_formats_table, info_dict)
3975 def list_thumbnails(self, info_dict):
3976 self.__list_table(info_dict['id'], 'thumbnails', self.render_thumbnails_table, info_dict)
3978 def list_subtitles(self, video_id, subtitles, name='subtitles'):
3979 self.__list_table(video_id, name, self.render_subtitles_table, video_id, subtitles)
3981 def print_debug_header(self):
3982 if not self.params.get('verbose'):
3983 return
3985 from . import _IN_CLI # Must be delayed import
3987 # These imports can be slow. So import them only as needed
3988 from .extractor.extractors import _LAZY_LOADER
3989 from .extractor.extractors import (
3990 _PLUGIN_CLASSES as plugin_ies,
3991 _PLUGIN_OVERRIDES as plugin_ie_overrides,
3994 def get_encoding(stream):
3995 ret = str(getattr(stream, 'encoding', f'missing ({type(stream).__name__})'))
3996 additional_info = []
3997 if os.environ.get('TERM', '').lower() == 'dumb':
3998 additional_info.append('dumb')
3999 if not supports_terminal_sequences(stream):
4000 from .utils import WINDOWS_VT_MODE # Must be imported locally
4001 additional_info.append('No VT' if WINDOWS_VT_MODE is False else 'No ANSI')
4002 if additional_info:
4003 ret = f'{ret} ({",".join(additional_info)})'
4004 return ret
4006 encoding_str = 'Encodings: locale {}, fs {}, pref {}, {}'.format(
4007 locale.getpreferredencoding(),
4008 sys.getfilesystemencoding(),
4009 self.get_encoding(),
4010 ', '.join(
4011 f'{key} {get_encoding(stream)}' for key, stream in self._out_files.items_
4012 if stream is not None and key != 'console'),
4015 logger = self.params.get('logger')
4016 if logger:
4017 write_debug = lambda msg: logger.debug(f'[debug] {msg}')
4018 write_debug(encoding_str)
4019 else:
4020 write_string(f'[debug] {encoding_str}\n', encoding=None)
4021 write_debug = lambda msg: self._write_string(f'[debug] {msg}\n')
4023 source = detect_variant()
4024 if VARIANT not in (None, 'pip'):
4025 source += '*'
4026 klass = type(self)
4027 write_debug(join_nonempty(
4028 f'{REPOSITORY.rpartition("/")[2]} version',
4029 _make_label(ORIGIN, CHANNEL.partition('@')[2] or __version__, __version__),
4030 f'[{RELEASE_GIT_HEAD[:9]}]' if RELEASE_GIT_HEAD else '',
4031 '' if source == 'unknown' else f'({source})',
4032 '' if _IN_CLI else 'API' if klass == YoutubeDL else f'API:{self.__module__}.{klass.__qualname__}',
4033 delim=' '))
4035 if not _IN_CLI:
4036 write_debug(f'params: {self.params}')
4038 if not _LAZY_LOADER:
4039 if os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'):
4040 write_debug('Lazy loading extractors is forcibly disabled')
4041 else:
4042 write_debug('Lazy loading extractors is disabled')
4043 if self.params['compat_opts']:
4044 write_debug('Compatibility options: {}'.format(', '.join(self.params['compat_opts'])))
4046 if current_git_head():
4047 write_debug(f'Git HEAD: {current_git_head()}')
4048 write_debug(system_identifier())
4050 exe_versions, ffmpeg_features = FFmpegPostProcessor.get_versions_and_features(self)
4051 ffmpeg_features = {key for key, val in ffmpeg_features.items() if val}
4052 if ffmpeg_features:
4053 exe_versions['ffmpeg'] += ' ({})'.format(','.join(sorted(ffmpeg_features)))
4055 exe_versions['rtmpdump'] = rtmpdump_version()
4056 exe_versions['phantomjs'] = PhantomJSwrapper._version()
4057 exe_str = ', '.join(
4058 f'{exe} {v}' for exe, v in sorted(exe_versions.items()) if v
4059 ) or 'none'
4060 write_debug(f'exe versions: {exe_str}')
4062 from .compat.compat_utils import get_package_info
4063 from .dependencies import available_dependencies
4065 write_debug('Optional libraries: %s' % (', '.join(sorted({
4066 join_nonempty(*get_package_info(m)) for m in available_dependencies.values()
4067 })) or 'none'))
4069 write_debug(f'Proxy map: {self.proxies}')
4070 write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers.values())}')
4071 for plugin_type, plugins in {'Extractor': plugin_ies, 'Post-Processor': plugin_pps}.items():
4072 display_list = ['{}{}'.format(
4073 klass.__name__, '' if klass.__name__ == name else f' as {name}')
4074 for name, klass in plugins.items()]
4075 if plugin_type == 'Extractor':
4076 display_list.extend(f'{plugins[-1].IE_NAME.partition("+")[2]} ({parent.__name__})'
4077 for parent, plugins in plugin_ie_overrides.items())
4078 if not display_list:
4079 continue
4080 write_debug(f'{plugin_type} Plugins: {", ".join(sorted(display_list))}')
4082 plugin_dirs = plugin_directories()
4083 if plugin_dirs:
4084 write_debug(f'Plugin directories: {plugin_dirs}')
4086 # Not implemented
4087 if False and self.params.get('call_home'):
4088 ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode()
4089 write_debug(f'Public IP address: {ipaddr}')
4090 latest_version = self.urlopen(
4091 'https://yt-dl.org/latest/version').read().decode()
4092 if version_tuple(latest_version) > version_tuple(__version__):
4093 self.report_warning(
4094 f'You are using an outdated version (newest version: {latest_version})! '
4095 'See https://yt-dl.org/update if you need help updating.')
4097 @functools.cached_property
4098 def proxies(self):
4099 """Global proxy configuration"""
4100 opts_proxy = self.params.get('proxy')
4101 if opts_proxy is not None:
4102 if opts_proxy == '':
4103 opts_proxy = '__noproxy__'
4104 proxies = {'all': opts_proxy}
4105 else:
4106 proxies = urllib.request.getproxies()
4107 # compat. Set HTTPS_PROXY to __noproxy__ to revert
4108 if 'http' in proxies and 'https' not in proxies:
4109 proxies['https'] = proxies['http']
4111 return proxies
4113 @functools.cached_property
4114 def cookiejar(self):
4115 """Global cookiejar instance"""
4116 return load_cookies(
4117 self.params.get('cookiefile'), self.params.get('cookiesfrombrowser'), self)
4119 @property
4120 def _opener(self):
4122 Get a urllib OpenerDirector from the Urllib handler (deprecated).
4124 self.deprecation_warning('YoutubeDL._opener is deprecated, use YoutubeDL.urlopen()')
4125 handler = self._request_director.handlers['Urllib']
4126 return handler._get_instance(cookiejar=self.cookiejar, proxies=self.proxies)
4128 def _get_available_impersonate_targets(self):
4129 # TODO(future): make available as public API
4130 return [
4131 (target, rh.RH_NAME)
4132 for rh in self._request_director.handlers.values()
4133 if isinstance(rh, ImpersonateRequestHandler)
4134 for target in rh.supported_targets
4137 def _impersonate_target_available(self, target):
4138 # TODO(future): make available as public API
4139 return any(
4140 rh.is_supported_target(target)
4141 for rh in self._request_director.handlers.values()
4142 if isinstance(rh, ImpersonateRequestHandler))
4144 def urlopen(self, req):
4145 """ Start an HTTP download """
4146 if isinstance(req, str):
4147 req = Request(req)
4148 elif isinstance(req, urllib.request.Request):
4149 self.deprecation_warning(
4150 'Passing a urllib.request.Request object to YoutubeDL.urlopen() is deprecated. '
4151 'Use yt_dlp.networking.common.Request instead.')
4152 req = urllib_req_to_req(req)
4153 assert isinstance(req, Request)
4155 # compat: Assume user:pass url params are basic auth
4156 url, basic_auth_header = extract_basic_auth(req.url)
4157 if basic_auth_header:
4158 req.headers['Authorization'] = basic_auth_header
4159 req.url = sanitize_url(url)
4161 clean_proxies(proxies=req.proxies, headers=req.headers)
4162 clean_headers(req.headers)
4164 try:
4165 return self._request_director.send(req)
4166 except NoSupportingHandlers as e:
4167 for ue in e.unsupported_errors:
4168 # FIXME: This depends on the order of errors.
4169 if not (ue.handler and ue.msg):
4170 continue
4171 if ue.handler.RH_KEY == 'Urllib' and 'unsupported url scheme: "file"' in ue.msg.lower():
4172 raise RequestError(
4173 'file:// URLs are disabled by default in yt-dlp for security reasons. '
4174 'Use --enable-file-urls to enable at your own risk.', cause=ue) from ue
4175 if (
4176 'unsupported proxy type: "https"' in ue.msg.lower()
4177 and 'requests' not in self._request_director.handlers
4178 and 'curl_cffi' not in self._request_director.handlers
4180 raise RequestError(
4181 'To use an HTTPS proxy for this request, one of the following dependencies needs to be installed: requests, curl_cffi')
4183 elif (
4184 re.match(r'unsupported url scheme: "wss?"', ue.msg.lower())
4185 and 'websockets' not in self._request_director.handlers
4187 raise RequestError(
4188 'This request requires WebSocket support. '
4189 'Ensure one of the following dependencies are installed: websockets',
4190 cause=ue) from ue
4192 elif re.match(r'unsupported (?:extensions: impersonate|impersonate target)', ue.msg.lower()):
4193 raise RequestError(
4194 f'Impersonate target "{req.extensions["impersonate"]}" is not available.'
4195 f' See --list-impersonate-targets for available targets.'
4196 f' This request requires browser impersonation, however you may be missing dependencies'
4197 f' required to support this target.')
4198 raise
4199 except SSLError as e:
4200 if 'UNSAFE_LEGACY_RENEGOTIATION_DISABLED' in str(e):
4201 raise RequestError('UNSAFE_LEGACY_RENEGOTIATION_DISABLED: Try using --legacy-server-connect', cause=e) from e
4202 elif 'SSLV3_ALERT_HANDSHAKE_FAILURE' in str(e):
4203 raise RequestError(
4204 'SSLV3_ALERT_HANDSHAKE_FAILURE: The server may not support the current cipher list. '
4205 'Try using --legacy-server-connect', cause=e) from e
4206 raise
4208 def build_request_director(self, handlers, preferences=None):
4209 logger = _YDLLogger(self)
4210 headers = self.params['http_headers'].copy()
4211 proxies = self.proxies.copy()
4212 clean_headers(headers)
4213 clean_proxies(proxies, headers)
4215 director = RequestDirector(logger=logger, verbose=self.params.get('debug_printtraffic'))
4216 for handler in handlers:
4217 director.add_handler(handler(
4218 logger=logger,
4219 headers=headers,
4220 cookiejar=self.cookiejar,
4221 proxies=proxies,
4222 prefer_system_certs='no-certifi' in self.params['compat_opts'],
4223 verify=not self.params.get('nocheckcertificate'),
4224 **traverse_obj(self.params, {
4225 'verbose': 'debug_printtraffic',
4226 'source_address': 'source_address',
4227 'timeout': 'socket_timeout',
4228 'legacy_ssl_support': 'legacyserverconnect',
4229 'enable_file_urls': 'enable_file_urls',
4230 'impersonate': 'impersonate',
4231 'client_cert': {
4232 'client_certificate': 'client_certificate',
4233 'client_certificate_key': 'client_certificate_key',
4234 'client_certificate_password': 'client_certificate_password',
4238 director.preferences.update(preferences or [])
4239 if 'prefer-legacy-http-handler' in self.params['compat_opts']:
4240 director.preferences.add(lambda rh, _: 500 if rh.RH_KEY == 'Urllib' else 0)
4241 return director
4243 @functools.cached_property
4244 def _request_director(self):
4245 return self.build_request_director(_REQUEST_HANDLERS.values(), _RH_PREFERENCES)
4247 def encode(self, s):
4248 if isinstance(s, bytes):
4249 return s # Already encoded
4251 try:
4252 return s.encode(self.get_encoding())
4253 except UnicodeEncodeError as err:
4254 err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
4255 raise
4257 def get_encoding(self):
4258 encoding = self.params.get('encoding')
4259 if encoding is None:
4260 encoding = preferredencoding()
4261 return encoding
4263 def _write_info_json(self, label, ie_result, infofn, overwrite=None):
4264 """ Write infojson and returns True = written, 'exists' = Already exists, False = skip, None = error """
4265 if overwrite is None:
4266 overwrite = self.params.get('overwrites', True)
4267 if not self.params.get('writeinfojson'):
4268 return False
4269 elif not infofn:
4270 self.write_debug(f'Skipping writing {label} infojson')
4271 return False
4272 elif not self._ensure_dir_exists(infofn):
4273 return None
4274 elif not overwrite and os.path.exists(infofn):
4275 self.to_screen(f'[info] {label.title()} metadata is already present')
4276 return 'exists'
4278 self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}')
4279 try:
4280 write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn)
4281 return True
4282 except OSError:
4283 self.report_error(f'Cannot write {label} metadata to JSON file {infofn}')
4284 return None
4286 def _write_description(self, label, ie_result, descfn):
4287 """ Write description and returns True = written, False = skip, None = error """
4288 if not self.params.get('writedescription'):
4289 return False
4290 elif not descfn:
4291 self.write_debug(f'Skipping writing {label} description')
4292 return False
4293 elif not self._ensure_dir_exists(descfn):
4294 return None
4295 elif not self.params.get('overwrites', True) and os.path.exists(descfn):
4296 self.to_screen(f'[info] {label.title()} description is already present')
4297 elif ie_result.get('description') is None:
4298 self.to_screen(f'[info] There\'s no {label} description to write')
4299 return False
4300 else:
4301 try:
4302 self.to_screen(f'[info] Writing {label} description to: {descfn}')
4303 with open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
4304 descfile.write(ie_result['description'])
4305 except OSError:
4306 self.report_error(f'Cannot write {label} description file {descfn}')
4307 return None
4308 return True
4310 def _write_subtitles(self, info_dict, filename):
4311 """ Write subtitles to file and return list of (sub_filename, final_sub_filename); or None if error"""
4312 ret = []
4313 subtitles = info_dict.get('requested_subtitles')
4314 if not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')):
4315 # subtitles download errors are already managed as troubles in relevant IE
4316 # that way it will silently go on when used with unsupporting IE
4317 return ret
4318 elif not subtitles:
4319 self.to_screen('[info] There are no subtitles for the requested languages')
4320 return ret
4321 sub_filename_base = self.prepare_filename(info_dict, 'subtitle')
4322 if not sub_filename_base:
4323 self.to_screen('[info] Skipping writing video subtitles')
4324 return ret
4326 for sub_lang, sub_info in subtitles.items():
4327 sub_format = sub_info['ext']
4328 sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
4329 sub_filename_final = subtitles_filename(sub_filename_base, sub_lang, sub_format, info_dict.get('ext'))
4330 existing_sub = self.existing_file((sub_filename_final, sub_filename))
4331 if existing_sub:
4332 self.to_screen(f'[info] Video subtitle {sub_lang}.{sub_format} is already present')
4333 sub_info['filepath'] = existing_sub
4334 ret.append((existing_sub, sub_filename_final))
4335 continue
4337 self.to_screen(f'[info] Writing video subtitles to: {sub_filename}')
4338 if sub_info.get('data') is not None:
4339 try:
4340 # Use newline='' to prevent conversion of newline characters
4341 # See https://github.com/ytdl-org/youtube-dl/issues/10268
4342 with open(sub_filename, 'w', encoding='utf-8', newline='') as subfile:
4343 subfile.write(sub_info['data'])
4344 sub_info['filepath'] = sub_filename
4345 ret.append((sub_filename, sub_filename_final))
4346 continue
4347 except OSError:
4348 self.report_error(f'Cannot write video subtitles file {sub_filename}')
4349 return None
4351 try:
4352 sub_copy = sub_info.copy()
4353 sub_copy.setdefault('http_headers', info_dict.get('http_headers'))
4354 self.dl(sub_filename, sub_copy, subtitle=True)
4355 sub_info['filepath'] = sub_filename
4356 ret.append((sub_filename, sub_filename_final))
4357 except (DownloadError, ExtractorError, OSError, ValueError, *network_exceptions) as err:
4358 msg = f'Unable to download video subtitles for {sub_lang!r}: {err}'
4359 if self.params.get('ignoreerrors') is not True: # False or 'only_download'
4360 if not self.params.get('ignoreerrors'):
4361 self.report_error(msg)
4362 raise DownloadError(msg)
4363 self.report_warning(msg)
4364 return ret
4366 def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None):
4367 """ Write thumbnails to file and return list of (thumb_filename, final_thumb_filename); or None if error """
4368 write_all = self.params.get('write_all_thumbnails', False)
4369 thumbnails, ret = [], []
4370 if write_all or self.params.get('writethumbnail', False):
4371 thumbnails = info_dict.get('thumbnails') or []
4372 if not thumbnails:
4373 self.to_screen(f'[info] There are no {label} thumbnails to download')
4374 return ret
4375 multiple = write_all and len(thumbnails) > 1
4377 if thumb_filename_base is None:
4378 thumb_filename_base = filename
4379 if thumbnails and not thumb_filename_base:
4380 self.write_debug(f'Skipping writing {label} thumbnail')
4381 return ret
4383 if thumbnails and not self._ensure_dir_exists(filename):
4384 return None
4386 for idx, t in list(enumerate(thumbnails))[::-1]:
4387 thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg')
4388 thumb_display_id = f'{label} thumbnail {t["id"]}'
4389 thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext'))
4390 thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext'))
4392 existing_thumb = self.existing_file((thumb_filename_final, thumb_filename))
4393 if existing_thumb:
4394 self.to_screen('[info] {} is already present'.format((
4395 thumb_display_id if multiple else f'{label} thumbnail').capitalize()))
4396 t['filepath'] = existing_thumb
4397 ret.append((existing_thumb, thumb_filename_final))
4398 else:
4399 self.to_screen(f'[info] Downloading {thumb_display_id} ...')
4400 try:
4401 uf = self.urlopen(Request(t['url'], headers=t.get('http_headers', {})))
4402 self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}')
4403 with open(encodeFilename(thumb_filename), 'wb') as thumbf:
4404 shutil.copyfileobj(uf, thumbf)
4405 ret.append((thumb_filename, thumb_filename_final))
4406 t['filepath'] = thumb_filename
4407 except network_exceptions as err:
4408 if isinstance(err, HTTPError) and err.status == 404:
4409 self.to_screen(f'[info] {thumb_display_id.title()} does not exist')
4410 else:
4411 self.report_warning(f'Unable to download {thumb_display_id}: {err}')
4412 thumbnails.pop(idx)
4413 if ret and not write_all:
4414 break
4415 return ret