[ie/youtube] Fix `uploader_id` extraction (#11818)
[yt-dlp.git] / yt_dlp / extractor / bluesky.py
blob0e58a0932d0114f773575833f507d18742a26809
1 from .common import InfoExtractor
2 from ..utils import (
3 ExtractorError,
4 format_field,
5 int_or_none,
6 mimetype2ext,
7 orderedSet,
8 parse_iso8601,
9 truncate_string,
10 update_url_query,
11 url_basename,
12 url_or_none,
13 variadic,
15 from ..utils.traversal import traverse_obj
18 class BlueskyIE(InfoExtractor):
19 _VALID_URL = [
20 r'https?://(?:www\.)?(?:bsky\.app|main\.bsky\.dev)/profile/(?P<handle>[\w.:%-]+)/post/(?P<id>\w+)',
21 r'at://(?P<handle>[\w.:%-]+)/app\.bsky\.feed\.post/(?P<id>\w+)',
23 _TESTS = [{
24 'url': 'https://bsky.app/profile/blu3blue.bsky.social/post/3l4omssdl632g',
25 'md5': '375539c1930ab05d15585ed772ab54fd',
26 'info_dict': {
27 'id': '3l4omssdl632g',
28 'ext': 'mp4',
29 'uploader': 'Blu3Blu3Lilith',
30 'uploader_id': 'blu3blue.bsky.social',
31 'uploader_url': 'https://bsky.app/profile/blu3blue.bsky.social',
32 'channel_id': 'did:plc:pzdr5ylumf7vmvwasrpr5bf2',
33 'channel_url': 'https://bsky.app/profile/did:plc:pzdr5ylumf7vmvwasrpr5bf2',
34 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
35 'title': 'OMG WE HAVE VIDEOS NOW',
36 'description': 'OMG WE HAVE VIDEOS NOW',
37 'upload_date': '20240921',
38 'timestamp': 1726940605,
39 'like_count': int,
40 'repost_count': int,
41 'comment_count': int,
42 'tags': [],
44 }, {
45 'url': 'https://bsky.app/profile/bsky.app/post/3l3vgf77uco2g',
46 'md5': 'b9e344fdbce9f2852c668a97efefb105',
47 'info_dict': {
48 'id': '3l3vgf77uco2g',
49 'ext': 'mp4',
50 'uploader': 'Bluesky',
51 'uploader_id': 'bsky.app',
52 'uploader_url': 'https://bsky.app/profile/bsky.app',
53 'channel_id': 'did:plc:z72i7hdynmk6r22z27h6tvur',
54 'channel_url': 'https://bsky.app/profile/did:plc:z72i7hdynmk6r22z27h6tvur',
55 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
56 'title': 'Bluesky now has video! Update your app to versi...',
57 'alt_title': 'Bluesky video feature announcement',
58 'description': r're:(?s)Bluesky now has video! .{239}',
59 'upload_date': '20240911',
60 'timestamp': 1726074716,
61 'like_count': int,
62 'repost_count': int,
63 'comment_count': int,
64 'tags': [],
65 'subtitles': {
66 'en': 'mincount:1',
69 }, {
70 'url': 'https://main.bsky.dev/profile/souris.moe/post/3l4qhp7bcs52c',
71 'md5': '5f2df8c200b5633eb7fb2c984d29772f',
72 'info_dict': {
73 'id': '3l4qhp7bcs52c',
74 'ext': 'mp4',
75 'uploader': 'souris',
76 'uploader_id': 'souris.moe',
77 'uploader_url': 'https://bsky.app/profile/souris.moe',
78 'channel_id': 'did:plc:tj7g244gl5v6ai6cm4f4wlqp',
79 'channel_url': 'https://bsky.app/profile/did:plc:tj7g244gl5v6ai6cm4f4wlqp',
80 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
81 'title': 'Bluesky video #3l4qhp7bcs52c',
82 'upload_date': '20240922',
83 'timestamp': 1727003838,
84 'like_count': int,
85 'repost_count': int,
86 'comment_count': int,
87 'tags': [],
89 }, {
90 'url': 'https://bsky.app/profile/de1.pds.tentacle.expert/post/3l3w4tnezek2e',
91 'md5': '1af9c7fda061cf7593bbffca89e43d1c',
92 'info_dict': {
93 'id': '3l3w4tnezek2e',
94 'ext': 'mp4',
95 'uploader': 'clean',
96 'uploader_id': 'de1.pds.tentacle.expert',
97 'uploader_url': 'https://bsky.app/profile/de1.pds.tentacle.expert',
98 'channel_id': 'did:web:de1.tentacle.expert',
99 'channel_url': 'https://bsky.app/profile/did:web:de1.tentacle.expert',
100 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
101 'title': 'Bluesky video #3l3w4tnezek2e',
102 'upload_date': '20240911',
103 'timestamp': 1726098823,
104 'like_count': int,
105 'repost_count': int,
106 'comment_count': int,
107 'tags': [],
109 }, {
110 'url': 'https://bsky.app/profile/yunayuispink.bsky.social/post/3l7gqcfes742o',
111 'info_dict': {
112 'id': 'XxK3t_5V3ao',
113 'ext': 'mp4',
114 'uploader': 'yunayu',
115 'uploader_id': '@yunayuispink',
116 'uploader_url': 'https://www.youtube.com/@yunayuispink',
117 'channel': 'yunayu',
118 'channel_id': 'UCPLvXnHa7lTyNoR_dGsU14w',
119 'channel_url': 'https://www.youtube.com/channel/UCPLvXnHa7lTyNoR_dGsU14w',
120 'thumbnail': 'https://i.ytimg.com/vi_webp/XxK3t_5V3ao/maxresdefault.webp',
121 'description': r're:Have a good goodx10000day',
122 'title': '5min vs 5hours drawing',
123 'availability': 'public',
124 'live_status': 'not_live',
125 'playable_in_embed': True,
126 'upload_date': '20241026',
127 'timestamp': 1729967784,
128 'duration': 321,
129 'age_limit': 0,
130 'like_count': int,
131 'view_count': int,
132 'comment_count': int,
133 'channel_follower_count': int,
134 'categories': ['Entertainment'],
135 'tags': [],
137 'add_ie': ['Youtube'],
138 }, {
139 'url': 'https://bsky.app/profile/endshark.bsky.social/post/3jzxjkcemae2m',
140 'info_dict': {
141 'id': '222792849',
142 'ext': 'mp3',
143 'uploader': 'LASERBAT',
144 'uploader_id': 'laserbatx',
145 'uploader_url': 'https://laserbatx.bandcamp.com',
146 'artists': ['LASERBAT'],
147 'album_artists': ['LASERBAT'],
148 'album': 'Hari Nezumi [EP]',
149 'track': 'Forward to the End',
150 'title': 'LASERBAT - Forward to the End',
151 'thumbnail': 'https://f4.bcbits.com/img/a2507705510_5.jpg',
152 'duration': 228.571,
153 'track_id': '222792849',
154 'release_date': '20230423',
155 'upload_date': '20230423',
156 'timestamp': 1682276040.0,
157 'release_timestamp': 1682276040.0,
158 'track_number': 1,
160 'add_ie': ['Bandcamp'],
161 }, {
162 'url': 'https://bsky.app/profile/dannybhoix.bsky.social/post/3l6oe5mtr2c2j',
163 'md5': 'b9e344fdbce9f2852c668a97efefb105',
164 'info_dict': {
165 'id': '3l3vgf77uco2g',
166 'ext': 'mp4',
167 'uploader': 'Bluesky',
168 'uploader_id': 'bsky.app',
169 'uploader_url': 'https://bsky.app/profile/bsky.app',
170 'channel_id': 'did:plc:z72i7hdynmk6r22z27h6tvur',
171 'channel_url': 'https://bsky.app/profile/did:plc:z72i7hdynmk6r22z27h6tvur',
172 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
173 'title': 'Bluesky now has video! Update your app to versi...',
174 'alt_title': 'Bluesky video feature announcement',
175 'description': r're:(?s)Bluesky now has video! .{239}',
176 'upload_date': '20240911',
177 'timestamp': 1726074716,
178 'like_count': int,
179 'repost_count': int,
180 'comment_count': int,
181 'tags': [],
182 'subtitles': {
183 'en': 'mincount:1',
186 }, {
187 'url': 'https://bsky.app/profile/alt.bun.how/post/3l7rdfxhyds2f',
188 'md5': '8775118b235cf9fa6b5ad30f95cda75c',
189 'info_dict': {
190 'id': '3l7rdfxhyds2f',
191 'ext': 'mp4',
192 'uploader': 'cinnamon',
193 'uploader_id': 'alt.bun.how',
194 'uploader_url': 'https://bsky.app/profile/alt.bun.how',
195 'channel_id': 'did:plc:7x6rtuenkuvxq3zsvffp2ide',
196 'channel_url': 'https://bsky.app/profile/did:plc:7x6rtuenkuvxq3zsvffp2ide',
197 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
198 'title': 'crazy that i look like this tbh',
199 'description': 'crazy that i look like this tbh',
200 'upload_date': '20241030',
201 'timestamp': 1730332128,
202 'like_count': int,
203 'repost_count': int,
204 'comment_count': int,
205 'tags': ['sexual'],
206 'age_limit': 18,
208 }, {
209 'url': 'at://did:plc:ia76kvnndjutgedggx2ibrem/app.bsky.feed.post/3l6zrz6zyl2dr',
210 'md5': '71b0eb6d85d03145e6af6642c7fc6d78',
211 'info_dict': {
212 'id': '3l6zrz6zyl2dr',
213 'ext': 'mp4',
214 'uploader': 'mary🐇',
215 'uploader_id': 'mary.my.id',
216 'uploader_url': 'https://bsky.app/profile/mary.my.id',
217 'channel_id': 'did:plc:ia76kvnndjutgedggx2ibrem',
218 'channel_url': 'https://bsky.app/profile/did:plc:ia76kvnndjutgedggx2ibrem',
219 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
220 'title': 'Bluesky video #3l6zrz6zyl2dr',
221 'upload_date': '20241021',
222 'timestamp': 1729523172,
223 'like_count': int,
224 'repost_count': int,
225 'comment_count': int,
226 'tags': [],
228 }, {
229 'url': 'https://bsky.app/profile/purpleicetea.bsky.social/post/3l7gv55dc2o2w',
230 'info_dict': {
231 'id': '3l7gv55dc2o2w',
233 'playlist': [{
234 'info_dict': {
235 'id': '3l7gv55dc2o2w',
236 'ext': 'mp4',
237 'upload_date': '20241026',
238 'description': 'One of my favorite videos',
239 'comment_count': int,
240 'uploader_url': 'https://bsky.app/profile/purpleicetea.bsky.social',
241 'uploader': 'Purple.Ice.Tea',
242 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
243 'channel_url': 'https://bsky.app/profile/did:plc:bjh5ffwya5f53dfy47dezuwx',
244 'like_count': int,
245 'channel_id': 'did:plc:bjh5ffwya5f53dfy47dezuwx',
246 'repost_count': int,
247 'timestamp': 1729973202,
248 'tags': [],
249 'uploader_id': 'purpleicetea.bsky.social',
250 'title': 'One of my favorite videos',
252 }, {
253 'info_dict': {
254 'id': '3l77u64l7le2e',
255 'ext': 'mp4',
256 'title': 'hearing people on twitter say that bluesky isn\'...',
257 'like_count': int,
258 'uploader_id': 'thafnine.net',
259 'uploader_url': 'https://bsky.app/profile/thafnine.net',
260 'upload_date': '20241024',
261 'channel_url': 'https://bsky.app/profile/did:plc:6ttyq36rhiyed7wu3ws7dmqj',
262 'description': r're:(?s)hearing people on twitter say that bluesky .{93}',
263 'tags': [],
264 'alt_title': 'md5:9b1ee1937fb3d1a81e932f9ec14d560e',
265 'uploader': 'T9',
266 'channel_id': 'did:plc:6ttyq36rhiyed7wu3ws7dmqj',
267 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$',
268 'timestamp': 1729731642,
269 'comment_count': int,
270 'repost_count': int,
274 _BLOB_URL_TMPL = '{}/xrpc/com.atproto.sync.getBlob'
276 def _get_service_endpoint(self, did, video_id):
277 if did.startswith('did:web:'):
278 url = f'https://{did[8:]}/.well-known/did.json'
279 else:
280 url = f'https://plc.directory/{did}'
281 services = self._download_json(
282 url, video_id, 'Fetching service endpoint', 'Falling back to bsky.social', fatal=False)
283 return traverse_obj(
284 services, ('service', lambda _, x: x['type'] == 'AtprotoPersonalDataServer',
285 'serviceEndpoint', {url_or_none}, any)) or 'https://bsky.social'
287 def _real_extract(self, url):
288 handle, video_id = self._match_valid_url(url).group('handle', 'id')
290 post = self._download_json(
291 'https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread',
292 video_id, query={
293 'uri': f'at://{handle}/app.bsky.feed.post/{video_id}',
294 'depth': 0,
295 'parentHeight': 0,
296 })['thread']['post']
298 entries = []
299 # app.bsky.embed.video.view/app.bsky.embed.external.view
300 entries.extend(self._extract_videos(post, video_id))
301 # app.bsky.embed.recordWithMedia.view
302 entries.extend(self._extract_videos(
303 post, video_id, embed_path=('embed', 'media'), record_subpath=('embed', 'media')))
304 # app.bsky.embed.record.view
305 if nested_post := traverse_obj(post, ('embed', 'record', ('record', None), {dict}, any)):
306 entries.extend(self._extract_videos(
307 nested_post, video_id, embed_path=('embeds', 0), record_path='value'))
309 if not entries:
310 raise ExtractorError('No video could be found in this post', expected=True)
311 if len(entries) == 1:
312 return entries[0]
313 return self.playlist_result(entries, video_id)
315 @staticmethod
316 def _build_profile_url(path):
317 return format_field(path, None, 'https://bsky.app/profile/%s', default=None)
319 def _extract_videos(self, root, video_id, embed_path='embed', record_path='record', record_subpath='embed'):
320 embed_path = variadic(embed_path, (str, bytes, dict, set))
321 record_path = variadic(record_path, (str, bytes, dict, set))
322 record_subpath = variadic(record_subpath, (str, bytes, dict, set))
324 entries = []
325 if external_uri := traverse_obj(root, (
326 ((*record_path, *record_subpath), embed_path), 'external', 'uri', {url_or_none}, any)):
327 entries.append(self.url_result(external_uri))
328 if playlist := traverse_obj(root, (*embed_path, 'playlist', {url_or_none})):
329 formats, subtitles = self._extract_m3u8_formats_and_subtitles(
330 playlist, video_id, 'mp4', m3u8_id='hls', fatal=False)
331 else:
332 return entries
334 video_cid = traverse_obj(
335 root, (*embed_path, 'cid', {str}),
336 (*record_path, *record_subpath, 'video', 'ref', '$link', {str}))
337 did = traverse_obj(root, ('author', 'did', {str}))
339 if did and video_cid:
340 endpoint = self._get_service_endpoint(did, video_id)
342 formats.append({
343 'format_id': 'blob',
344 'url': update_url_query(
345 self._BLOB_URL_TMPL.format(endpoint), {'did': did, 'cid': video_cid}),
346 **traverse_obj(root, (*embed_path, 'aspectRatio', {
347 'width': ('width', {int_or_none}),
348 'height': ('height', {int_or_none}),
349 })),
350 **traverse_obj(root, (*record_path, *record_subpath, 'video', {
351 'filesize': ('size', {int_or_none}),
352 'ext': ('mimeType', {mimetype2ext}),
353 })),
356 for sub_data in traverse_obj(root, (
357 *record_path, *record_subpath, 'captions', lambda _, v: v['file']['ref']['$link'])):
358 subtitles.setdefault(sub_data.get('lang') or 'und', []).append({
359 'url': update_url_query(
360 self._BLOB_URL_TMPL.format(endpoint), {'did': did, 'cid': sub_data['file']['ref']['$link']}),
361 'ext': traverse_obj(sub_data, ('file', 'mimeType', {mimetype2ext})),
364 entries.append({
365 'id': video_id,
366 'formats': formats,
367 'subtitles': subtitles,
368 **traverse_obj(root, {
369 'id': ('uri', {url_basename}),
370 'thumbnail': (*embed_path, 'thumbnail', {url_or_none}),
371 'alt_title': (*embed_path, 'alt', {str}, filter),
372 'uploader': ('author', 'displayName', {str}),
373 'uploader_id': ('author', 'handle', {str}),
374 'uploader_url': ('author', 'handle', {self._build_profile_url}),
375 'channel_id': ('author', 'did', {str}),
376 'channel_url': ('author', 'did', {self._build_profile_url}),
377 'like_count': ('likeCount', {int_or_none}),
378 'repost_count': ('repostCount', {int_or_none}),
379 'comment_count': ('replyCount', {int_or_none}),
380 'timestamp': ('indexedAt', {parse_iso8601}),
381 'tags': ('labels', ..., 'val', {str}, all, {orderedSet}),
382 'age_limit': (
383 'labels', ..., 'val', {lambda x: 18 if x in ('sexual', 'porn', 'graphic-media') else None}, any),
384 'description': (*record_path, 'text', {str}, filter),
385 'title': (*record_path, 'text', {lambda x: x.replace('\n', ' ')}, {truncate_string(left=50)}),
388 return entries