[ie/dplay] Fix extractors (#10471)
[yt-dlp3.git] / yt_dlp / extractor / bandcamp.py
blob61cbab5a7af43d328e281ab40340d55efd4d99ae
1 import random
2 import re
3 import time
5 from .common import InfoExtractor
6 from ..utils import (
7 KNOWN_EXTENSIONS,
8 ExtractorError,
9 float_or_none,
10 int_or_none,
11 parse_filesize,
12 str_or_none,
13 try_get,
14 unified_strdate,
15 unified_timestamp,
16 update_url_query,
17 url_or_none,
18 urljoin,
22 class BandcampIE(InfoExtractor):
23 _VALID_URL = r'https?://(?P<uploader>[^/]+)\.bandcamp\.com/track/(?P<id>[^/?#&]+)'
24 _EMBED_REGEX = [r'<meta property="og:url"[^>]*?content="(?P<url>.*?bandcamp\.com.*?)"']
25 _TESTS = [{
26 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
27 'md5': 'c557841d5e50261777a6585648adf439',
28 'info_dict': {
29 'id': '1812978515',
30 'ext': 'mp3',
31 'title': 'youtube-dl "\'/\\ä↭ - youtube-dl "\'/\\ä↭ - youtube-dl test song "\'/\\ä↭',
32 'duration': 9.8485,
33 'uploader': 'youtube-dl "\'/\\ä↭',
34 'upload_date': '20121129',
35 'timestamp': 1354224127,
36 'track': 'youtube-dl "\'/\\ä↭ - youtube-dl test song "\'/\\ä↭',
37 'album_artist': 'youtube-dl "\'/\\ä↭',
38 'track_id': '1812978515',
39 'artist': 'youtube-dl "\'/\\ä↭',
40 'uploader_url': 'https://youtube-dl.bandcamp.com',
41 'uploader_id': 'youtube-dl',
42 'thumbnail': 'https://f4.bcbits.com/img/a3216802731_5.jpg',
44 'skip': 'There is a limit of 200 free downloads / month for the test song',
45 }, {
46 # free download
47 'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
48 'info_dict': {
49 'id': '2650410135',
50 'ext': 'm4a',
51 'acodec': r're:[fa]lac',
52 'title': 'Ben Prunty - Lanius (Battle)',
53 'thumbnail': r're:^https?://.*\.jpg$',
54 'uploader': 'Ben Prunty',
55 'timestamp': 1396508491,
56 'upload_date': '20140403',
57 'release_timestamp': 1396483200,
58 'release_date': '20140403',
59 'duration': 260.877,
60 'track': 'Lanius (Battle)',
61 'track_number': 1,
62 'track_id': '2650410135',
63 'artist': 'Ben Prunty',
64 'album_artist': 'Ben Prunty',
65 'album': 'FTL: Advanced Edition Soundtrack',
66 'uploader_url': 'https://benprunty.bandcamp.com',
67 'uploader_id': 'benprunty',
69 }, {
70 # no free download, mp3 128
71 'url': 'https://relapsealumni.bandcamp.com/track/hail-to-fire',
72 'md5': 'fec12ff55e804bb7f7ebeb77a800c8b7',
73 'info_dict': {
74 'id': '2584466013',
75 'ext': 'mp3',
76 'title': 'Mastodon - Hail to Fire',
77 'thumbnail': r're:^https?://.*\.jpg$',
78 'uploader': 'Mastodon',
79 'timestamp': 1322005399,
80 'upload_date': '20111122',
81 'release_timestamp': 1076112000,
82 'release_date': '20040207',
83 'duration': 120.79,
84 'track': 'Hail to Fire',
85 'track_number': 5,
86 'track_id': '2584466013',
87 'artist': 'Mastodon',
88 'album_artist': 'Mastodon',
89 'album': 'Call of the Mastodon',
90 'uploader_url': 'https://relapsealumni.bandcamp.com',
91 'uploader_id': 'relapsealumni',
93 }, {
94 # track from compilation album (artist/album_artist difference)
95 'url': 'https://diskotopia.bandcamp.com/track/safehouse',
96 'md5': '19c5337bca1428afa54129f86a2f6a69',
97 'info_dict': {
98 'id': '1978174799',
99 'ext': 'mp3',
100 'title': 'submerse - submerse - Safehouse',
101 'thumbnail': r're:^https?://.*\.jpg$',
102 'uploader': 'submerse',
103 'timestamp': 1480779297,
104 'upload_date': '20161203',
105 'release_timestamp': 1481068800,
106 'release_date': '20161207',
107 'duration': 154.066,
108 'track': 'submerse - Safehouse',
109 'track_number': 3,
110 'track_id': '1978174799',
111 'artist': 'submerse',
112 'album_artist': 'Diskotopia',
113 'album': 'DSK F/W 2016-2017 Free Compilation',
114 'uploader_url': 'https://diskotopia.bandcamp.com',
115 'uploader_id': 'diskotopia',
119 def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True):
120 return self._parse_json(self._html_search_regex(
121 rf'data-{attr}=(["\'])({{.+?}})\1', webpage,
122 attr + ' data', group=2), video_id, fatal=fatal)
124 def _real_extract(self, url):
125 title, uploader = self._match_valid_url(url).group('id', 'uploader')
126 webpage = self._download_webpage(url, title)
127 tralbum = self._extract_data_attr(webpage, title)
128 thumbnail = self._og_search_thumbnail(webpage)
130 track_id = None
131 track = None
132 track_number = None
133 duration = None
135 formats = []
136 track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict)
137 if track_info:
138 file_ = track_info.get('file')
139 if isinstance(file_, dict):
140 for format_id, format_url in file_.items():
141 if not url_or_none(format_url):
142 continue
143 ext, abr_str = format_id.split('-', 1)
144 formats.append({
145 'format_id': format_id,
146 'url': self._proto_relative_url(format_url, 'http:'),
147 'ext': ext,
148 'vcodec': 'none',
149 'acodec': ext,
150 'abr': int_or_none(abr_str),
152 track = track_info.get('title')
153 track_id = str_or_none(
154 track_info.get('track_id') or track_info.get('id'))
155 track_number = int_or_none(track_info.get('track_num'))
156 duration = float_or_none(track_info.get('duration'))
158 embed = self._extract_data_attr(webpage, title, 'embed', False)
159 current = tralbum.get('current') or {}
160 artist = embed.get('artist') or current.get('artist') or tralbum.get('artist')
161 album_artist = self._html_search_regex(
162 r'<h3 class="albumTitle">[\S\s]*?by\s*<span>\s*<a href="[^>]+">\s*([^>]+?)\s*</a>',
163 webpage, 'album artist', fatal=False)
164 timestamp = unified_timestamp(
165 current.get('publish_date') or tralbum.get('album_publish_date'))
167 download_link = tralbum.get('freeDownloadPage')
168 if download_link:
169 track_id = str(tralbum['id'])
171 download_webpage = self._download_webpage(
172 download_link, track_id, 'Downloading free downloads page')
174 blob = self._extract_data_attr(download_webpage, track_id, 'blob')
176 info = try_get(
177 blob, (lambda x: x['digital_items'][0],
178 lambda x: x['download_items'][0]), dict)
179 if info:
180 downloads = info.get('downloads')
181 if isinstance(downloads, dict):
182 if not track:
183 track = info.get('title')
184 if not artist:
185 artist = info.get('artist')
186 if not thumbnail:
187 thumbnail = info.get('thumb_url')
189 download_formats = {}
190 download_formats_list = blob.get('download_formats')
191 if isinstance(download_formats_list, list):
192 for f in blob['download_formats']:
193 name, ext = f.get('name'), f.get('file_extension')
194 if all(isinstance(x, str) for x in (name, ext)):
195 download_formats[name] = ext.strip('.')
197 for format_id, f in downloads.items():
198 format_url = f.get('url')
199 if not format_url:
200 continue
201 # Stat URL generation algorithm is reverse engineered from
202 # download_*_bundle_*.js
203 stat_url = update_url_query(
204 format_url.replace('/download/', '/statdownload/'), {
205 '.rand': int(time.time() * 1000 * random.random()),
207 format_id = f.get('encoding_name') or format_id
208 stat = self._download_json(
209 stat_url, track_id, f'Downloading {format_id} JSON',
210 transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1],
211 fatal=False)
212 if not stat:
213 continue
214 retry_url = url_or_none(stat.get('retry_url'))
215 if not retry_url:
216 continue
217 formats.append({
218 'url': self._proto_relative_url(retry_url, 'http:'),
219 'ext': download_formats.get(format_id),
220 'format_id': format_id,
221 'format_note': f.get('description'),
222 'filesize': parse_filesize(f.get('size_mb')),
223 'vcodec': 'none',
224 'acodec': format_id.split('-')[0],
227 title = f'{artist} - {track}' if artist else track
229 if not duration:
230 duration = float_or_none(self._html_search_meta(
231 'duration', webpage, default=None))
233 return {
234 'id': track_id,
235 'title': title,
236 'thumbnail': thumbnail,
237 'uploader': artist,
238 'uploader_id': uploader,
239 'uploader_url': f'https://{uploader}.bandcamp.com',
240 'timestamp': timestamp,
241 'release_timestamp': unified_timestamp(tralbum.get('album_release_date')),
242 'duration': duration,
243 'track': track,
244 'track_number': track_number,
245 'track_id': track_id,
246 'artist': artist,
247 'album': embed.get('album_title'),
248 'album_artist': album_artist,
249 'formats': formats,
253 class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE
254 IE_NAME = 'Bandcamp:album'
255 _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com/album/(?P<id>[^/?#&]+)'
257 _TESTS = [{
258 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
259 'playlist': [
261 'md5': '39bc1eded3476e927c724321ddf116cf',
262 'info_dict': {
263 'id': '1353101989',
264 'ext': 'mp3',
265 'title': 'Blazo - Intro',
266 'timestamp': 1311756226,
267 'upload_date': '20110727',
268 'uploader': 'Blazo',
272 'md5': '1a2c32e2691474643e912cc6cd4bffaa',
273 'info_dict': {
274 'id': '38097443',
275 'ext': 'mp3',
276 'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)',
277 'timestamp': 1311757238,
278 'upload_date': '20110727',
279 'uploader': 'Blazo',
283 'info_dict': {
284 'title': 'Jazz Format Mixtape vol.1',
285 'id': 'jazz-format-mixtape-vol-1',
286 'uploader_id': 'blazo',
288 'params': {
289 'playlistend': 2,
291 'skip': 'Bandcamp imposes download limits.',
292 }, {
293 'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave',
294 'info_dict': {
295 'title': 'Hierophany of the Open Grave',
296 'uploader_id': 'nightbringer',
297 'id': 'hierophany-of-the-open-grave',
299 'playlist_mincount': 9,
300 }, {
301 # with escaped quote in title
302 'url': 'https://jstrecords.bandcamp.com/album/entropy-ep',
303 'info_dict': {
304 'title': '"Entropy" EP',
305 'uploader_id': 'jstrecords',
306 'id': 'entropy-ep',
307 'description': 'md5:0ff22959c943622972596062f2f366a5',
309 'playlist_mincount': 3,
310 }, {
311 # not all tracks have songs
312 'url': 'https://insulters.bandcamp.com/album/we-are-the-plague',
313 'info_dict': {
314 'id': 'we-are-the-plague',
315 'title': 'WE ARE THE PLAGUE',
316 'uploader_id': 'insulters',
317 'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f',
319 'playlist_count': 2,
322 @classmethod
323 def suitable(cls, url):
324 return (False
325 if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url)
326 else super().suitable(url))
328 def _real_extract(self, url):
329 uploader_id, album_id = self._match_valid_url(url).groups()
330 playlist_id = album_id or uploader_id
331 webpage = self._download_webpage(url, playlist_id)
332 tralbum = self._extract_data_attr(webpage, playlist_id)
333 track_info = tralbum.get('trackinfo')
334 if not track_info:
335 raise ExtractorError('The page doesn\'t contain any tracks')
336 # Only tracks with duration info have songs
337 entries = [
338 self.url_result(
339 urljoin(url, t['title_link']), BandcampIE.ie_key(),
340 str_or_none(t.get('track_id') or t.get('id')), t.get('title'))
341 for t in track_info
342 if t.get('duration')]
344 current = tralbum.get('current') or {}
346 return {
347 '_type': 'playlist',
348 'uploader_id': uploader_id,
349 'id': playlist_id,
350 'title': current.get('title'),
351 'description': current.get('about'),
352 'entries': entries,
356 class BandcampWeeklyIE(BandcampIE): # XXX: Do not subclass from concrete IE
357 IE_NAME = 'Bandcamp:weekly'
358 _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
359 _TESTS = [{
360 'url': 'https://bandcamp.com/?show=224',
361 'md5': 'b00df799c733cf7e0c567ed187dea0fd',
362 'info_dict': {
363 'id': '224',
364 'ext': 'opus',
365 'title': 'BC Weekly April 4th 2017 - Magic Moments',
366 'description': 'md5:5d48150916e8e02d030623a48512c874',
367 'duration': 5829.77,
368 'release_date': '20170404',
369 'series': 'Bandcamp Weekly',
370 'episode': 'Magic Moments',
371 'episode_id': '224',
373 'params': {
374 'format': 'opus-lo',
376 }, {
377 'url': 'https://bandcamp.com/?blah/blah@&show=228',
378 'only_matching': True,
381 def _real_extract(self, url):
382 show_id = self._match_id(url)
383 webpage = self._download_webpage(url, show_id)
385 blob = self._extract_data_attr(webpage, show_id, 'blob')
387 show = blob['bcw_data'][show_id]
389 formats = []
390 for format_id, format_url in show['audio_stream'].items():
391 if not url_or_none(format_url):
392 continue
393 for known_ext in KNOWN_EXTENSIONS:
394 if known_ext in format_id:
395 ext = known_ext
396 break
397 else:
398 ext = None
399 formats.append({
400 'format_id': format_id,
401 'url': format_url,
402 'ext': ext,
403 'vcodec': 'none',
406 title = show.get('audio_title') or 'Bandcamp Weekly'
407 subtitle = show.get('subtitle')
408 if subtitle:
409 title += f' - {subtitle}'
411 return {
412 'id': show_id,
413 'title': title,
414 'description': show.get('desc') or show.get('short_desc'),
415 'duration': float_or_none(show.get('audio_duration')),
416 'is_live': False,
417 'release_date': unified_strdate(show.get('published_date')),
418 'series': 'Bandcamp Weekly',
419 'episode': show.get('subtitle'),
420 'episode_id': show_id,
421 'formats': formats,
425 class BandcampUserIE(InfoExtractor):
426 IE_NAME = 'Bandcamp:user'
427 _VALID_URL = r'https?://(?!www\.)(?P<id>[^.]+)\.bandcamp\.com(?:/music)?/?(?:[#?]|$)'
429 _TESTS = [{
430 # Type 1 Bandcamp user page.
431 'url': 'https://adrianvonziegler.bandcamp.com',
432 'info_dict': {
433 'id': 'adrianvonziegler',
434 'title': 'Discography of adrianvonziegler',
436 'playlist_mincount': 23,
437 }, {
438 # Bandcamp user page with only one album
439 'url': 'http://dotscale.bandcamp.com',
440 'info_dict': {
441 'id': 'dotscale',
442 'title': 'Discography of dotscale',
444 'playlist_count': 1,
445 }, {
446 # Type 2 Bandcamp user page.
447 'url': 'https://nightcallofficial.bandcamp.com',
448 'info_dict': {
449 'id': 'nightcallofficial',
450 'title': 'Discography of nightcallofficial',
452 'playlist_count': 4,
453 }, {
454 'url': 'https://steviasphere.bandcamp.com/music',
455 'playlist_mincount': 47,
456 'info_dict': {
457 'id': 'steviasphere',
458 'title': 'Discography of steviasphere',
460 }, {
461 'url': 'https://coldworldofficial.bandcamp.com/music',
462 'playlist_mincount': 10,
463 'info_dict': {
464 'id': 'coldworldofficial',
465 'title': 'Discography of coldworldofficial',
467 }, {
468 'url': 'https://nuclearwarnowproductions.bandcamp.com/music',
469 'playlist_mincount': 399,
470 'info_dict': {
471 'id': 'nuclearwarnowproductions',
472 'title': 'Discography of nuclearwarnowproductions',
476 def _real_extract(self, url):
477 uploader = self._match_id(url)
478 webpage = self._download_webpage(url, uploader)
480 discography_data = (re.findall(r'<li data-item-id=["\'][^>]+>\s*<a href=["\'](?![^"\'/]*?/merch)([^"\']+)', webpage)
481 or re.findall(r'<div[^>]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage))
483 return self.playlist_from_matches(
484 discography_data, uploader, f'Discography of {uploader}', getter=lambda x: urljoin(url, x))