[ie/soundcloud] Various fixes (#11820)
[yt-dlp.git] / yt_dlp / extractor / bandcamp.py
blob939c2800e67120dfcd21b59911494d404717cb51
1 import json
2 import random
3 import re
4 import time
6 from .common import InfoExtractor
7 from ..utils import (
8 KNOWN_EXTENSIONS,
9 ExtractorError,
10 extract_attributes,
11 float_or_none,
12 int_or_none,
13 parse_filesize,
14 str_or_none,
15 try_get,
16 unified_strdate,
17 unified_timestamp,
18 update_url_query,
19 url_or_none,
20 urljoin,
22 from ..utils.traversal import find_element, traverse_obj
25 class BandcampIE(InfoExtractor):
26 _VALID_URL = r'https?://(?P<uploader>[^/]+)\.bandcamp\.com/track/(?P<id>[^/?#&]+)'
27 _EMBED_REGEX = [r'<meta property="og:url"[^>]*?content="(?P<url>.*?bandcamp\.com.*?)"']
28 _TESTS = [{
29 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
30 'md5': 'c557841d5e50261777a6585648adf439',
31 'info_dict': {
32 'id': '1812978515',
33 'ext': 'mp3',
34 'title': 'youtube-dl "\'/\\ä↭ - youtube-dl "\'/\\ä↭ - youtube-dl test song "\'/\\ä↭',
35 'duration': 9.8485,
36 'uploader': 'youtube-dl "\'/\\ä↭',
37 'upload_date': '20121129',
38 'timestamp': 1354224127,
39 'track': 'youtube-dl "\'/\\ä↭ - youtube-dl test song "\'/\\ä↭',
40 'album_artist': 'youtube-dl "\'/\\ä↭',
41 'track_id': '1812978515',
42 'artist': 'youtube-dl "\'/\\ä↭',
43 'uploader_url': 'https://youtube-dl.bandcamp.com',
44 'uploader_id': 'youtube-dl',
45 'thumbnail': 'https://f4.bcbits.com/img/a3216802731_5.jpg',
46 'artists': ['youtube-dl "\'/\\ä↭'],
47 'album_artists': ['youtube-dl "\'/\\ä↭'],
49 'skip': 'There is a limit of 200 free downloads / month for the test song',
50 }, {
51 # free download
52 'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
53 'info_dict': {
54 'id': '2650410135',
55 'ext': 'm4a',
56 'acodec': r're:[fa]lac',
57 'title': 'Ben Prunty - Lanius (Battle)',
58 'thumbnail': r're:^https?://.*\.jpg$',
59 'uploader': 'Ben Prunty',
60 'timestamp': 1396508491,
61 'upload_date': '20140403',
62 'release_timestamp': 1396483200,
63 'release_date': '20140403',
64 'duration': 260.877,
65 'track': 'Lanius (Battle)',
66 'track_number': 1,
67 'track_id': '2650410135',
68 'artist': 'Ben Prunty',
69 'album_artist': 'Ben Prunty',
70 'album': 'FTL: Advanced Edition Soundtrack',
71 'uploader_url': 'https://benprunty.bandcamp.com',
72 'uploader_id': 'benprunty',
74 }, {
75 # no free download, mp3 128
76 'url': 'https://relapsealumni.bandcamp.com/track/hail-to-fire',
77 'md5': 'fec12ff55e804bb7f7ebeb77a800c8b7',
78 'info_dict': {
79 'id': '2584466013',
80 'ext': 'mp3',
81 'title': 'Mastodon - Hail to Fire',
82 'thumbnail': r're:^https?://.*\.jpg$',
83 'uploader': 'Mastodon',
84 'timestamp': 1322005399,
85 'upload_date': '20111122',
86 'release_timestamp': 1076112000,
87 'release_date': '20040207',
88 'duration': 120.79,
89 'track': 'Hail to Fire',
90 'track_number': 5,
91 'track_id': '2584466013',
92 'artist': 'Mastodon',
93 'album_artist': 'Mastodon',
94 'album': 'Call of the Mastodon',
95 'uploader_url': 'https://relapsealumni.bandcamp.com',
96 'uploader_id': 'relapsealumni',
98 }, {
99 # track from compilation album (artist/album_artist difference)
100 'url': 'https://diskotopia.bandcamp.com/track/safehouse',
101 'md5': '19c5337bca1428afa54129f86a2f6a69',
102 'info_dict': {
103 'id': '1978174799',
104 'ext': 'mp3',
105 'title': 'submerse - submerse - Safehouse',
106 'thumbnail': r're:^https?://.*\.jpg$',
107 'uploader': 'submerse',
108 'timestamp': 1480779297,
109 'upload_date': '20161203',
110 'release_timestamp': 1481068800,
111 'release_date': '20161207',
112 'duration': 154.066,
113 'track': 'submerse - Safehouse',
114 'track_number': 3,
115 'track_id': '1978174799',
116 'artist': 'submerse',
117 'album_artist': 'Diskotopia',
118 'album': 'DSK F/W 2016-2017 Free Compilation',
119 'uploader_url': 'https://diskotopia.bandcamp.com',
120 'uploader_id': 'diskotopia',
124 def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True):
125 return self._parse_json(self._html_search_regex(
126 rf'data-{attr}=(["\'])({{.+?}})\1', webpage,
127 attr + ' data', group=2), video_id, fatal=fatal)
129 def _real_extract(self, url):
130 title, uploader = self._match_valid_url(url).group('id', 'uploader')
131 webpage = self._download_webpage(url, title)
132 tralbum = self._extract_data_attr(webpage, title)
133 thumbnail = self._og_search_thumbnail(webpage)
135 track_id = None
136 track = None
137 track_number = None
138 duration = None
140 formats = []
141 track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict)
142 if track_info:
143 file_ = track_info.get('file')
144 if isinstance(file_, dict):
145 for format_id, format_url in file_.items():
146 if not url_or_none(format_url):
147 continue
148 ext, abr_str = format_id.split('-', 1)
149 formats.append({
150 'format_id': format_id,
151 'url': self._proto_relative_url(format_url, 'http:'),
152 'ext': ext,
153 'vcodec': 'none',
154 'acodec': ext,
155 'abr': int_or_none(abr_str),
157 track = track_info.get('title')
158 track_id = str_or_none(
159 track_info.get('track_id') or track_info.get('id'))
160 track_number = int_or_none(track_info.get('track_num'))
161 duration = float_or_none(track_info.get('duration'))
163 embed = self._extract_data_attr(webpage, title, 'embed', False)
164 current = tralbum.get('current') or {}
165 artist = embed.get('artist') or current.get('artist') or tralbum.get('artist')
166 album_artist = self._html_search_regex(
167 r'<h3 class="albumTitle">[\S\s]*?by\s*<span>\s*<a href="[^>]+">\s*([^>]+?)\s*</a>',
168 webpage, 'album artist', fatal=False)
169 timestamp = unified_timestamp(
170 current.get('publish_date') or tralbum.get('album_publish_date'))
172 download_link = tralbum.get('freeDownloadPage')
173 if download_link:
174 track_id = str(tralbum['id'])
176 download_webpage = self._download_webpage(
177 download_link, track_id, 'Downloading free downloads page')
179 blob = self._extract_data_attr(download_webpage, track_id, 'blob')
181 info = try_get(
182 blob, (lambda x: x['digital_items'][0],
183 lambda x: x['download_items'][0]), dict)
184 if info:
185 downloads = info.get('downloads')
186 if isinstance(downloads, dict):
187 if not track:
188 track = info.get('title')
189 if not artist:
190 artist = info.get('artist')
191 if not thumbnail:
192 thumbnail = info.get('thumb_url')
194 download_formats = {}
195 download_formats_list = blob.get('download_formats')
196 if isinstance(download_formats_list, list):
197 for f in blob['download_formats']:
198 name, ext = f.get('name'), f.get('file_extension')
199 if all(isinstance(x, str) for x in (name, ext)):
200 download_formats[name] = ext.strip('.')
202 for format_id, f in downloads.items():
203 format_url = f.get('url')
204 if not format_url:
205 continue
206 # Stat URL generation algorithm is reverse engineered from
207 # download_*_bundle_*.js
208 stat_url = update_url_query(
209 format_url.replace('/download/', '/statdownload/'), {
210 '.rand': int(time.time() * 1000 * random.random()),
212 format_id = f.get('encoding_name') or format_id
213 stat = self._download_json(
214 stat_url, track_id, f'Downloading {format_id} JSON',
215 transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1],
216 fatal=False)
217 if not stat:
218 continue
219 retry_url = url_or_none(stat.get('retry_url'))
220 if not retry_url:
221 continue
222 formats.append({
223 'url': self._proto_relative_url(retry_url, 'http:'),
224 'ext': download_formats.get(format_id),
225 'format_id': format_id,
226 'format_note': f.get('description'),
227 'filesize': parse_filesize(f.get('size_mb')),
228 'vcodec': 'none',
229 'acodec': format_id.split('-')[0],
232 title = f'{artist} - {track}' if artist else track
234 if not duration:
235 duration = float_or_none(self._html_search_meta(
236 'duration', webpage, default=None))
238 return {
239 'id': track_id,
240 'title': title,
241 'thumbnail': thumbnail,
242 'uploader': artist,
243 'uploader_id': uploader,
244 'uploader_url': f'https://{uploader}.bandcamp.com',
245 'timestamp': timestamp,
246 'release_timestamp': unified_timestamp(tralbum.get('album_release_date')),
247 'duration': duration,
248 'track': track,
249 'track_number': track_number,
250 'track_id': track_id,
251 'artist': artist,
252 'album': embed.get('album_title'),
253 'album_artist': album_artist,
254 'formats': formats,
258 class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE
259 IE_NAME = 'Bandcamp:album'
260 _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com/album/(?P<id>[^/?#&]+)'
262 _TESTS = [{
263 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
264 'playlist': [
266 'md5': '39bc1eded3476e927c724321ddf116cf',
267 'info_dict': {
268 'id': '1353101989',
269 'ext': 'mp3',
270 'title': 'Blazo - Intro',
271 'timestamp': 1311756226,
272 'upload_date': '20110727',
273 'uploader': 'Blazo',
274 'thumbnail': 'https://f4.bcbits.com/img/a1721150828_5.jpg',
275 'album_artists': ['Blazo'],
276 'uploader_url': 'https://blazo.bandcamp.com',
277 'release_date': '20110727',
278 'release_timestamp': 1311724800.0,
279 'track': 'Intro',
280 'uploader_id': 'blazo',
281 'track_number': 1,
282 'album': 'Jazz Format Mixtape vol.1',
283 'artists': ['Blazo'],
284 'duration': 19.335,
285 'track_id': '1353101989',
289 'md5': '1a2c32e2691474643e912cc6cd4bffaa',
290 'info_dict': {
291 'id': '38097443',
292 'ext': 'mp3',
293 'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)',
294 'timestamp': 1311757238,
295 'upload_date': '20110727',
296 'uploader': 'Blazo',
297 'track': 'Kero One - Keep It Alive (Blazo remix)',
298 'release_date': '20110727',
299 'track_id': '38097443',
300 'track_number': 2,
301 'duration': 181.467,
302 'uploader_url': 'https://blazo.bandcamp.com',
303 'album': 'Jazz Format Mixtape vol.1',
304 'uploader_id': 'blazo',
305 'album_artists': ['Blazo'],
306 'artists': ['Blazo'],
307 'thumbnail': 'https://f4.bcbits.com/img/a1721150828_5.jpg',
308 'release_timestamp': 1311724800.0,
312 'info_dict': {
313 'title': 'Jazz Format Mixtape vol.1',
314 'id': 'jazz-format-mixtape-vol-1',
315 'uploader_id': 'blazo',
316 'description': 'md5:38052a93217f3ffdc033cd5dbbce2989',
318 'params': {
319 'playlistend': 2,
321 'skip': 'Bandcamp imposes download limits.',
322 }, {
323 'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave',
324 'info_dict': {
325 'title': 'Hierophany of the Open Grave',
326 'uploader_id': 'nightbringer',
327 'id': 'hierophany-of-the-open-grave',
329 'playlist_mincount': 9,
330 }, {
331 # with escaped quote in title
332 'url': 'https://jstrecords.bandcamp.com/album/entropy-ep',
333 'info_dict': {
334 'title': '"Entropy" EP',
335 'uploader_id': 'jstrecords',
336 'id': 'entropy-ep',
337 'description': 'md5:0ff22959c943622972596062f2f366a5',
339 'playlist_mincount': 3,
340 }, {
341 # not all tracks have songs
342 'url': 'https://insulters.bandcamp.com/album/we-are-the-plague',
343 'info_dict': {
344 'id': 'we-are-the-plague',
345 'title': 'WE ARE THE PLAGUE',
346 'uploader_id': 'insulters',
347 'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f',
349 'playlist_count': 2,
352 @classmethod
353 def suitable(cls, url):
354 return (False
355 if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url)
356 else super().suitable(url))
358 def _real_extract(self, url):
359 uploader_id, album_id = self._match_valid_url(url).groups()
360 playlist_id = album_id or uploader_id
361 webpage = self._download_webpage(url, playlist_id)
362 tralbum = self._extract_data_attr(webpage, playlist_id)
363 track_info = tralbum.get('trackinfo')
364 if not track_info:
365 raise ExtractorError('The page doesn\'t contain any tracks')
366 # Only tracks with duration info have songs
367 entries = [
368 self.url_result(
369 urljoin(url, t['title_link']), BandcampIE.ie_key(),
370 str_or_none(t.get('track_id') or t.get('id')), t.get('title'))
371 for t in track_info
372 if t.get('duration')]
374 current = tralbum.get('current') or {}
376 return {
377 '_type': 'playlist',
378 'uploader_id': uploader_id,
379 'id': playlist_id,
380 'title': current.get('title'),
381 'description': current.get('about'),
382 'entries': entries,
386 class BandcampWeeklyIE(BandcampIE): # XXX: Do not subclass from concrete IE
387 IE_NAME = 'Bandcamp:weekly'
388 _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
389 _TESTS = [{
390 'url': 'https://bandcamp.com/?show=224',
391 'md5': '61acc9a002bed93986b91168aa3ab433',
392 'info_dict': {
393 'id': '224',
394 'ext': 'mp3',
395 'title': 'BC Weekly April 4th 2017 - Magic Moments',
396 'description': 'md5:5d48150916e8e02d030623a48512c874',
397 'duration': 5829.77,
398 'release_date': '20170404',
399 'series': 'Bandcamp Weekly',
400 'episode': 'Magic Moments',
401 'episode_id': '224',
403 'params': {
404 'format': 'mp3-128',
406 }, {
407 'url': 'https://bandcamp.com/?blah/blah@&show=228',
408 'only_matching': True,
411 def _real_extract(self, url):
412 show_id = self._match_id(url)
413 webpage = self._download_webpage(url, show_id)
415 blob = self._extract_data_attr(webpage, show_id, 'blob')
417 show = blob['bcw_data'][show_id]
419 formats = []
420 for format_id, format_url in show['audio_stream'].items():
421 if not url_or_none(format_url):
422 continue
423 for known_ext in KNOWN_EXTENSIONS:
424 if known_ext in format_id:
425 ext = known_ext
426 break
427 else:
428 ext = None
429 formats.append({
430 'format_id': format_id,
431 'url': format_url,
432 'ext': ext,
433 'vcodec': 'none',
436 title = show.get('audio_title') or 'Bandcamp Weekly'
437 subtitle = show.get('subtitle')
438 if subtitle:
439 title += f' - {subtitle}'
441 return {
442 'id': show_id,
443 'title': title,
444 'description': show.get('desc') or show.get('short_desc'),
445 'duration': float_or_none(show.get('audio_duration')),
446 'is_live': False,
447 'release_date': unified_strdate(show.get('published_date')),
448 'series': 'Bandcamp Weekly',
449 'episode': show.get('subtitle'),
450 'episode_id': show_id,
451 'formats': formats,
455 class BandcampUserIE(InfoExtractor):
456 IE_NAME = 'Bandcamp:user'
457 _VALID_URL = r'https?://(?!www\.)(?P<id>[^.]+)\.bandcamp\.com(?:/music)?/?(?:[#?]|$)'
459 _TESTS = [{
460 # Type 1 Bandcamp user page.
461 'url': 'https://adrianvonziegler.bandcamp.com',
462 'info_dict': {
463 'id': 'adrianvonziegler',
464 'title': 'Discography of adrianvonziegler',
466 'playlist_mincount': 23,
467 }, {
468 # Bandcamp user page with only one album
469 'url': 'http://dotscale.bandcamp.com',
470 'info_dict': {
471 'id': 'dotscale',
472 'title': 'Discography of dotscale',
474 'playlist_count': 1,
475 }, {
476 # Type 2 Bandcamp user page.
477 'url': 'https://nightcallofficial.bandcamp.com',
478 'info_dict': {
479 'id': 'nightcallofficial',
480 'title': 'Discography of nightcallofficial',
482 'playlist_count': 4,
483 }, {
484 'url': 'https://steviasphere.bandcamp.com/music',
485 'playlist_mincount': 47,
486 'info_dict': {
487 'id': 'steviasphere',
488 'title': 'Discography of steviasphere',
490 }, {
491 'url': 'https://coldworldofficial.bandcamp.com/music',
492 'playlist_mincount': 7,
493 'info_dict': {
494 'id': 'coldworldofficial',
495 'title': 'Discography of coldworldofficial',
497 }, {
498 'url': 'https://nuclearwarnowproductions.bandcamp.com/music',
499 'playlist_mincount': 399,
500 'info_dict': {
501 'id': 'nuclearwarnowproductions',
502 'title': 'Discography of nuclearwarnowproductions',
506 def _yield_items(self, webpage):
507 yield from (
508 re.findall(r'<li data-item-id=["\'][^>]+>\s*<a href=["\'](?![^"\'/]*?/merch)([^"\']+)', webpage)
509 or re.findall(r'<div[^>]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage))
511 yield from traverse_obj(webpage, (
512 {find_element(id='music-grid', html=True)}, {extract_attributes},
513 'data-client-items', {json.loads}, ..., 'page_url', {str}))
515 def _real_extract(self, url):
516 uploader = self._match_id(url)
517 webpage = self._download_webpage(url, uploader)
519 return self.playlist_from_matches(
520 self._yield_items(webpage), uploader, f'Discography of {uploader}',
521 getter=urljoin(url))