[ie/youtube] Fix `uploader_id` extraction (#11818)
[yt-dlp.git] / yt_dlp / extractor / yandexmusic.py
blob12cc5ca28eea174c8fd20f7e8a7dbb2f2d063ab6
1 import hashlib
2 import itertools
4 from .common import InfoExtractor
5 from ..utils import (
6 ExtractorError,
7 float_or_none,
8 int_or_none,
9 try_get,
13 class YandexMusicBaseIE(InfoExtractor):
14 _VALID_URL_BASE = r'https?://music\.yandex\.(?P<tld>ru|kz|ua|by|com)'
16 @staticmethod
17 def _handle_error(response):
18 if isinstance(response, dict):
19 error = response.get('error')
20 if error:
21 raise ExtractorError(error, expected=True)
22 if response.get('type') == 'captcha' or 'captcha' in response:
23 YandexMusicBaseIE._raise_captcha()
25 @staticmethod
26 def _raise_captcha():
27 raise ExtractorError(
28 'YandexMusic has considered yt-dlp requests automated and '
29 'asks you to solve a CAPTCHA. You can either wait for some '
30 'time until unblocked and optionally use --sleep-interval '
31 'in future or alternatively you can go to https://music.yandex.ru/ '
32 'solve CAPTCHA, then export cookies and pass cookie file to '
33 'yt-dlp with --cookies',
34 expected=True)
36 def _download_webpage_handle(self, *args, **kwargs):
37 webpage = super()._download_webpage_handle(*args, **kwargs)
38 if 'Нам очень жаль, но&nbsp;запросы, поступившие с&nbsp;вашего IP-адреса, похожи на&nbsp;автоматические.' in webpage:
39 self._raise_captcha()
40 return webpage
42 def _download_json(self, *args, **kwargs):
43 response = super()._download_json(*args, **kwargs)
44 self._handle_error(response)
45 return response
47 def _call_api(self, ep, tld, url, item_id, note, query):
48 return self._download_json(
49 f'https://music.yandex.{tld}/handlers/{ep}.jsx',
50 item_id, note,
51 fatal=False,
52 headers={
53 'Referer': url,
54 'X-Requested-With': 'XMLHttpRequest',
55 'X-Retpath-Y': url,
57 query=query)
60 class YandexMusicTrackIE(YandexMusicBaseIE):
61 IE_NAME = 'yandexmusic:track'
62 IE_DESC = 'Яндекс.Музыка - Трек'
63 _VALID_URL = rf'{YandexMusicBaseIE._VALID_URL_BASE}/album/(?P<album_id>\d+)/track/(?P<id>\d+)'
65 _TESTS = [{
66 'url': 'http://music.yandex.ru/album/540508/track/4878838',
67 'md5': 'dec8b661f12027ceaba33318787fff76',
68 'info_dict': {
69 'id': '4878838',
70 'ext': 'mp3',
71 'title': 'md5:c63e19341fdbe84e43425a30bc777856',
72 'filesize': int,
73 'duration': 193.04,
74 'track': 'md5:210508c6ffdfd67a493a6c378f22c3ff',
75 'album': 'md5:cd04fb13c4efeafdfa0a6a6aca36d01a',
76 'album_artist': 'md5:5f54c35462c07952df33d97cfb5fc200',
77 'artist': 'md5:e6fd86621825f14dc0b25db3acd68160',
78 'release_year': 2009,
80 # 'skip': 'Travis CI servers blocked by YandexMusic',
81 }, {
82 # multiple disks
83 'url': 'http://music.yandex.ru/album/3840501/track/705105',
84 'md5': '82a54e9e787301dd45aba093cf6e58c0',
85 'info_dict': {
86 'id': '705105',
87 'ext': 'mp3',
88 'title': 'md5:f86d4a9188279860a83000277024c1a6',
89 'filesize': int,
90 'duration': 239.27,
91 'track': 'md5:40f887f0666ba1aa10b835aca44807d1',
92 'album': 'md5:624f5224b14f5c88a8e812fd7fbf1873',
93 'album_artist': 'md5:dd35f2af4e8927100cbe6f5e62e1fb12',
94 'artist': 'md5:dd35f2af4e8927100cbe6f5e62e1fb12',
95 'release_year': 2016,
96 'genre': 'pop',
97 'disc_number': 2,
98 'track_number': 9,
100 # 'skip': 'Travis CI servers blocked by YandexMusic',
101 }, {
102 'url': 'http://music.yandex.com/album/540508/track/4878838',
103 'only_matching': True,
106 def _real_extract(self, url):
107 mobj = self._match_valid_url(url)
108 tld, album_id, track_id = mobj.group('tld'), mobj.group('album_id'), mobj.group('id')
110 track = self._call_api(
111 'track', tld, url, track_id, 'Downloading track JSON',
112 {'track': f'{track_id}:{album_id}'})['track']
113 track_title = track['title']
115 download_data = self._download_json(
116 f'https://music.yandex.ru/api/v2.1/handlers/track/{track_id}:{album_id}/web-album_track-track-track-main/download/m',
117 track_id, 'Downloading track location url JSON', query={'hq': 1}, headers={'X-Retpath-Y': url})
119 fd_data = self._download_json(
120 download_data['src'], track_id,
121 'Downloading track location JSON',
122 query={'format': 'json'})
123 key = hashlib.md5(('XGRlBW9FXlekgbPrRHuSiA' + fd_data['path'][1:] + fd_data['s']).encode()).hexdigest()
124 f_url = 'http://{}/get-mp3/{}/{}?track-id={} '.format(fd_data['host'], key, fd_data['ts'] + fd_data['path'], track['id'])
126 thumbnail = None
127 cover_uri = track.get('albums', [{}])[0].get('coverUri')
128 if cover_uri:
129 thumbnail = cover_uri.replace('%%', 'orig')
130 if not thumbnail.startswith('http'):
131 thumbnail = 'http://' + thumbnail
133 track_info = {
134 'id': track_id,
135 'ext': 'mp3',
136 'url': f_url,
137 'filesize': int_or_none(track.get('fileSize')),
138 'duration': float_or_none(track.get('durationMs'), 1000),
139 'thumbnail': thumbnail,
140 'track': track_title,
141 'acodec': download_data.get('codec'),
142 'abr': int_or_none(download_data.get('bitrate')),
145 def extract_artist_name(artist):
146 decomposed = artist.get('decomposed')
147 if not isinstance(decomposed, list):
148 return artist['name']
149 parts = [artist['name']]
150 for element in decomposed:
151 if isinstance(element, dict) and element.get('name'):
152 parts.append(element['name'])
153 elif isinstance(element, str):
154 parts.append(element)
155 return ''.join(parts)
157 def extract_artist(artist_list):
158 if artist_list and isinstance(artist_list, list):
159 artists_names = [extract_artist_name(a) for a in artist_list if a.get('name')]
160 if artists_names:
161 return ', '.join(artists_names)
163 albums = track.get('albums')
164 if albums and isinstance(albums, list):
165 album = albums[0]
166 if isinstance(album, dict):
167 year = album.get('year')
168 disc_number = int_or_none(try_get(
169 album, lambda x: x['trackPosition']['volume']))
170 track_number = int_or_none(try_get(
171 album, lambda x: x['trackPosition']['index']))
172 track_info.update({
173 'album': album.get('title'),
174 'album_artist': extract_artist(album.get('artists')),
175 'release_year': int_or_none(year),
176 'genre': album.get('genre'),
177 'disc_number': disc_number,
178 'track_number': track_number,
181 track_artist = extract_artist(track.get('artists'))
182 if track_artist:
183 track_info.update({
184 'artist': track_artist,
185 'title': f'{track_artist} - {track_title}',
187 else:
188 track_info['title'] = track_title
190 return track_info
193 class YandexMusicPlaylistBaseIE(YandexMusicBaseIE):
194 def _extract_tracks(self, source, item_id, url, tld):
195 tracks = source['tracks']
196 track_ids = [str(track_id) for track_id in source['trackIds']]
198 # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks,
199 # missing tracks should be retrieved manually.
200 if len(tracks) < len(track_ids):
201 present_track_ids = {
202 str(track['id'])
203 for track in tracks if track.get('id')}
204 missing_track_ids = [
205 track_id for track_id in track_ids
206 if track_id not in present_track_ids]
207 # Request missing tracks in chunks to avoid exceeding max HTTP header size,
208 # see https://github.com/ytdl-org/youtube-dl/issues/27355
209 _TRACKS_PER_CHUNK = 250
210 for chunk_num in itertools.count(0):
211 start = chunk_num * _TRACKS_PER_CHUNK
212 end = start + _TRACKS_PER_CHUNK
213 missing_track_ids_req = missing_track_ids[start:end]
214 assert missing_track_ids_req
215 missing_tracks = self._call_api(
216 'track-entries', tld, url, item_id,
217 f'Downloading missing tracks JSON chunk {chunk_num + 1}', {
218 'entries': ','.join(missing_track_ids_req),
219 'lang': tld,
220 'external-domain': f'music.yandex.{tld}',
221 'overembed': 'false',
222 'strict': 'true',
224 if missing_tracks:
225 tracks.extend(missing_tracks)
226 if end >= len(missing_track_ids):
227 break
229 return tracks
231 def _build_playlist(self, tracks):
232 entries = []
233 for track in tracks:
234 track_id = track.get('id') or track.get('realId')
235 if not track_id:
236 continue
237 albums = track.get('albums')
238 if not albums or not isinstance(albums, list):
239 continue
240 album = albums[0]
241 if not isinstance(album, dict):
242 continue
243 album_id = album.get('id')
244 if not album_id:
245 continue
246 entries.append(self.url_result(
247 f'http://music.yandex.ru/album/{album_id}/track/{track_id}',
248 ie=YandexMusicTrackIE.ie_key(), video_id=track_id))
249 return entries
252 class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE):
253 IE_NAME = 'yandexmusic:album'
254 IE_DESC = 'Яндекс.Музыка - Альбом'
255 _VALID_URL = rf'{YandexMusicBaseIE._VALID_URL_BASE}/album/(?P<id>\d+)'
257 _TESTS = [{
258 'url': 'http://music.yandex.ru/album/540508',
259 'info_dict': {
260 'id': '540508',
261 'title': 'md5:7ed1c3567f28d14be9f61179116f5571',
263 'playlist_count': 50,
264 # 'skip': 'Travis CI servers blocked by YandexMusic',
265 }, {
266 'url': 'https://music.yandex.ru/album/3840501',
267 'info_dict': {
268 'id': '3840501',
269 'title': 'md5:36733472cdaa7dcb1fd9473f7da8e50f',
271 'playlist_count': 33,
272 # 'skip': 'Travis CI servers blocked by YandexMusic',
273 }, {
274 # empty artists
275 'url': 'https://music.yandex.ru/album/9091882',
276 'info_dict': {
277 'id': '9091882',
278 'title': 'ТЕД на русском',
280 'playlist_count': 187,
283 @classmethod
284 def suitable(cls, url):
285 return False if YandexMusicTrackIE.suitable(url) else super().suitable(url)
287 def _real_extract(self, url):
288 mobj = self._match_valid_url(url)
289 tld = mobj.group('tld')
290 album_id = mobj.group('id')
292 album = self._call_api(
293 'album', tld, url, album_id, 'Downloading album JSON',
294 {'album': album_id})
296 entries = self._build_playlist([track for volume in album['volumes'] for track in volume])
298 title = album['title']
299 artist = try_get(album, lambda x: x['artists'][0]['name'], str)
300 if artist:
301 title = f'{artist} - {title}'
302 year = album.get('year')
303 if year:
304 title += f' ({year})'
306 return self.playlist_result(entries, str(album['id']), title)
309 class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
310 IE_NAME = 'yandexmusic:playlist'
311 IE_DESC = 'Яндекс.Музыка - Плейлист'
312 _VALID_URL = rf'{YandexMusicBaseIE._VALID_URL_BASE}/users/(?P<user>[^/]+)/playlists/(?P<id>\d+)'
314 _TESTS = [{
315 'url': 'http://music.yandex.ru/users/music.partners/playlists/1245',
316 'info_dict': {
317 'id': '1245',
318 'title': 'md5:841559b3fe2b998eca88d0d2e22a3097',
319 'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9',
321 'playlist_count': 5,
322 # 'skip': 'Travis CI servers blocked by YandexMusic',
323 }, {
324 'url': 'https://music.yandex.ru/users/ya.playlist/playlists/1036',
325 'only_matching': True,
326 }, {
327 # playlist exceeding the limit of 150 tracks (see
328 # https://github.com/ytdl-org/youtube-dl/issues/6666)
329 'url': 'https://music.yandex.ru/users/mesiaz/playlists/1364',
330 'info_dict': {
331 'id': '1364',
332 'title': 'md5:b3b400f997d3f878a13ae0699653f7db',
334 'playlist_mincount': 437,
335 # 'skip': 'Travis CI servers blocked by YandexMusic',
338 def _real_extract(self, url):
339 mobj = self._match_valid_url(url)
340 tld = mobj.group('tld')
341 user = mobj.group('user')
342 playlist_id = mobj.group('id')
344 playlist = self._call_api(
345 'playlist', tld, url, playlist_id, 'Downloading playlist JSON', {
346 'owner': user,
347 'kinds': playlist_id,
348 'light': 'true',
349 'lang': tld,
350 'external-domain': f'music.yandex.{tld}',
351 'overembed': 'false',
352 })['playlist']
354 tracks = self._extract_tracks(playlist, playlist_id, url, tld)
356 return self.playlist_result(
357 self._build_playlist(tracks),
358 str(playlist_id),
359 playlist.get('title'), playlist.get('description'))
362 class YandexMusicArtistBaseIE(YandexMusicPlaylistBaseIE):
363 def _call_artist(self, tld, url, artist_id):
364 return self._call_api(
365 'artist', tld, url, artist_id,
366 f'Downloading artist {self._ARTIST_WHAT} JSON', {
367 'artist': artist_id,
368 'what': self._ARTIST_WHAT,
369 'sort': self._ARTIST_SORT or '',
370 'dir': '',
371 'period': '',
372 'lang': tld,
373 'external-domain': f'music.yandex.{tld}',
374 'overembed': 'false',
377 def _real_extract(self, url):
378 mobj = self._match_valid_url(url)
379 tld = mobj.group('tld')
380 artist_id = mobj.group('id')
381 data = self._call_artist(tld, url, artist_id)
382 tracks = self._extract_tracks(data, artist_id, url, tld)
383 title = try_get(data, lambda x: x['artist']['name'], str)
384 return self.playlist_result(
385 self._build_playlist(tracks), artist_id, title)
388 class YandexMusicArtistTracksIE(YandexMusicArtistBaseIE):
389 IE_NAME = 'yandexmusic:artist:tracks'
390 IE_DESC = 'Яндекс.Музыка - Артист - Треки'
391 _VALID_URL = rf'{YandexMusicBaseIE._VALID_URL_BASE}/artist/(?P<id>\d+)/tracks'
393 _TESTS = [{
394 'url': 'https://music.yandex.ru/artist/617526/tracks',
395 'info_dict': {
396 'id': '617526',
397 'title': 'md5:131aef29d45fd5a965ca613e708c040b',
399 'playlist_count': 507,
400 # 'skip': 'Travis CI servers blocked by YandexMusic',
403 _ARTIST_SORT = ''
404 _ARTIST_WHAT = 'tracks'
406 def _real_extract(self, url):
407 mobj = self._match_valid_url(url)
408 tld = mobj.group('tld')
409 artist_id = mobj.group('id')
410 data = self._call_artist(tld, url, artist_id)
411 tracks = self._extract_tracks(data, artist_id, url, tld)
412 artist = try_get(data, lambda x: x['artist']['name'], str)
413 title = '{} - {}'.format(artist or artist_id, 'Треки')
414 return self.playlist_result(
415 self._build_playlist(tracks), artist_id, title)
418 class YandexMusicArtistAlbumsIE(YandexMusicArtistBaseIE):
419 IE_NAME = 'yandexmusic:artist:albums'
420 IE_DESC = 'Яндекс.Музыка - Артист - Альбомы'
421 _VALID_URL = rf'{YandexMusicBaseIE._VALID_URL_BASE}/artist/(?P<id>\d+)/albums'
423 _TESTS = [{
424 'url': 'https://music.yandex.ru/artist/617526/albums',
425 'info_dict': {
426 'id': '617526',
427 'title': 'md5:55dc58d5c85699b7fb41ee926700236c',
429 'playlist_count': 8,
430 # 'skip': 'Travis CI servers blocked by YandexMusic',
433 _ARTIST_SORT = 'year'
434 _ARTIST_WHAT = 'albums'
436 def _real_extract(self, url):
437 mobj = self._match_valid_url(url)
438 tld = mobj.group('tld')
439 artist_id = mobj.group('id')
440 data = self._call_artist(tld, url, artist_id)
441 entries = []
442 for album in data['albums']:
443 if not isinstance(album, dict):
444 continue
445 album_id = album.get('id')
446 if not album_id:
447 continue
448 entries.append(self.url_result(
449 f'http://music.yandex.ru/album/{album_id}',
450 ie=YandexMusicAlbumIE.ie_key(), video_id=album_id))
451 artist = try_get(data, lambda x: x['artist']['name'], str)
452 title = '{} - {}'.format(artist or artist_id, 'Альбомы')
453 return self.playlist_result(entries, artist_id, title)