[ie/dplay] Fix extractors (#10471)
[yt-dlp3.git] / yt_dlp / extractor / monstercat.py
blob930c13e278812503531c81d0b7107b712af3b623
1 import re
3 from .common import InfoExtractor
4 from ..utils import (
5 clean_html,
6 extract_attributes,
7 get_element_by_class,
8 get_element_html_by_class,
9 get_element_text_and_html_by_tag,
10 int_or_none,
11 strip_or_none,
12 traverse_obj,
13 try_call,
14 unified_strdate,
18 class MonstercatIE(InfoExtractor):
19 _VALID_URL = r'https?://www\.monstercat\.com/release/(?P<id>\d+)'
20 _TESTS = [{
21 'url': 'https://www.monstercat.com/release/742779548009',
22 'playlist_count': 20,
23 'info_dict': {
24 'title': 'The Secret Language of Trees',
25 'id': '742779548009',
26 'thumbnail': 'https://www.monstercat.com/release/742779548009/cover',
27 'release_date': '20230711',
28 'album': 'The Secret Language of Trees',
29 'album_artist': 'BT',
33 def _extract_tracks(self, table, album_meta):
34 for td in re.findall(r'<tr[^<]*>((?:(?!</tr>)[\w\W])+)', table): # regex by chatgpt due to lack of get_elements_by_tag
35 title = clean_html(try_call(
36 lambda: get_element_by_class('d-inline-flex flex-column', td).partition(' <span')[0]))
37 ids = extract_attributes(try_call(lambda: get_element_html_by_class('btn-play cursor-pointer mr-small', td)) or '')
38 track_id = ids.get('data-track-id')
39 release_id = ids.get('data-release-id')
41 track_number = int_or_none(try_call(lambda: get_element_by_class('py-xsmall', td)))
42 if not track_id or not release_id:
43 self.report_warning(f'Skipping track {track_number}, ID(s) not found')
44 self.write_debug(f'release_id={release_id!r} track_id={track_id!r}')
45 continue
46 yield {
47 **album_meta,
48 'title': title,
49 'track': title,
50 'track_number': track_number,
51 'artist': clean_html(try_call(lambda: get_element_by_class('d-block fs-xxsmall', td))),
52 'url': f'https://www.monstercat.com/api/release/{release_id}/track-stream/{track_id}',
53 'id': track_id,
54 'ext': 'mp3',
57 def _real_extract(self, url):
58 url_id = self._match_id(url)
59 html = self._download_webpage(url, url_id)
60 # wrap all `get_elements` in `try_call`, HTMLParser has problems with site's html
61 tracklist_table = try_call(lambda: get_element_by_class('table table-small', html)) or ''
63 title = try_call(lambda: get_element_text_and_html_by_tag('h1', html)[0])
64 date = traverse_obj(html, ({lambda html: get_element_by_class('font-italic mb-medium d-tablet-none d-phone-block',
65 html).partition('Released ')}, 2, {strip_or_none}, {unified_strdate}))
67 album_meta = {
68 'title': title,
69 'album': title,
70 'thumbnail': f'https://www.monstercat.com/release/{url_id}/cover',
71 'album_artist': try_call(
72 lambda: get_element_by_class('h-normal text-uppercase mb-desktop-medium mb-smallish', html)),
73 'release_date': date,
76 return self.playlist_result(
77 self._extract_tracks(tracklist_table, album_meta), playlist_id=url_id, **album_meta)