[ie/soundcloud] Various fixes (#11820)
[yt-dlp.git] / yt_dlp / extractor / cpac.py
blob08d79a22f417191ff7a953472a17cfa17435a105
1 from .common import InfoExtractor
2 from ..utils import (
3 int_or_none,
4 str_or_none,
5 try_get,
6 unified_timestamp,
7 update_url_query,
8 urljoin,
12 class CPACIE(InfoExtractor):
13 IE_NAME = 'cpac'
14 _VALID_URL = r'https?://(?:www\.)?cpac\.ca/(?P<fr>l-)?episode\?id=(?P<id>[\da-f]{8}(?:-[\da-f]{4}){3}-[\da-f]{12})'
15 _TEST = {
16 # 'url': 'http://www.cpac.ca/en/programs/primetime-politics/episodes/65490909',
17 'url': 'https://www.cpac.ca/episode?id=fc7edcae-4660-47e1-ba61-5b7f29a9db0f',
18 'md5': 'e46ad699caafd7aa6024279f2614e8fa',
19 'info_dict': {
20 'id': 'fc7edcae-4660-47e1-ba61-5b7f29a9db0f',
21 'ext': 'mp4',
22 'upload_date': '20220215',
23 'title': 'News Conference to Celebrate National Kindness Week – February 15, 2022',
24 'description': 'md5:466a206abd21f3a6f776cdef290c23fb',
25 'timestamp': 1644901200,
27 'params': {
28 'format': 'bestvideo',
29 'hls_prefer_native': True,
33 def _real_extract(self, url):
34 video_id = self._match_id(url)
35 url_lang = 'fr' if '/l-episode?' in url else 'en'
37 content = self._download_json(
38 'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/episode/index.xml&crafterSite=cpacca&id=' + video_id,
39 video_id)
40 video_url = try_get(content, lambda x: x['page']['details']['videoUrl'], str)
41 formats = []
42 if video_url:
43 content = content['page']
44 title = str_or_none(content['details'][f'title_{url_lang}_t'])
45 formats = self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', ext='mp4')
46 for fmt in formats:
47 # prefer language to match URL
48 fmt_lang = fmt.get('language')
49 if fmt_lang == url_lang:
50 fmt['language_preference'] = 10
51 elif not fmt_lang:
52 fmt['language_preference'] = -1
53 else:
54 fmt['language_preference'] = -10
56 category = str_or_none(content['details'][f'category_{url_lang}_t'])
58 def is_live(v_type):
59 return (v_type == 'live') if v_type is not None else None
61 return {
62 'id': video_id,
63 'formats': formats,
64 'title': title,
65 'description': str_or_none(content['details'].get(f'description_{url_lang}_t')),
66 'timestamp': unified_timestamp(content['details'].get('liveDateTime')),
67 'categories': [category] if category else None,
68 'thumbnail': urljoin(url, str_or_none(content['details'].get(f'image_{url_lang}_s'))),
69 'is_live': is_live(content['details'].get('type')),
73 class CPACPlaylistIE(InfoExtractor):
74 IE_NAME = 'cpac:playlist'
75 _VALID_URL = r'(?i)https?://(?:www\.)?cpac\.ca/(?:program|search|(?P<fr>emission|rechercher))\?(?:[^&]+&)*?(?P<id>(?:id=\d+|programId=\d+|key=[^&]+))'
77 _TESTS = [{
78 'url': 'https://www.cpac.ca/program?id=6',
79 'info_dict': {
80 'id': 'id=6',
81 'title': 'Headline Politics',
82 'description': 'Watch CPAC’s signature long-form coverage of the day’s pressing political events as they unfold.',
84 'playlist_count': 10,
85 }, {
86 'url': 'https://www.cpac.ca/search?key=hudson&type=all&order=desc',
87 'info_dict': {
88 'id': 'key=hudson',
89 'title': 'hudson',
91 'playlist_count': 22,
92 }, {
93 'url': 'https://www.cpac.ca/search?programId=50',
94 'info_dict': {
95 'id': 'programId=50',
96 'title': '50',
98 'playlist_count': 9,
99 }, {
100 'url': 'https://www.cpac.ca/emission?id=6',
101 'only_matching': True,
102 }, {
103 'url': 'https://www.cpac.ca/rechercher?key=hudson&type=all&order=desc',
104 'only_matching': True,
107 def _real_extract(self, url):
108 video_id = self._match_id(url)
109 url_lang = 'fr' if any(x in url for x in ('/emission?', '/rechercher?')) else 'en'
110 pl_type, list_type = ('program', 'itemList') if any(x in url for x in ('/program?', '/emission?')) else ('search', 'searchResult')
111 api_url = (
112 f'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/{pl_type}/index.xml&crafterSite=cpacca&{video_id}')
113 content = self._download_json(api_url, video_id)
114 entries = []
115 total_pages = int_or_none(try_get(content, lambda x: x['page'][list_type]['totalPages']), default=1)
116 for page in range(1, total_pages + 1):
117 if page > 1:
118 api_url = update_url_query(api_url, {'page': page})
119 content = self._download_json(
120 api_url, video_id,
121 note=f'Downloading continuation - {page}',
122 fatal=False)
124 for item in try_get(content, lambda x: x['page'][list_type]['item'], list) or []:
125 episode_url = urljoin(url, try_get(item, lambda x: x[f'url_{url_lang}_s']))
126 if episode_url:
127 entries.append(episode_url)
129 return self.playlist_result(
130 (self.url_result(entry) for entry in entries),
131 playlist_id=video_id,
132 playlist_title=try_get(content, lambda x: x['page']['program'][f'title_{url_lang}_t']) or video_id.split('=')[-1],
133 playlist_description=try_get(content, lambda x: x['page']['program'][f'description_{url_lang}_t']),