3 from .common
import InfoExtractor
, SearchInfoExtractor
16 class PRXBaseIE(InfoExtractor
):
17 PRX_BASE_URL_RE
= r
'https?://(?:(?:beta|listen)\.)?prx.org/%s'
19 def _call_api(self
, item_id
, path
, query
=None, fatal
=True, note
='Downloading CMS API JSON'):
20 return self
._download
_json
(
21 urljoin('https://cms.prx.org/api/v1/', path
), item_id
, query
=query
, fatal
=fatal
, note
=note
)
24 def _get_prx_embed_response(response
, section
):
25 return traverse_obj(response
, ('_embedded', f
'prx:{section}'))
28 def _extract_file_link(response
):
29 return url_or_none(traverse_obj(
30 response
, ('_links', 'enclosure', 'href'), expected_type
=str))
33 def _extract_image(cls
, image_response
):
34 if not isinstance(image_response
, dict):
37 'id': str_or_none(image_response
.get('id')),
38 'filesize': image_response
.get('size'),
39 'width': image_response
.get('width'),
40 'height': image_response
.get('height'),
41 'url': cls
._extract
_file
_link
(image_response
),
45 def _extract_base_info(cls
, response
):
46 if not isinstance(response
, dict):
48 item_id
= str_or_none(response
.get('id'))
51 thumbnail_dict
= cls
._extract
_image
(cls
._get
_prx
_embed
_response
(response
, 'image'))
53 clean_html(response
.get('description'))
54 or response
.get('shortDescription'))
57 'title': response
.get('title') or item_id
,
58 'thumbnails': [thumbnail_dict
] if thumbnail_dict
else None,
59 'description': description
,
60 'release_timestamp': unified_timestamp(response
.get('releasedAt')),
61 'timestamp': unified_timestamp(response
.get('createdAt')),
62 'modified_timestamp': unified_timestamp(response
.get('updatedAt')),
63 'duration': int_or_none(response
.get('duration')),
64 'tags': response
.get('tags'),
65 'episode_number': int_or_none(response
.get('episodeIdentifier')),
66 'season_number': int_or_none(response
.get('seasonIdentifier')),
70 def _extract_series_info(cls
, series_response
):
71 base_info
= cls
._extract
_base
_info
(series_response
)
74 account_info
= cls
._extract
_account
_info
(
75 cls
._get
_prx
_embed
_response
(series_response
, 'account')) or {}
78 'channel_id': account_info
.get('channel_id'),
79 'channel_url': account_info
.get('channel_url'),
80 'channel': account_info
.get('channel'),
81 'series': base_info
.get('title'),
82 'series_id': base_info
.get('id'),
86 def _extract_account_info(cls
, account_response
):
87 base_info
= cls
._extract
_base
_info
(account_response
)
90 name
= account_response
.get('name')
94 'channel_id': base_info
.get('id'),
95 'channel_url': 'https://beta.prx.org/accounts/{}'.format(base_info
.get('id')),
100 def _extract_story_info(cls
, story_response
):
101 base_info
= cls
._extract
_base
_info
(story_response
)
104 series
= cls
._extract
_series
_info
(
105 cls
._get
_prx
_embed
_response
(story_response
, 'series')) or {}
106 account
= cls
._extract
_account
_info
(
107 cls
._get
_prx
_embed
_response
(story_response
, 'account')) or {}
110 'series': series
.get('series'),
111 'series_id': series
.get('series_id'),
112 'channel_id': account
.get('channel_id'),
113 'channel_url': account
.get('channel_url'),
114 'channel': account
.get('channel'),
117 def _entries(self
, item_id
, endpoint
, entry_func
, query
=None):
119 Extract entries from paginated list API
120 @param entry_func: Function to generate entry from response item
123 for page
in itertools
.count(1):
124 response
= self
._call
_api
(f
'{item_id}: page {page}', endpoint
, query
={
129 items
= self
._get
_prx
_embed
_response
(response
, 'items')
130 if not response
or not items
:
133 yield from filter(None, map(entry_func
, items
))
135 total
+= response
['count']
136 if total
>= response
['total']:
139 def _story_playlist_entry(self
, response
):
140 story
= self
._extract
_story
_info
(response
)
145 'url': 'https://beta.prx.org/stories/{}'.format(story
['id']),
146 'ie_key': PRXStoryIE
.ie_key(),
150 def _series_playlist_entry(self
, response
):
151 series
= self
._extract
_series
_info
(response
)
156 'url': 'https://beta.prx.org/series/{}'.format(series
['id']),
157 'ie_key': PRXSeriesIE
.ie_key(),
162 class PRXStoryIE(PRXBaseIE
):
163 _VALID_URL
= PRXBaseIE
.PRX_BASE_URL_RE
% r
'stories/(?P<id>\d+)'
167 # Story with season and episode details
168 'url': 'https://beta.prx.org/stories/399200',
171 'title': 'Fly Me To The Moon',
172 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
173 'release_timestamp': 1640250000,
174 'timestamp': 1640208972,
175 'modified_timestamp': 1641318202,
180 'series': 'AirSpace',
181 'series_id': '38057',
182 'channel_id': '220986',
183 'channel_url': 'https://beta.prx.org/accounts/220986',
184 'channel': 'Air and Space Museum',
188 'id': '399200_part1',
189 'title': 'Fly Me To The Moon',
190 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
191 'release_timestamp': 1640250000,
192 'timestamp': 1640208972,
193 'modified_timestamp': 1641318202,
198 'series': 'AirSpace',
199 'series_id': '38057',
200 'channel_id': '220986',
201 'channel_url': 'https://beta.prx.org/accounts/220986',
202 'channel': 'Air and Space Museum',
204 'upload_date': '20211222',
205 'episode': 'Episode 8',
206 'release_date': '20211223',
207 'season': 'Season 5',
208 'modified_date': '20220104',
212 'id': '399200_part2',
213 'title': 'Fly Me To The Moon',
214 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
215 'release_timestamp': 1640250000,
216 'timestamp': 1640208972,
217 'modified_timestamp': 1641318202,
222 'series': 'AirSpace',
223 'series_id': '38057',
224 'channel_id': '220986',
225 'channel_url': 'https://beta.prx.org/accounts/220986',
226 'channel': 'Air and Space Museum',
228 'upload_date': '20211222',
229 'episode': 'Episode 8',
230 'release_date': '20211223',
231 'season': 'Season 5',
232 'modified_date': '20220104',
238 # Story with only split audio
239 'url': 'https://beta.prx.org/stories/326414',
242 'title': 'Massachusetts v EPA',
243 'description': 'md5:744fffba08f19f4deab69fa8d49d5816',
244 'timestamp': 1592509124,
245 'modified_timestamp': 1592510457,
248 'series': 'Outside/In',
249 'series_id': '36252',
251 'channel_url': 'https://beta.prx.org/accounts/206',
252 'channel': 'New Hampshire Public Radio',
256 # Story with single combined audio
257 'url': 'https://beta.prx.org/stories/400404',
260 'title': 'Cafe Chill (Episode 2022-01)',
261 'thumbnails': 'count:1',
262 'description': 'md5:9f1b5a3cbd64fb159d08c3baa31f1539',
263 'timestamp': 1641233952,
264 'modified_timestamp': 1641234248,
266 'series': 'Café Chill',
267 'series_id': '37762',
268 'channel_id': '5767',
269 'channel_url': 'https://beta.prx.org/accounts/5767',
270 'channel': 'C89.5 - KNHC Seattle',
273 'thumbnail': r
're:https?://cms\.prx\.org/pub/\w+/0/web/story_image/767965/medium/Aurora_Over_Trees\.jpg',
274 'upload_date': '20220103',
275 'modified_date': '20220103',
278 'url': 'https://listen.prx.org/stories/399200',
279 'only_matching': True,
283 def _extract_audio_pieces(self
, audio_response
):
285 'format_id': str_or_none(piece_response
.get('id')),
286 'format_note': str_or_none(piece_response
.get('label')),
287 'filesize': int_or_none(piece_response
.get('size')),
288 'duration': int_or_none(piece_response
.get('duration')),
289 'ext': mimetype2ext(piece_response
.get('contentType')),
290 'asr': int_or_none(piece_response
.get('frequency'), scale
=1000),
291 'abr': int_or_none(piece_response
.get('bitRate')),
292 'url': self
._extract
_file
_link
(piece_response
),
294 } for piece_response
in sorted(
295 self
._get
_prx
_embed
_response
(audio_response
, 'items') or [],
296 key
=lambda p
: int_or_none(p
.get('position')))]
298 def _extract_story(self
, story_response
):
299 info
= self
._extract
_story
_info
(story_response
)
302 audio_pieces
= self
._extract
_audio
_pieces
(
303 self
._get
_prx
_embed
_response
(story_response
, 'audio'))
304 if len(audio_pieces
) == 1:
306 'formats': audio_pieces
,
312 'id': '{}_part{}'.format(info
['id'], (idx
+ 1)),
314 } for idx
, fmt
in enumerate(audio_pieces
)]
316 '_type': 'multi_video',
321 def _real_extract(self
, url
):
322 story_id
= self
._match
_id
(url
)
323 response
= self
._call
_api
(story_id
, f
'stories/{story_id}')
324 return self
._extract
_story
(response
)
327 class PRXSeriesIE(PRXBaseIE
):
328 _VALID_URL
= PRXBaseIE
.PRX_BASE_URL_RE
% r
'series/(?P<id>\d+)'
331 'url': 'https://beta.prx.org/series/36252',
334 'title': 'Outside/In',
335 'thumbnails': 'count:1',
336 'description': 'md5:a6bedc5f810777bcb09ab30ff9059114',
337 'timestamp': 1470684964,
338 'modified_timestamp': 1582308830,
340 'channel_url': 'https://beta.prx.org/accounts/206',
341 'channel': 'New Hampshire Public Radio',
342 'series': 'Outside/In',
343 'series_id': '36252',
345 'playlist_mincount': 39,
348 'url': 'https://beta.prx.org/series/25038',
352 'timestamp': 1207612800,
353 'modified_timestamp': 1207612800,
355 'channel_url': 'https://beta.prx.org/accounts/206',
356 'channel': 'New Hampshire Public Radio',
358 'series_id': '25038',
364 def _extract_series(self
, series_response
):
365 info
= self
._extract
_series
_info
(series_response
)
368 'entries': self
._entries
(info
['id'], 'series/{}/stories'.format(info
['id']), self
._story
_playlist
_entry
),
372 def _real_extract(self
, url
):
373 series_id
= self
._match
_id
(url
)
374 response
= self
._call
_api
(series_id
, f
'series/{series_id}')
375 return self
._extract
_series
(response
)
378 class PRXAccountIE(PRXBaseIE
):
379 _VALID_URL
= PRXBaseIE
.PRX_BASE_URL_RE
% r
'accounts/(?P<id>\d+)'
381 'url': 'https://beta.prx.org/accounts/206',
384 'title': 'New Hampshire Public Radio',
385 'description': 'md5:277f2395301d0aca563c80c70a18ee0a',
387 'channel_url': 'https://beta.prx.org/accounts/206',
388 'channel': 'New Hampshire Public Radio',
389 'thumbnails': 'count:1',
391 'playlist_mincount': 380,
394 def _extract_account(self
, account_response
):
395 info
= self
._extract
_account
_info
(account_response
)
396 series
= self
._entries
(
397 info
['id'], f
'accounts/{info["id"]}/series', self
._series
_playlist
_entry
)
398 stories
= self
._entries
(
399 info
['id'], f
'accounts/{info["id"]}/stories', self
._story
_playlist
_entry
)
402 'entries': itertools
.chain(series
, stories
),
406 def _real_extract(self
, url
):
407 account_id
= self
._match
_id
(url
)
408 response
= self
._call
_api
(account_id
, f
'accounts/{account_id}')
409 return self
._extract
_account
(response
)
412 class PRXStoriesSearchIE(PRXBaseIE
, SearchInfoExtractor
):
413 IE_DESC
= 'PRX Stories Search'
414 IE_NAME
= 'prxstories:search'
415 _SEARCH_KEY
= 'prxstories'
417 def _search_results(self
, query
):
418 yield from self
._entries
(
419 f
'query {query}', 'stories/search', self
._story
_playlist
_entry
, query
={'q': query
})
422 class PRXSeriesSearchIE(PRXBaseIE
, SearchInfoExtractor
):
423 IE_DESC
= 'PRX Series Search'
424 IE_NAME
= 'prxseries:search'
425 _SEARCH_KEY
= 'prxseries'
427 def _search_results(self
, query
):
428 yield from self
._entries
(
429 f
'query {query}', 'series/search', self
._series
_playlist
_entry
, query
={'q': query
})