1 from .common
import InfoExtractor
12 class ApplePodcastsIE(InfoExtractor
):
13 _VALID_URL
= r
'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)'
15 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
16 'md5': '41dc31cd650143e530d9423b6b5a344f',
18 'id': '1000482637777',
20 'title': '207 - Whitney Webb Returns',
21 'description': 'md5:75ef4316031df7b41ced4e7b987f79c6',
22 'upload_date': '20200705',
23 'timestamp': 1593932400,
25 'series': 'The Tim Dillon Show',
26 'thumbnail': 're:.+[.](png|jpe?g|webp)',
29 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
30 'only_matching': True,
32 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns?i=1000482637777',
33 'only_matching': True,
35 'url': 'https://podcasts.apple.com/podcast/id1135137367?i=1000482637777',
36 'only_matching': True,
39 def _real_extract(self
, url
):
40 episode_id
= self
._match
_id
(url
)
41 webpage
= self
._download
_webpage
(url
, episode_id
)
44 # new page type 2021-11
45 amp_data
= self
._parse
_json
(self
._search
_regex
(
46 r
'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]*>\s*({.+?})\s*<',
47 webpage
, 'AMP data', default
='{}'), episode_id
, fatal
=False) or {}
48 amp_data
= try_get(amp_data
,
49 lambda a
: self
._parse
_json
(
50 next(a
[x
] for x
in iter(a
) if episode_id
in x
),
53 amp_data
= amp_data
.get('d') or []
54 episode_data
= try_get(
56 lambda a
: next(x
for x
in a
57 if x
['type'] == 'podcast-episodes' and x
['id'] == episode_id
),
60 # try pre 2021-11 page type: TODO: consider deleting if no longer used
61 ember_data
= self
._parse
_json
(self
._search
_regex
(
62 r
'(?s)id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<',
63 webpage
, 'ember data'), episode_id
) or {}
64 ember_data
= ember_data
.get(episode_id
) or ember_data
65 episode_data
= try_get(ember_data
, lambda x
: x
['data'], dict)
66 episode
= episode_data
['attributes']
67 description
= episode
.get('description') or {}
70 for inc
in (amp_data
or ember_data
.get('included') or []):
71 if inc
.get('type') == 'media/podcast':
72 series
= try_get(inc
, lambda x
: x
['attributes']['name'])
73 series
= series
or clean_html(get_element_by_class('podcast-header__identity', webpage
))
77 'title': episode
.get('name'),
78 'url': clean_podcast_url(episode
['assetUrl']),
79 'description': description
.get('standard') or description
.get('short'),
80 'timestamp': parse_iso8601(episode
.get('releaseDateTime')),
81 'duration': int_or_none(episode
.get('durationInMilliseconds'), 1000),
83 'thumbnail': self
._og
_search
_thumbnail
(webpage
),