5 from .common
import InfoExtractor
10 get_elements_by_class
,
18 def _get_elements_by_tag_and_attrib(html
, tag
=None, attribute
=None, value
=None, escape_value
=True):
19 """Return the content of the tag with the specified attribute in the passed HTML document"""
22 tag
= '[a-zA-Z0-9:._-]+'
26 attribute
= rf
'\s+(?P<attribute>{re.escape(attribute)})'
30 value
= re
.escape(value
) if escape_value
else value
31 value
= f
'=[\'"]?(?P<value>{value})[\'"]?'
34 for m
in re
.finditer(rf
'''(?xs)
36 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
48 def _get_element_by_tag_and_attrib(html
, tag
=None, attribute
=None, value
=None, escape_value
=True):
49 retval
= _get_elements_by_tag_and_attrib(html
, tag
, attribute
, value
, escape_value
)
50 return retval
[0] if retval
else None
53 class DubokuIE(InfoExtractor
):
55 IE_DESC
= 'www.duboku.io'
57 _VALID_URL
= r
'(?:https?://[^/]+\.duboku\.io/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
59 'url': 'https://w.duboku.io/vodplay/1575-1-1.html',
64 'title': 'contains:白色月光',
70 'episode': 'Episode 1',
73 'skip_download': 'm3u8 download',
76 'url': 'https://w.duboku.io/vodplay/1588-1-1.html',
81 'title': 'contains:第1集',
84 'episode': 'Episode 1',
90 'skip_download': 'm3u8 download',
94 _PLAYER_DATA_PATTERN
= r
'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script'
96 def _real_extract(self
, url
):
97 video_id
= self
._match
_id
(url
)
98 temp
= video_id
.split('-')
103 webpage_url
= f
'https://w.duboku.io/vodplay/{video_id}.html'
104 webpage_html
= self
._download
_webpage
(webpage_url
, video_id
)
108 player_data
= self
._search
_regex
(
109 self
._PLAYER
_DATA
_PATTERN
, webpage_html
, 'player_data')
110 player_data
= self
._parse
_json
(player_data
, video_id
, js_to_json
)
114 temp
= get_elements_by_class('title', webpage_html
)
118 mobj
= re
.search(r
'<a\s+.*>(.*)</a>', html
)
120 href
= extract_attributes(mobj
.group(0)).get('href')
122 mobj1
= re
.search(r
'/(\d+)\.html', href
)
123 if mobj1
and mobj1
.group(1) == series_id
:
124 series_title
= clean_html(mobj
.group(0))
125 series_title
= re
.sub(r
'[\s\r\n\t]+', ' ', series_title
)
126 title
= clean_html(html
)
127 title
= re
.sub(r
'[\s\r\n\t]+', ' ', title
)
130 data_url
= player_data
.get('url')
132 raise ExtractorError('Cannot find url in player_data')
133 player_encrypt
= player_data
.get('encrypt')
134 if player_encrypt
== 1:
135 data_url
= urllib
.parse
.unquote(data_url
)
136 elif player_encrypt
== 2:
137 data_url
= urllib
.parse
.unquote(base64
.b64decode(data_url
).decode('ascii'))
139 # if it is an embedded iframe, maybe it's an external source
140 headers
= {'Referer': webpage_url
}
141 if player_data
.get('from') == 'iframe':
142 # use _type url_transparent to retain the meaningful details
145 '_type': 'url_transparent',
146 'url': smuggle_url(data_url
, {'referer': webpage_url
}),
149 'series': series_title
,
150 'season_number': int_or_none(season_id
),
151 'season_id': season_id
,
152 'episode_number': int_or_none(episode_id
),
153 'episode_id': episode_id
,
156 formats
= self
._extract
_m
3u8_formats
(data_url
, video_id
, 'mp4', headers
=headers
)
161 'series': series_title
,
162 'season_number': int_or_none(season_id
),
163 'season_id': season_id
,
164 'episode_number': int_or_none(episode_id
),
165 'episode_id': episode_id
,
167 'http_headers': headers
,
171 class DubokuPlaylistIE(InfoExtractor
):
172 IE_NAME
= 'duboku:list'
173 IE_DESC
= 'www.duboku.io entire series'
175 _VALID_URL
= r
'(?:https?://[^/]+\.duboku\.io/voddetail/)(?P<id>[0-9]+)\.html.*'
177 'url': 'https://w.duboku.io/voddetail/1575.html',
179 'id': 'startswith:1575',
182 'playlist_count': 12,
184 'url': 'https://w.duboku.io/voddetail/1554.html',
186 'id': 'startswith:1554',
189 'playlist_mincount': 30,
192 def _real_extract(self
, url
):
193 mobj
= self
._match
_valid
_url
(url
)
195 raise ExtractorError(f
'Invalid URL: {url}')
196 series_id
= mobj
.group('id')
197 fragment
= urllib
.parse
.urlparse(url
).fragment
199 webpage_url
= f
'https://w.duboku.io/voddetail/{series_id}.html'
200 webpage_html
= self
._download
_webpage
(webpage_url
, series_id
)
204 title
= _get_element_by_tag_and_attrib(webpage_html
, 'h1', 'class', 'title')
205 title
= unescapeHTML(title
.group('content')) if title
else None
207 title
= self
._html
_search
_meta
('keywords', webpage_html
)
209 title
= _get_element_by_tag_and_attrib(webpage_html
, 'title')
210 title
= unescapeHTML(title
.group('content')) if title
else None
215 for div
in _get_elements_by_tag_and_attrib(
216 webpage_html
, attribute
='id', value
='playlist\\d+', escape_value
=False):
217 playlist_id
= div
.group('value')
219 for a
in _get_elements_by_tag_and_attrib(
220 div
.group('content'), 'a', 'href', value
='[^\'"]+?', escape_value
=False):
222 'href': unescapeHTML(a
.group('value')),
223 'title': unescapeHTML(a
.group('content')),
225 playlists
[playlist_id
] = playlist
227 # select the specified playlist if url fragment exists
231 playlist
= playlists
.get(fragment
)
232 playlist_id
= fragment
234 first
= next(iter(playlists
.items()), None)
236 (playlist_id
, playlist
) = first
238 raise ExtractorError(
239 f
'Cannot find {fragment}' if fragment
else 'Cannot extract playlist')
242 return self
.playlist_result([
244 urllib
.parse
.urljoin('https://w.duboku.io', x
['href']),
245 ie
=DubokuIE
.ie_key(), video_title
=x
.get('title'))
246 for x
in playlist
], series_id
+ '#' + playlist_id
, title
)