[ie/kenh14] Add extractor (#3996)
[yt-dlp3.git] / yt_dlp / extractor / kenh14.py
blob3c46020e8b6a637a04ff0d7fdc70a80cb06d32c1
1 from .common import InfoExtractor
2 from ..utils import (
3 clean_html,
4 extract_attributes,
5 get_element_by_class,
6 get_element_html_by_attribute,
7 get_elements_html_by_class,
8 int_or_none,
9 parse_duration,
10 parse_iso8601,
11 remove_start,
12 strip_or_none,
13 unescapeHTML,
14 update_url,
15 url_or_none,
17 from ..utils.traversal import traverse_obj
20 class Kenh14VideoIE(InfoExtractor):
21 _VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P<id>[0-9]+)\.chn'
22 _TESTS = [{
23 'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn',
24 'md5': '1ed67f9c3a1e74acf15db69590cf6210',
25 'info_dict': {
26 'id': '316173',
27 'ext': 'mp4',
28 'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
29 'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
30 'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
31 'tags': [],
32 'uploader': 'Unbox Therapy',
33 'upload_date': '20220517',
34 'view_count': int,
35 'duration': 722.86,
36 'timestamp': 1652764468,
38 }, {
39 'url': 'https://video.kenh14.vn/video-316174.chn',
40 'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd',
41 'info_dict': {
42 'id': '316174',
43 'ext': 'mp4',
44 'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu',
45 'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc',
46 'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
47 'tags': [],
48 'upload_date': '20220517',
49 'view_count': int,
50 'duration': 70.04,
51 'timestamp': 1652766021,
53 }, {
54 'url': 'https://video.kenh14.vn/0-344740.chn',
55 'md5': 'b843495d5e728142c8870c09b46df2a9',
56 'info_dict': {
57 'id': '344740',
58 'ext': 'mov',
59 'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi',
60 'description': 'md5:2a2dbb4a7397169fb21ee68f09160497',
61 'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$',
62 'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'],
63 'uploader': 'Quang Vũ',
64 'upload_date': '20241024',
65 'view_count': int,
66 'duration': 198.88,
67 'timestamp': 1729741590,
71 def _real_extract(self, url):
72 video_id = self._match_id(url)
73 webpage = self._download_webpage(url, video_id)
75 attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '')
76 direct_url = attrs['data-vid']
78 metadata = self._download_json(
79 'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format(
80 remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False)
82 formats = [{'url': f'https://{direct_url}', 'format_id': 'http', 'quality': 1}]
83 subtitles = {}
84 video_data = self._download_json(
85 f'https://{direct_url}.json', video_id, note='Downloading video data', fatal=False)
86 if hls_url := traverse_obj(video_data, ('hls', {url_or_none})):
87 fmts, subs = self._extract_m3u8_formats_and_subtitles(
88 hls_url, video_id, m3u8_id='hls', fatal=False)
89 formats.extend(fmts)
90 self._merge_subtitles(subs, target=subtitles)
91 if dash_url := traverse_obj(video_data, ('mpd', {url_or_none})):
92 fmts, subs = self._extract_mpd_formats_and_subtitles(
93 dash_url, video_id, mpd_id='dash', fatal=False)
94 formats.extend(fmts)
95 self._merge_subtitles(subs, target=subtitles)
97 return {
98 **traverse_obj(metadata, {
99 'duration': ('duration', {parse_duration}),
100 'uploader': ('author', {strip_or_none}),
101 'timestamp': ('uploadtime', {parse_iso8601(delimiter=' ')}),
102 'view_count': ('views', {int_or_none}),
104 'id': video_id,
105 'title': (
106 traverse_obj(metadata, ('title', {strip_or_none}))
107 or clean_html(self._og_search_title(webpage))
108 or clean_html(get_element_by_class('vdbw-title', webpage))),
109 'formats': formats,
110 'subtitles': subtitles,
111 'description': (
112 clean_html(self._og_search_description(webpage))
113 or clean_html(get_element_by_class('vdbw-sapo', webpage))),
114 'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')),
115 'tags': traverse_obj(self._html_search_meta('keywords', webpage), (
116 {lambda x: x.split(';')}, ..., filter)),
120 class Kenh14PlaylistIE(InfoExtractor):
121 _VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P<id>[0-9]+)\.chn'
122 _TESTS = [{
123 'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn',
124 'info_dict': {
125 'id': '71',
126 'title': 'Trần Tình (Naked love) mùa 2',
127 'description': 'md5:e9522339304956dea931722dd72eddb2',
128 'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
130 'playlist_count': 9,
131 }, {
132 'url': 'https://video.kenh14.vn/playlist/0-72.chn',
133 'info_dict': {
134 'id': '72',
135 'title': 'Lau Lại Đầu Từ',
136 'description': 'Cùng xem xưa và nay có gì khác biệt nhé!',
137 'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
139 'playlist_count': 6,
142 def _real_extract(self, url):
143 playlist_id = self._match_id(url)
144 webpage = self._download_webpage(url, playlist_id)
146 category_detail = get_element_by_class('category-detail', webpage) or ''
147 embed_info = traverse_obj(
148 self._yield_json_ld(webpage, playlist_id),
149 (lambda _, v: v['name'] and v['alternateName'], any)) or {}
151 return self.playlist_from_matches(
152 get_elements_html_by_class('video-item', webpage), playlist_id,
153 (clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))),
154 getter=lambda x: 'https://video.kenh14.vn/video/video-{}.chn'.format(extract_attributes(x)['data-id']),
155 ie=Kenh14VideoIE, playlist_description=(
156 clean_html(get_element_by_class('description', category_detail))
157 or unescapeHTML(embed_info.get('alternateName'))),
158 thumbnail=traverse_obj(
159 self._og_search_thumbnail(webpage),
160 ({url_or_none}, {update_url(query=None)})))