Release 2024.12.06
[yt-dlp3.git] / yt_dlp / extractor / thisoldhouse.py
blobfbc12d55d901df387af4de0ae2f8a0e63431aa15
1 import json
3 from .brightcove import BrightcoveNewIE
4 from .common import InfoExtractor
5 from .zype import ZypeIE
6 from ..networking import HEADRequest
7 from ..networking.exceptions import HTTPError
8 from ..utils import (
9 ExtractorError,
10 filter_dict,
11 parse_qs,
12 smuggle_url,
13 try_call,
14 urlencode_postdata,
18 class ThisOldHouseIE(InfoExtractor):
19 _NETRC_MACHINE = 'thisoldhouse'
20 _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode|(?:[^/?#]+/)?\d+)/(?P<id>[^/?#]+)'
21 _TESTS = [{
22 # Unresolved Brightcove URL embed (formerly Zype), free
23 'url': 'https://www.thisoldhouse.com/furniture/21017078/how-to-build-a-storage-bench',
24 'info_dict': {
25 'id': '6325298523112',
26 'ext': 'mp4',
27 'title': 'How to Build a Storage Bench',
28 'description': 'In the workshop, Tom Silva and Kevin O\'Connor build a storage bench for an entryway.',
29 'timestamp': 1681793639,
30 'upload_date': '20230418',
31 'duration': 674.54,
32 'tags': 'count:11',
33 'uploader_id': '6314471934001',
34 'thumbnail': r're:^https?://.*\.jpg',
36 'params': {
37 'skip_download': True,
39 }, {
40 # Brightcove embed, authwalled
41 'url': 'https://www.thisoldhouse.com/glen-ridge-generational/99537/s45-e17-multi-generational',
42 'info_dict': {
43 'id': '6349675446112',
44 'ext': 'mp4',
45 'title': 'E17 | Glen Ridge Generational | Multi-Generational',
46 'description': 'md5:53c6bc2e8031f3033d693d9a3563222c',
47 'timestamp': 1711382202,
48 'upload_date': '20240325',
49 'duration': 1422.229,
50 'tags': 'count:13',
51 'uploader_id': '6314471934001',
52 'thumbnail': r're:^https?://.*\.jpg',
54 'expected_warnings': ['Login with password is not supported for this website'],
55 'params': {
56 'skip_download': True,
58 'skip': 'Requires subscription',
59 }, {
60 # Page no longer has video
61 'url': 'https://www.thisoldhouse.com/watch/arlington-arts-crafts-arts-and-crafts-class-begins',
62 'only_matching': True,
63 }, {
64 # 404 Not Found
65 'url': 'https://www.thisoldhouse.com/tv-episode/ask-toh-shelf-rough-electric',
66 'only_matching': True,
67 }, {
68 # 404 Not Found
69 'url': 'https://www.thisoldhouse.com/how-to/how-to-build-storage-bench',
70 'only_matching': True,
71 }, {
72 'url': 'https://www.thisoldhouse.com/21113884/s41-e13-paradise-lost',
73 'only_matching': True,
74 }, {
75 # iframe www.thisoldhouse.com
76 'url': 'https://www.thisoldhouse.com/21083431/seaside-transformation-the-westerly-project',
77 'only_matching': True,
80 _LOGIN_URL = 'https://login.thisoldhouse.com/usernamepassword/login'
82 def _perform_login(self, username, password):
83 self._request_webpage(
84 HEADRequest('https://www.thisoldhouse.com/insider'), None, 'Requesting session cookies')
85 urlh = self._request_webpage(
86 'https://www.thisoldhouse.com/wp-login.php', None, 'Requesting login info',
87 errnote='Unable to login', query={'redirect_to': 'https://www.thisoldhouse.com/insider'})
89 try:
90 auth_form = self._download_webpage(
91 self._LOGIN_URL, None, 'Submitting credentials', headers={
92 'Content-Type': 'application/json',
93 'Referer': urlh.url,
94 }, data=json.dumps(filter_dict({
95 **{('client_id' if k == 'client' else k): v[0] for k, v in parse_qs(urlh.url).items()},
96 'tenant': 'thisoldhouse',
97 'username': username,
98 'password': password,
99 'popup_options': {},
100 'sso': True,
101 '_csrf': try_call(lambda: self._get_cookies(self._LOGIN_URL)['_csrf'].value),
102 '_intstate': 'deprecated',
103 }), separators=(',', ':')).encode())
104 except ExtractorError as e:
105 if isinstance(e.cause, HTTPError) and e.cause.status == 401:
106 raise ExtractorError('Invalid username or password', expected=True)
107 raise
109 self._request_webpage(
110 'https://login.thisoldhouse.com/login/callback', None, 'Completing login',
111 data=urlencode_postdata(self._hidden_inputs(auth_form)))
113 def _real_extract(self, url):
114 display_id = self._match_id(url)
115 webpage = self._download_webpage(url, display_id)
116 if 'To Unlock This content' in webpage:
117 self.raise_login_required(
118 'This video is only available for subscribers. '
119 'Note that --cookies-from-browser may not work due to this site using session cookies')
121 video_url, video_id = self._search_regex(
122 r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})[^\'"]*)[\'"]',
123 webpage, 'zype url', group=(1, 2), default=(None, None))
124 if video_url:
125 video_url = self._request_webpage(HEADRequest(video_url), video_id, 'Resolving Zype URL').url
126 return self.url_result(video_url, ZypeIE, video_id)
128 video_url, video_id = self._search_regex([
129 r'<iframe[^>]+src=[\'"]((?:https?:)?//players\.brightcove\.net/\d+/\w+/index\.html\?videoId=(\d+))',
130 r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)thisoldhouse\.com/videos/brightcove/(\d+))'],
131 webpage, 'iframe url', group=(1, 2))
132 if not parse_qs(video_url).get('videoId'):
133 video_url = self._request_webpage(HEADRequest(video_url), video_id, 'Resolving Brightcove URL').url
134 return self.url_result(smuggle_url(video_url, {'referrer': url}), BrightcoveNewIE, video_id)