yt_dlp/extractor/thisoldhouse.py

   1 import json
   2
   3 from .brightcove import BrightcoveNewIE
   4 from .common import InfoExtractor
   5 from .zype import ZypeIE
   6 from ..networking import HEADRequest
   7 from ..networking.exceptions import HTTPError
   8 from ..utils import (
   9     ExtractorError,
  10     filter_dict,
  11     parse_qs,
  12     smuggle_url,
  13     try_call,
  14     urlencode_postdata,
  15 )
  16
  17
  18 class ThisOldHouseIE(InfoExtractor):
  19     _NETRC_MACHINE = 'thisoldhouse'
  20     _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode|(?:[^/?#]+/)?\d+)/(?P<id>[^/?#]+)'
  21     _TESTS = [{
  22         # Unresolved Brightcove URL embed (formerly Zype), free
  23         'url': 'https://www.thisoldhouse.com/furniture/21017078/how-to-build-a-storage-bench',
  24         'info_dict': {
  25             'id': '6325298523112',
  26             'ext': 'mp4',
  27             'title': 'How to Build a Storage Bench',
  28             'description': 'In the workshop, Tom Silva and Kevin O\'Connor build a storage bench for an entryway.',
  29             'timestamp': 1681793639,
  30             'upload_date': '20230418',
  31             'duration': 674.54,
  32             'tags': 'count:11',
  33             'uploader_id': '6314471934001',
  34             'thumbnail': r're:^https?://.*\.jpg',
  35         },
  36         'params': {
  37             'skip_download': True,
  38         },
  39     }, {
  40         # Brightcove embed, authwalled
  41         'url': 'https://www.thisoldhouse.com/glen-ridge-generational/99537/s45-e17-multi-generational',
  42         'info_dict': {
  43             'id': '6349675446112',
  44             'ext': 'mp4',
  45             'title': 'E17 | Glen Ridge Generational | Multi-Generational',
  46             'description': 'md5:53c6bc2e8031f3033d693d9a3563222c',
  47             'timestamp': 1711382202,
  48             'upload_date': '20240325',
  49             'duration': 1422.229,
  50             'tags': 'count:13',
  51             'uploader_id': '6314471934001',
  52             'thumbnail': r're:^https?://.*\.jpg',
  53         },
  54         'expected_warnings': ['Login with password is not supported for this website'],
  55         'params': {
  56             'skip_download': True,
  57         },
  58         'skip': 'Requires subscription',
  59     }, {
  60         # Page no longer has video
  61         'url': 'https://www.thisoldhouse.com/watch/arlington-arts-crafts-arts-and-crafts-class-begins',
  62         'only_matching': True,
  63     }, {
  64         # 404 Not Found
  65         'url': 'https://www.thisoldhouse.com/tv-episode/ask-toh-shelf-rough-electric',
  66         'only_matching': True,
  67     }, {
  68         # 404 Not Found
  69         'url': 'https://www.thisoldhouse.com/how-to/how-to-build-storage-bench',
  70         'only_matching': True,
  71     }, {
  72         'url': 'https://www.thisoldhouse.com/21113884/s41-e13-paradise-lost',
  73         'only_matching': True,
  74     }, {
  75         # iframe www.thisoldhouse.com
  76         'url': 'https://www.thisoldhouse.com/21083431/seaside-transformation-the-westerly-project',
  77         'only_matching': True,
  78     }]
  79
  80     _LOGIN_URL = 'https://login.thisoldhouse.com/usernamepassword/login'
  81
  82     def _perform_login(self, username, password):
  83         self._request_webpage(
  84             HEADRequest('https://www.thisoldhouse.com/insider'), None, 'Requesting session cookies')
  85         urlh = self._request_webpage(
  86             'https://www.thisoldhouse.com/wp-login.php', None, 'Requesting login info',
  87             errnote='Unable to login', query={'redirect_to': 'https://www.thisoldhouse.com/insider'})
  88
  89         try:
  90             auth_form = self._download_webpage(
  91                 self._LOGIN_URL, None, 'Submitting credentials', headers={
  92                     'Content-Type': 'application/json',
  93                     'Referer': urlh.url,
  94                 }, data=json.dumps(filter_dict({
  95                     **{('client_id' if k == 'client' else k): v[0] for k, v in parse_qs(urlh.url).items()},
  96                     'tenant': 'thisoldhouse',
  97                     'username': username,
  98                     'password': password,
  99                     'popup_options': {},
 100                     'sso': True,
 101                     '_csrf': try_call(lambda: self._get_cookies(self._LOGIN_URL)['_csrf'].value),
 102                     '_intstate': 'deprecated',
 103                 }), separators=(',', ':')).encode())
 104         except ExtractorError as e:
 105             if isinstance(e.cause, HTTPError) and e.cause.status == 401:
 106                 raise ExtractorError('Invalid username or password', expected=True)
 107             raise
 108
 109         self._request_webpage(
 110             'https://login.thisoldhouse.com/login/callback', None, 'Completing login',
 111             data=urlencode_postdata(self._hidden_inputs(auth_form)))
 112
 113     def _real_extract(self, url):
 114         display_id = self._match_id(url)
 115         webpage = self._download_webpage(url, display_id)
 116         if 'To Unlock This content' in webpage:
 117             self.raise_login_required(
 118                 'This video is only available for subscribers. '
 119                 'Note that --cookies-from-browser may not work due to this site using session cookies')
 120
 121         video_url, video_id = self._search_regex(
 122             r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})[^\'"]*)[\'"]',
 123             webpage, 'zype url', group=(1, 2), default=(None, None))
 124         if video_url:
 125             video_url = self._request_webpage(HEADRequest(video_url), video_id, 'Resolving Zype URL').url
 126             return self.url_result(video_url, ZypeIE, video_id)
 127
 128         video_url, video_id = self._search_regex([
 129             r'<iframe[^>]+src=[\'"]((?:https?:)?//players\.brightcove\.net/\d+/\w+/index\.html\?videoId=(\d+))',
 130             r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)thisoldhouse\.com/videos/brightcove/(\d+))'],
 131             webpage, 'iframe url', group=(1, 2))
 132         if not parse_qs(video_url).get('videoId'):
 133             video_url = self._request_webpage(HEADRequest(video_url), video_id, 'Resolving Brightcove URL').url
 134         return self.url_result(smuggle_url(video_url, {'referrer': url}), BrightcoveNewIE, video_id)