1 from __future__
import annotations
10 import urllib
.response
12 from urllib
.request
import (
17 HTTPDefaultErrorHandler
,
22 from ._helper
import (
24 add_accept_encoding_header
,
26 create_socks_proxy_socket
,
28 make_socks_proxy_opts
,
31 from .common
import Features
, RequestHandler
, Response
, register_rh
32 from .exceptions
import (
33 CertificateVerifyError
,
41 from ..dependencies
import brotli
42 from ..socks
import ProxyError
as SocksProxyError
43 from ..utils
import update_url_query
44 from ..utils
.networking
import normalize_url
46 SUPPORTED_ENCODINGS
= ['gzip', 'deflate']
47 CONTENT_DECODE_ERRORS
= [zlib
.error
, OSError]
50 SUPPORTED_ENCODINGS
.append('br')
51 CONTENT_DECODE_ERRORS
.append(brotli
.error
)
54 def _create_http_connection(http_class
, source_address
, *args
, **kwargs
):
55 hc
= http_class(*args
, **kwargs
)
57 if hasattr(hc
, '_create_connection'):
58 hc
._create
_connection
= create_connection
60 if source_address
is not None:
61 hc
.source_address
= (source_address
, 0)
66 class HTTPHandler(urllib
.request
.AbstractHTTPHandler
):
67 """Handler for HTTP requests and responses.
69 This class, when installed with an OpenerDirector, automatically adds
70 the standard headers to every HTTP request and handles gzipped, deflated and
71 brotli responses from web servers.
73 Part of this code was copied from:
75 http://techknack.net/python-urllib2-handlers/
77 Andrew Rowls, the author of that code, agreed to release it to the
81 def __init__(self
, context
=None, source_address
=None, *args
, **kwargs
):
82 super().__init
__(*args
, **kwargs
)
83 self
._source
_address
= source_address
84 self
._context
= context
87 def _make_conn_class(base
, req
):
89 socks_proxy
= req
.headers
.pop('Ytdl-socks-proxy', None)
91 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
94 def http_open(self
, req
):
95 conn_class
= self
._make
_conn
_class
(http
.client
.HTTPConnection
, req
)
96 return self
.do_open(functools
.partial(
97 _create_http_connection
, conn_class
, self
._source
_address
), req
)
99 def https_open(self
, req
):
100 conn_class
= self
._make
_conn
_class
(http
.client
.HTTPSConnection
, req
)
103 _create_http_connection
, conn_class
, self
._source
_address
),
104 req
, context
=self
._context
)
111 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
113 return zlib
.decompress(data
)
119 return brotli
.decompress(data
)
123 # There may be junk added the end of the file
124 # We ignore it by only ever decoding a single gzip payload
127 return zlib
.decompress(data
, wbits
=zlib
.MAX_WBITS |
16)
129 def http_request(self
, req
):
130 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
131 # always respected by websites, some tend to give out URLs with non percent-encoded
132 # non-ASCII characters (see telemb.py, ard.py [#3412])
133 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
134 # To work around aforementioned issue we will replace request's original URL with
135 # percent-encoded one
136 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
137 # the code of this workaround has been moved here from YoutubeDL.urlopen()
138 url
= req
.get_full_url()
139 url_escaped
= normalize_url(url
)
141 # Substitute URL if any change after escaping
142 if url
!= url_escaped
:
143 req
= update_Request(req
, url
=url_escaped
)
145 return super().do_request_(req
)
147 def http_response(self
, req
, resp
):
150 # Content-Encoding header lists the encodings in order that they were applied [1].
151 # To decompress, we simply do the reverse.
152 # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
153 decoded_response
= None
154 for encoding
in (e
.strip() for e
in reversed(resp
.headers
.get('Content-encoding', '').split(','))):
155 if encoding
== 'gzip':
156 decoded_response
= self
.gz(decoded_response
or resp
.read())
157 elif encoding
== 'deflate':
158 decoded_response
= self
.deflate(decoded_response
or resp
.read())
159 elif encoding
== 'br' and brotli
:
160 decoded_response
= self
.brotli(decoded_response
or resp
.read())
162 if decoded_response
is not None:
163 resp
= urllib
.request
.addinfourl(io
.BytesIO(decoded_response
), old_resp
.headers
, old_resp
.url
, old_resp
.code
)
164 resp
.msg
= old_resp
.msg
165 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
166 # https://github.com/ytdl-org/youtube-dl/issues/6457).
167 if 300 <= resp
.code
< 400:
168 location
= resp
.headers
.get('Location')
170 # As of RFC 2616 default charset is iso-8859-1 that is respected by Python 3
171 location
= location
.encode('iso-8859-1').decode()
172 location_escaped
= normalize_url(location
)
173 if location
!= location_escaped
:
174 del resp
.headers
['Location']
175 resp
.headers
['Location'] = location_escaped
178 https_request
= http_request
179 https_response
= http_response
182 def make_socks_conn_class(base_class
, socks_proxy
):
183 assert issubclass(base_class
, (
184 http
.client
.HTTPConnection
, http
.client
.HTTPSConnection
))
186 proxy_args
= make_socks_proxy_opts(socks_proxy
)
188 class SocksConnection(base_class
):
189 _create_connection
= create_connection
192 self
.sock
= create_connection(
193 (proxy_args
['addr'], proxy_args
['port']),
194 timeout
=self
.timeout
,
195 source_address
=self
.source_address
,
196 _create_socket_func
=functools
.partial(
197 create_socks_proxy_socket
, (self
.host
, self
.port
), proxy_args
))
198 if isinstance(self
, http
.client
.HTTPSConnection
):
199 self
.sock
= self
._context
.wrap_socket(self
.sock
, server_hostname
=self
.host
)
201 return SocksConnection
204 class RedirectHandler(urllib
.request
.HTTPRedirectHandler
):
205 """YoutubeDL redirect handler
207 The code is based on HTTPRedirectHandler implementation from CPython [1].
209 This redirect handler fixes and improves the logic to better align with RFC7261
210 and what browsers tend to do [2][3]
212 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
213 2. https://datatracker.ietf.org/doc/html/rfc7231
214 3. https://github.com/python/cpython/issues/91306
217 http_error_301
= http_error_303
= http_error_307
= http_error_308
= urllib
.request
.HTTPRedirectHandler
.http_error_302
219 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
220 if code
not in (301, 302, 303, 307, 308):
221 raise urllib
.error
.HTTPError(req
.full_url
, code
, msg
, headers
, fp
)
225 # Technically the Cookie header should be in unredirected_hdrs,
226 # however in practice some may set it in normal headers anyway.
227 # We will remove it here to prevent any leaks.
228 remove_headers
= ['Cookie']
230 new_method
= get_redirect_method(req
.get_method(), code
)
231 # only remove payload if method changed (e.g. POST to GET)
232 if new_method
!= req
.get_method():
234 remove_headers
.extend(['Content-Length', 'Content-Type'])
236 new_headers
= {k
: v
for k
, v
in req
.headers
.items() if k
.title() not in remove_headers
}
238 return urllib
.request
.Request(
239 newurl
, headers
=new_headers
, origin_req_host
=req
.origin_req_host
,
240 unverifiable
=True, method
=new_method
, data
=new_data
)
243 class ProxyHandler(urllib
.request
.BaseHandler
):
246 def __init__(self
, proxies
=None):
247 self
.proxies
= proxies
248 # Set default handlers
249 for scheme
in ('http', 'https', 'ftp'):
250 setattr(self
, f
'{scheme}_open', lambda r
, meth
=self
.proxy_open
: meth(r
))
252 def proxy_open(self
, req
):
253 proxy
= select_proxy(req
.get_full_url(), self
.proxies
)
256 if urllib
.parse
.urlparse(proxy
).scheme
.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
257 req
.add_header('Ytdl-socks-proxy', proxy
)
258 # yt-dlp's http/https handlers do wrapping the socket with socks
260 return urllib
.request
.ProxyHandler
.proxy_open(
261 self
, req
, proxy
, None)
264 class PUTRequest(urllib
.request
.Request
):
265 def get_method(self
):
269 class HEADRequest(urllib
.request
.Request
):
270 def get_method(self
):
274 def update_Request(req
, url
=None, data
=None, headers
=None, query
=None):
275 req_headers
= req
.headers
.copy()
276 req_headers
.update(headers
or {})
277 req_data
= data
if data
is not None else req
.data
278 req_url
= update_url_query(url
or req
.get_full_url(), query
)
279 req_get_method
= req
.get_method()
280 if req_get_method
== 'HEAD':
281 req_type
= HEADRequest
282 elif req_get_method
== 'PUT':
283 req_type
= PUTRequest
285 req_type
= urllib
.request
.Request
287 req_url
, data
=req_data
, headers
=req_headers
,
288 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
289 if hasattr(req
, 'timeout'):
290 new_req
.timeout
= req
.timeout
294 class UrllibResponseAdapter(Response
):
296 HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
299 def __init__(self
, res
: http
.client
.HTTPResponse | urllib
.response
.addinfourl
):
300 # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
301 # HTTPResponse: .getcode() was deprecated, .status always existed [2]
302 # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
303 # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
305 fp
=res
, headers
=res
.headers
, url
=res
.url
,
306 status
=getattr(res
, 'status', None) or res
.getcode(), reason
=getattr(res
, 'reason', None))
308 def read(self
, amt
=None):
310 return self
.fp
.read(amt
)
311 except Exception as e
:
312 handle_response_read_exceptions(e
)
316 def handle_sslerror(e
: ssl
.SSLError
):
317 if not isinstance(e
, ssl
.SSLError
):
319 if isinstance(e
, ssl
.SSLCertVerificationError
):
320 raise CertificateVerifyError(cause
=e
) from e
321 raise SSLError(cause
=e
) from e
324 def handle_response_read_exceptions(e
):
325 if isinstance(e
, http
.client
.IncompleteRead
):
326 raise IncompleteRead(partial
=len(e
.partial
), cause
=e
, expected
=e
.expected
) from e
327 elif isinstance(e
, ssl
.SSLError
):
329 elif isinstance(e
, (OSError, EOFError, http
.client
.HTTPException
, *CONTENT_DECODE_ERRORS
)):
330 # OSErrors raised here should mostly be network related
331 raise TransportError(cause
=e
) from e
335 class UrllibRH(RequestHandler
, InstanceStoreMixin
):
336 _SUPPORTED_URL_SCHEMES
= ('http', 'https', 'data', 'ftp')
337 _SUPPORTED_PROXY_SCHEMES
= ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
338 _SUPPORTED_FEATURES
= (Features
.NO_PROXY
, Features
.ALL_PROXY
)
341 def __init__(self
, *, enable_file_urls
: bool = False, **kwargs
):
342 super().__init
__(**kwargs
)
343 self
.enable_file_urls
= enable_file_urls
344 if self
.enable_file_urls
:
345 self
._SUPPORTED
_URL
_SCHEMES
= (*self
._SUPPORTED
_URL
_SCHEMES
, 'file')
347 def _check_extensions(self
, extensions
):
348 super()._check
_extensions
(extensions
)
349 extensions
.pop('cookiejar', None)
350 extensions
.pop('timeout', None)
351 extensions
.pop('legacy_ssl', None)
353 def _create_instance(self
, proxies
, cookiejar
, legacy_ssl_support
=None):
354 opener
= urllib
.request
.OpenerDirector()
356 ProxyHandler(proxies
),
358 debuglevel
=int(bool(self
.verbose
)),
359 context
=self
._make
_sslcontext
(legacy_ssl_support
=legacy_ssl_support
),
360 source_address
=self
.source_address
),
361 HTTPCookieProcessor(cookiejar
),
364 HTTPDefaultErrorHandler(),
366 HTTPErrorProcessor(),
370 if self
.enable_file_urls
:
371 handlers
.append(FileHandler())
373 for handler
in handlers
:
374 opener
.add_handler(handler
)
376 # Delete the default user-agent header, which would otherwise apply in
377 # cases where our custom HTTP handler doesn't come into play
378 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
379 opener
.addheaders
= []
382 def _send(self
, request
):
383 headers
= self
._merge
_headers
(request
.headers
)
384 add_accept_encoding_header(headers
, SUPPORTED_ENCODINGS
)
385 urllib_req
= urllib
.request
.Request(
388 headers
=dict(headers
),
389 method
=request
.method
,
392 opener
= self
._get
_instance
(
393 proxies
=self
._get
_proxies
(request
),
394 cookiejar
=self
._get
_cookiejar
(request
),
395 legacy_ssl_support
=request
.extensions
.get('legacy_ssl'),
398 res
= opener
.open(urllib_req
, timeout
=self
._calculate
_timeout
(request
))
399 except urllib
.error
.HTTPError
as e
:
400 if isinstance(e
.fp
, (http
.client
.HTTPResponse
, urllib
.response
.addinfourl
)):
401 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
402 e
._closer
.close_called
= True
403 raise HTTPError(UrllibResponseAdapter(e
.fp
), redirect_loop
='redirect error' in str(e
)) from e
405 except urllib
.error
.URLError
as e
:
406 cause
= e
.reason
# NOTE: cause may be a string
409 if 'tunnel connection failed' in str(cause
).lower() or isinstance(cause
, SocksProxyError
):
410 raise ProxyError(cause
=e
) from e
412 handle_response_read_exceptions(cause
)
413 raise TransportError(cause
=e
) from e
414 except (http
.client
.InvalidURL
, ValueError) as e
:
416 # http.client.HTTPConnection raises ValueError in some validation cases
417 # such as if request method contains illegal control characters [1]
418 # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
419 raise RequestError(cause
=e
) from e
420 except Exception as e
:
421 handle_response_read_exceptions(e
)
424 return UrllibResponseAdapter(res
)