1 """Convenient HTTP UserAgent class.
3 This is a subclass of urllib2.OpenerDirector.
6 Copyright 2003-2006 John J. Lee <jjl@pobox.com>
8 This code is free software; you can redistribute it and/or modify it under
9 the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
10 included with the distribution).
14 import sys
, warnings
, urllib2
23 class UserAgentBase(_opener
.OpenerDirector
):
24 """Convenient user-agent class.
26 Do not use .add_handler() to add a handler for something already dealt with
29 The only reason at present for the distinction between UserAgent and
30 UserAgentBase is so that classes that depend on .seek()able responses
31 (e.g. mechanize.Browser) can inherit from UserAgentBase. The subclass
32 UserAgent exposes a .set_seekable_responses() method that allows switching
33 off the adding of a .seek() method to responses.
37 addheaders: list of (name, value) pairs specifying headers to send with
38 every request, unless they are overridden in the Request instance.
40 >>> ua = UserAgentBase()
42 ... ("User-agent", "Mozilla/5.0 (compatible)"),
43 ... ("From", "responsible.person@example.com")]
49 "http": _urllib2
.HTTPHandler
,
50 # CacheFTPHandler is buggy, at least in 2.3, so we don't use it
51 "ftp": _urllib2
.FTPHandler
,
52 "file": _urllib2
.FileHandler
,
53 "gopher": _urllib2
.GopherHandler
,
56 "_unknown": _urllib2
.UnknownHandler
,
57 # HTTP{S,}Handler depend on HTTPErrorProcessor too
58 "_http_error": _urllib2
.HTTPErrorProcessor
,
59 "_http_request_upgrade": _urllib2
.HTTPRequestUpgradeProcessor
,
60 "_http_default_error": _urllib2
.HTTPDefaultErrorHandler
,
63 "_basicauth": _urllib2
.HTTPBasicAuthHandler
,
64 "_digestauth": _urllib2
.HTTPDigestAuthHandler
,
65 "_redirect": _urllib2
.HTTPRedirectHandler
,
66 "_cookies": _urllib2
.HTTPCookieProcessor
,
67 "_refresh": _urllib2
.HTTPRefreshProcessor
,
68 "_equiv": _urllib2
.HTTPEquivProcessor
,
69 "_proxy": _urllib2
.ProxyHandler
,
70 "_proxy_basicauth": _urllib2
.ProxyBasicAuthHandler
,
71 "_proxy_digestauth": _urllib2
.ProxyDigestAuthHandler
,
72 "_robots": _urllib2
.HTTPRobotRulesProcessor
,
73 "_gzip": _gzip
.HTTPGzipProcessor
, # experimental!
76 "_debug_redirect": _urllib2
.HTTPRedirectDebugProcessor
,
77 "_debug_response_body": _urllib2
.HTTPResponseDebugProcessor
,
80 default_schemes
= ["http", "ftp", "file", "gopher"]
81 default_others
= ["_unknown", "_http_error", "_http_request_upgrade",
82 "_http_default_error",
84 default_features
= ["_redirect", "_cookies",
86 "_basicauth", "_digestauth",
87 "_proxy", "_proxy_basicauth", "_proxy_digestauth",
90 if hasattr(_urllib2
, 'HTTPSHandler'):
91 handler_classes
["https"] = _urllib2
.HTTPSHandler
92 default_schemes
.append("https")
95 _opener
.OpenerDirector
.__init
__(self
)
97 ua_handlers
= self
._ua
_handlers
= {}
98 for scheme
in (self
.default_schemes
+
100 self
.default_features
):
101 klass
= self
.handler_classes
[scheme
]
102 ua_handlers
[scheme
] = klass()
103 for handler
in ua_handlers
.itervalues():
104 self
.add_handler(handler
)
107 # Ensure correct default constructor args were passed to
108 # HTTPRefreshProcessor and HTTPEquivProcessor.
109 if "_refresh" in ua_handlers
:
110 self
.set_handle_refresh(True)
111 if "_equiv" in ua_handlers
:
112 self
.set_handle_equiv(True)
113 # Ensure default password managers are installed.
115 if "_basicauth" in ua_handlers
or "_digestauth" in ua_handlers
:
116 pm
= _urllib2
.HTTPPasswordMgrWithDefaultRealm()
117 if ("_proxy_basicauth" in ua_handlers
or
118 "_proxy_digestauth" in ua_handlers
):
119 ppm
= _auth
.HTTPProxyPasswordMgr()
120 self
.set_password_manager(pm
)
121 self
.set_proxy_password_manager(ppm
)
122 # set default certificate manager
123 if "https" in ua_handlers
:
124 cm
= _urllib2
.HTTPSClientCertMgr()
125 self
.set_client_cert_manager(cm
)
128 _opener
.OpenerDirector
.close(self
)
129 self
._ua
_handlers
= None
132 ## def set_timeout(self, timeout):
133 ## self._timeout = timeout
134 ## def set_http_connection_cache(self, conn_cache):
135 ## self._http_conn_cache = conn_cache
136 ## def set_ftp_connection_cache(self, conn_cache):
137 ## # XXX ATM, FTP has cache as part of handler; should it be separate?
138 ## self._ftp_conn_cache = conn_cache
140 def set_handled_schemes(self
, schemes
):
141 """Set sequence of URL scheme (protocol) strings.
143 For example: ua.set_handled_schemes(["http", "ftp"])
145 If this fails (with ValueError) because you've passed an unknown
146 scheme, the set of handled schemes will not be changed.
150 for scheme
in schemes
:
151 if scheme
.startswith("_"):
152 raise ValueError("not a scheme '%s'" % scheme
)
153 if scheme
not in self
.handler_classes
:
154 raise ValueError("unknown scheme '%s'")
157 # get rid of scheme handlers we don't want
158 for scheme
, oldhandler
in self
._ua
_handlers
.items():
159 if scheme
.startswith("_"): continue # not a scheme handler
160 if scheme
not in want
:
161 self
._replace
_handler
(scheme
, None)
163 del want
[scheme
] # already got it
164 # add the scheme handlers that are missing
165 for scheme
in want
.keys():
166 self
._set
_handler
(scheme
, True)
168 def set_cookiejar(self
, cookiejar
):
169 """Set a mechanize.CookieJar, or None."""
170 self
._set
_handler
("_cookies", obj
=cookiejar
)
172 # XXX could use Greg Stein's httpx for some of this instead?
174 def set_proxies(self
, proxies
):
175 """Set a dictionary mapping URL scheme to proxy specification, or None.
177 e.g. {"http": "joe:password@myproxy.example.com:3128",
178 "ftp": "proxy.example.com"}
181 self
._set
_handler
("_proxy", obj
=proxies
)
183 def add_password(self
, url
, user
, password
, realm
=None):
184 self
._password
_manager
.add_password(realm
, url
, user
, password
)
185 def add_proxy_password(self
, user
, password
, hostport
=None, realm
=None):
186 self
._proxy
_password
_manager
.add_password(
187 realm
, hostport
, user
, password
)
189 def add_client_certificate(self
, url
, key_file
, cert_file
):
190 """Add an SSL client certificate, for HTTPS client auth.
192 key_file and cert_file must be filenames of the key and certificate
193 files, in PEM format. You can use e.g. OpenSSL to convert a p12 (PKCS
194 12) file to PEM format:
196 openssl pkcs12 -clcerts -nokeys -in cert.p12 -out cert.pem
197 openssl pkcs12 -nocerts -in cert.p12 -out key.pem
200 Note that client certificate password input is very inflexible ATM. At
201 the moment this seems to be console only, which is presumably the
202 default behaviour of libopenssl. In future mechanize may support
203 third-party libraries that (I assume) allow more options here.
206 self
._client
_cert
_manager
.add_key_cert(url
, key_file
, cert_file
)
208 # the following are rarely useful -- use add_password / add_proxy_password
210 def set_password_manager(self
, password_manager
):
211 """Set a mechanize.HTTPPasswordMgrWithDefaultRealm, or None."""
212 self
._password
_manager
= password_manager
213 self
._set
_handler
("_basicauth", obj
=password_manager
)
214 self
._set
_handler
("_digestauth", obj
=password_manager
)
215 def set_proxy_password_manager(self
, password_manager
):
216 """Set a mechanize.HTTPProxyPasswordMgr, or None."""
217 self
._proxy
_password
_manager
= password_manager
218 self
._set
_handler
("_proxy_basicauth", obj
=password_manager
)
219 self
._set
_handler
("_proxy_digestauth", obj
=password_manager
)
220 def set_client_cert_manager(self
, cert_manager
):
221 """Set a mechanize.HTTPClientCertMgr, or None."""
222 self
._client
_cert
_manager
= cert_manager
223 handler
= self
._ua
_handlers
["https"]
224 handler
.client_cert_manager
= cert_manager
226 # these methods all take a boolean parameter
227 def set_handle_robots(self
, handle
):
228 """Set whether to observe rules from robots.txt."""
229 self
._set
_handler
("_robots", handle
)
230 def set_handle_redirect(self
, handle
):
231 """Set whether to handle HTTP 30x redirections."""
232 self
._set
_handler
("_redirect", handle
)
233 def set_handle_refresh(self
, handle
, max_time
=None, honor_time
=True):
234 """Set whether to handle HTTP Refresh headers."""
235 self
._set
_handler
("_refresh", handle
, constructor_kwds
=
236 {"max_time": max_time
, "honor_time": honor_time
})
237 def set_handle_equiv(self
, handle
, head_parser_class
=None):
238 """Set whether to treat HTML http-equiv headers like HTTP headers.
240 Response objects may be .seek()able if this is set (currently returned
241 responses are, raised HTTPError exception responses are not).
244 if head_parser_class
is not None:
245 constructor_kwds
= {"head_parser_class": head_parser_class
}
248 self
._set
_handler
("_equiv", handle
, constructor_kwds
=constructor_kwds
)
249 def set_handle_gzip(self
, handle
):
250 """Handle gzip transfer encoding.
255 "gzip transfer encoding is experimental!", stacklevel
=2)
256 self
._set
_handler
("_gzip", handle
)
257 def set_debug_redirects(self
, handle
):
258 """Log information about HTTP redirects (including refreshes).
260 Logging is performed using module logging. The logger name is
261 "mechanize.http_redirects". To actually print some debug output,
265 logger = logging.getLogger("mechanize.http_redirects")
266 logger.addHandler(logging.StreamHandler(sys.stdout))
267 logger.setLevel(logging.INFO)
269 Other logger names relevant to this module:
271 "mechanize.http_responses"
272 "mechanize.cookies" (or "cookielib" if running Python 2.4)
274 To turn on everything:
277 logger = logging.getLogger("mechanize")
278 logger.addHandler(logging.StreamHandler(sys.stdout))
279 logger.setLevel(logging.INFO)
282 self
._set
_handler
("_debug_redirect", handle
)
283 def set_debug_responses(self
, handle
):
284 """Log HTTP response bodies.
286 See docstring for .set_debug_redirects() for details of logging.
288 Response objects may be .seek()able if this is set (currently returned
289 responses are, raised HTTPError exception responses are not).
292 self
._set
_handler
("_debug_response_body", handle
)
293 def set_debug_http(self
, handle
):
294 """Print HTTP headers to sys.stdout."""
295 level
= int(bool(handle
))
296 for scheme
in "http", "https":
297 h
= self
._ua
_handlers
.get(scheme
)
299 h
.set_http_debuglevel(level
)
301 def _set_handler(self
, name
, handle
=None, obj
=None,
302 constructor_args
=(), constructor_kwds
={}):
304 handle
= obj
is not None
306 handler_class
= self
.handler_classes
[name
]
308 newhandler
= handler_class(obj
)
310 newhandler
= handler_class(*constructor_args
, **constructor_kwds
)
313 self
._replace
_handler
(name
, newhandler
)
315 def _replace_handler(self
, name
, newhandler
=None):
316 # first, if handler was previously added, remove it
318 handler
= self
._ua
_handlers
.get(name
)
321 self
.handlers
.remove(handler
)
324 # then add the replacement, if any
325 if newhandler
is not None:
326 self
.add_handler(newhandler
)
327 self
._ua
_handlers
[name
] = newhandler
330 class UserAgent(UserAgentBase
):
333 UserAgentBase
.__init
__(self
)
334 self
._seekable
= False
336 def set_seekable_responses(self
, handle
):
337 """Make response objects .seek()able."""
338 self
._seekable
= bool(handle
)
340 def open(self
, fullurl
, data
=None):
342 def bound_open(fullurl
, data
=None):
343 return UserAgentBase
.open(self
, fullurl
, data
)
344 response
= _opener
.wrapped_open(
345 bound_open
, _response
.seek_wrapped_response
, fullurl
, data
)
347 response
= UserAgentBase
.open(self
, fullurl
, data
)