Update version number and release date.
[python/dscho.git] / Lib / urllib2.py
blobb6b2ac6cc5aa02f7c49306d850816a63bf5be351
1 """An extensible library for opening URLs using a variety of protocols
3 The simplest way to use this module is to call the urlopen function,
4 which accepts a string containing a URL or a Request object (described
5 below). It opens the URL and returns the results as file-like
6 object; the returned object has some extra methods described below.
8 The OpenerDirector manages a collection of Handler objects that do
9 all the actual work. Each Handler implements a particular protocol or
10 option. The OpenerDirector is a composite object that invokes the
11 Handlers needed to open the requested URL. For example, the
12 HTTPHandler performs HTTP GET and POST requests and deals with
13 non-error returns. The HTTPRedirectHandler automatically deals with
14 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15 deals with digest authentication.
17 urlopen(url, data=None) -- basic usage is that same as original
18 urllib. pass the url and optionally data to post to an HTTP URL, and
19 get a file-like object back. One difference is that you can also pass
20 a Request instance instead of URL. Raises a URLError (subclass of
21 IOError); for HTTP errors, raises an HTTPError, which can also be
22 treated as a valid response.
24 build_opener -- function that creates a new OpenerDirector instance.
25 will install the default handlers. accepts one or more Handlers as
26 arguments, either instances or Handler classes that it will
27 instantiate. if one of the argument is a subclass of the default
28 handler, the argument will be installed instead of the default.
30 install_opener -- installs a new opener as the default opener.
32 objects of interest:
33 OpenerDirector --
35 Request -- an object that encapsulates the state of a request. the
36 state can be a simple as the URL. it can also include extra HTTP
37 headers, e.g. a User-Agent.
39 BaseHandler --
41 exceptions:
42 URLError-- a subclass of IOError, individual protocols have their own
43 specific subclass
45 HTTPError-- also a valid HTTP response, so you can treat an HTTP error
46 as an exceptional event or valid response
48 internals:
49 BaseHandler and parent
50 _call_chain conventions
52 Example usage:
54 import urllib2
56 # set up authentication info
57 authinfo = urllib2.HTTPBasicAuthHandler()
58 authinfo.add_password('realm', 'host', 'username', 'password')
60 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
62 # build a new opener that adds authentication and caching FTP handlers
63 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
65 # install it
66 urllib2.install_opener(opener)
68 f = urllib2.urlopen('http://www.python.org/')
71 """
73 # XXX issues:
74 # If an authentication error handler that tries to perform
75 # authentication for some reason but fails, how should the error be
76 # signalled? The client needs to know the HTTP error code. But if
77 # the handler knows that the problem was, e.g., that it didn't know
78 # that hash algo that requested in the challenge, it would be good to
79 # pass that information along to the client, too.
81 # XXX to do:
82 # name!
83 # documentation (getting there)
84 # complex proxies
85 # abstract factory for opener
86 # ftp errors aren't handled cleanly
87 # gopher can return a socket.error
88 # check digest against correct (i.e. non-apache) implementation
90 import socket
91 import httplib
92 import inspect
93 import re
94 import base64
95 import urlparse
96 import md5
97 import mimetypes
98 import mimetools
99 import rfc822
100 import ftplib
101 import sys
102 import time
103 import os
104 import gopherlib
105 import posixpath
107 try:
108 from cStringIO import StringIO
109 except ImportError:
110 from StringIO import StringIO
112 try:
113 import sha
114 except ImportError:
115 # need 1.5.2 final
116 sha = None
118 # not sure how many of these need to be gotten rid of
119 from urllib import unwrap, unquote, splittype, splithost, \
120 addinfourl, splitport, splitgophertype, splitquery, \
121 splitattr, ftpwrapper, noheaders
123 # support for proxies via environment variables
124 from urllib import getproxies
126 # support for FileHandler
127 from urllib import localhost, url2pathname
129 __version__ = "2.0a1"
131 _opener = None
132 def urlopen(url, data=None):
133 global _opener
134 if _opener is None:
135 _opener = build_opener()
136 return _opener.open(url, data)
138 def install_opener(opener):
139 global _opener
140 _opener = opener
142 # do these error classes make sense?
143 # make sure all of the IOError stuff is overridden. we just want to be
144 # subtypes.
146 class URLError(IOError):
147 # URLError is a sub-type of IOError, but it doesn't share any of
148 # the implementation. need to override __init__ and __str__
149 def __init__(self, reason):
150 self.reason = reason
152 def __str__(self):
153 return '<urlopen error %s>' % self.reason
155 class HTTPError(URLError, addinfourl):
156 """Raised when HTTP error occurs, but also acts like non-error return"""
157 __super_init = addinfourl.__init__
159 def __init__(self, url, code, msg, hdrs, fp):
160 self.code = code
161 self.msg = msg
162 self.hdrs = hdrs
163 self.fp = fp
164 self.filename = url
165 # The addinfourl classes depend on fp being a valid file
166 # object. In some cases, the HTTPError may not have a valid
167 # file object. If this happens, the simplest workaround is to
168 # not initialize the base classes.
169 if fp is not None:
170 self.__super_init(fp, hdrs, url)
172 def __str__(self):
173 return 'HTTP Error %s: %s' % (self.code, self.msg)
175 def __del__(self):
176 # XXX is this safe? what if user catches exception, then
177 # extracts fp and discards exception?
178 if self.fp:
179 self.fp.close()
181 class GopherError(URLError):
182 pass
185 class Request:
187 def __init__(self, url, data=None, headers={}):
188 # unwrap('<URL:type://host/path>') --> 'type://host/path'
189 self.__original = unwrap(url)
190 self.type = None
191 # self.__r_type is what's left after doing the splittype
192 self.host = None
193 self.port = None
194 self.data = data
195 self.headers = {}
196 self.headers.update(headers)
198 def __getattr__(self, attr):
199 # XXX this is a fallback mechanism to guard against these
200 # methods getting called in a non-standard order. this may be
201 # too complicated and/or unnecessary.
202 # XXX should the __r_XXX attributes be public?
203 if attr[:12] == '_Request__r_':
204 name = attr[12:]
205 if hasattr(Request, 'get_' + name):
206 getattr(self, 'get_' + name)()
207 return getattr(self, attr)
208 raise AttributeError, attr
210 def get_method(self):
211 if self.has_data():
212 return "POST"
213 else:
214 return "GET"
216 def add_data(self, data):
217 self.data = data
219 def has_data(self):
220 return self.data is not None
222 def get_data(self):
223 return self.data
225 def get_full_url(self):
226 return self.__original
228 def get_type(self):
229 if self.type is None:
230 self.type, self.__r_type = splittype(self.__original)
231 if self.type is None:
232 raise ValueError, "unknown url type: %s" % self.__original
233 return self.type
235 def get_host(self):
236 if self.host is None:
237 self.host, self.__r_host = splithost(self.__r_type)
238 if self.host:
239 self.host = unquote(self.host)
240 return self.host
242 def get_selector(self):
243 return self.__r_host
245 def set_proxy(self, host, type):
246 self.host, self.type = host, type
247 self.__r_host = self.__original
249 def add_header(self, key, val):
250 # useful for something like authentication
251 self.headers[key] = val
253 class OpenerDirector:
254 def __init__(self):
255 server_version = "Python-urllib/%s" % __version__
256 self.addheaders = [('User-Agent', server_version)]
257 # manage the individual handlers
258 self.handlers = []
259 self.handle_open = {}
260 self.handle_error = {}
262 def add_handler(self, handler):
263 added = 0
264 for meth in dir(handler):
265 if meth[-5:] == '_open':
266 protocol = meth[:-5]
267 if protocol in self.handle_open:
268 self.handle_open[protocol].append(handler)
269 else:
270 self.handle_open[protocol] = [handler]
271 added = 1
272 continue
273 i = meth.find('_')
274 j = meth[i+1:].find('_') + i + 1
275 if j != -1 and meth[i+1:j] == 'error':
276 proto = meth[:i]
277 kind = meth[j+1:]
278 try:
279 kind = int(kind)
280 except ValueError:
281 pass
282 dict = self.handle_error.get(proto, {})
283 if kind in dict:
284 dict[kind].append(handler)
285 else:
286 dict[kind] = [handler]
287 self.handle_error[proto] = dict
288 added = 1
289 continue
290 if added:
291 self.handlers.append(handler)
292 handler.add_parent(self)
294 def __del__(self):
295 self.close()
297 def close(self):
298 for handler in self.handlers:
299 handler.close()
300 self.handlers = []
302 def _call_chain(self, chain, kind, meth_name, *args):
303 # XXX raise an exception if no one else should try to handle
304 # this url. return None if you can't but someone else could.
305 handlers = chain.get(kind, ())
306 for handler in handlers:
307 func = getattr(handler, meth_name)
309 result = func(*args)
310 if result is not None:
311 return result
313 def open(self, fullurl, data=None):
314 # accept a URL or a Request object
315 if isinstance(fullurl, basestring):
316 req = Request(fullurl, data)
317 else:
318 req = fullurl
319 if data is not None:
320 req.add_data(data)
321 assert isinstance(req, Request) # really only care about interface
323 result = self._call_chain(self.handle_open, 'default',
324 'default_open', req)
325 if result:
326 return result
328 type_ = req.get_type()
329 result = self._call_chain(self.handle_open, type_, type_ + \
330 '_open', req)
331 if result:
332 return result
334 return self._call_chain(self.handle_open, 'unknown',
335 'unknown_open', req)
337 def error(self, proto, *args):
338 if proto in ['http', 'https']:
339 # XXX http[s] protocols are special-cased
340 dict = self.handle_error['http'] # https is not different than http
341 proto = args[2] # YUCK!
342 meth_name = 'http_error_%d' % proto
343 http_err = 1
344 orig_args = args
345 else:
346 dict = self.handle_error
347 meth_name = proto + '_error'
348 http_err = 0
349 args = (dict, proto, meth_name) + args
350 result = self._call_chain(*args)
351 if result:
352 return result
354 if http_err:
355 args = (dict, 'default', 'http_error_default') + orig_args
356 return self._call_chain(*args)
358 # XXX probably also want an abstract factory that knows things like
359 # the fact that a ProxyHandler needs to get inserted first.
360 # would also know when it makes sense to skip a superclass in favor of
361 # a subclass and when it might make sense to include both
363 def build_opener(*handlers):
364 """Create an opener object from a list of handlers.
366 The opener will use several default handlers, including support
367 for HTTP and FTP. If there is a ProxyHandler, it must be at the
368 front of the list of handlers. (Yuck.)
370 If any of the handlers passed as arguments are subclasses of the
371 default handlers, the default handlers will not be used.
374 opener = OpenerDirector()
375 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
376 HTTPDefaultErrorHandler, HTTPRedirectHandler,
377 FTPHandler, FileHandler]
378 if hasattr(httplib, 'HTTPS'):
379 default_classes.append(HTTPSHandler)
380 skip = []
381 for klass in default_classes:
382 for check in handlers:
383 if inspect.isclass(check):
384 if issubclass(check, klass):
385 skip.append(klass)
386 elif isinstance(check, klass):
387 skip.append(klass)
388 for klass in skip:
389 default_classes.remove(klass)
391 for klass in default_classes:
392 opener.add_handler(klass())
394 for h in handlers:
395 if inspect.isclass(h):
396 h = h()
397 opener.add_handler(h)
398 return opener
400 class BaseHandler:
401 def add_parent(self, parent):
402 self.parent = parent
403 def close(self):
404 self.parent = None
406 class HTTPDefaultErrorHandler(BaseHandler):
407 def http_error_default(self, req, fp, code, msg, hdrs):
408 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
410 class HTTPRedirectHandler(BaseHandler):
411 def redirect_request(self, req, fp, code, msg, headers):
412 """Return a Request or None in response to a redirect.
414 This is called by the http_error_30x methods when a redirection
415 response is received. If a redirection should take place, return a new
416 Request to allow http_error_30x to perform the redirect. Otherwise,
417 raise HTTPError if no-one else should try to handle this url. Return
418 None if you can't but another Handler might.
421 if (code in (301, 302, 303, 307) and req.method() in ("GET", "HEAD") or
422 code in (302, 303) and req.method() == "POST"):
423 # Strictly (according to RFC 2616), 302 in response to a POST
424 # MUST NOT cause a redirection without confirmation from the user
425 # (of urllib2, in this case). In practice, essentially all clients
426 # do redirect in this case, so we do the same.
427 return Request(newurl, headers=req.headers)
428 else:
429 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
431 # Implementation note: To avoid the server sending us into an
432 # infinite loop, the request object needs to track what URLs we
433 # have already seen. Do this by adding a handler-specific
434 # attribute to the Request object.
435 def http_error_302(self, req, fp, code, msg, headers):
436 if 'location' in headers:
437 newurl = headers['location']
438 elif 'uri' in headers:
439 newurl = headers['uri']
440 else:
441 return
442 newurl = urlparse.urljoin(req.get_full_url(), newurl)
444 # XXX Probably want to forget about the state of the current
445 # request, although that might interact poorly with other
446 # handlers that also use handler-specific request attributes
447 new = self.redirect_request(req, fp, code, msg, headers)
448 if new is None:
449 return
451 # loop detection
452 new.error_302_dict = {}
453 if hasattr(req, 'error_302_dict'):
454 if len(req.error_302_dict)>10 or \
455 newurl in req.error_302_dict:
456 raise HTTPError(req.get_full_url(), code,
457 self.inf_msg + msg, headers, fp)
458 new.error_302_dict.update(req.error_302_dict)
459 new.error_302_dict[newurl] = newurl
461 # Don't close the fp until we are sure that we won't use it
462 # with HTTPError.
463 fp.read()
464 fp.close()
466 return self.parent.open(new)
468 http_error_301 = http_error_303 = http_error_307 = http_error_302
470 inf_msg = "The HTTP server returned a redirect error that would" \
471 "lead to an infinite loop.\n" \
472 "The last 302 error message was:\n"
474 class ProxyHandler(BaseHandler):
475 def __init__(self, proxies=None):
476 if proxies is None:
477 proxies = getproxies()
478 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
479 self.proxies = proxies
480 for type, url in proxies.items():
481 setattr(self, '%s_open' % type,
482 lambda r, proxy=url, type=type, meth=self.proxy_open: \
483 meth(r, proxy, type))
485 def proxy_open(self, req, proxy, type):
486 orig_type = req.get_type()
487 type, r_type = splittype(proxy)
488 host, XXX = splithost(r_type)
489 if '@' in host:
490 user_pass, host = host.split('@', 1)
491 if ':' in user_pass:
492 user, password = user_pass.split(':', 1)
493 user_pass = base64.encodestring('%s:%s' % (unquote(user),
494 unquote(password)))
495 req.add_header('Proxy-Authorization', 'Basic ' + user_pass)
496 host = unquote(host)
497 req.set_proxy(host, type)
498 if orig_type == type:
499 # let other handlers take care of it
500 # XXX this only makes sense if the proxy is before the
501 # other handlers
502 return None
503 else:
504 # need to start over, because the other handlers don't
505 # grok the proxy's URL type
506 return self.parent.open(req)
508 # feature suggested by Duncan Booth
509 # XXX custom is not a good name
510 class CustomProxy:
511 # either pass a function to the constructor or override handle
512 def __init__(self, proto, func=None, proxy_addr=None):
513 self.proto = proto
514 self.func = func
515 self.addr = proxy_addr
517 def handle(self, req):
518 if self.func and self.func(req):
519 return 1
521 def get_proxy(self):
522 return self.addr
524 class CustomProxyHandler(BaseHandler):
525 def __init__(self, *proxies):
526 self.proxies = {}
528 def proxy_open(self, req):
529 proto = req.get_type()
530 try:
531 proxies = self.proxies[proto]
532 except KeyError:
533 return None
534 for p in proxies:
535 if p.handle(req):
536 req.set_proxy(p.get_proxy())
537 return self.parent.open(req)
538 return None
540 def do_proxy(self, p, req):
541 return self.parent.open(req)
543 def add_proxy(self, cpo):
544 if cpo.proto in self.proxies:
545 self.proxies[cpo.proto].append(cpo)
546 else:
547 self.proxies[cpo.proto] = [cpo]
549 class HTTPPasswordMgr:
550 def __init__(self):
551 self.passwd = {}
553 def add_password(self, realm, uri, user, passwd):
554 # uri could be a single URI or a sequence
555 if isinstance(uri, basestring):
556 uri = [uri]
557 uri = tuple(map(self.reduce_uri, uri))
558 if not realm in self.passwd:
559 self.passwd[realm] = {}
560 self.passwd[realm][uri] = (user, passwd)
562 def find_user_password(self, realm, authuri):
563 domains = self.passwd.get(realm, {})
564 authuri = self.reduce_uri(authuri)
565 for uris, authinfo in domains.items():
566 for uri in uris:
567 if self.is_suburi(uri, authuri):
568 return authinfo
569 return None, None
571 def reduce_uri(self, uri):
572 """Accept netloc or URI and extract only the netloc and path"""
573 parts = urlparse.urlparse(uri)
574 if parts[1]:
575 return parts[1], parts[2] or '/'
576 else:
577 return parts[2], '/'
579 def is_suburi(self, base, test):
580 """Check if test is below base in a URI tree
582 Both args must be URIs in reduced form.
584 if base == test:
585 return True
586 if base[0] != test[0]:
587 return False
588 common = posixpath.commonprefix((base[1], test[1]))
589 if len(common) == len(base[1]):
590 return True
591 return False
594 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
596 def find_user_password(self, realm, authuri):
597 user, password = HTTPPasswordMgr.find_user_password(self,realm,authuri)
598 if user is not None:
599 return user, password
600 return HTTPPasswordMgr.find_user_password(self, None, authuri)
603 class AbstractBasicAuthHandler:
605 rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
607 # XXX there can actually be multiple auth-schemes in a
608 # www-authenticate header. should probably be a lot more careful
609 # in parsing them to extract multiple alternatives
611 def __init__(self, password_mgr=None):
612 if password_mgr is None:
613 password_mgr = HTTPPasswordMgr()
614 self.passwd = password_mgr
615 self.add_password = self.passwd.add_password
617 def http_error_auth_reqed(self, authreq, host, req, headers):
618 # XXX could be multiple headers
619 authreq = headers.get(authreq, None)
620 if authreq:
621 mo = AbstractBasicAuthHandler.rx.match(authreq)
622 if mo:
623 scheme, realm = mo.groups()
624 if scheme.lower() == 'basic':
625 return self.retry_http_basic_auth(host, req, realm)
627 def retry_http_basic_auth(self, host, req, realm):
628 user,pw = self.passwd.find_user_password(realm, host)
629 if pw:
630 raw = "%s:%s" % (user, pw)
631 auth = 'Basic %s' % base64.encodestring(raw).strip()
632 if req.headers.get(self.auth_header, None) == auth:
633 return None
634 req.add_header(self.auth_header, auth)
635 return self.parent.open(req)
636 else:
637 return None
639 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
641 auth_header = 'Authorization'
643 def http_error_401(self, req, fp, code, msg, headers):
644 host = urlparse.urlparse(req.get_full_url())[1]
645 return self.http_error_auth_reqed('www-authenticate',
646 host, req, headers)
649 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
651 auth_header = 'Proxy-Authorization'
653 def http_error_407(self, req, fp, code, msg, headers):
654 host = req.get_host()
655 return self.http_error_auth_reqed('proxy-authenticate',
656 host, req, headers)
659 class AbstractDigestAuthHandler:
661 def __init__(self, passwd=None):
662 if passwd is None:
663 passwd = HTTPPasswordMgr()
664 self.passwd = passwd
665 self.add_password = self.passwd.add_password
667 def http_error_auth_reqed(self, authreq, host, req, headers):
668 authreq = headers.get(self.auth_header, None)
669 if authreq:
670 kind = authreq.split()[0]
671 if kind == 'Digest':
672 return self.retry_http_digest_auth(req, authreq)
674 def retry_http_digest_auth(self, req, auth):
675 token, challenge = auth.split(' ', 1)
676 chal = parse_keqv_list(parse_http_list(challenge))
677 auth = self.get_authorization(req, chal)
678 if auth:
679 auth_val = 'Digest %s' % auth
680 if req.headers.get(self.auth_header, None) == auth_val:
681 return None
682 req.add_header(self.auth_header, auth_val)
683 resp = self.parent.open(req)
684 return resp
686 def get_authorization(self, req, chal):
687 try:
688 realm = chal['realm']
689 nonce = chal['nonce']
690 algorithm = chal.get('algorithm', 'MD5')
691 # mod_digest doesn't send an opaque, even though it isn't
692 # supposed to be optional
693 opaque = chal.get('opaque', None)
694 except KeyError:
695 return None
697 H, KD = self.get_algorithm_impls(algorithm)
698 if H is None:
699 return None
701 user, pw = self.passwd.find_user_password(realm,
702 req.get_full_url())
703 if user is None:
704 return None
706 # XXX not implemented yet
707 if req.has_data():
708 entdig = self.get_entity_digest(req.get_data(), chal)
709 else:
710 entdig = None
712 A1 = "%s:%s:%s" % (user, realm, pw)
713 A2 = "%s:%s" % (req.has_data() and 'POST' or 'GET',
714 # XXX selector: what about proxies and full urls
715 req.get_selector())
716 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
717 # XXX should the partial digests be encoded too?
719 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
720 'response="%s"' % (user, realm, nonce, req.get_selector(),
721 respdig)
722 if opaque:
723 base = base + ', opaque="%s"' % opaque
724 if entdig:
725 base = base + ', digest="%s"' % entdig
726 if algorithm != 'MD5':
727 base = base + ', algorithm="%s"' % algorithm
728 return base
730 def get_algorithm_impls(self, algorithm):
731 # lambdas assume digest modules are imported at the top level
732 if algorithm == 'MD5':
733 H = lambda x, e=encode_digest:e(md5.new(x).digest())
734 elif algorithm == 'SHA':
735 H = lambda x, e=encode_digest:e(sha.new(x).digest())
736 # XXX MD5-sess
737 KD = lambda s, d, H=H: H("%s:%s" % (s, d))
738 return H, KD
740 def get_entity_digest(self, data, chal):
741 # XXX not implemented yet
742 return None
745 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
746 """An authentication protocol defined by RFC 2069
748 Digest authentication improves on basic authentication because it
749 does not transmit passwords in the clear.
752 header = 'Authorization'
754 def http_error_401(self, req, fp, code, msg, headers):
755 host = urlparse.urlparse(req.get_full_url())[1]
756 self.http_error_auth_reqed('www-authenticate', host, req, headers)
759 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
761 header = 'Proxy-Authorization'
763 def http_error_407(self, req, fp, code, msg, headers):
764 host = req.get_host()
765 self.http_error_auth_reqed('proxy-authenticate', host, req, headers)
768 def encode_digest(digest):
769 hexrep = []
770 for c in digest:
771 n = (ord(c) >> 4) & 0xf
772 hexrep.append(hex(n)[-1])
773 n = ord(c) & 0xf
774 hexrep.append(hex(n)[-1])
775 return ''.join(hexrep)
778 class AbstractHTTPHandler(BaseHandler):
780 def do_open(self, http_class, req):
781 host = req.get_host()
782 if not host:
783 raise URLError('no host given')
785 try:
786 h = http_class(host) # will parse host:port
787 if req.has_data():
788 data = req.get_data()
789 h.putrequest('POST', req.get_selector())
790 if not 'Content-type' in req.headers:
791 h.putheader('Content-type',
792 'application/x-www-form-urlencoded')
793 if not 'Content-length' in req.headers:
794 h.putheader('Content-length', '%d' % len(data))
795 else:
796 h.putrequest('GET', req.get_selector())
797 except socket.error, err:
798 raise URLError(err)
800 scheme, sel = splittype(req.get_selector())
801 sel_host, sel_path = splithost(sel)
802 h.putheader('Host', sel_host or host)
803 for args in self.parent.addheaders:
804 name, value = args
805 if name not in req.headers:
806 h.putheader(*args)
807 for k, v in req.headers.items():
808 h.putheader(k, v)
809 h.endheaders()
810 if req.has_data():
811 h.send(data)
813 code, msg, hdrs = h.getreply()
814 fp = h.getfile()
815 if code == 200:
816 return addinfourl(fp, hdrs, req.get_full_url())
817 else:
818 return self.parent.error('http', req, fp, code, msg, hdrs)
821 class HTTPHandler(AbstractHTTPHandler):
823 def http_open(self, req):
824 return self.do_open(httplib.HTTP, req)
827 if hasattr(httplib, 'HTTPS'):
828 class HTTPSHandler(AbstractHTTPHandler):
830 def https_open(self, req):
831 return self.do_open(httplib.HTTPS, req)
834 class UnknownHandler(BaseHandler):
835 def unknown_open(self, req):
836 type = req.get_type()
837 raise URLError('unknown url type: %s' % type)
839 def parse_keqv_list(l):
840 """Parse list of key=value strings where keys are not duplicated."""
841 parsed = {}
842 for elt in l:
843 k, v = elt.split('=', 1)
844 if v[0] == '"' and v[-1] == '"':
845 v = v[1:-1]
846 parsed[k] = v
847 return parsed
849 def parse_http_list(s):
850 """Parse lists as described by RFC 2068 Section 2.
852 In particular, parse comman-separated lists where the elements of
853 the list may include quoted-strings. A quoted-string could
854 contain a comma.
856 # XXX this function could probably use more testing
858 list = []
859 end = len(s)
860 i = 0
861 inquote = 0
862 start = 0
863 while i < end:
864 cur = s[i:]
865 c = cur.find(',')
866 q = cur.find('"')
867 if c == -1:
868 list.append(s[start:])
869 break
870 if q == -1:
871 if inquote:
872 raise ValueError, "unbalanced quotes"
873 else:
874 list.append(s[start:i+c])
875 i = i + c + 1
876 continue
877 if inquote:
878 if q < c:
879 list.append(s[start:i+c])
880 i = i + c + 1
881 start = i
882 inquote = 0
883 else:
884 i = i + q
885 else:
886 if c < q:
887 list.append(s[start:i+c])
888 i = i + c + 1
889 start = i
890 else:
891 inquote = 1
892 i = i + q + 1
893 return map(lambda x: x.strip(), list)
895 class FileHandler(BaseHandler):
896 # Use local file or FTP depending on form of URL
897 def file_open(self, req):
898 url = req.get_selector()
899 if url[:2] == '//' and url[2:3] != '/':
900 req.type = 'ftp'
901 return self.parent.open(req)
902 else:
903 return self.open_local_file(req)
905 # names for the localhost
906 names = None
907 def get_names(self):
908 if FileHandler.names is None:
909 FileHandler.names = (socket.gethostbyname('localhost'),
910 socket.gethostbyname(socket.gethostname()))
911 return FileHandler.names
913 # not entirely sure what the rules are here
914 def open_local_file(self, req):
915 host = req.get_host()
916 file = req.get_selector()
917 localfile = url2pathname(file)
918 stats = os.stat(localfile)
919 size = stats.st_size
920 modified = rfc822.formatdate(stats.st_mtime)
921 mtype = mimetypes.guess_type(file)[0]
922 headers = mimetools.Message(StringIO(
923 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
924 (mtype or 'text/plain', size, modified)))
925 if host:
926 host, port = splitport(host)
927 if not host or \
928 (not port and socket.gethostbyname(host) in self.get_names()):
929 return addinfourl(open(localfile, 'rb'),
930 headers, 'file:'+file)
931 raise URLError('file not on local host')
933 class FTPHandler(BaseHandler):
934 def ftp_open(self, req):
935 host = req.get_host()
936 if not host:
937 raise IOError, ('ftp error', 'no host given')
938 # XXX handle custom username & password
939 try:
940 host = socket.gethostbyname(host)
941 except socket.error, msg:
942 raise URLError(msg)
943 host, port = splitport(host)
944 if port is None:
945 port = ftplib.FTP_PORT
946 path, attrs = splitattr(req.get_selector())
947 path = unquote(path)
948 dirs = path.split('/')
949 dirs, file = dirs[:-1], dirs[-1]
950 if dirs and not dirs[0]:
951 dirs = dirs[1:]
952 user = passwd = '' # XXX
953 try:
954 fw = self.connect_ftp(user, passwd, host, port, dirs)
955 type = file and 'I' or 'D'
956 for attr in attrs:
957 attr, value = splitattr(attr)
958 if attr.lower() == 'type' and \
959 value in ('a', 'A', 'i', 'I', 'd', 'D'):
960 type = value.upper()
961 fp, retrlen = fw.retrfile(file, type)
962 headers = ""
963 mtype = mimetypes.guess_type(req.get_full_url())[0]
964 if mtype:
965 headers += "Content-Type: %s\n" % mtype
966 if retrlen is not None and retrlen >= 0:
967 headers += "Content-Length: %d\n" % retrlen
968 sf = StringIO(headers)
969 headers = mimetools.Message(sf)
970 return addinfourl(fp, headers, req.get_full_url())
971 except ftplib.all_errors, msg:
972 raise IOError, ('ftp error', msg), sys.exc_info()[2]
974 def connect_ftp(self, user, passwd, host, port, dirs):
975 fw = ftpwrapper(user, passwd, host, port, dirs)
976 ## fw.ftp.set_debuglevel(1)
977 return fw
979 class CacheFTPHandler(FTPHandler):
980 # XXX would be nice to have pluggable cache strategies
981 # XXX this stuff is definitely not thread safe
982 def __init__(self):
983 self.cache = {}
984 self.timeout = {}
985 self.soonest = 0
986 self.delay = 60
987 self.max_conns = 16
989 def setTimeout(self, t):
990 self.delay = t
992 def setMaxConns(self, m):
993 self.max_conns = m
995 def connect_ftp(self, user, passwd, host, port, dirs):
996 key = user, passwd, host, port
997 if key in self.cache:
998 self.timeout[key] = time.time() + self.delay
999 else:
1000 self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
1001 self.timeout[key] = time.time() + self.delay
1002 self.check_cache()
1003 return self.cache[key]
1005 def check_cache(self):
1006 # first check for old ones
1007 t = time.time()
1008 if self.soonest <= t:
1009 for k, v in self.timeout.items():
1010 if v < t:
1011 self.cache[k].close()
1012 del self.cache[k]
1013 del self.timeout[k]
1014 self.soonest = min(self.timeout.values())
1016 # then check the size
1017 if len(self.cache) == self.max_conns:
1018 for k, v in self.timeout.items():
1019 if v == self.soonest:
1020 del self.cache[k]
1021 del self.timeout[k]
1022 break
1023 self.soonest = min(self.timeout.values())
1025 class GopherHandler(BaseHandler):
1026 def gopher_open(self, req):
1027 host = req.get_host()
1028 if not host:
1029 raise GopherError('no host given')
1030 host = unquote(host)
1031 selector = req.get_selector()
1032 type, selector = splitgophertype(selector)
1033 selector, query = splitquery(selector)
1034 selector = unquote(selector)
1035 if query:
1036 query = unquote(query)
1037 fp = gopherlib.send_query(selector, query, host)
1038 else:
1039 fp = gopherlib.send_selector(selector, host)
1040 return addinfourl(fp, noheaders(), req.get_full_url())
1042 #bleck! don't use this yet
1043 class OpenerFactory:
1045 default_handlers = [UnknownHandler, HTTPHandler,
1046 HTTPDefaultErrorHandler, HTTPRedirectHandler,
1047 FTPHandler, FileHandler]
1048 proxy_handlers = [ProxyHandler]
1049 handlers = []
1050 replacement_handlers = []
1052 def add_proxy_handler(self, ph):
1053 self.proxy_handlers = self.proxy_handlers + [ph]
1055 def add_handler(self, h):
1056 self.handlers = self.handlers + [h]
1058 def replace_handler(self, h):
1059 pass
1061 def build_opener(self):
1062 opener = OpenerDirector()
1063 for ph in self.proxy_handlers:
1064 if inspect.isclass(ph):
1065 ph = ph()
1066 opener.add_handler(ph)
1068 if __name__ == "__main__":
1069 # XXX some of the test code depends on machine configurations that
1070 # are internal to CNRI. Need to set up a public server with the
1071 # right authentication configuration for test purposes.
1072 if socket.gethostname() == 'bitdiddle':
1073 localhost = 'bitdiddle.cnri.reston.va.us'
1074 elif socket.gethostname() == 'bitdiddle.concentric.net':
1075 localhost = 'localhost'
1076 else:
1077 localhost = None
1078 urls = [
1079 # Thanks to Fred for finding these!
1080 'gopher://gopher.lib.ncsu.edu/11/library/stacks/Alex',
1081 'gopher://gopher.vt.edu:10010/10/33',
1083 'file:/etc/passwd',
1084 'file://nonsensename/etc/passwd',
1085 'ftp://www.python.org/pub/python/misc/sousa.au',
1086 'ftp://www.python.org/pub/tmp/blat',
1087 'http://www.espn.com/', # redirect
1088 'http://www.python.org/Spanish/Inquistion/',
1089 ('http://www.python.org/cgi-bin/faqw.py',
1090 'query=pythonistas&querytype=simple&casefold=yes&req=search'),
1091 'http://www.python.org/',
1092 'ftp://gatekeeper.research.compaq.com/pub/DEC/SRC/research-reports/00README-Legal-Rules-Regs',
1095 ## if localhost is not None:
1096 ## urls = urls + [
1097 ## 'file://%s/etc/passwd' % localhost,
1098 ## 'http://%s/simple/' % localhost,
1099 ## 'http://%s/digest/' % localhost,
1100 ## 'http://%s/not/found.h' % localhost,
1101 ## ]
1103 ## bauth = HTTPBasicAuthHandler()
1104 ## bauth.add_password('basic_test_realm', localhost, 'jhylton',
1105 ## 'password')
1106 ## dauth = HTTPDigestAuthHandler()
1107 ## dauth.add_password('digest_test_realm', localhost, 'jhylton',
1108 ## 'password')
1111 cfh = CacheFTPHandler()
1112 cfh.setTimeout(1)
1114 ## # XXX try out some custom proxy objects too!
1115 ## def at_cnri(req):
1116 ## host = req.get_host()
1117 ## print host
1118 ## if host[-18:] == '.cnri.reston.va.us':
1119 ## return 1
1120 ## p = CustomProxy('http', at_cnri, 'proxy.cnri.reston.va.us')
1121 ## ph = CustomProxyHandler(p)
1123 ## install_opener(build_opener(dauth, bauth, cfh, GopherHandler, ph))
1124 install_opener(build_opener(cfh, GopherHandler))
1126 for url in urls:
1127 if isinstance(url, tuple):
1128 url, req = url
1129 else:
1130 req = None
1131 print url
1132 try:
1133 f = urlopen(url, req)
1134 except IOError, err:
1135 print "IOError:", err
1136 except socket.error, err:
1137 print "socket.error:", err
1138 else:
1139 buf = f.read()
1140 f.close()
1141 print "read %d bytes" % len(buf)
1142 print
1143 time.sleep(0.1)