Results of a rewrite pass
[python/dscho.git] / Lib / urllib2.py
blobf189b390298fca8ee7b583d749ad1970b0bb461f
1 """An extensible library for opening URLs using a variety of protocols
3 The simplest way to use this module is to call the urlopen function,
4 which accepts a string containing a URL or a Request object (described
5 below). It opens the URL and returns the results as file-like
6 object; the returned object has some extra methods described below.
8 The OpenerDirector manages a collection of Handler objects that do
9 all the actual work. Each Handler implements a particular protocol or
10 option. The OpenerDirector is a composite object that invokes the
11 Handlers needed to open the requested URL. For example, the
12 HTTPHandler performs HTTP GET and POST requests and deals with
13 non-error returns. The HTTPRedirectHandler automatically deals with
14 HTTP 301 & 302 redirect errors, and the HTTPDigestAuthHandler deals
15 with digest authentication.
17 urlopen(url, data=None) -- basic usage is that same as original
18 urllib. pass the url and optionally data to post to an HTTP URL, and
19 get a file-like object back. One difference is that you can also pass
20 a Request instance instead of URL. Raises a URLError (subclass of
21 IOError); for HTTP errors, raises an HTTPError, which can also be
22 treated as a valid response.
24 build_opener -- function that creates a new OpenerDirector instance.
25 will install the default handlers. accepts one or more Handlers as
26 arguments, either instances or Handler classes that it will
27 instantiate. if one of the argument is a subclass of the default
28 handler, the argument will be installed instead of the default.
30 install_opener -- installs a new opener as the default opener.
32 objects of interest:
33 OpenerDirector --
35 Request -- an object that encapsulates the state of a request. the
36 state can be a simple as the URL. it can also include extra HTTP
37 headers, e.g. a User-Agent.
39 BaseHandler --
41 exceptions:
42 URLError-- a subclass of IOError, individual protocols have their own
43 specific subclass
45 HTTPError-- also a valid HTTP response, so you can treat an HTTP error
46 as an exceptional event or valid response
48 internals:
49 BaseHandler and parent
50 _call_chain conventions
52 Example usage:
54 import urllib2
56 # set up authentication info
57 authinfo = urllib2.HTTPBasicAuthHandler()
58 authinfo.add_password('realm', 'host', 'username', 'password')
60 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
62 # build a new opener that adds authentication and caching FTP handlers
63 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
65 # install it
66 urllib2.install_opener(opener)
68 f = urllib2.urlopen('http://www.python.org/')
71 """
73 # XXX issues:
74 # If an authentication error handler that tries to perform
75 # authentication for some reason but fails, how should the error be
76 # signalled? The client needs to know the HTTP error code. But if
77 # the handler knows that the problem was, e.g., that it didn't know
78 # that hash algo that requested in the challenge, it would be good to
79 # pass that information along to the client, too.
81 # XXX to do:
82 # name!
83 # documentation (getting there)
84 # complex proxies
85 # abstract factory for opener
86 # ftp errors aren't handled cleanly
87 # gopher can return a socket.error
88 # check digest against correct (i.e. non-apache) implementation
90 import socket
91 import httplib
92 import inspect
93 import re
94 import base64
95 import urlparse
96 import md5
97 import mimetypes
98 import mimetools
99 import rfc822
100 import ftplib
101 import sys
102 import time
103 import os
104 import gopherlib
105 import posixpath
107 try:
108 from cStringIO import StringIO
109 except ImportError:
110 from StringIO import StringIO
112 try:
113 import sha
114 except ImportError:
115 # need 1.5.2 final
116 sha = None
118 # not sure how many of these need to be gotten rid of
119 from urllib import unwrap, unquote, splittype, splithost, \
120 addinfourl, splitport, splitgophertype, splitquery, \
121 splitattr, ftpwrapper, noheaders
123 # support for proxies via environment variables
124 from urllib import getproxies
126 # support for FileHandler
127 from urllib import localhost, url2pathname
129 __version__ = "2.0a1"
131 _opener = None
132 def urlopen(url, data=None):
133 global _opener
134 if _opener is None:
135 _opener = build_opener()
136 return _opener.open(url, data)
138 def install_opener(opener):
139 global _opener
140 _opener = opener
142 # do these error classes make sense?
143 # make sure all of the IOError stuff is overridden. we just want to be
144 # subtypes.
146 class URLError(IOError):
147 # URLError is a sub-type of IOError, but it doesn't share any of
148 # the implementation. need to override __init__ and __str__
149 def __init__(self, reason):
150 self.reason = reason
152 def __str__(self):
153 return '<urlopen error %s>' % self.reason
155 class HTTPError(URLError, addinfourl):
156 """Raised when HTTP error occurs, but also acts like non-error return"""
157 __super_init = addinfourl.__init__
159 def __init__(self, url, code, msg, hdrs, fp):
160 self.code = code
161 self.msg = msg
162 self.hdrs = hdrs
163 self.fp = fp
164 self.filename = url
165 # The addinfourl classes depend on fp being a valid file
166 # object. In some cases, the HTTPError may not have a valid
167 # file object. If this happens, the simplest workaround is to
168 # not initialize the base classes.
169 if fp is not None:
170 self.__super_init(fp, hdrs, url)
172 def __str__(self):
173 return 'HTTP Error %s: %s' % (self.code, self.msg)
175 def __del__(self):
176 # XXX is this safe? what if user catches exception, then
177 # extracts fp and discards exception?
178 if self.fp:
179 self.fp.close()
181 class GopherError(URLError):
182 pass
185 class Request:
187 def __init__(self, url, data=None, headers={}):
188 # unwrap('<URL:type://host/path>') --> 'type://host/path'
189 self.__original = unwrap(url)
190 self.type = None
191 # self.__r_type is what's left after doing the splittype
192 self.host = None
193 self.port = None
194 self.data = data
195 self.headers = {}
196 self.headers.update(headers)
198 def __getattr__(self, attr):
199 # XXX this is a fallback mechanism to guard against these
200 # methods getting called in a non-standard order. this may be
201 # too complicated and/or unnecessary.
202 # XXX should the __r_XXX attributes be public?
203 if attr[:12] == '_Request__r_':
204 name = attr[12:]
205 if hasattr(Request, 'get_' + name):
206 getattr(self, 'get_' + name)()
207 return getattr(self, attr)
208 raise AttributeError, attr
210 def add_data(self, data):
211 self.data = data
213 def has_data(self):
214 return self.data is not None
216 def get_data(self):
217 return self.data
219 def get_full_url(self):
220 return self.__original
222 def get_type(self):
223 if self.type is None:
224 self.type, self.__r_type = splittype(self.__original)
225 if self.type is None:
226 raise ValueError, "unknown url type: %s" % self.__original
227 return self.type
229 def get_host(self):
230 if self.host is None:
231 self.host, self.__r_host = splithost(self.__r_type)
232 if self.host:
233 self.host = unquote(self.host)
234 return self.host
236 def get_selector(self):
237 return self.__r_host
239 def set_proxy(self, host, type):
240 self.host, self.type = host, type
241 self.__r_host = self.__original
243 def add_header(self, key, val):
244 # useful for something like authentication
245 self.headers[key] = val
247 class OpenerDirector:
248 def __init__(self):
249 server_version = "Python-urllib/%s" % __version__
250 self.addheaders = [('User-Agent', server_version)]
251 # manage the individual handlers
252 self.handlers = []
253 self.handle_open = {}
254 self.handle_error = {}
256 def add_handler(self, handler):
257 added = 0
258 for meth in dir(handler):
259 if meth[-5:] == '_open':
260 protocol = meth[:-5]
261 if protocol in self.handle_open:
262 self.handle_open[protocol].append(handler)
263 else:
264 self.handle_open[protocol] = [handler]
265 added = 1
266 continue
267 i = meth.find('_')
268 j = meth[i+1:].find('_') + i + 1
269 if j != -1 and meth[i+1:j] == 'error':
270 proto = meth[:i]
271 kind = meth[j+1:]
272 try:
273 kind = int(kind)
274 except ValueError:
275 pass
276 dict = self.handle_error.get(proto, {})
277 if kind in dict:
278 dict[kind].append(handler)
279 else:
280 dict[kind] = [handler]
281 self.handle_error[proto] = dict
282 added = 1
283 continue
284 if added:
285 self.handlers.append(handler)
286 handler.add_parent(self)
288 def __del__(self):
289 self.close()
291 def close(self):
292 for handler in self.handlers:
293 handler.close()
294 self.handlers = []
296 def _call_chain(self, chain, kind, meth_name, *args):
297 # XXX raise an exception if no one else should try to handle
298 # this url. return None if you can't but someone else could.
299 handlers = chain.get(kind, ())
300 for handler in handlers:
301 func = getattr(handler, meth_name)
303 result = func(*args)
304 if result is not None:
305 return result
307 def open(self, fullurl, data=None):
308 # accept a URL or a Request object
309 if isinstance(fullurl, basestring):
310 req = Request(fullurl, data)
311 else:
312 req = fullurl
313 if data is not None:
314 req.add_data(data)
315 assert isinstance(req, Request) # really only care about interface
317 result = self._call_chain(self.handle_open, 'default',
318 'default_open', req)
319 if result:
320 return result
322 type_ = req.get_type()
323 result = self._call_chain(self.handle_open, type_, type_ + \
324 '_open', req)
325 if result:
326 return result
328 return self._call_chain(self.handle_open, 'unknown',
329 'unknown_open', req)
331 def error(self, proto, *args):
332 if proto in ['http', 'https']:
333 # XXX http[s] protocols are special-cased
334 dict = self.handle_error['http'] # https is not different than http
335 proto = args[2] # YUCK!
336 meth_name = 'http_error_%d' % proto
337 http_err = 1
338 orig_args = args
339 else:
340 dict = self.handle_error
341 meth_name = proto + '_error'
342 http_err = 0
343 args = (dict, proto, meth_name) + args
344 result = self._call_chain(*args)
345 if result:
346 return result
348 if http_err:
349 args = (dict, 'default', 'http_error_default') + orig_args
350 return self._call_chain(*args)
352 # XXX probably also want an abstract factory that knows things like
353 # the fact that a ProxyHandler needs to get inserted first.
354 # would also know when it makes sense to skip a superclass in favor of
355 # a subclass and when it might make sense to include both
357 def build_opener(*handlers):
358 """Create an opener object from a list of handlers.
360 The opener will use several default handlers, including support
361 for HTTP and FTP. If there is a ProxyHandler, it must be at the
362 front of the list of handlers. (Yuck.)
364 If any of the handlers passed as arguments are subclasses of the
365 default handlers, the default handlers will not be used.
368 opener = OpenerDirector()
369 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
370 HTTPDefaultErrorHandler, HTTPRedirectHandler,
371 FTPHandler, FileHandler]
372 if hasattr(httplib, 'HTTPS'):
373 default_classes.append(HTTPSHandler)
374 skip = []
375 for klass in default_classes:
376 for check in handlers:
377 if inspect.isclass(check):
378 if issubclass(check, klass):
379 skip.append(klass)
380 elif isinstance(check, klass):
381 skip.append(klass)
382 for klass in skip:
383 default_classes.remove(klass)
385 for klass in default_classes:
386 opener.add_handler(klass())
388 for h in handlers:
389 if inspect.isclass(h):
390 h = h()
391 opener.add_handler(h)
392 return opener
394 class BaseHandler:
395 def add_parent(self, parent):
396 self.parent = parent
397 def close(self):
398 self.parent = None
400 class HTTPDefaultErrorHandler(BaseHandler):
401 def http_error_default(self, req, fp, code, msg, hdrs):
402 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
404 class HTTPRedirectHandler(BaseHandler):
405 # Implementation note: To avoid the server sending us into an
406 # infinite loop, the request object needs to track what URLs we
407 # have already seen. Do this by adding a handler-specific
408 # attribute to the Request object.
409 def http_error_302(self, req, fp, code, msg, headers):
410 if 'location' in headers:
411 newurl = headers['location']
412 elif 'uri' in headers:
413 newurl = headers['uri']
414 else:
415 return
416 newurl = urlparse.urljoin(req.get_full_url(), newurl)
418 # XXX Probably want to forget about the state of the current
419 # request, although that might interact poorly with other
420 # handlers that also use handler-specific request attributes
421 new = Request(newurl, req.get_data(), req.headers)
422 new.error_302_dict = {}
423 if hasattr(req, 'error_302_dict'):
424 if len(req.error_302_dict)>10 or \
425 newurl in req.error_302_dict:
426 raise HTTPError(req.get_full_url(), code,
427 self.inf_msg + msg, headers, fp)
428 new.error_302_dict.update(req.error_302_dict)
429 new.error_302_dict[newurl] = newurl
431 # Don't close the fp until we are sure that we won't use it
432 # with HTTPError.
433 fp.read()
434 fp.close()
436 return self.parent.open(new)
438 http_error_301 = http_error_302
440 inf_msg = "The HTTP server returned a redirect error that would" \
441 "lead to an infinite loop.\n" \
442 "The last 302 error message was:\n"
444 class ProxyHandler(BaseHandler):
445 def __init__(self, proxies=None):
446 if proxies is None:
447 proxies = getproxies()
448 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
449 self.proxies = proxies
450 for type, url in proxies.items():
451 setattr(self, '%s_open' % type,
452 lambda r, proxy=url, type=type, meth=self.proxy_open: \
453 meth(r, proxy, type))
455 def proxy_open(self, req, proxy, type):
456 orig_type = req.get_type()
457 type, r_type = splittype(proxy)
458 host, XXX = splithost(r_type)
459 if '@' in host:
460 user_pass, host = host.split('@', 1)
461 if ':' in user_pass:
462 user, password = user_pass.split(':', 1)
463 user_pass = base64.encodestring('%s:%s' % (unquote(user),
464 unquote(password)))
465 req.add_header('Proxy-Authorization', 'Basic ' + user_pass)
466 host = unquote(host)
467 req.set_proxy(host, type)
468 if orig_type == type:
469 # let other handlers take care of it
470 # XXX this only makes sense if the proxy is before the
471 # other handlers
472 return None
473 else:
474 # need to start over, because the other handlers don't
475 # grok the proxy's URL type
476 return self.parent.open(req)
478 # feature suggested by Duncan Booth
479 # XXX custom is not a good name
480 class CustomProxy:
481 # either pass a function to the constructor or override handle
482 def __init__(self, proto, func=None, proxy_addr=None):
483 self.proto = proto
484 self.func = func
485 self.addr = proxy_addr
487 def handle(self, req):
488 if self.func and self.func(req):
489 return 1
491 def get_proxy(self):
492 return self.addr
494 class CustomProxyHandler(BaseHandler):
495 def __init__(self, *proxies):
496 self.proxies = {}
498 def proxy_open(self, req):
499 proto = req.get_type()
500 try:
501 proxies = self.proxies[proto]
502 except KeyError:
503 return None
504 for p in proxies:
505 if p.handle(req):
506 req.set_proxy(p.get_proxy())
507 return self.parent.open(req)
508 return None
510 def do_proxy(self, p, req):
511 return self.parent.open(req)
513 def add_proxy(self, cpo):
514 if cpo.proto in self.proxies:
515 self.proxies[cpo.proto].append(cpo)
516 else:
517 self.proxies[cpo.proto] = [cpo]
519 class HTTPPasswordMgr:
520 def __init__(self):
521 self.passwd = {}
523 def add_password(self, realm, uri, user, passwd):
524 # uri could be a single URI or a sequence
525 if isinstance(uri, basestring):
526 uri = [uri]
527 uri = tuple(map(self.reduce_uri, uri))
528 if not realm in self.passwd:
529 self.passwd[realm] = {}
530 self.passwd[realm][uri] = (user, passwd)
532 def find_user_password(self, realm, authuri):
533 domains = self.passwd.get(realm, {})
534 authuri = self.reduce_uri(authuri)
535 for uris, authinfo in domains.items():
536 for uri in uris:
537 if self.is_suburi(uri, authuri):
538 return authinfo
539 return None, None
541 def reduce_uri(self, uri):
542 """Accept netloc or URI and extract only the netloc and path"""
543 parts = urlparse.urlparse(uri)
544 if parts[1]:
545 return parts[1], parts[2] or '/'
546 else:
547 return parts[2], '/'
549 def is_suburi(self, base, test):
550 """Check if test is below base in a URI tree
552 Both args must be URIs in reduced form.
554 if base == test:
555 return True
556 if base[0] != test[0]:
557 return False
558 common = posixpath.commonprefix((base[1], test[1]))
559 if len(common) == len(base[1]):
560 return True
561 return False
564 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
566 def find_user_password(self, realm, authuri):
567 user, password = HTTPPasswordMgr.find_user_password(self,realm,authuri)
568 if user is not None:
569 return user, password
570 return HTTPPasswordMgr.find_user_password(self, None, authuri)
573 class AbstractBasicAuthHandler:
575 rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
577 # XXX there can actually be multiple auth-schemes in a
578 # www-authenticate header. should probably be a lot more careful
579 # in parsing them to extract multiple alternatives
581 def __init__(self, password_mgr=None):
582 if password_mgr is None:
583 password_mgr = HTTPPasswordMgr()
584 self.passwd = password_mgr
585 self.add_password = self.passwd.add_password
587 def http_error_auth_reqed(self, authreq, host, req, headers):
588 # XXX could be multiple headers
589 authreq = headers.get(authreq, None)
590 if authreq:
591 mo = AbstractBasicAuthHandler.rx.match(authreq)
592 if mo:
593 scheme, realm = mo.groups()
594 if scheme.lower() == 'basic':
595 return self.retry_http_basic_auth(host, req, realm)
597 def retry_http_basic_auth(self, host, req, realm):
598 user,pw = self.passwd.find_user_password(realm, host)
599 if pw:
600 raw = "%s:%s" % (user, pw)
601 auth = 'Basic %s' % base64.encodestring(raw).strip()
602 if req.headers.get(self.auth_header, None) == auth:
603 return None
604 req.add_header(self.auth_header, auth)
605 return self.parent.open(req)
606 else:
607 return None
609 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
611 auth_header = 'Authorization'
613 def http_error_401(self, req, fp, code, msg, headers):
614 host = urlparse.urlparse(req.get_full_url())[1]
615 return self.http_error_auth_reqed('www-authenticate',
616 host, req, headers)
619 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
621 auth_header = 'Proxy-Authorization'
623 def http_error_407(self, req, fp, code, msg, headers):
624 host = req.get_host()
625 return self.http_error_auth_reqed('proxy-authenticate',
626 host, req, headers)
629 class AbstractDigestAuthHandler:
631 def __init__(self, passwd=None):
632 if passwd is None:
633 passwd = HTTPPasswordMgr()
634 self.passwd = passwd
635 self.add_password = self.passwd.add_password
637 def http_error_auth_reqed(self, authreq, host, req, headers):
638 authreq = headers.get(self.auth_header, None)
639 if authreq:
640 kind = authreq.split()[0]
641 if kind == 'Digest':
642 return self.retry_http_digest_auth(req, authreq)
644 def retry_http_digest_auth(self, req, auth):
645 token, challenge = auth.split(' ', 1)
646 chal = parse_keqv_list(parse_http_list(challenge))
647 auth = self.get_authorization(req, chal)
648 if auth:
649 auth_val = 'Digest %s' % auth
650 if req.headers.get(self.auth_header, None) == auth_val:
651 return None
652 req.add_header(self.auth_header, auth_val)
653 resp = self.parent.open(req)
654 return resp
656 def get_authorization(self, req, chal):
657 try:
658 realm = chal['realm']
659 nonce = chal['nonce']
660 algorithm = chal.get('algorithm', 'MD5')
661 # mod_digest doesn't send an opaque, even though it isn't
662 # supposed to be optional
663 opaque = chal.get('opaque', None)
664 except KeyError:
665 return None
667 H, KD = self.get_algorithm_impls(algorithm)
668 if H is None:
669 return None
671 user, pw = self.passwd.find_user_password(realm,
672 req.get_full_url())
673 if user is None:
674 return None
676 # XXX not implemented yet
677 if req.has_data():
678 entdig = self.get_entity_digest(req.get_data(), chal)
679 else:
680 entdig = None
682 A1 = "%s:%s:%s" % (user, realm, pw)
683 A2 = "%s:%s" % (req.has_data() and 'POST' or 'GET',
684 # XXX selector: what about proxies and full urls
685 req.get_selector())
686 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
687 # XXX should the partial digests be encoded too?
689 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
690 'response="%s"' % (user, realm, nonce, req.get_selector(),
691 respdig)
692 if opaque:
693 base = base + ', opaque="%s"' % opaque
694 if entdig:
695 base = base + ', digest="%s"' % entdig
696 if algorithm != 'MD5':
697 base = base + ', algorithm="%s"' % algorithm
698 return base
700 def get_algorithm_impls(self, algorithm):
701 # lambdas assume digest modules are imported at the top level
702 if algorithm == 'MD5':
703 H = lambda x, e=encode_digest:e(md5.new(x).digest())
704 elif algorithm == 'SHA':
705 H = lambda x, e=encode_digest:e(sha.new(x).digest())
706 # XXX MD5-sess
707 KD = lambda s, d, H=H: H("%s:%s" % (s, d))
708 return H, KD
710 def get_entity_digest(self, data, chal):
711 # XXX not implemented yet
712 return None
715 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
716 """An authentication protocol defined by RFC 2069
718 Digest authentication improves on basic authentication because it
719 does not transmit passwords in the clear.
722 header = 'Authorization'
724 def http_error_401(self, req, fp, code, msg, headers):
725 host = urlparse.urlparse(req.get_full_url())[1]
726 self.http_error_auth_reqed('www-authenticate', host, req, headers)
729 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
731 header = 'Proxy-Authorization'
733 def http_error_407(self, req, fp, code, msg, headers):
734 host = req.get_host()
735 self.http_error_auth_reqed('proxy-authenticate', host, req, headers)
738 def encode_digest(digest):
739 hexrep = []
740 for c in digest:
741 n = (ord(c) >> 4) & 0xf
742 hexrep.append(hex(n)[-1])
743 n = ord(c) & 0xf
744 hexrep.append(hex(n)[-1])
745 return ''.join(hexrep)
748 class AbstractHTTPHandler(BaseHandler):
750 def do_open(self, http_class, req):
751 host = req.get_host()
752 if not host:
753 raise URLError('no host given')
755 try:
756 h = http_class(host) # will parse host:port
757 if req.has_data():
758 data = req.get_data()
759 h.putrequest('POST', req.get_selector())
760 if not 'Content-type' in req.headers:
761 h.putheader('Content-type',
762 'application/x-www-form-urlencoded')
763 if not 'Content-length' in req.headers:
764 h.putheader('Content-length', '%d' % len(data))
765 else:
766 h.putrequest('GET', req.get_selector())
767 except socket.error, err:
768 raise URLError(err)
770 scheme, sel = splittype(req.get_selector())
771 sel_host, sel_path = splithost(sel)
772 h.putheader('Host', sel_host or host)
773 for args in self.parent.addheaders:
774 name, value = args
775 if name not in req.headers:
776 h.putheader(*args)
777 for k, v in req.headers.items():
778 h.putheader(k, v)
779 h.endheaders()
780 if req.has_data():
781 h.send(data)
783 code, msg, hdrs = h.getreply()
784 fp = h.getfile()
785 if code == 200:
786 return addinfourl(fp, hdrs, req.get_full_url())
787 else:
788 return self.parent.error('http', req, fp, code, msg, hdrs)
791 class HTTPHandler(AbstractHTTPHandler):
793 def http_open(self, req):
794 return self.do_open(httplib.HTTP, req)
797 if hasattr(httplib, 'HTTPS'):
798 class HTTPSHandler(AbstractHTTPHandler):
800 def https_open(self, req):
801 return self.do_open(httplib.HTTPS, req)
804 class UnknownHandler(BaseHandler):
805 def unknown_open(self, req):
806 type = req.get_type()
807 raise URLError('unknown url type: %s' % type)
809 def parse_keqv_list(l):
810 """Parse list of key=value strings where keys are not duplicated."""
811 parsed = {}
812 for elt in l:
813 k, v = elt.split('=', 1)
814 if v[0] == '"' and v[-1] == '"':
815 v = v[1:-1]
816 parsed[k] = v
817 return parsed
819 def parse_http_list(s):
820 """Parse lists as described by RFC 2068 Section 2.
822 In particular, parse comman-separated lists where the elements of
823 the list may include quoted-strings. A quoted-string could
824 contain a comma.
826 # XXX this function could probably use more testing
828 list = []
829 end = len(s)
830 i = 0
831 inquote = 0
832 start = 0
833 while i < end:
834 cur = s[i:]
835 c = cur.find(',')
836 q = cur.find('"')
837 if c == -1:
838 list.append(s[start:])
839 break
840 if q == -1:
841 if inquote:
842 raise ValueError, "unbalanced quotes"
843 else:
844 list.append(s[start:i+c])
845 i = i + c + 1
846 continue
847 if inquote:
848 if q < c:
849 list.append(s[start:i+c])
850 i = i + c + 1
851 start = i
852 inquote = 0
853 else:
854 i = i + q
855 else:
856 if c < q:
857 list.append(s[start:i+c])
858 i = i + c + 1
859 start = i
860 else:
861 inquote = 1
862 i = i + q + 1
863 return map(lambda x: x.strip(), list)
865 class FileHandler(BaseHandler):
866 # Use local file or FTP depending on form of URL
867 def file_open(self, req):
868 url = req.get_selector()
869 if url[:2] == '//' and url[2:3] != '/':
870 req.type = 'ftp'
871 return self.parent.open(req)
872 else:
873 return self.open_local_file(req)
875 # names for the localhost
876 names = None
877 def get_names(self):
878 if FileHandler.names is None:
879 FileHandler.names = (socket.gethostbyname('localhost'),
880 socket.gethostbyname(socket.gethostname()))
881 return FileHandler.names
883 # not entirely sure what the rules are here
884 def open_local_file(self, req):
885 host = req.get_host()
886 file = req.get_selector()
887 localfile = url2pathname(file)
888 stats = os.stat(localfile)
889 size = stats.st_size
890 modified = rfc822.formatdate(stats.st_mtime)
891 mtype = mimetypes.guess_type(file)[0]
892 headers = mimetools.Message(StringIO(
893 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
894 (mtype or 'text/plain', size, modified)))
895 if host:
896 host, port = splitport(host)
897 if not host or \
898 (not port and socket.gethostbyname(host) in self.get_names()):
899 return addinfourl(open(localfile, 'rb'),
900 headers, 'file:'+file)
901 raise URLError('file not on local host')
903 class FTPHandler(BaseHandler):
904 def ftp_open(self, req):
905 host = req.get_host()
906 if not host:
907 raise IOError, ('ftp error', 'no host given')
908 # XXX handle custom username & password
909 try:
910 host = socket.gethostbyname(host)
911 except socket.error, msg:
912 raise URLError(msg)
913 host, port = splitport(host)
914 if port is None:
915 port = ftplib.FTP_PORT
916 path, attrs = splitattr(req.get_selector())
917 path = unquote(path)
918 dirs = path.split('/')
919 dirs, file = dirs[:-1], dirs[-1]
920 if dirs and not dirs[0]:
921 dirs = dirs[1:]
922 user = passwd = '' # XXX
923 try:
924 fw = self.connect_ftp(user, passwd, host, port, dirs)
925 type = file and 'I' or 'D'
926 for attr in attrs:
927 attr, value = splitattr(attr)
928 if attr.lower() == 'type' and \
929 value in ('a', 'A', 'i', 'I', 'd', 'D'):
930 type = value.upper()
931 fp, retrlen = fw.retrfile(file, type)
932 headers = ""
933 mtype = mimetypes.guess_type(req.get_full_url())[0]
934 if mtype:
935 headers += "Content-Type: %s\n" % mtype
936 if retrlen is not None and retrlen >= 0:
937 headers += "Content-Length: %d\n" % retrlen
938 sf = StringIO(headers)
939 headers = mimetools.Message(sf)
940 return addinfourl(fp, headers, req.get_full_url())
941 except ftplib.all_errors, msg:
942 raise IOError, ('ftp error', msg), sys.exc_info()[2]
944 def connect_ftp(self, user, passwd, host, port, dirs):
945 fw = ftpwrapper(user, passwd, host, port, dirs)
946 ## fw.ftp.set_debuglevel(1)
947 return fw
949 class CacheFTPHandler(FTPHandler):
950 # XXX would be nice to have pluggable cache strategies
951 # XXX this stuff is definitely not thread safe
952 def __init__(self):
953 self.cache = {}
954 self.timeout = {}
955 self.soonest = 0
956 self.delay = 60
957 self.max_conns = 16
959 def setTimeout(self, t):
960 self.delay = t
962 def setMaxConns(self, m):
963 self.max_conns = m
965 def connect_ftp(self, user, passwd, host, port, dirs):
966 key = user, passwd, host, port
967 if key in self.cache:
968 self.timeout[key] = time.time() + self.delay
969 else:
970 self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
971 self.timeout[key] = time.time() + self.delay
972 self.check_cache()
973 return self.cache[key]
975 def check_cache(self):
976 # first check for old ones
977 t = time.time()
978 if self.soonest <= t:
979 for k, v in self.timeout.items():
980 if v < t:
981 self.cache[k].close()
982 del self.cache[k]
983 del self.timeout[k]
984 self.soonest = min(self.timeout.values())
986 # then check the size
987 if len(self.cache) == self.max_conns:
988 for k, v in self.timeout.items():
989 if v == self.soonest:
990 del self.cache[k]
991 del self.timeout[k]
992 break
993 self.soonest = min(self.timeout.values())
995 class GopherHandler(BaseHandler):
996 def gopher_open(self, req):
997 host = req.get_host()
998 if not host:
999 raise GopherError('no host given')
1000 host = unquote(host)
1001 selector = req.get_selector()
1002 type, selector = splitgophertype(selector)
1003 selector, query = splitquery(selector)
1004 selector = unquote(selector)
1005 if query:
1006 query = unquote(query)
1007 fp = gopherlib.send_query(selector, query, host)
1008 else:
1009 fp = gopherlib.send_selector(selector, host)
1010 return addinfourl(fp, noheaders(), req.get_full_url())
1012 #bleck! don't use this yet
1013 class OpenerFactory:
1015 default_handlers = [UnknownHandler, HTTPHandler,
1016 HTTPDefaultErrorHandler, HTTPRedirectHandler,
1017 FTPHandler, FileHandler]
1018 proxy_handlers = [ProxyHandler]
1019 handlers = []
1020 replacement_handlers = []
1022 def add_proxy_handler(self, ph):
1023 self.proxy_handlers = self.proxy_handlers + [ph]
1025 def add_handler(self, h):
1026 self.handlers = self.handlers + [h]
1028 def replace_handler(self, h):
1029 pass
1031 def build_opener(self):
1032 opener = OpenerDirector()
1033 for ph in self.proxy_handlers:
1034 if inspect.isclass(ph):
1035 ph = ph()
1036 opener.add_handler(ph)
1038 if __name__ == "__main__":
1039 # XXX some of the test code depends on machine configurations that
1040 # are internal to CNRI. Need to set up a public server with the
1041 # right authentication configuration for test purposes.
1042 if socket.gethostname() == 'bitdiddle':
1043 localhost = 'bitdiddle.cnri.reston.va.us'
1044 elif socket.gethostname() == 'bitdiddle.concentric.net':
1045 localhost = 'localhost'
1046 else:
1047 localhost = None
1048 urls = [
1049 # Thanks to Fred for finding these!
1050 'gopher://gopher.lib.ncsu.edu/11/library/stacks/Alex',
1051 'gopher://gopher.vt.edu:10010/10/33',
1053 'file:/etc/passwd',
1054 'file://nonsensename/etc/passwd',
1055 'ftp://www.python.org/pub/python/misc/sousa.au',
1056 'ftp://www.python.org/pub/tmp/blat',
1057 'http://www.espn.com/', # redirect
1058 'http://www.python.org/Spanish/Inquistion/',
1059 ('http://www.python.org/cgi-bin/faqw.py',
1060 'query=pythonistas&querytype=simple&casefold=yes&req=search'),
1061 'http://www.python.org/',
1062 'ftp://gatekeeper.research.compaq.com/pub/DEC/SRC/research-reports/00README-Legal-Rules-Regs',
1065 ## if localhost is not None:
1066 ## urls = urls + [
1067 ## 'file://%s/etc/passwd' % localhost,
1068 ## 'http://%s/simple/' % localhost,
1069 ## 'http://%s/digest/' % localhost,
1070 ## 'http://%s/not/found.h' % localhost,
1071 ## ]
1073 ## bauth = HTTPBasicAuthHandler()
1074 ## bauth.add_password('basic_test_realm', localhost, 'jhylton',
1075 ## 'password')
1076 ## dauth = HTTPDigestAuthHandler()
1077 ## dauth.add_password('digest_test_realm', localhost, 'jhylton',
1078 ## 'password')
1081 cfh = CacheFTPHandler()
1082 cfh.setTimeout(1)
1084 ## # XXX try out some custom proxy objects too!
1085 ## def at_cnri(req):
1086 ## host = req.get_host()
1087 ## print host
1088 ## if host[-18:] == '.cnri.reston.va.us':
1089 ## return 1
1090 ## p = CustomProxy('http', at_cnri, 'proxy.cnri.reston.va.us')
1091 ## ph = CustomProxyHandler(p)
1093 ## install_opener(build_opener(dauth, bauth, cfh, GopherHandler, ph))
1094 install_opener(build_opener(cfh, GopherHandler))
1096 for url in urls:
1097 if isinstance(url, tuple):
1098 url, req = url
1099 else:
1100 req = None
1101 print url
1102 try:
1103 f = urlopen(url, req)
1104 except IOError, err:
1105 print "IOError:", err
1106 except socket.error, err:
1107 print "socket.error:", err
1108 else:
1109 buf = f.read()
1110 f.close()
1111 print "read %d bytes" % len(buf)
1112 print
1113 time.sleep(0.1)