append(): Fixing the test for convertability after consultation with
[python/dscho.git] / Lib / urllib2.py
blobf2e854a2210f0e40c2133a5daea2f605c842ffcd
1 """An extensible library for opening URLs using a variety of protocols
3 The simplest way to use this module is to call the urlopen function,
4 which accepts a string containing a URL or a Request object (described
5 below). It opens the URL and returns the results as file-like
6 object; the returned object has some extra methods described below.
8 The OpenerDirector manages a collection of Handler objects that do
9 all the actual work. Each Handler implements a particular protocol or
10 option. The OpenerDirector is a composite object that invokes the
11 Handlers needed to open the requested URL. For example, the
12 HTTPHandler performs HTTP GET and POST requests and deals with
13 non-error returns. The HTTPRedirectHandler automatically deals with
14 HTTP 301 & 302 redirect errors, and the HTTPDigestAuthHandler deals
15 with digest authentication.
17 urlopen(url, data=None) -- basic usage is that same as original
18 urllib. pass the url and optionally data to post to an HTTP URL, and
19 get a file-like object back. One difference is that you can also pass
20 a Request instance instead of URL. Raises a URLError (subclass of
21 IOError); for HTTP errors, raises an HTTPError, which can also be
22 treated as a valid response.
24 build_opener -- function that creates a new OpenerDirector instance.
25 will install the default handlers. accepts one or more Handlers as
26 arguments, either instances or Handler classes that it will
27 instantiate. if one of the argument is a subclass of the default
28 handler, the argument will be installed instead of the default.
30 install_opener -- installs a new opener as the default opener.
32 objects of interest:
33 OpenerDirector --
35 Request -- an object that encapsulates the state of a request. the
36 state can be a simple as the URL. it can also include extra HTTP
37 headers, e.g. a User-Agent.
39 BaseHandler --
41 exceptions:
42 URLError-- a subclass of IOError, individual protocols have their own
43 specific subclass
45 HTTPError-- also a valid HTTP response, so you can treat an HTTP error
46 as an exceptional event or valid response
48 internals:
49 BaseHandler and parent
50 _call_chain conventions
52 Example usage:
54 import urllib2
56 # set up authentication info
57 authinfo = urllib2.HTTPBasicAuthHandler()
58 authinfo.add_password('realm', 'host', 'username', 'password')
60 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
62 # build a new opener that adds authentication and caching FTP handlers
63 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
65 # install it
66 urllib2.install_opener(opener)
68 f = urllib2.urlopen('http://www.python.org/')
71 """
73 # XXX issues:
74 # If an authentication error handler that tries to perform
75 # authentication for some reason but fails, how should the error be
76 # signalled? The client needs to know the HTTP error code. But if
77 # the handler knows that the problem was, e.g., that it didn't know
78 # that hash algo that requested in the challenge, it would be good to
79 # pass that information along to the client, too.
81 # XXX to do:
82 # name!
83 # documentation (getting there)
84 # complex proxies
85 # abstract factory for opener
86 # ftp errors aren't handled cleanly
87 # gopher can return a socket.error
88 # check digest against correct (i.e. non-apache) implementation
90 import socket
91 import httplib
92 import inspect
93 import re
94 import base64
95 import urlparse
96 import md5
97 import mimetypes
98 import mimetools
99 import rfc822
100 import ftplib
101 import sys
102 import time
103 import os
104 import gopherlib
105 import posixpath
107 try:
108 from cStringIO import StringIO
109 except ImportError:
110 from StringIO import StringIO
112 try:
113 import sha
114 except ImportError:
115 # need 1.5.2 final
116 sha = None
118 # not sure how many of these need to be gotten rid of
119 from urllib import unwrap, unquote, splittype, splithost, \
120 addinfourl, splitport, splitgophertype, splitquery, \
121 splitattr, ftpwrapper, noheaders
123 # support for proxies via environment variables
124 from urllib import getproxies
126 # support for FileHandler
127 from urllib import localhost, url2pathname
129 __version__ = "2.0a1"
131 _opener = None
132 def urlopen(url, data=None):
133 global _opener
134 if _opener is None:
135 _opener = build_opener()
136 return _opener.open(url, data)
138 def install_opener(opener):
139 global _opener
140 _opener = opener
142 # do these error classes make sense?
143 # make sure all of the IOError stuff is overridden. we just want to be
144 # subtypes.
146 class URLError(IOError):
147 # URLError is a sub-type of IOError, but it doesn't share any of
148 # the implementation. need to override __init__ and __str__
149 def __init__(self, reason):
150 self.reason = reason
152 def __str__(self):
153 return '<urlopen error %s>' % self.reason
155 class HTTPError(URLError, addinfourl):
156 """Raised when HTTP error occurs, but also acts like non-error return"""
157 __super_init = addinfourl.__init__
159 def __init__(self, url, code, msg, hdrs, fp):
160 self.code = code
161 self.msg = msg
162 self.hdrs = hdrs
163 self.fp = fp
164 self.filename = url
165 # The addinfourl classes depend on fp being a valid file
166 # object. In some cases, the HTTPError may not have a valid
167 # file object. If this happens, the simplest workaround is to
168 # not initialize the base classes.
169 if fp is not None:
170 self.__super_init(fp, hdrs, url)
172 def __str__(self):
173 return 'HTTP Error %s: %s' % (self.code, self.msg)
175 def __del__(self):
176 # XXX is this safe? what if user catches exception, then
177 # extracts fp and discards exception?
178 if self.fp:
179 self.fp.close()
181 class GopherError(URLError):
182 pass
185 class Request:
187 def __init__(self, url, data=None, headers={}):
188 # unwrap('<URL:type://host/path>') --> 'type://host/path'
189 self.__original = unwrap(url)
190 self.type = None
191 # self.__r_type is what's left after doing the splittype
192 self.host = None
193 self.port = None
194 self.data = data
195 self.headers = {}
196 self.headers.update(headers)
198 def __getattr__(self, attr):
199 # XXX this is a fallback mechanism to guard against these
200 # methods getting called in a non-standard order. this may be
201 # too complicated and/or unnecessary.
202 # XXX should the __r_XXX attributes be public?
203 if attr[:12] == '_Request__r_':
204 name = attr[12:]
205 if hasattr(Request, 'get_' + name):
206 getattr(self, 'get_' + name)()
207 return getattr(self, attr)
208 raise AttributeError, attr
210 def add_data(self, data):
211 self.data = data
213 def has_data(self):
214 return self.data is not None
216 def get_data(self):
217 return self.data
219 def get_full_url(self):
220 return self.__original
222 def get_type(self):
223 if self.type is None:
224 self.type, self.__r_type = splittype(self.__original)
225 if self.type is None:
226 raise ValueError, "unknown url type: %s" % self.__original
227 return self.type
229 def get_host(self):
230 if self.host is None:
231 self.host, self.__r_host = splithost(self.__r_type)
232 if self.host:
233 self.host = unquote(self.host)
234 return self.host
236 def get_selector(self):
237 return self.__r_host
239 def set_proxy(self, host, type):
240 self.host, self.type = host, type
241 self.__r_host = self.__original
243 def add_header(self, key, val):
244 # useful for something like authentication
245 self.headers[key] = val
247 class OpenerDirector:
248 def __init__(self):
249 server_version = "Python-urllib/%s" % __version__
250 self.addheaders = [('User-Agent', server_version)]
251 # manage the individual handlers
252 self.handlers = []
253 self.handle_open = {}
254 self.handle_error = {}
256 def add_handler(self, handler):
257 added = 0
258 for meth in dir(handler):
259 if meth[-5:] == '_open':
260 protocol = meth[:-5]
261 if protocol in self.handle_open:
262 self.handle_open[protocol].append(handler)
263 else:
264 self.handle_open[protocol] = [handler]
265 added = 1
266 continue
267 i = meth.find('_')
268 j = meth[i+1:].find('_') + i + 1
269 if j != -1 and meth[i+1:j] == 'error':
270 proto = meth[:i]
271 kind = meth[j+1:]
272 try:
273 kind = int(kind)
274 except ValueError:
275 pass
276 dict = self.handle_error.get(proto, {})
277 if kind in dict:
278 dict[kind].append(handler)
279 else:
280 dict[kind] = [handler]
281 self.handle_error[proto] = dict
282 added = 1
283 continue
284 if added:
285 self.handlers.append(handler)
286 handler.add_parent(self)
288 def __del__(self):
289 self.close()
291 def close(self):
292 for handler in self.handlers:
293 handler.close()
294 self.handlers = []
296 def _call_chain(self, chain, kind, meth_name, *args):
297 # XXX raise an exception if no one else should try to handle
298 # this url. return None if you can't but someone else could.
299 handlers = chain.get(kind, ())
300 for handler in handlers:
301 func = getattr(handler, meth_name)
303 result = func(*args)
304 if result is not None:
305 return result
307 def open(self, fullurl, data=None):
308 # accept a URL or a Request object
309 if isinstance(fullurl, basestring):
310 req = Request(fullurl, data)
311 else:
312 req = fullurl
313 if data is not None:
314 req.add_data(data)
315 assert isinstance(req, Request) # really only care about interface
317 result = self._call_chain(self.handle_open, 'default',
318 'default_open', req)
319 if result:
320 return result
322 type_ = req.get_type()
323 result = self._call_chain(self.handle_open, type_, type_ + \
324 '_open', req)
325 if result:
326 return result
328 return self._call_chain(self.handle_open, 'unknown',
329 'unknown_open', req)
331 def error(self, proto, *args):
332 if proto in ['http', 'https']:
333 # XXX http[s] protocols are special-cased
334 dict = self.handle_error['http'] # https is not different than http
335 proto = args[2] # YUCK!
336 meth_name = 'http_error_%d' % proto
337 http_err = 1
338 orig_args = args
339 else:
340 dict = self.handle_error
341 meth_name = proto + '_error'
342 http_err = 0
343 args = (dict, proto, meth_name) + args
344 result = self._call_chain(*args)
345 if result:
346 return result
348 if http_err:
349 args = (dict, 'default', 'http_error_default') + orig_args
350 return self._call_chain(*args)
352 # XXX probably also want an abstract factory that knows things like
353 # the fact that a ProxyHandler needs to get inserted first.
354 # would also know when it makes sense to skip a superclass in favor of
355 # a subclass and when it might make sense to include both
357 def build_opener(*handlers):
358 """Create an opener object from a list of handlers.
360 The opener will use several default handlers, including support
361 for HTTP and FTP. If there is a ProxyHandler, it must be at the
362 front of the list of handlers. (Yuck.)
364 If any of the handlers passed as arguments are subclasses of the
365 default handlers, the default handlers will not be used.
368 opener = OpenerDirector()
369 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
370 HTTPDefaultErrorHandler, HTTPRedirectHandler,
371 FTPHandler, FileHandler]
372 if hasattr(httplib, 'HTTPS'):
373 default_classes.append(HTTPSHandler)
374 skip = []
375 for klass in default_classes:
376 for check in handlers:
377 if inspect.isclass(check):
378 if issubclass(check, klass):
379 skip.append(klass)
380 elif isinstance(check, klass):
381 skip.append(klass)
382 for klass in skip:
383 default_classes.remove(klass)
385 for klass in default_classes:
386 opener.add_handler(klass())
388 for h in handlers:
389 if inspect.isclass(h):
390 h = h()
391 opener.add_handler(h)
392 return opener
394 class BaseHandler:
395 def add_parent(self, parent):
396 self.parent = parent
397 def close(self):
398 self.parent = None
400 class HTTPDefaultErrorHandler(BaseHandler):
401 def http_error_default(self, req, fp, code, msg, hdrs):
402 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
404 class HTTPRedirectHandler(BaseHandler):
405 # Implementation note: To avoid the server sending us into an
406 # infinite loop, the request object needs to track what URLs we
407 # have already seen. Do this by adding a handler-specific
408 # attribute to the Request object.
409 def http_error_302(self, req, fp, code, msg, headers):
410 if 'location' in headers:
411 newurl = headers['location']
412 elif 'uri' in headers:
413 newurl = headers['uri']
414 else:
415 return
416 newurl = urlparse.urljoin(req.get_full_url(), newurl)
418 # XXX Probably want to forget about the state of the current
419 # request, although that might interact poorly with other
420 # handlers that also use handler-specific request attributes
421 new = Request(newurl, req.get_data(), req.headers)
422 new.error_302_dict = {}
423 if hasattr(req, 'error_302_dict'):
424 if len(req.error_302_dict)>10 or \
425 newurl in req.error_302_dict:
426 raise HTTPError(req.get_full_url(), code,
427 self.inf_msg + msg, headers, fp)
428 new.error_302_dict.update(req.error_302_dict)
429 new.error_302_dict[newurl] = newurl
431 # Don't close the fp until we are sure that we won't use it
432 # with HTTPError.
433 fp.read()
434 fp.close()
436 return self.parent.open(new)
438 http_error_301 = http_error_302
440 inf_msg = "The HTTP server returned a redirect error that would" \
441 "lead to an infinite loop.\n" \
442 "The last 302 error message was:\n"
444 class ProxyHandler(BaseHandler):
445 def __init__(self, proxies=None):
446 if proxies is None:
447 proxies = getproxies()
448 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
449 self.proxies = proxies
450 for type, url in proxies.items():
451 setattr(self, '%s_open' % type,
452 lambda r, proxy=url, type=type, meth=self.proxy_open: \
453 meth(r, proxy, type))
455 def proxy_open(self, req, proxy, type):
456 orig_type = req.get_type()
457 type, r_type = splittype(proxy)
458 host, XXX = splithost(r_type)
459 if '@' in host:
460 user_pass, host = host.split('@', 1)
461 if ':' in user_pass:
462 user, password = user_pass.split(':', 1)
463 user_pass = base64.encodestring('%s:%s' % (unquote(user),
464 unquote(password)))
465 req.add_header('Proxy-Authorization', 'Basic ' + user_pass)
466 host = unquote(host)
467 req.set_proxy(host, type)
468 if orig_type == type:
469 # let other handlers take care of it
470 # XXX this only makes sense if the proxy is before the
471 # other handlers
472 return None
473 else:
474 # need to start over, because the other handlers don't
475 # grok the proxy's URL type
476 return self.parent.open(req)
478 # feature suggested by Duncan Booth
479 # XXX custom is not a good name
480 class CustomProxy:
481 # either pass a function to the constructor or override handle
482 def __init__(self, proto, func=None, proxy_addr=None):
483 self.proto = proto
484 self.func = func
485 self.addr = proxy_addr
487 def handle(self, req):
488 if self.func and self.func(req):
489 return 1
491 def get_proxy(self):
492 return self.addr
494 class CustomProxyHandler(BaseHandler):
495 def __init__(self, *proxies):
496 self.proxies = {}
498 def proxy_open(self, req):
499 proto = req.get_type()
500 try:
501 proxies = self.proxies[proto]
502 except KeyError:
503 return None
504 for p in proxies:
505 if p.handle(req):
506 req.set_proxy(p.get_proxy())
507 return self.parent.open(req)
508 return None
510 def do_proxy(self, p, req):
511 return self.parent.open(req)
513 def add_proxy(self, cpo):
514 if cpo.proto in self.proxies:
515 self.proxies[cpo.proto].append(cpo)
516 else:
517 self.proxies[cpo.proto] = [cpo]
519 class HTTPPasswordMgr:
520 def __init__(self):
521 self.passwd = {}
523 def add_password(self, realm, uri, user, passwd):
524 # uri could be a single URI or a sequence
525 if isinstance(uri, basestring):
526 uri = [uri]
527 uri = tuple(map(self.reduce_uri, uri))
528 if not realm in self.passwd:
529 self.passwd[realm] = {}
530 self.passwd[realm][uri] = (user, passwd)
532 def find_user_password(self, realm, authuri):
533 domains = self.passwd.get(realm, {})
534 authuri = self.reduce_uri(authuri)
535 for uris, authinfo in domains.items():
536 for uri in uris:
537 if self.is_suburi(uri, authuri):
538 return authinfo
539 return None, None
541 def reduce_uri(self, uri):
542 """Accept netloc or URI and extract only the netloc and path"""
543 parts = urlparse.urlparse(uri)
544 if parts[1]:
545 return parts[1], parts[2] or '/'
546 else:
547 return parts[2], '/'
549 def is_suburi(self, base, test):
550 """Check if test is below base in a URI tree
552 Both args must be URIs in reduced form.
554 if base == test:
555 return True
556 if base[0] != test[0]:
557 return False
558 common = posixpath.commonprefix((base[1], test[1]))
559 if len(common) == len(base[1]):
560 return True
561 return False
564 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
566 def find_user_password(self, realm, authuri):
567 user, password = HTTPPasswordMgr.find_user_password(self,realm,authuri)
568 if user is not None:
569 return user, password
570 return HTTPPasswordMgr.find_user_password(self, None, authuri)
573 class AbstractBasicAuthHandler:
575 rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
577 # XXX there can actually be multiple auth-schemes in a
578 # www-authenticate header. should probably be a lot more careful
579 # in parsing them to extract multiple alternatives
581 def __init__(self, password_mgr=None):
582 if password_mgr is None:
583 password_mgr = HTTPPasswordMgr()
584 self.passwd = password_mgr
585 self.add_password = self.passwd.add_password
587 def http_error_auth_reqed(self, authreq, host, req, headers):
588 # XXX could be multiple headers
589 authreq = headers.get(authreq, None)
590 if authreq:
591 mo = AbstractBasicAuthHandler.rx.match(authreq)
592 if mo:
593 scheme, realm = mo.groups()
594 if scheme.lower() == 'basic':
595 return self.retry_http_basic_auth(host, req, realm)
597 def retry_http_basic_auth(self, host, req, realm):
598 user,pw = self.passwd.find_user_password(realm, host)
599 if pw:
600 raw = "%s:%s" % (user, pw)
601 auth = 'Basic %s' % base64.encodestring(raw).strip()
602 if req.headers.get(self.auth_header, None) == auth:
603 return None
604 req.add_header(self.auth_header, auth)
605 return self.parent.open(req)
606 else:
607 return None
609 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
611 auth_header = 'Authorization'
613 def http_error_401(self, req, fp, code, msg, headers):
614 host = urlparse.urlparse(req.get_full_url())[1]
615 return self.http_error_auth_reqed('www-authenticate',
616 host, req, headers)
619 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
621 auth_header = 'Proxy-Authorization'
623 def http_error_407(self, req, fp, code, msg, headers):
624 host = req.get_host()
625 return self.http_error_auth_reqed('proxy-authenticate',
626 host, req, headers)
629 class AbstractDigestAuthHandler:
631 def __init__(self, passwd=None):
632 if passwd is None:
633 passwd = HTTPPasswordMgr()
634 self.passwd = passwd
635 self.add_password = self.passwd.add_password
637 def http_error_auth_reqed(self, authreq, host, req, headers):
638 authreq = headers.get(self.auth_header, None)
639 if authreq:
640 kind = authreq.split()[0]
641 if kind == 'Digest':
642 return self.retry_http_digest_auth(req, authreq)
644 def retry_http_digest_auth(self, req, auth):
645 token, challenge = auth.split(' ', 1)
646 chal = parse_keqv_list(parse_http_list(challenge))
647 auth = self.get_authorization(req, chal)
648 if auth:
649 auth_val = 'Digest %s' % auth
650 if req.headers.get(self.auth_header, None) == auth_val:
651 return None
652 req.add_header(self.auth_header, auth_val)
653 resp = self.parent.open(req)
654 return resp
656 def get_authorization(self, req, chal):
657 try:
658 realm = chal['realm']
659 nonce = chal['nonce']
660 algorithm = chal.get('algorithm', 'MD5')
661 # mod_digest doesn't send an opaque, even though it isn't
662 # supposed to be optional
663 opaque = chal.get('opaque', None)
664 except KeyError:
665 return None
667 H, KD = self.get_algorithm_impls(algorithm)
668 if H is None:
669 return None
671 user, pw = self.passwd.find_user_password(realm,
672 req.get_full_url())
673 if user is None:
674 return None
676 # XXX not implemented yet
677 if req.has_data():
678 entdig = self.get_entity_digest(req.get_data(), chal)
679 else:
680 entdig = None
682 A1 = "%s:%s:%s" % (user, realm, pw)
683 A2 = "%s:%s" % (req.has_data() and 'POST' or 'GET',
684 # XXX selector: what about proxies and full urls
685 req.get_selector())
686 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
687 # XXX should the partial digests be encoded too?
689 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
690 'response="%s"' % (user, realm, nonce, req.get_selector(),
691 respdig)
692 if opaque:
693 base = base + ', opaque="%s"' % opaque
694 if entdig:
695 base = base + ', digest="%s"' % entdig
696 if algorithm != 'MD5':
697 base = base + ', algorithm="%s"' % algorithm
698 return base
700 def get_algorithm_impls(self, algorithm):
701 # lambdas assume digest modules are imported at the top level
702 if algorithm == 'MD5':
703 H = lambda x, e=encode_digest:e(md5.new(x).digest())
704 elif algorithm == 'SHA':
705 H = lambda x, e=encode_digest:e(sha.new(x).digest())
706 # XXX MD5-sess
707 KD = lambda s, d, H=H: H("%s:%s" % (s, d))
708 return H, KD
710 def get_entity_digest(self, data, chal):
711 # XXX not implemented yet
712 return None
715 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
716 """An authentication protocol defined by RFC 2069
718 Digest authentication improves on basic authentication because it
719 does not transmit passwords in the clear.
722 header = 'Authorization'
724 def http_error_401(self, req, fp, code, msg, headers):
725 host = urlparse.urlparse(req.get_full_url())[1]
726 self.http_error_auth_reqed('www-authenticate', host, req, headers)
729 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
731 header = 'Proxy-Authorization'
733 def http_error_407(self, req, fp, code, msg, headers):
734 host = req.get_host()
735 self.http_error_auth_reqed('proxy-authenticate', host, req, headers)
738 def encode_digest(digest):
739 hexrep = []
740 for c in digest:
741 n = (ord(c) >> 4) & 0xf
742 hexrep.append(hex(n)[-1])
743 n = ord(c) & 0xf
744 hexrep.append(hex(n)[-1])
745 return ''.join(hexrep)
748 class AbstractHTTPHandler(BaseHandler):
750 def do_open(self, http_class, req):
751 host = req.get_host()
752 if not host:
753 raise URLError('no host given')
755 try:
756 h = http_class(host) # will parse host:port
757 if req.has_data():
758 data = req.get_data()
759 h.putrequest('POST', req.get_selector())
760 if not 'Content-type' in req.headers:
761 h.putheader('Content-type',
762 'application/x-www-form-urlencoded')
763 if not 'Content-length' in req.headers:
764 h.putheader('Content-length', '%d' % len(data))
765 else:
766 h.putrequest('GET', req.get_selector())
767 except socket.error, err:
768 raise URLError(err)
770 scheme, sel = splittype(req.get_selector())
771 sel_host, sel_path = splithost(sel)
772 h.putheader('Host', sel_host or host)
773 for args in self.parent.addheaders:
774 if name not in req.headers:
775 h.putheader(*args)
776 for k, v in req.headers.items():
777 h.putheader(k, v)
778 h.endheaders()
779 if req.has_data():
780 h.send(data)
782 code, msg, hdrs = h.getreply()
783 fp = h.getfile()
784 if code == 200:
785 return addinfourl(fp, hdrs, req.get_full_url())
786 else:
787 return self.parent.error('http', req, fp, code, msg, hdrs)
790 class HTTPHandler(AbstractHTTPHandler):
792 def http_open(self, req):
793 return self.do_open(httplib.HTTP, req)
796 if hasattr(httplib, 'HTTPS'):
797 class HTTPSHandler(AbstractHTTPHandler):
799 def https_open(self, req):
800 return self.do_open(httplib.HTTPS, req)
803 class UnknownHandler(BaseHandler):
804 def unknown_open(self, req):
805 type = req.get_type()
806 raise URLError('unknown url type: %s' % type)
808 def parse_keqv_list(l):
809 """Parse list of key=value strings where keys are not duplicated."""
810 parsed = {}
811 for elt in l:
812 k, v = elt.split('=', 1)
813 if v[0] == '"' and v[-1] == '"':
814 v = v[1:-1]
815 parsed[k] = v
816 return parsed
818 def parse_http_list(s):
819 """Parse lists as described by RFC 2068 Section 2.
821 In particular, parse comman-separated lists where the elements of
822 the list may include quoted-strings. A quoted-string could
823 contain a comma.
825 # XXX this function could probably use more testing
827 list = []
828 end = len(s)
829 i = 0
830 inquote = 0
831 start = 0
832 while i < end:
833 cur = s[i:]
834 c = cur.find(',')
835 q = cur.find('"')
836 if c == -1:
837 list.append(s[start:])
838 break
839 if q == -1:
840 if inquote:
841 raise ValueError, "unbalanced quotes"
842 else:
843 list.append(s[start:i+c])
844 i = i + c + 1
845 continue
846 if inquote:
847 if q < c:
848 list.append(s[start:i+c])
849 i = i + c + 1
850 start = i
851 inquote = 0
852 else:
853 i = i + q
854 else:
855 if c < q:
856 list.append(s[start:i+c])
857 i = i + c + 1
858 start = i
859 else:
860 inquote = 1
861 i = i + q + 1
862 return map(lambda x: x.strip(), list)
864 class FileHandler(BaseHandler):
865 # Use local file or FTP depending on form of URL
866 def file_open(self, req):
867 url = req.get_selector()
868 if url[:2] == '//' and url[2:3] != '/':
869 req.type = 'ftp'
870 return self.parent.open(req)
871 else:
872 return self.open_local_file(req)
874 # names for the localhost
875 names = None
876 def get_names(self):
877 if FileHandler.names is None:
878 FileHandler.names = (socket.gethostbyname('localhost'),
879 socket.gethostbyname(socket.gethostname()))
880 return FileHandler.names
882 # not entirely sure what the rules are here
883 def open_local_file(self, req):
884 host = req.get_host()
885 file = req.get_selector()
886 localfile = url2pathname(file)
887 stats = os.stat(localfile)
888 size = stats.st_size
889 modified = rfc822.formatdate(stats.st_mtime)
890 mtype = mimetypes.guess_type(file)[0]
891 headers = mimetools.Message(StringIO(
892 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
893 (mtype or 'text/plain', size, modified)))
894 if host:
895 host, port = splitport(host)
896 if not host or \
897 (not port and socket.gethostbyname(host) in self.get_names()):
898 return addinfourl(open(localfile, 'rb'),
899 headers, 'file:'+file)
900 raise URLError('file not on local host')
902 class FTPHandler(BaseHandler):
903 def ftp_open(self, req):
904 host = req.get_host()
905 if not host:
906 raise IOError, ('ftp error', 'no host given')
907 # XXX handle custom username & password
908 try:
909 host = socket.gethostbyname(host)
910 except socket.error, msg:
911 raise URLError(msg)
912 host, port = splitport(host)
913 if port is None:
914 port = ftplib.FTP_PORT
915 path, attrs = splitattr(req.get_selector())
916 path = unquote(path)
917 dirs = path.split('/')
918 dirs, file = dirs[:-1], dirs[-1]
919 if dirs and not dirs[0]:
920 dirs = dirs[1:]
921 user = passwd = '' # XXX
922 try:
923 fw = self.connect_ftp(user, passwd, host, port, dirs)
924 type = file and 'I' or 'D'
925 for attr in attrs:
926 attr, value = splitattr(attr)
927 if attr.lower() == 'type' and \
928 value in ('a', 'A', 'i', 'I', 'd', 'D'):
929 type = value.upper()
930 fp, retrlen = fw.retrfile(file, type)
931 headers = ""
932 mtype = mimetypes.guess_type(req.get_full_url())[0]
933 if mtype:
934 headers += "Content-Type: %s\n" % mtype
935 if retrlen is not None and retrlen >= 0:
936 headers += "Content-Length: %d\n" % retrlen
937 sf = StringIO(headers)
938 headers = mimetools.Message(sf)
939 return addinfourl(fp, headers, req.get_full_url())
940 except ftplib.all_errors, msg:
941 raise IOError, ('ftp error', msg), sys.exc_info()[2]
943 def connect_ftp(self, user, passwd, host, port, dirs):
944 fw = ftpwrapper(user, passwd, host, port, dirs)
945 ## fw.ftp.set_debuglevel(1)
946 return fw
948 class CacheFTPHandler(FTPHandler):
949 # XXX would be nice to have pluggable cache strategies
950 # XXX this stuff is definitely not thread safe
951 def __init__(self):
952 self.cache = {}
953 self.timeout = {}
954 self.soonest = 0
955 self.delay = 60
956 self.max_conns = 16
958 def setTimeout(self, t):
959 self.delay = t
961 def setMaxConns(self, m):
962 self.max_conns = m
964 def connect_ftp(self, user, passwd, host, port, dirs):
965 key = user, passwd, host, port
966 if key in self.cache:
967 self.timeout[key] = time.time() + self.delay
968 else:
969 self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
970 self.timeout[key] = time.time() + self.delay
971 self.check_cache()
972 return self.cache[key]
974 def check_cache(self):
975 # first check for old ones
976 t = time.time()
977 if self.soonest <= t:
978 for k, v in self.timeout.items():
979 if v < t:
980 self.cache[k].close()
981 del self.cache[k]
982 del self.timeout[k]
983 self.soonest = min(self.timeout.values())
985 # then check the size
986 if len(self.cache) == self.max_conns:
987 for k, v in self.timeout.items():
988 if v == self.soonest:
989 del self.cache[k]
990 del self.timeout[k]
991 break
992 self.soonest = min(self.timeout.values())
994 class GopherHandler(BaseHandler):
995 def gopher_open(self, req):
996 host = req.get_host()
997 if not host:
998 raise GopherError('no host given')
999 host = unquote(host)
1000 selector = req.get_selector()
1001 type, selector = splitgophertype(selector)
1002 selector, query = splitquery(selector)
1003 selector = unquote(selector)
1004 if query:
1005 query = unquote(query)
1006 fp = gopherlib.send_query(selector, query, host)
1007 else:
1008 fp = gopherlib.send_selector(selector, host)
1009 return addinfourl(fp, noheaders(), req.get_full_url())
1011 #bleck! don't use this yet
1012 class OpenerFactory:
1014 default_handlers = [UnknownHandler, HTTPHandler,
1015 HTTPDefaultErrorHandler, HTTPRedirectHandler,
1016 FTPHandler, FileHandler]
1017 proxy_handlers = [ProxyHandler]
1018 handlers = []
1019 replacement_handlers = []
1021 def add_proxy_handler(self, ph):
1022 self.proxy_handlers = self.proxy_handlers + [ph]
1024 def add_handler(self, h):
1025 self.handlers = self.handlers + [h]
1027 def replace_handler(self, h):
1028 pass
1030 def build_opener(self):
1031 opener = OpenerDirector()
1032 for ph in self.proxy_handlers:
1033 if inspect.isclass(ph):
1034 ph = ph()
1035 opener.add_handler(ph)
1037 if __name__ == "__main__":
1038 # XXX some of the test code depends on machine configurations that
1039 # are internal to CNRI. Need to set up a public server with the
1040 # right authentication configuration for test purposes.
1041 if socket.gethostname() == 'bitdiddle':
1042 localhost = 'bitdiddle.cnri.reston.va.us'
1043 elif socket.gethostname() == 'bitdiddle.concentric.net':
1044 localhost = 'localhost'
1045 else:
1046 localhost = None
1047 urls = [
1048 # Thanks to Fred for finding these!
1049 'gopher://gopher.lib.ncsu.edu/11/library/stacks/Alex',
1050 'gopher://gopher.vt.edu:10010/10/33',
1052 'file:/etc/passwd',
1053 'file://nonsensename/etc/passwd',
1054 'ftp://www.python.org/pub/python/misc/sousa.au',
1055 'ftp://www.python.org/pub/tmp/blat',
1056 'http://www.espn.com/', # redirect
1057 'http://www.python.org/Spanish/Inquistion/',
1058 ('http://www.python.org/cgi-bin/faqw.py',
1059 'query=pythonistas&querytype=simple&casefold=yes&req=search'),
1060 'http://www.python.org/',
1061 'ftp://gatekeeper.research.compaq.com/pub/DEC/SRC/research-reports/00README-Legal-Rules-Regs',
1064 ## if localhost is not None:
1065 ## urls = urls + [
1066 ## 'file://%s/etc/passwd' % localhost,
1067 ## 'http://%s/simple/' % localhost,
1068 ## 'http://%s/digest/' % localhost,
1069 ## 'http://%s/not/found.h' % localhost,
1070 ## ]
1072 ## bauth = HTTPBasicAuthHandler()
1073 ## bauth.add_password('basic_test_realm', localhost, 'jhylton',
1074 ## 'password')
1075 ## dauth = HTTPDigestAuthHandler()
1076 ## dauth.add_password('digest_test_realm', localhost, 'jhylton',
1077 ## 'password')
1080 cfh = CacheFTPHandler()
1081 cfh.setTimeout(1)
1083 ## # XXX try out some custom proxy objects too!
1084 ## def at_cnri(req):
1085 ## host = req.get_host()
1086 ## print host
1087 ## if host[-18:] == '.cnri.reston.va.us':
1088 ## return 1
1089 ## p = CustomProxy('http', at_cnri, 'proxy.cnri.reston.va.us')
1090 ## ph = CustomProxyHandler(p)
1092 ## install_opener(build_opener(dauth, bauth, cfh, GopherHandler, ph))
1093 install_opener(build_opener(cfh, GopherHandler))
1095 for url in urls:
1096 if isinstance(url, tuple):
1097 url, req = url
1098 else:
1099 req = None
1100 print url
1101 try:
1102 f = urlopen(url, req)
1103 except IOError, err:
1104 print "IOError:", err
1105 except socket.error, err:
1106 print "socket.error:", err
1107 else:
1108 buf = f.read()
1109 f.close()
1110 print "read %d bytes" % len(buf)
1111 print
1112 time.sleep(0.1)