Fix the availability statement for the spawn*() functions to reflect the
[python/dscho.git] / Lib / urllib2.py
blobffd73b82ad844087bad7e5f7e37ca54dd266eafc
1 """An extensible library for opening URLs using a variety of protocols
3 The simplest way to use this module is to call the urlopen function,
4 which accepts a string containing a URL or a Request object (described
5 below). It opens the URL and returns the results as file-like
6 object; the returned object has some extra methods described below.
8 The OpenerDirectory manages a collection of Handler objects that do
9 all the actual work. Each Handler implements a particular protocol or
10 option. The OpenerDirector is a composite object that invokes the
11 Handlers needed to open the requested URL. For example, the
12 HTTPHandler performs HTTP GET and POST requests and deals with
13 non-error returns. The HTTPRedirectHandler automatically deals with
14 HTTP 301 & 302 redirect errors, and the HTTPDigestAuthHandler deals
15 with digest authentication.
17 urlopen(url, data=None) -- basic usage is that same as original
18 urllib. pass the url and optionally data to post to an HTTP URL, and
19 get a file-like object back. One difference is that you can also pass
20 a Request instance instead of URL. Raises a URLError (subclass of
21 IOError); for HTTP errors, raises an HTTPError, which can also be
22 treated as a valid response.
24 build_opener -- function that creates a new OpenerDirector instance.
25 will install the default handlers. accepts one or more Handlers as
26 arguments, either instances or Handler classes that it will
27 instantiate. if one of the argument is a subclass of the default
28 handler, the argument will be installed instead of the default.
30 install_opener -- installs a new opener as the default opener.
32 objects of interest:
33 OpenerDirector --
35 Request -- an object that encapsulates the state of a request. the
36 state can be a simple as the URL. it can also include extra HTTP
37 headers, e.g. a User-Agent.
39 BaseHandler --
41 exceptions:
42 URLError-- a subclass of IOError, individual protocols have their own
43 specific subclass
45 HTTPError-- also a valid HTTP response, so you can treat an HTTP error
46 as an exceptional event or valid response
48 internals:
49 BaseHandler and parent
50 _call_chain conventions
52 Example usage:
54 import urllib2
56 # set up authentication info
57 authinfo = urllib2.HTTPBasicAuthHandler()
58 authinfo.add_password('realm', 'host', 'username', 'password')
60 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
62 # build a new opener that adds authentication and caching FTP handlers
63 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
65 # install it
66 urllib2.install_opener(opener)
68 f = urllib2.urlopen('http://www.python.org/')
71 """
73 # XXX issues:
74 # If an authentication error handler that tries to perform
75 # authentication for some reason but fails, how should the error be
76 # signalled? The client needs to know the HTTP error code. But if
77 # the handler knows that the problem was, e.g., that it didn't know
78 # that hash algo that requested in the challenge, it would be good to
79 # pass that information along to the client, too.
81 # XXX to do:
82 # name!
83 # documentation (getting there)
84 # complex proxies
85 # abstract factory for opener
86 # ftp errors aren't handled cleanly
87 # gopher can return a socket.error
88 # check digest against correct (i.e. non-apache) implementation
90 import socket
91 import httplib
92 import inspect
93 import re
94 import base64
95 import types
96 import urlparse
97 import md5
98 import mimetypes
99 import mimetools
100 import rfc822
101 import ftplib
102 import sys
103 import time
104 import os
105 import stat
106 import gopherlib
107 import posixpath
109 try:
110 from cStringIO import StringIO
111 except ImportError:
112 from StringIO import StringIO
114 try:
115 import sha
116 except ImportError:
117 # need 1.5.2 final
118 sha = None
120 # not sure how many of these need to be gotten rid of
121 from urllib import unwrap, unquote, splittype, splithost, \
122 addinfourl, splitport, splitgophertype, splitquery, \
123 splitattr, ftpwrapper, noheaders
125 # support for proxies via environment variables
126 from urllib import getproxies
128 # support for FileHandler
129 from urllib import localhost, url2pathname
131 __version__ = "2.0a1"
133 _opener = None
134 def urlopen(url, data=None):
135 global _opener
136 if _opener is None:
137 _opener = build_opener()
138 return _opener.open(url, data)
140 def install_opener(opener):
141 global _opener
142 _opener = opener
144 # do these error classes make sense?
145 # make sure all of the IOError stuff is overridden. we just want to be
146 # subtypes.
148 class URLError(IOError):
149 # URLError is a sub-type of IOError, but it doesn't share any of
150 # the implementation. need to override __init__ and __str__
151 def __init__(self, reason):
152 self.reason = reason
154 def __str__(self):
155 return '<urlopen error %s>' % self.reason
157 class HTTPError(URLError, addinfourl):
158 """Raised when HTTP error occurs, but also acts like non-error return"""
159 __super_init = addinfourl.__init__
161 def __init__(self, url, code, msg, hdrs, fp):
162 self.__super_init(fp, hdrs, url)
163 self.code = code
164 self.msg = msg
165 self.hdrs = hdrs
166 self.fp = fp
167 # XXX
168 self.filename = url
170 def __str__(self):
171 return 'HTTP Error %s: %s' % (self.code, self.msg)
173 def __del__(self):
174 # XXX is this safe? what if user catches exception, then
175 # extracts fp and discards exception?
176 if self.fp:
177 self.fp.close()
179 class GopherError(URLError):
180 pass
183 class Request:
185 def __init__(self, url, data=None, headers={}):
186 # unwrap('<URL:type://host/path>') --> 'type://host/path'
187 self.__original = unwrap(url)
188 self.type = None
189 # self.__r_type is what's left after doing the splittype
190 self.host = None
191 self.port = None
192 self.data = data
193 self.headers = {}
194 self.headers.update(headers)
196 def __getattr__(self, attr):
197 # XXX this is a fallback mechanism to guard against these
198 # methods getting called in a non-standard order. this may be
199 # too complicated and/or unnecessary.
200 # XXX should the __r_XXX attributes be public?
201 if attr[:12] == '_Request__r_':
202 name = attr[12:]
203 if hasattr(Request, 'get_' + name):
204 getattr(self, 'get_' + name)()
205 return getattr(self, attr)
206 raise AttributeError, attr
208 def add_data(self, data):
209 self.data = data
211 def has_data(self):
212 return self.data is not None
214 def get_data(self):
215 return self.data
217 def get_full_url(self):
218 return self.__original
220 def get_type(self):
221 if self.type is None:
222 self.type, self.__r_type = splittype(self.__original)
223 if self.type is None:
224 raise ValueError, "unknown url type: %s" % self.__original
225 return self.type
227 def get_host(self):
228 if self.host is None:
229 self.host, self.__r_host = splithost(self.__r_type)
230 if self.host:
231 self.host = unquote(self.host)
232 return self.host
234 def get_selector(self):
235 return self.__r_host
237 def set_proxy(self, host, type):
238 self.host, self.type = host, type
239 self.__r_host = self.__original
241 def add_header(self, key, val):
242 # useful for something like authentication
243 self.headers[key] = val
245 class OpenerDirector:
246 def __init__(self):
247 server_version = "Python-urllib/%s" % __version__
248 self.addheaders = [('User-agent', server_version)]
249 # manage the individual handlers
250 self.handlers = []
251 self.handle_open = {}
252 self.handle_error = {}
254 def add_handler(self, handler):
255 added = 0
256 for meth in dir(handler):
257 if meth[-5:] == '_open':
258 protocol = meth[:-5]
259 if self.handle_open.has_key(protocol):
260 self.handle_open[protocol].append(handler)
261 else:
262 self.handle_open[protocol] = [handler]
263 added = 1
264 continue
265 i = meth.find('_')
266 j = meth[i+1:].find('_') + i + 1
267 if j != -1 and meth[i+1:j] == 'error':
268 proto = meth[:i]
269 kind = meth[j+1:]
270 try:
271 kind = int(kind)
272 except ValueError:
273 pass
274 dict = self.handle_error.get(proto, {})
275 if dict.has_key(kind):
276 dict[kind].append(handler)
277 else:
278 dict[kind] = [handler]
279 self.handle_error[proto] = dict
280 added = 1
281 continue
282 if added:
283 self.handlers.append(handler)
284 handler.add_parent(self)
286 def __del__(self):
287 self.close()
289 def close(self):
290 for handler in self.handlers:
291 handler.close()
292 self.handlers = []
294 def _call_chain(self, chain, kind, meth_name, *args):
295 # XXX raise an exception if no one else should try to handle
296 # this url. return None if you can't but someone else could.
297 handlers = chain.get(kind, ())
298 for handler in handlers:
299 func = getattr(handler, meth_name)
301 result = func(*args)
302 if result is not None:
303 return result
305 def open(self, fullurl, data=None):
306 # accept a URL or a Request object
307 if isinstance(fullurl, (types.StringType, types.UnicodeType)):
308 req = Request(fullurl, data)
309 else:
310 req = fullurl
311 if data is not None:
312 req.add_data(data)
313 assert isinstance(req, Request) # really only care about interface
315 result = self._call_chain(self.handle_open, 'default',
316 'default_open', req)
317 if result:
318 return result
320 type_ = req.get_type()
321 result = self._call_chain(self.handle_open, type_, type_ + \
322 '_open', req)
323 if result:
324 return result
326 return self._call_chain(self.handle_open, 'unknown',
327 'unknown_open', req)
329 def error(self, proto, *args):
330 if proto in ['http', 'https']:
331 # XXX http[s] protocols are special-cased
332 dict = self.handle_error['http'] # https is not different than http
333 proto = args[2] # YUCK!
334 meth_name = 'http_error_%d' % proto
335 http_err = 1
336 orig_args = args
337 else:
338 dict = self.handle_error
339 meth_name = proto + '_error'
340 http_err = 0
341 args = (dict, proto, meth_name) + args
342 result = self._call_chain(*args)
343 if result:
344 return result
346 if http_err:
347 args = (dict, 'default', 'http_error_default') + orig_args
348 return self._call_chain(*args)
350 # XXX probably also want an abstract factory that knows things like
351 # the fact that a ProxyHandler needs to get inserted first.
352 # would also know when it makes sense to skip a superclass in favor of
353 # a subclass and when it might make sense to include both
355 def build_opener(*handlers):
356 """Create an opener object from a list of handlers.
358 The opener will use several default handlers, including support
359 for HTTP and FTP. If there is a ProxyHandler, it must be at the
360 front of the list of handlers. (Yuck.)
362 If any of the handlers passed as arguments are subclasses of the
363 default handlers, the default handlers will not be used.
366 opener = OpenerDirector()
367 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
368 HTTPDefaultErrorHandler, HTTPRedirectHandler,
369 FTPHandler, FileHandler]
370 if hasattr(httplib, 'HTTPS'):
371 default_classes.append(HTTPSHandler)
372 skip = []
373 for klass in default_classes:
374 for check in handlers:
375 if inspect.isclass(check):
376 if issubclass(check, klass):
377 skip.append(klass)
378 elif isinstance(check, klass):
379 skip.append(klass)
380 for klass in skip:
381 default_classes.remove(klass)
383 for klass in default_classes:
384 opener.add_handler(klass())
386 for h in handlers:
387 if inspect.isclass(h):
388 h = h()
389 opener.add_handler(h)
390 return opener
392 class BaseHandler:
393 def add_parent(self, parent):
394 self.parent = parent
395 def close(self):
396 self.parent = None
398 class HTTPDefaultErrorHandler(BaseHandler):
399 def http_error_default(self, req, fp, code, msg, hdrs):
400 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
402 class HTTPRedirectHandler(BaseHandler):
403 # Implementation note: To avoid the server sending us into an
404 # infinite loop, the request object needs to track what URLs we
405 # have already seen. Do this by adding a handler-specific
406 # attribute to the Request object.
407 def http_error_302(self, req, fp, code, msg, headers):
408 if headers.has_key('location'):
409 newurl = headers['location']
410 elif headers.has_key('uri'):
411 newurl = headers['uri']
412 else:
413 return
414 newurl = urlparse.urljoin(req.get_full_url(), newurl)
416 # XXX Probably want to forget about the state of the current
417 # request, although that might interact poorly with other
418 # handlers that also use handler-specific request attributes
419 new = Request(newurl, req.get_data())
420 new.error_302_dict = {}
421 if hasattr(req, 'error_302_dict'):
422 if len(req.error_302_dict)>10 or \
423 req.error_302_dict.has_key(newurl):
424 raise HTTPError(req.get_full_url(), code,
425 self.inf_msg + msg, headers, fp)
426 new.error_302_dict.update(req.error_302_dict)
427 new.error_302_dict[newurl] = newurl
429 # Don't close the fp until we are sure that we won't use it
430 # with HTTPError.
431 fp.read()
432 fp.close()
434 return self.parent.open(new)
436 http_error_301 = http_error_302
438 inf_msg = "The HTTP server returned a redirect error that would" \
439 "lead to an infinite loop.\n" \
440 "The last 302 error message was:\n"
442 class ProxyHandler(BaseHandler):
443 def __init__(self, proxies=None):
444 if proxies is None:
445 proxies = getproxies()
446 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
447 self.proxies = proxies
448 for type, url in proxies.items():
449 setattr(self, '%s_open' % type,
450 lambda r, proxy=url, type=type, meth=self.proxy_open: \
451 meth(r, proxy, type))
453 def proxy_open(self, req, proxy, type):
454 orig_type = req.get_type()
455 type, r_type = splittype(proxy)
456 host, XXX = splithost(r_type)
457 if '@' in host:
458 user_pass, host = host.split('@', 1)
459 user_pass = base64.encodestring(unquote(user_pass)).strip()
460 req.add_header('Proxy-Authorization', 'Basic '+user_pass)
461 host = unquote(host)
462 req.set_proxy(host, type)
463 if orig_type == type:
464 # let other handlers take care of it
465 # XXX this only makes sense if the proxy is before the
466 # other handlers
467 return None
468 else:
469 # need to start over, because the other handlers don't
470 # grok the proxy's URL type
471 return self.parent.open(req)
473 # feature suggested by Duncan Booth
474 # XXX custom is not a good name
475 class CustomProxy:
476 # either pass a function to the constructor or override handle
477 def __init__(self, proto, func=None, proxy_addr=None):
478 self.proto = proto
479 self.func = func
480 self.addr = proxy_addr
482 def handle(self, req):
483 if self.func and self.func(req):
484 return 1
486 def get_proxy(self):
487 return self.addr
489 class CustomProxyHandler(BaseHandler):
490 def __init__(self, *proxies):
491 self.proxies = {}
493 def proxy_open(self, req):
494 proto = req.get_type()
495 try:
496 proxies = self.proxies[proto]
497 except KeyError:
498 return None
499 for p in proxies:
500 if p.handle(req):
501 req.set_proxy(p.get_proxy())
502 return self.parent.open(req)
503 return None
505 def do_proxy(self, p, req):
506 return self.parent.open(req)
508 def add_proxy(self, cpo):
509 if self.proxies.has_key(cpo.proto):
510 self.proxies[cpo.proto].append(cpo)
511 else:
512 self.proxies[cpo.proto] = [cpo]
514 class HTTPPasswordMgr:
515 def __init__(self):
516 self.passwd = {}
518 def add_password(self, realm, uri, user, passwd):
519 # uri could be a single URI or a sequence
520 if isinstance(uri, (types.StringType, types.UnicodeType)):
521 uri = [uri]
522 uri = tuple(map(self.reduce_uri, uri))
523 if not self.passwd.has_key(realm):
524 self.passwd[realm] = {}
525 self.passwd[realm][uri] = (user, passwd)
527 def find_user_password(self, realm, authuri):
528 domains = self.passwd.get(realm, {})
529 authuri = self.reduce_uri(authuri)
530 for uris, authinfo in domains.items():
531 for uri in uris:
532 if self.is_suburi(uri, authuri):
533 return authinfo
534 return None, None
536 def reduce_uri(self, uri):
537 """Accept netloc or URI and extract only the netloc and path"""
538 parts = urlparse.urlparse(uri)
539 if parts[1]:
540 return parts[1], parts[2] or '/'
541 else:
542 return parts[2], '/'
544 def is_suburi(self, base, test):
545 """Check if test is below base in a URI tree
547 Both args must be URIs in reduced form.
549 if base == test:
550 return 1
551 if base[0] != test[0]:
552 return 0
553 common = posixpath.commonprefix((base[1], test[1]))
554 if len(common) == len(base[1]):
555 return 1
556 return 0
559 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
561 def find_user_password(self, realm, authuri):
562 user, password = HTTPPasswordMgr.find_user_password(self,realm,authuri)
563 if user is not None:
564 return user, password
565 return HTTPPasswordMgr.find_user_password(self, None, authuri)
568 class AbstractBasicAuthHandler:
570 rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"')
572 # XXX there can actually be multiple auth-schemes in a
573 # www-authenticate header. should probably be a lot more careful
574 # in parsing them to extract multiple alternatives
576 def __init__(self, password_mgr=None):
577 if password_mgr is None:
578 password_mgr = HTTPPasswordMgr()
579 self.passwd = password_mgr
580 self.add_password = self.passwd.add_password
582 def http_error_auth_reqed(self, authreq, host, req, headers):
583 # XXX could be multiple headers
584 authreq = headers.get(authreq, None)
585 if authreq:
586 mo = AbstractBasicAuthHandler.rx.match(authreq)
587 if mo:
588 scheme, realm = mo.groups()
589 if scheme.lower() == 'basic':
590 return self.retry_http_basic_auth(host, req, realm)
592 def retry_http_basic_auth(self, host, req, realm):
593 user,pw = self.passwd.find_user_password(realm, host)
594 if pw:
595 raw = "%s:%s" % (user, pw)
596 auth = 'Basic %s' % base64.encodestring(raw).strip()
597 if req.headers.get(self.auth_header, None) == auth:
598 return None
599 req.add_header(self.auth_header, auth)
600 return self.parent.open(req)
601 else:
602 return None
604 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
606 auth_header = 'Authorization'
608 def http_error_401(self, req, fp, code, msg, headers):
609 host = urlparse.urlparse(req.get_full_url())[1]
610 return self.http_error_auth_reqed('www-authenticate',
611 host, req, headers)
614 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
616 auth_header = 'Proxy-Authorization'
618 def http_error_407(self, req, fp, code, msg, headers):
619 host = req.get_host()
620 return self.http_error_auth_reqed('proxy-authenticate',
621 host, req, headers)
624 class AbstractDigestAuthHandler:
626 def __init__(self, passwd=None):
627 if passwd is None:
628 passwd = HTTPPasswordMgr()
629 self.passwd = passwd
630 self.add_password = self.passwd.add_password
632 def http_error_auth_reqed(self, authreq, host, req, headers):
633 authreq = headers.get(self.auth_header, None)
634 if authreq:
635 kind = authreq.split()[0]
636 if kind == 'Digest':
637 return self.retry_http_digest_auth(req, authreq)
639 def retry_http_digest_auth(self, req, auth):
640 token, challenge = auth.split(' ', 1)
641 chal = parse_keqv_list(parse_http_list(challenge))
642 auth = self.get_authorization(req, chal)
643 if auth:
644 auth_val = 'Digest %s' % auth
645 if req.headers.get(self.auth_header, None) == auth_val:
646 return None
647 req.add_header(self.auth_header, auth_val)
648 resp = self.parent.open(req)
649 return resp
651 def get_authorization(self, req, chal):
652 try:
653 realm = chal['realm']
654 nonce = chal['nonce']
655 algorithm = chal.get('algorithm', 'MD5')
656 # mod_digest doesn't send an opaque, even though it isn't
657 # supposed to be optional
658 opaque = chal.get('opaque', None)
659 except KeyError:
660 return None
662 H, KD = self.get_algorithm_impls(algorithm)
663 if H is None:
664 return None
666 user, pw = self.passwd.find_user_password(realm,
667 req.get_full_url())
668 if user is None:
669 return None
671 # XXX not implemented yet
672 if req.has_data():
673 entdig = self.get_entity_digest(req.get_data(), chal)
674 else:
675 entdig = None
677 A1 = "%s:%s:%s" % (user, realm, pw)
678 A2 = "%s:%s" % (req.has_data() and 'POST' or 'GET',
679 # XXX selector: what about proxies and full urls
680 req.get_selector())
681 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
682 # XXX should the partial digests be encoded too?
684 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
685 'response="%s"' % (user, realm, nonce, req.get_selector(),
686 respdig)
687 if opaque:
688 base = base + ', opaque="%s"' % opaque
689 if entdig:
690 base = base + ', digest="%s"' % entdig
691 if algorithm != 'MD5':
692 base = base + ', algorithm="%s"' % algorithm
693 return base
695 def get_algorithm_impls(self, algorithm):
696 # lambdas assume digest modules are imported at the top level
697 if algorithm == 'MD5':
698 H = lambda x, e=encode_digest:e(md5.new(x).digest())
699 elif algorithm == 'SHA':
700 H = lambda x, e=encode_digest:e(sha.new(x).digest())
701 # XXX MD5-sess
702 KD = lambda s, d, H=H: H("%s:%s" % (s, d))
703 return H, KD
705 def get_entity_digest(self, data, chal):
706 # XXX not implemented yet
707 return None
710 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
711 """An authentication protocol defined by RFC 2069
713 Digest authentication improves on basic authentication because it
714 does not transmit passwords in the clear.
717 header = 'Authorization'
719 def http_error_401(self, req, fp, code, msg, headers):
720 host = urlparse.urlparse(req.get_full_url())[1]
721 self.http_error_auth_reqed('www-authenticate', host, req, headers)
724 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
726 header = 'Proxy-Authorization'
728 def http_error_407(self, req, fp, code, msg, headers):
729 host = req.get_host()
730 self.http_error_auth_reqed('proxy-authenticate', host, req, headers)
733 def encode_digest(digest):
734 hexrep = []
735 for c in digest:
736 n = (ord(c) >> 4) & 0xf
737 hexrep.append(hex(n)[-1])
738 n = ord(c) & 0xf
739 hexrep.append(hex(n)[-1])
740 return ''.join(hexrep)
743 class AbstractHTTPHandler(BaseHandler):
745 def do_open(self, http_class, req):
746 host = req.get_host()
747 if not host:
748 raise URLError('no host given')
750 try:
751 h = http_class(host) # will parse host:port
752 if req.has_data():
753 data = req.get_data()
754 h.putrequest('POST', req.get_selector())
755 if not req.headers.has_key('Content-type'):
756 h.putheader('Content-type',
757 'application/x-www-form-urlencoded')
758 if not req.headers.has_key('Content-length'):
759 h.putheader('Content-length', '%d' % len(data))
760 else:
761 h.putrequest('GET', req.get_selector())
762 except socket.error, err:
763 raise URLError(err)
765 h.putheader('Host', host)
766 for args in self.parent.addheaders:
767 h.putheader(*args)
768 for k, v in req.headers.items():
769 h.putheader(k, v)
770 h.endheaders()
771 if req.has_data():
772 h.send(data)
774 code, msg, hdrs = h.getreply()
775 fp = h.getfile()
776 if code == 200:
777 return addinfourl(fp, hdrs, req.get_full_url())
778 else:
779 return self.parent.error('http', req, fp, code, msg, hdrs)
782 class HTTPHandler(AbstractHTTPHandler):
784 def http_open(self, req):
785 return self.do_open(httplib.HTTP, req)
788 if hasattr(httplib, 'HTTPS'):
789 class HTTPSHandler(AbstractHTTPHandler):
791 def https_open(self, req):
792 return self.do_open(httplib.HTTPS, req)
795 class UnknownHandler(BaseHandler):
796 def unknown_open(self, req):
797 type = req.get_type()
798 raise URLError('unknown url type: %s' % type)
800 def parse_keqv_list(l):
801 """Parse list of key=value strings where keys are not duplicated."""
802 parsed = {}
803 for elt in l:
804 k, v = elt.split('=', 1)
805 if v[0] == '"' and v[-1] == '"':
806 v = v[1:-1]
807 parsed[k] = v
808 return parsed
810 def parse_http_list(s):
811 """Parse lists as described by RFC 2068 Section 2.
813 In particular, parse comman-separated lists where the elements of
814 the list may include quoted-strings. A quoted-string could
815 contain a comma.
817 # XXX this function could probably use more testing
819 list = []
820 end = len(s)
821 i = 0
822 inquote = 0
823 start = 0
824 while i < end:
825 cur = s[i:]
826 c = cur.find(',')
827 q = cur.find('"')
828 if c == -1:
829 list.append(s[start:])
830 break
831 if q == -1:
832 if inquote:
833 raise ValueError, "unbalanced quotes"
834 else:
835 list.append(s[start:i+c])
836 i = i + c + 1
837 continue
838 if inquote:
839 if q < c:
840 list.append(s[start:i+c])
841 i = i + c + 1
842 start = i
843 inquote = 0
844 else:
845 i = i + q
846 else:
847 if c < q:
848 list.append(s[start:i+c])
849 i = i + c + 1
850 start = i
851 else:
852 inquote = 1
853 i = i + q + 1
854 return map(lambda x: x.strip(), list)
856 class FileHandler(BaseHandler):
857 # Use local file or FTP depending on form of URL
858 def file_open(self, req):
859 url = req.get_selector()
860 if url[:2] == '//' and url[2:3] != '/':
861 req.type = 'ftp'
862 return self.parent.open(req)
863 else:
864 return self.open_local_file(req)
866 # names for the localhost
867 names = None
868 def get_names(self):
869 if FileHandler.names is None:
870 FileHandler.names = (socket.gethostbyname('localhost'),
871 socket.gethostbyname(socket.gethostname()))
872 return FileHandler.names
874 # not entirely sure what the rules are here
875 def open_local_file(self, req):
876 host = req.get_host()
877 file = req.get_selector()
878 localfile = url2pathname(file)
879 stats = os.stat(localfile)
880 size = stats[stat.ST_SIZE]
881 modified = rfc822.formatdate(stats[stat.ST_MTIME])
882 mtype = mimetypes.guess_type(file)[0]
883 stats = os.stat(localfile)
884 headers = mimetools.Message(StringIO(
885 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
886 (mtype or 'text/plain', size, modified)))
887 if host:
888 host, port = splitport(host)
889 if not host or \
890 (not port and socket.gethostbyname(host) in self.get_names()):
891 return addinfourl(open(localfile, 'rb'),
892 headers, 'file:'+file)
893 raise URLError('file not on local host')
895 class FTPHandler(BaseHandler):
896 def ftp_open(self, req):
897 host = req.get_host()
898 if not host:
899 raise IOError, ('ftp error', 'no host given')
900 # XXX handle custom username & password
901 try:
902 host = socket.gethostbyname(host)
903 except socket.error, msg:
904 raise URLError(msg)
905 host, port = splitport(host)
906 if port is None:
907 port = ftplib.FTP_PORT
908 path, attrs = splitattr(req.get_selector())
909 path = unquote(path)
910 dirs = path.split('/')
911 dirs, file = dirs[:-1], dirs[-1]
912 if dirs and not dirs[0]:
913 dirs = dirs[1:]
914 user = passwd = '' # XXX
915 try:
916 fw = self.connect_ftp(user, passwd, host, port, dirs)
917 type = file and 'I' or 'D'
918 for attr in attrs:
919 attr, value = splitattr(attr)
920 if attr.lower() == 'type' and \
921 value in ('a', 'A', 'i', 'I', 'd', 'D'):
922 type = value.upper()
923 fp, retrlen = fw.retrfile(file, type)
924 headers = ""
925 mtype = mimetypes.guess_type(req.get_full_url())[0]
926 if mtype:
927 headers += "Content-Type: %s\n" % mtype
928 if retrlen is not None and retrlen >= 0:
929 headers += "Content-Length: %d\n" % retrlen
930 sf = StringIO(headers)
931 headers = mimetools.Message(sf)
932 return addinfourl(fp, headers, req.get_full_url())
933 except ftplib.all_errors, msg:
934 raise IOError, ('ftp error', msg), sys.exc_info()[2]
936 def connect_ftp(self, user, passwd, host, port, dirs):
937 fw = ftpwrapper(user, passwd, host, port, dirs)
938 ## fw.ftp.set_debuglevel(1)
939 return fw
941 class CacheFTPHandler(FTPHandler):
942 # XXX would be nice to have pluggable cache strategies
943 # XXX this stuff is definitely not thread safe
944 def __init__(self):
945 self.cache = {}
946 self.timeout = {}
947 self.soonest = 0
948 self.delay = 60
949 self.max_conns = 16
951 def setTimeout(self, t):
952 self.delay = t
954 def setMaxConns(self, m):
955 self.max_conns = m
957 def connect_ftp(self, user, passwd, host, port, dirs):
958 key = user, passwd, host, port
959 if self.cache.has_key(key):
960 self.timeout[key] = time.time() + self.delay
961 else:
962 self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
963 self.timeout[key] = time.time() + self.delay
964 self.check_cache()
965 return self.cache[key]
967 def check_cache(self):
968 # first check for old ones
969 t = time.time()
970 if self.soonest <= t:
971 for k, v in self.timeout.items():
972 if v < t:
973 self.cache[k].close()
974 del self.cache[k]
975 del self.timeout[k]
976 self.soonest = min(self.timeout.values())
978 # then check the size
979 if len(self.cache) == self.max_conns:
980 for k, v in self.timeout.items():
981 if v == self.soonest:
982 del self.cache[k]
983 del self.timeout[k]
984 break
985 self.soonest = min(self.timeout.values())
987 class GopherHandler(BaseHandler):
988 def gopher_open(self, req):
989 host = req.get_host()
990 if not host:
991 raise GopherError('no host given')
992 host = unquote(host)
993 selector = req.get_selector()
994 type, selector = splitgophertype(selector)
995 selector, query = splitquery(selector)
996 selector = unquote(selector)
997 if query:
998 query = unquote(query)
999 fp = gopherlib.send_query(selector, query, host)
1000 else:
1001 fp = gopherlib.send_selector(selector, host)
1002 return addinfourl(fp, noheaders(), req.get_full_url())
1004 #bleck! don't use this yet
1005 class OpenerFactory:
1007 default_handlers = [UnknownHandler, HTTPHandler,
1008 HTTPDefaultErrorHandler, HTTPRedirectHandler,
1009 FTPHandler, FileHandler]
1010 proxy_handlers = [ProxyHandler]
1011 handlers = []
1012 replacement_handlers = []
1014 def add_proxy_handler(self, ph):
1015 self.proxy_handlers = self.proxy_handlers + [ph]
1017 def add_handler(self, h):
1018 self.handlers = self.handlers + [h]
1020 def replace_handler(self, h):
1021 pass
1023 def build_opener(self):
1024 opener = OpenerDirector()
1025 for ph in self.proxy_handlers:
1026 if inspect.isclass(ph):
1027 ph = ph()
1028 opener.add_handler(ph)
1030 if __name__ == "__main__":
1031 # XXX some of the test code depends on machine configurations that
1032 # are internal to CNRI. Need to set up a public server with the
1033 # right authentication configuration for test purposes.
1034 if socket.gethostname() == 'bitdiddle':
1035 localhost = 'bitdiddle.cnri.reston.va.us'
1036 elif socket.gethostname() == 'bitdiddle.concentric.net':
1037 localhost = 'localhost'
1038 else:
1039 localhost = None
1040 urls = [
1041 # Thanks to Fred for finding these!
1042 'gopher://gopher.lib.ncsu.edu/11/library/stacks/Alex',
1043 'gopher://gopher.vt.edu:10010/10/33',
1045 'file:/etc/passwd',
1046 'file://nonsensename/etc/passwd',
1047 'ftp://www.python.org/pub/python/misc/sousa.au',
1048 'ftp://www.python.org/pub/tmp/blat',
1049 'http://www.espn.com/', # redirect
1050 'http://www.python.org/Spanish/Inquistion/',
1051 ('http://www.python.org/cgi-bin/faqw.py',
1052 'query=pythonistas&querytype=simple&casefold=yes&req=search'),
1053 'http://www.python.org/',
1054 'ftp://gatekeeper.research.compaq.com/pub/DEC/SRC/research-reports/00README-Legal-Rules-Regs',
1057 ## if localhost is not None:
1058 ## urls = urls + [
1059 ## 'file://%s/etc/passwd' % localhost,
1060 ## 'http://%s/simple/' % localhost,
1061 ## 'http://%s/digest/' % localhost,
1062 ## 'http://%s/not/found.h' % localhost,
1063 ## ]
1065 ## bauth = HTTPBasicAuthHandler()
1066 ## bauth.add_password('basic_test_realm', localhost, 'jhylton',
1067 ## 'password')
1068 ## dauth = HTTPDigestAuthHandler()
1069 ## dauth.add_password('digest_test_realm', localhost, 'jhylton',
1070 ## 'password')
1073 cfh = CacheFTPHandler()
1074 cfh.setTimeout(1)
1076 ## # XXX try out some custom proxy objects too!
1077 ## def at_cnri(req):
1078 ## host = req.get_host()
1079 ## print host
1080 ## if host[-18:] == '.cnri.reston.va.us':
1081 ## return 1
1082 ## p = CustomProxy('http', at_cnri, 'proxy.cnri.reston.va.us')
1083 ## ph = CustomProxyHandler(p)
1085 ## install_opener(build_opener(dauth, bauth, cfh, GopherHandler, ph))
1086 install_opener(build_opener(cfh, GopherHandler))
1088 for url in urls:
1089 if isinstance(url, types.TupleType):
1090 url, req = url
1091 else:
1092 req = None
1093 print url
1094 try:
1095 f = urlopen(url, req)
1096 except IOError, err:
1097 print "IOError:", err
1098 except socket.error, err:
1099 print "socket.error:", err
1100 else:
1101 buf = f.read()
1102 f.close()
1103 print "read %d bytes" % len(buf)
1104 print
1105 time.sleep(0.1)