New \grammartoken markup, similar to \token but allowed everywhere.
[python/dscho.git] / Lib / urllib2.py
blobf9e6bf254f4332a39697105800f2eebc7b0a52f9
1 """An extensible library for opening URLs using a variety of protocols
3 The simplest way to use this module is to call the urlopen function,
4 which accepts a string containing a URL or a Request object (described
5 below). It opens the URL and returns the results as file-like
6 object; the returned object has some extra methods described below.
8 The OpenerDirectory manages a collection of Handler objects that do
9 all the actual work. Each Handler implements a particular protocol or
10 option. The OpenerDirector is a composite object that invokes the
11 Handlers needed to open the requested URL. For example, the
12 HTTPHandler performs HTTP GET and POST requests and deals with
13 non-error returns. The HTTPRedirectHandler automatically deals with
14 HTTP 301 & 302 redirect errors, and the HTTPDigestAuthHandler deals
15 with digest authentication.
17 urlopen(url, data=None) -- basic usage is that same as original
18 urllib. pass the url and optionally data to post to an HTTP URL, and
19 get a file-like object back. One difference is that you can also pass
20 a Request instance instead of URL. Raises a URLError (subclass of
21 IOError); for HTTP errors, raises an HTTPError, which can also be
22 treated as a valid response.
24 build_opener -- function that creates a new OpenerDirector instance.
25 will install the default handlers. accepts one or more Handlers as
26 arguments, either instances or Handler classes that it will
27 instantiate. if one of the argument is a subclass of the default
28 handler, the argument will be installed instead of the default.
30 install_opener -- installs a new opener as the default opener.
32 objects of interest:
33 OpenerDirector --
35 Request -- an object that encapsulates the state of a request. the
36 state can be a simple as the URL. it can also include extra HTTP
37 headers, e.g. a User-Agent.
39 BaseHandler --
41 exceptions:
42 URLError-- a subclass of IOError, individual protocols have their own
43 specific subclass
45 HTTPError-- also a valid HTTP response, so you can treat an HTTP error
46 as an exceptional event or valid response
48 internals:
49 BaseHandler and parent
50 _call_chain conventions
52 Example usage:
54 import urllib2
56 # set up authentication info
57 authinfo = urllib2.HTTPBasicAuthHandler()
58 authinfo.add_password('realm', 'host', 'username', 'password')
60 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
62 # build a new opener that adds authentication and caching FTP handlers
63 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
65 # install it
66 urllib2.install_opener(opener)
68 f = urllib2.urlopen('http://www.python.org/')
71 """
73 # XXX issues:
74 # If an authentication error handler that tries to perform
75 # authentication for some reason but fails, how should the error be
76 # signalled? The client needs to know the HTTP error code. But if
77 # the handler knows that the problem was, e.g., that it didn't know
78 # that hash algo that requested in the challenge, it would be good to
79 # pass that information along to the client, too.
81 # XXX to do:
82 # name!
83 # documentation (getting there)
84 # complex proxies
85 # abstract factory for opener
86 # ftp errors aren't handled cleanly
87 # gopher can return a socket.error
88 # check digest against correct (i.e. non-apache) implementation
90 import socket
91 import httplib
92 import re
93 import base64
94 import types
95 import urlparse
96 import md5
97 import mimetypes
98 import mimetools
99 import ftplib
100 import sys
101 import time
102 import gopherlib
103 import posixpath
105 try:
106 from cStringIO import StringIO
107 except ImportError:
108 from StringIO import StringIO
110 try:
111 import sha
112 except ImportError:
113 # need 1.5.2 final
114 sha = None
116 # not sure how many of these need to be gotten rid of
117 from urllib import unwrap, unquote, splittype, splithost, \
118 addinfourl, splitport, splitgophertype, splitquery, \
119 splitattr, ftpwrapper, noheaders
121 # support for proxies via environment variables
122 from urllib import getproxies
124 # support for FileHandler
125 from urllib import localhost, url2pathname
127 __version__ = "2.0a1"
129 _opener = None
130 def urlopen(url, data=None):
131 global _opener
132 if _opener is None:
133 _opener = build_opener()
134 return _opener.open(url, data)
136 def install_opener(opener):
137 global _opener
138 _opener = opener
140 # do these error classes make sense?
141 # make sure all of the IOError stuff is overridden. we just want to be
142 # subtypes.
144 class URLError(IOError):
145 # URLError is a sub-type of IOError, but it doesn't share any of
146 # the implementation. need to override __init__ and __str__
147 def __init__(self, reason):
148 self.reason = reason
150 def __str__(self):
151 return '<urlopen error %s>' % self.reason
153 class HTTPError(URLError, addinfourl):
154 """Raised when HTTP error occurs, but also acts like non-error return"""
155 __super_init = addinfourl.__init__
157 def __init__(self, url, code, msg, hdrs, fp):
158 self.__super_init(fp, hdrs, url)
159 self.code = code
160 self.msg = msg
161 self.hdrs = hdrs
162 self.fp = fp
163 # XXX
164 self.filename = url
166 def __str__(self):
167 return 'HTTP Error %s: %s' % (self.code, self.msg)
169 def __del__(self):
170 # XXX is this safe? what if user catches exception, then
171 # extracts fp and discards exception?
172 if self.fp:
173 self.fp.close()
175 class GopherError(URLError):
176 pass
179 class Request:
181 def __init__(self, url, data=None, headers={}):
182 # unwrap('<URL:type://host/path>') --> 'type://host/path'
183 self.__original = unwrap(url)
184 self.type = None
185 # self.__r_type is what's left after doing the splittype
186 self.host = None
187 self.port = None
188 self.data = data
189 self.headers = {}
190 self.headers.update(headers)
192 def __getattr__(self, attr):
193 # XXX this is a fallback mechanism to guard against these
194 # methods getting called in a non-standard order. this may be
195 # too complicated and/or unnecessary.
196 # XXX should the __r_XXX attributes be public?
197 if attr[:12] == '_Request__r_':
198 name = attr[12:]
199 if hasattr(Request, 'get_' + name):
200 getattr(self, 'get_' + name)()
201 return getattr(self, attr)
202 raise AttributeError, attr
204 def add_data(self, data):
205 self.data = data
207 def has_data(self):
208 return self.data is not None
210 def get_data(self):
211 return self.data
213 def get_full_url(self):
214 return self.__original
216 def get_type(self):
217 if self.type is None:
218 self.type, self.__r_type = splittype(self.__original)
219 if self.type is None:
220 raise ValueError, "unknown url type: %s" % self.__original
221 return self.type
223 def get_host(self):
224 if self.host is None:
225 self.host, self.__r_host = splithost(self.__r_type)
226 if self.host:
227 self.host = unquote(self.host)
228 return self.host
230 def get_selector(self):
231 return self.__r_host
233 def set_proxy(self, host, type):
234 self.host, self.type = host, type
235 self.__r_host = self.__original
237 def add_header(self, key, val):
238 # useful for something like authentication
239 self.headers[key] = val
241 class OpenerDirector:
242 def __init__(self):
243 server_version = "Python-urllib/%s" % __version__
244 self.addheaders = [('User-agent', server_version)]
245 # manage the individual handlers
246 self.handlers = []
247 self.handle_open = {}
248 self.handle_error = {}
250 def add_handler(self, handler):
251 added = 0
252 for meth in get_methods(handler):
253 if meth[-5:] == '_open':
254 protocol = meth[:-5]
255 if self.handle_open.has_key(protocol):
256 self.handle_open[protocol].append(handler)
257 else:
258 self.handle_open[protocol] = [handler]
259 added = 1
260 continue
261 i = meth.find('_')
262 j = meth[i+1:].find('_') + i + 1
263 if j != -1 and meth[i+1:j] == 'error':
264 proto = meth[:i]
265 kind = meth[j+1:]
266 try:
267 kind = int(kind)
268 except ValueError:
269 pass
270 dict = self.handle_error.get(proto, {})
271 if dict.has_key(kind):
272 dict[kind].append(handler)
273 else:
274 dict[kind] = [handler]
275 self.handle_error[proto] = dict
276 added = 1
277 continue
278 if added:
279 self.handlers.append(handler)
280 handler.add_parent(self)
282 def __del__(self):
283 self.close()
285 def close(self):
286 for handler in self.handlers:
287 handler.close()
288 self.handlers = []
290 def _call_chain(self, chain, kind, meth_name, *args):
291 # XXX raise an exception if no one else should try to handle
292 # this url. return None if you can't but someone else could.
293 handlers = chain.get(kind, ())
294 for handler in handlers:
295 func = getattr(handler, meth_name)
297 result = func(*args)
298 if result is not None:
299 return result
301 def open(self, fullurl, data=None):
302 # accept a URL or a Request object
303 if isinstance(fullurl, types.StringType):
304 req = Request(fullurl, data)
305 else:
306 req = fullurl
307 if data is not None:
308 req.add_data(data)
309 assert isinstance(req, Request) # really only care about interface
311 result = self._call_chain(self.handle_open, 'default',
312 'default_open', req)
313 if result:
314 return result
316 type_ = req.get_type()
317 result = self._call_chain(self.handle_open, type_, type_ + \
318 '_open', req)
319 if result:
320 return result
322 return self._call_chain(self.handle_open, 'unknown',
323 'unknown_open', req)
325 def error(self, proto, *args):
326 if proto in ['http', 'https']:
327 # XXX http[s] protocols are special cased
328 dict = self.handle_error['http'] # https is not different then http
329 proto = args[2] # YUCK!
330 meth_name = 'http_error_%d' % proto
331 http_err = 1
332 orig_args = args
333 else:
334 dict = self.handle_error
335 meth_name = proto + '_error'
336 http_err = 0
337 args = (dict, proto, meth_name) + args
338 result = self._call_chain(*args)
339 if result:
340 return result
342 if http_err:
343 args = (dict, 'default', 'http_error_default') + orig_args
344 return self._call_chain(*args)
346 def is_callable(obj):
347 # not quite like builtin callable (which I didn't know existed),
348 # not entirely sure it needs to be different
349 if type(obj) in (types.BuiltinFunctionType,
350 types.BuiltinMethodType, types.LambdaType,
351 types.MethodType):
352 return 1
353 if isinstance(obj, types.InstanceType):
354 return hasattr(obj, '__call__')
355 return 0
357 def get_methods(inst):
358 methods = {}
359 classes = []
360 classes.append(inst.__class__)
361 while classes:
362 klass = classes[0]
363 del classes[0]
364 classes = classes + list(klass.__bases__)
365 for name in dir(klass):
366 attr = getattr(klass, name)
367 if isinstance(attr, types.UnboundMethodType):
368 methods[name] = 1
369 for name in dir(inst):
370 if is_callable(getattr(inst, name)):
371 methods[name] = 1
372 return methods.keys()
374 # XXX probably also want an abstract factory that knows things like
375 # the fact that a ProxyHandler needs to get inserted first.
376 # would also know when it makes sense to skip a superclass in favor of
377 # a subclass and when it might make sense to include both
379 def build_opener(*handlers):
380 """Create an opener object from a list of handlers.
382 The opener will use several default handlers, including support
383 for HTTP and FTP. If there is a ProxyHandler, it must be at the
384 front of the list of handlers. (Yuck.)
386 If any of the handlers passed as arguments are subclasses of the
387 default handlers, the default handlers will not be used.
390 opener = OpenerDirector()
391 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
392 HTTPDefaultErrorHandler, HTTPRedirectHandler,
393 FTPHandler, FileHandler]
394 if hasattr(httplib, 'HTTPS'):
395 default_classes.append(HTTPSHandler)
396 skip = []
397 for klass in default_classes:
398 for check in handlers:
399 if isinstance(check, types.ClassType):
400 if issubclass(check, klass):
401 skip.append(klass)
402 elif isinstance(check, types.InstanceType):
403 if isinstance(check, klass):
404 skip.append(klass)
405 for klass in skip:
406 default_classes.remove(klass)
408 for klass in default_classes:
409 opener.add_handler(klass())
411 for h in handlers:
412 if isinstance(h, types.ClassType):
413 h = h()
414 opener.add_handler(h)
415 return opener
417 class BaseHandler:
418 def add_parent(self, parent):
419 self.parent = parent
420 def close(self):
421 self.parent = None
423 class HTTPDefaultErrorHandler(BaseHandler):
424 def http_error_default(self, req, fp, code, msg, hdrs):
425 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
427 class HTTPRedirectHandler(BaseHandler):
428 # Implementation note: To avoid the server sending us into an
429 # infinite loop, the request object needs to track what URLs we
430 # have already seen. Do this by adding a handler-specific
431 # attribute to the Request object.
432 def http_error_302(self, req, fp, code, msg, headers):
433 if headers.has_key('location'):
434 newurl = headers['location']
435 elif headers.has_key('uri'):
436 newurl = headers['uri']
437 else:
438 return
439 newurl = urlparse.urljoin(req.get_full_url(), newurl)
441 # XXX Probably want to forget about the state of the current
442 # request, although that might interact poorly with other
443 # handlers that also use handler-specific request attributes
444 new = Request(newurl, req.get_data())
445 new.error_302_dict = {}
446 if hasattr(req, 'error_302_dict'):
447 if len(req.error_302_dict)>10 or \
448 req.error_302_dict.has_key(newurl):
449 raise HTTPError(req.get_full_url(), code,
450 self.inf_msg + msg, headers, fp)
451 new.error_302_dict.update(req.error_302_dict)
452 new.error_302_dict[newurl] = newurl
454 # Don't close the fp until we are sure that we won't use it
455 # with HTTPError.
456 fp.read()
457 fp.close()
459 return self.parent.open(new)
461 http_error_301 = http_error_302
463 inf_msg = "The HTTP server returned a redirect error that would" \
464 "lead to an infinite loop.\n" \
465 "The last 302 error message was:\n"
467 class ProxyHandler(BaseHandler):
468 def __init__(self, proxies=None):
469 if proxies is None:
470 proxies = getproxies()
471 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
472 self.proxies = proxies
473 for type, url in proxies.items():
474 setattr(self, '%s_open' % type,
475 lambda r, proxy=url, type=type, meth=self.proxy_open: \
476 meth(r, proxy, type))
478 def proxy_open(self, req, proxy, type):
479 orig_type = req.get_type()
480 type, r_type = splittype(proxy)
481 host, XXX = splithost(r_type)
482 if '@' in host:
483 user_pass, host = host.split('@', 1)
484 user_pass = base64.encodestring(unquote(user_pass)).strip()
485 req.add_header('Proxy-Authorization', 'Basic '+user_pass)
486 host = unquote(host)
487 req.set_proxy(host, type)
488 if orig_type == type:
489 # let other handlers take care of it
490 # XXX this only makes sense if the proxy is before the
491 # other handlers
492 return None
493 else:
494 # need to start over, because the other handlers don't
495 # grok the proxy's URL type
496 return self.parent.open(req)
498 # feature suggested by Duncan Booth
499 # XXX custom is not a good name
500 class CustomProxy:
501 # either pass a function to the constructor or override handle
502 def __init__(self, proto, func=None, proxy_addr=None):
503 self.proto = proto
504 self.func = func
505 self.addr = proxy_addr
507 def handle(self, req):
508 if self.func and self.func(req):
509 return 1
511 def get_proxy(self):
512 return self.addr
514 class CustomProxyHandler(BaseHandler):
515 def __init__(self, *proxies):
516 self.proxies = {}
518 def proxy_open(self, req):
519 proto = req.get_type()
520 try:
521 proxies = self.proxies[proto]
522 except KeyError:
523 return None
524 for p in proxies:
525 if p.handle(req):
526 req.set_proxy(p.get_proxy())
527 return self.parent.open(req)
528 return None
530 def do_proxy(self, p, req):
531 return self.parent.open(req)
533 def add_proxy(self, cpo):
534 if self.proxies.has_key(cpo.proto):
535 self.proxies[cpo.proto].append(cpo)
536 else:
537 self.proxies[cpo.proto] = [cpo]
539 class HTTPPasswordMgr:
540 def __init__(self):
541 self.passwd = {}
543 def add_password(self, realm, uri, user, passwd):
544 # uri could be a single URI or a sequence
545 if isinstance(uri, types.StringType):
546 uri = [uri]
547 uri = tuple(map(self.reduce_uri, uri))
548 if not self.passwd.has_key(realm):
549 self.passwd[realm] = {}
550 self.passwd[realm][uri] = (user, passwd)
552 def find_user_password(self, realm, authuri):
553 domains = self.passwd.get(realm, {})
554 authuri = self.reduce_uri(authuri)
555 for uris, authinfo in domains.items():
556 for uri in uris:
557 if self.is_suburi(uri, authuri):
558 return authinfo
559 return None, None
561 def reduce_uri(self, uri):
562 """Accept netloc or URI and extract only the netloc and path"""
563 parts = urlparse.urlparse(uri)
564 if parts[1]:
565 return parts[1], parts[2] or '/'
566 else:
567 return parts[2], '/'
569 def is_suburi(self, base, test):
570 """Check if test is below base in a URI tree
572 Both args must be URIs in reduced form.
574 if base == test:
575 return 1
576 if base[0] != test[0]:
577 return 0
578 common = posixpath.commonprefix((base[1], test[1]))
579 if len(common) == len(base[1]):
580 return 1
581 return 0
584 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
586 def find_user_password(self, realm, authuri):
587 user, password = HTTPPasswordMgr.find_user_password(self,realm,authuri)
588 if user is not None:
589 return user, password
590 return HTTPPasswordMgr.find_user_password(self, None, authuri)
593 class AbstractBasicAuthHandler:
595 rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"')
597 # XXX there can actually be multiple auth-schemes in a
598 # www-authenticate header. should probably be a lot more careful
599 # in parsing them to extract multiple alternatives
601 def __init__(self, password_mgr=None):
602 if password_mgr is None:
603 password_mgr = HTTPPasswordMgr()
604 self.passwd = password_mgr
605 self.add_password = self.passwd.add_password
606 self.__current_realm = None
607 # if __current_realm is not None, then the server must have
608 # refused our name/password and is asking for authorization
609 # again. must be careful to set it to None on successful
610 # return.
612 def http_error_auth_reqed(self, authreq, host, req, headers):
613 # XXX could be multiple headers
614 authreq = headers.get(authreq, None)
615 if authreq:
616 mo = AbstractBasicAuthHandler.rx.match(authreq)
617 if mo:
618 scheme, realm = mo.groups()
619 if scheme.lower() == 'basic':
620 return self.retry_http_basic_auth(host, req, realm)
622 def retry_http_basic_auth(self, host, req, realm):
623 if self.__current_realm is None:
624 self.__current_realm = realm
625 else:
626 self.__current_realm = realm
627 return None
628 user,pw = self.passwd.find_user_password(realm, host)
629 if pw:
630 raw = "%s:%s" % (user, pw)
631 auth = base64.encodestring(raw).strip()
632 req.add_header(self.header, 'Basic %s' % auth)
633 resp = self.parent.open(req)
634 self.__current_realm = None
635 return resp
636 else:
637 self.__current_realm = None
638 return None
640 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
642 header = 'Authorization'
644 def http_error_401(self, req, fp, code, msg, headers):
645 host = urlparse.urlparse(req.get_full_url())[1]
646 return self.http_error_auth_reqed('www-authenticate',
647 host, req, headers)
650 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
652 header = 'Proxy-Authorization'
654 def http_error_407(self, req, fp, code, msg, headers):
655 host = req.get_host()
656 return self.http_error_auth_reqed('proxy-authenticate',
657 host, req, headers)
660 class AbstractDigestAuthHandler:
662 def __init__(self, passwd=None):
663 if passwd is None:
664 passwd = HTTPPasswordMgr()
665 self.passwd = passwd
666 self.add_password = self.passwd.add_password
667 self.__current_realm = None
669 def http_error_auth_reqed(self, authreq, host, req, headers):
670 authreq = headers.get(self.header, None)
671 if authreq:
672 kind = authreq.split()[0]
673 if kind == 'Digest':
674 return self.retry_http_digest_auth(req, authreq)
676 def retry_http_digest_auth(self, req, auth):
677 token, challenge = auth.split(' ', 1)
678 chal = parse_keqv_list(parse_http_list(challenge))
679 auth = self.get_authorization(req, chal)
680 if auth:
681 req.add_header(self.header, 'Digest %s' % auth)
682 resp = self.parent.open(req)
683 self.__current_realm = None
684 return resp
686 def get_authorization(self, req, chal):
687 try:
688 realm = chal['realm']
689 nonce = chal['nonce']
690 algorithm = chal.get('algorithm', 'MD5')
691 # mod_digest doesn't send an opaque, even though it isn't
692 # supposed to be optional
693 opaque = chal.get('opaque', None)
694 except KeyError:
695 return None
697 if self.__current_realm is None:
698 self.__current_realm = realm
699 else:
700 self.__current_realm = realm
701 return None
703 H, KD = self.get_algorithm_impls(algorithm)
704 if H is None:
705 return None
707 user, pw = self.passwd.find_user_password(realm,
708 req.get_full_url())
709 if user is None:
710 return None
712 # XXX not implemented yet
713 if req.has_data():
714 entdig = self.get_entity_digest(req.get_data(), chal)
715 else:
716 entdig = None
718 A1 = "%s:%s:%s" % (user, realm, pw)
719 A2 = "%s:%s" % (req.has_data() and 'POST' or 'GET',
720 # XXX selector: what about proxies and full urls
721 req.get_selector())
722 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
723 # XXX should the partial digests be encoded too?
725 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
726 'response="%s"' % (user, realm, nonce, req.get_selector(),
727 respdig)
728 if opaque:
729 base = base + ', opaque="%s"' % opaque
730 if entdig:
731 base = base + ', digest="%s"' % entdig
732 if algorithm != 'MD5':
733 base = base + ', algorithm="%s"' % algorithm
734 return base
736 def get_algorithm_impls(self, algorithm):
737 # lambdas assume digest modules are imported at the top level
738 if algorithm == 'MD5':
739 H = lambda x, e=encode_digest:e(md5.new(x).digest())
740 elif algorithm == 'SHA':
741 H = lambda x, e=encode_digest:e(sha.new(x).digest())
742 # XXX MD5-sess
743 KD = lambda s, d, H=H: H("%s:%s" % (s, d))
744 return H, KD
746 def get_entity_digest(self, data, chal):
747 # XXX not implemented yet
748 return None
751 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
752 """An authentication protocol defined by RFC 2069
754 Digest authentication improves on basic authentication because it
755 does not transmit passwords in the clear.
758 header = 'Authorization'
760 def http_error_401(self, req, fp, code, msg, headers):
761 host = urlparse.urlparse(req.get_full_url())[1]
762 self.http_error_auth_reqed('www-authenticate', host, req, headers)
765 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
767 header = 'Proxy-Authorization'
769 def http_error_407(self, req, fp, code, msg, headers):
770 host = req.get_host()
771 self.http_error_auth_reqed('proxy-authenticate', host, req, headers)
774 def encode_digest(digest):
775 hexrep = []
776 for c in digest:
777 n = (ord(c) >> 4) & 0xf
778 hexrep.append(hex(n)[-1])
779 n = ord(c) & 0xf
780 hexrep.append(hex(n)[-1])
781 return ''.join(hexrep)
784 class AbstractHTTPHandler(BaseHandler):
786 def do_open(self, http_class, req):
787 host = req.get_host()
788 if not host:
789 raise URLError('no host given')
791 try:
792 h = http_class(host) # will parse host:port
793 if req.has_data():
794 data = req.get_data()
795 h.putrequest('POST', req.get_selector())
796 if not req.headers.has_key('Content-type'):
797 h.putheader('Content-type',
798 'application/x-www-form-urlencoded')
799 if not req.headers.has_key('Content-length'):
800 h.putheader('Content-length', '%d' % len(data))
801 else:
802 h.putrequest('GET', req.get_selector())
803 except socket.error, err:
804 raise URLError(err)
806 h.putheader('Host', host)
807 for args in self.parent.addheaders:
808 h.putheader(*args)
809 for k, v in req.headers.items():
810 h.putheader(k, v)
811 h.endheaders()
812 if req.has_data():
813 h.send(data)
815 code, msg, hdrs = h.getreply()
816 fp = h.getfile()
817 if code == 200:
818 return addinfourl(fp, hdrs, req.get_full_url())
819 else:
820 return self.parent.error('http', req, fp, code, msg, hdrs)
823 class HTTPHandler(AbstractHTTPHandler):
825 def http_open(self, req):
826 return self.do_open(httplib.HTTP, req)
829 if hasattr(httplib, 'HTTPS'):
830 class HTTPSHandler(AbstractHTTPHandler):
832 def https_open(self, req):
833 return self.do_open(httplib.HTTPS, req)
836 class UnknownHandler(BaseHandler):
837 def unknown_open(self, req):
838 type = req.get_type()
839 raise URLError('unknown url type: %s' % type)
841 def parse_keqv_list(l):
842 """Parse list of key=value strings where keys are not duplicated."""
843 parsed = {}
844 for elt in l:
845 k, v = elt.split('=', 1)
846 if v[0] == '"' and v[-1] == '"':
847 v = v[1:-1]
848 parsed[k] = v
849 return parsed
851 def parse_http_list(s):
852 """Parse lists as described by RFC 2068 Section 2.
854 In particular, parse comman-separated lists where the elements of
855 the list may include quoted-strings. A quoted-string could
856 contain a comma.
858 # XXX this function could probably use more testing
860 list = []
861 end = len(s)
862 i = 0
863 inquote = 0
864 start = 0
865 while i < end:
866 cur = s[i:]
867 c = cur.find(',')
868 q = cur.find('"')
869 if c == -1:
870 list.append(s[start:])
871 break
872 if q == -1:
873 if inquote:
874 raise ValueError, "unbalanced quotes"
875 else:
876 list.append(s[start:i+c])
877 i = i + c + 1
878 continue
879 if inquote:
880 if q < c:
881 list.append(s[start:i+c])
882 i = i + c + 1
883 start = i
884 inquote = 0
885 else:
886 i = i + q
887 else:
888 if c < q:
889 list.append(s[start:i+c])
890 i = i + c + 1
891 start = i
892 else:
893 inquote = 1
894 i = i + q + 1
895 return map(lambda x: x.strip(), list)
897 class FileHandler(BaseHandler):
898 # Use local file or FTP depending on form of URL
899 def file_open(self, req):
900 url = req.get_selector()
901 if url[:2] == '//' and url[2:3] != '/':
902 req.type = 'ftp'
903 return self.parent.open(req)
904 else:
905 return self.open_local_file(req)
907 # names for the localhost
908 names = None
909 def get_names(self):
910 if FileHandler.names is None:
911 FileHandler.names = (socket.gethostbyname('localhost'),
912 socket.gethostbyname(socket.gethostname()))
913 return FileHandler.names
915 # not entirely sure what the rules are here
916 def open_local_file(self, req):
917 mtype = mimetypes.guess_type(req.get_selector())[0]
918 headers = mimetools.Message(StringIO('Content-Type: %s\n' \
919 % (mtype or 'text/plain')))
920 host = req.get_host()
921 file = req.get_selector()
922 if host:
923 host, port = splitport(host)
924 if not host or \
925 (not port and socket.gethostbyname(host) in self.get_names()):
926 return addinfourl(open(url2pathname(file), 'rb'),
927 headers, 'file:'+file)
928 raise URLError('file not on local host')
930 class FTPHandler(BaseHandler):
931 def ftp_open(self, req):
932 host = req.get_host()
933 if not host:
934 raise IOError, ('ftp error', 'no host given')
935 # XXX handle custom username & password
936 try:
937 host = socket.gethostbyname(host)
938 except socket.error, msg:
939 raise URLError(msg)
940 host, port = splitport(host)
941 if port is None:
942 port = ftplib.FTP_PORT
943 path, attrs = splitattr(req.get_selector())
944 path = unquote(path)
945 dirs = path.split('/')
946 dirs, file = dirs[:-1], dirs[-1]
947 if dirs and not dirs[0]:
948 dirs = dirs[1:]
949 user = passwd = '' # XXX
950 try:
951 fw = self.connect_ftp(user, passwd, host, port, dirs)
952 type = file and 'I' or 'D'
953 for attr in attrs:
954 attr, value = splitattr(attr)
955 if attr.lower() == 'type' and \
956 value in ('a', 'A', 'i', 'I', 'd', 'D'):
957 type = value.upper()
958 fp, retrlen = fw.retrfile(file, type)
959 if retrlen is not None and retrlen >= 0:
960 sf = StringIO('Content-Length: %d\n' % retrlen)
961 headers = mimetools.Message(sf)
962 else:
963 headers = noheaders()
964 return addinfourl(fp, headers, req.get_full_url())
965 except ftplib.all_errors, msg:
966 raise IOError, ('ftp error', msg), sys.exc_info()[2]
968 def connect_ftp(self, user, passwd, host, port, dirs):
969 fw = ftpwrapper(user, passwd, host, port, dirs)
970 ## fw.ftp.set_debuglevel(1)
971 return fw
973 class CacheFTPHandler(FTPHandler):
974 # XXX would be nice to have pluggable cache strategies
975 # XXX this stuff is definitely not thread safe
976 def __init__(self):
977 self.cache = {}
978 self.timeout = {}
979 self.soonest = 0
980 self.delay = 60
981 self.max_conns = 16
983 def setTimeout(self, t):
984 self.delay = t
986 def setMaxConns(self, m):
987 self.max_conns = m
989 def connect_ftp(self, user, passwd, host, port, dirs):
990 key = user, passwd, host, port
991 if self.cache.has_key(key):
992 self.timeout[key] = time.time() + self.delay
993 else:
994 self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
995 self.timeout[key] = time.time() + self.delay
996 self.check_cache()
997 return self.cache[key]
999 def check_cache(self):
1000 # first check for old ones
1001 t = time.time()
1002 if self.soonest <= t:
1003 for k, v in self.timeout.items():
1004 if v < t:
1005 self.cache[k].close()
1006 del self.cache[k]
1007 del self.timeout[k]
1008 self.soonest = min(self.timeout.values())
1010 # then check the size
1011 if len(self.cache) == self.max_conns:
1012 for k, v in self.timeout.items():
1013 if v == self.soonest:
1014 del self.cache[k]
1015 del self.timeout[k]
1016 break
1017 self.soonest = min(self.timeout.values())
1019 class GopherHandler(BaseHandler):
1020 def gopher_open(self, req):
1021 host = req.get_host()
1022 if not host:
1023 raise GopherError('no host given')
1024 host = unquote(host)
1025 selector = req.get_selector()
1026 type, selector = splitgophertype(selector)
1027 selector, query = splitquery(selector)
1028 selector = unquote(selector)
1029 if query:
1030 query = unquote(query)
1031 fp = gopherlib.send_query(selector, query, host)
1032 else:
1033 fp = gopherlib.send_selector(selector, host)
1034 return addinfourl(fp, noheaders(), req.get_full_url())
1036 #bleck! don't use this yet
1037 class OpenerFactory:
1039 default_handlers = [UnknownHandler, HTTPHandler,
1040 HTTPDefaultErrorHandler, HTTPRedirectHandler,
1041 FTPHandler, FileHandler]
1042 proxy_handlers = [ProxyHandler]
1043 handlers = []
1044 replacement_handlers = []
1046 def add_proxy_handler(self, ph):
1047 self.proxy_handlers = self.proxy_handlers + [ph]
1049 def add_handler(self, h):
1050 self.handlers = self.handlers + [h]
1052 def replace_handler(self, h):
1053 pass
1055 def build_opener(self):
1056 opener = OpenerDirector()
1057 for ph in self.proxy_handlers:
1058 if isinstance(ph, types.ClassType):
1059 ph = ph()
1060 opener.add_handler(ph)
1062 if __name__ == "__main__":
1063 # XXX some of the test code depends on machine configurations that
1064 # are internal to CNRI. Need to set up a public server with the
1065 # right authentication configuration for test purposes.
1066 if socket.gethostname() == 'bitdiddle':
1067 localhost = 'bitdiddle.cnri.reston.va.us'
1068 elif socket.gethostname() == 'bitdiddle.concentric.net':
1069 localhost = 'localhost'
1070 else:
1071 localhost = None
1072 urls = [
1073 # Thanks to Fred for finding these!
1074 'gopher://gopher.lib.ncsu.edu/11/library/stacks/Alex',
1075 'gopher://gopher.vt.edu:10010/10/33',
1077 'file:/etc/passwd',
1078 'file://nonsensename/etc/passwd',
1079 'ftp://www.python.org/pub/tmp/httplib.py',
1080 'ftp://www.python.org/pub/tmp/imageop.c',
1081 'ftp://www.python.org/pub/tmp/blat',
1082 'http://www.espn.com/', # redirect
1083 'http://www.python.org/Spanish/Inquistion/',
1084 ('http://grail.cnri.reston.va.us/cgi-bin/faqw.py',
1085 'query=pythonistas&querytype=simple&casefold=yes&req=search'),
1086 'http://www.python.org/',
1087 'ftp://prep.ai.mit.edu/welcome.msg',
1088 'ftp://www.python.org/pub/tmp/figure.prn',
1089 'ftp://www.python.org/pub/tmp/interp.pl',
1090 'http://checkproxy.cnri.reston.va.us/test/test.html',
1093 if localhost is not None:
1094 urls = urls + [
1095 'file://%s/etc/passwd' % localhost,
1096 'http://%s/simple/' % localhost,
1097 'http://%s/digest/' % localhost,
1098 'http://%s/not/found.h' % localhost,
1101 bauth = HTTPBasicAuthHandler()
1102 bauth.add_password('basic_test_realm', localhost, 'jhylton',
1103 'password')
1104 dauth = HTTPDigestAuthHandler()
1105 dauth.add_password('digest_test_realm', localhost, 'jhylton',
1106 'password')
1109 cfh = CacheFTPHandler()
1110 cfh.setTimeout(1)
1112 # XXX try out some custom proxy objects too!
1113 def at_cnri(req):
1114 host = req.get_host()
1115 print host
1116 if host[-18:] == '.cnri.reston.va.us':
1117 return 1
1118 p = CustomProxy('http', at_cnri, 'proxy.cnri.reston.va.us')
1119 ph = CustomProxyHandler(p)
1121 #install_opener(build_opener(dauth, bauth, cfh, GopherHandler, ph))
1123 for url in urls:
1124 if isinstance(url, types.TupleType):
1125 url, req = url
1126 else:
1127 req = None
1128 print url
1129 try:
1130 f = urlopen(url, req)
1131 except IOError, err:
1132 print "IOError:", err
1133 except socket.error, err:
1134 print "socket.error:", err
1135 else:
1136 buf = f.read()
1137 f.close()
1138 print "read %d bytes" % len(buf)
1139 print
1140 time.sleep(0.1)