Bump version to 0.9.1.
[python/dscho.git] / Lib / urllib2.py
blob86fdc2008afd269feec4d92c82b4fa2f3e8992ad
1 """An extensible library for opening URLs using a variety of protocols
3 The simplest way to use this module is to call the urlopen function,
4 which accepts a string containing a URL or a Request object (described
5 below). It opens the URL and returns the results as file-like
6 object; the returned object has some extra methods described below.
8 The OpenerDirectory manages a collection of Handler objects that do
9 all the actual work. Each Handler implements a particular protocol or
10 option. The OpenerDirector is a composite object that invokes the
11 Handlers needed to open the requested URL. For example, the
12 HTTPHandler performs HTTP GET and POST requests and deals with
13 non-error returns. The HTTPRedirectHandler automatically deals with
14 HTTP 301 & 302 redirect errors, and the HTTPDigestAuthHandler deals
15 with digest authentication.
17 urlopen(url, data=None) -- basic usage is that same as original
18 urllib. pass the url and optionally data to post to an HTTP URL, and
19 get a file-like object back. One difference is that you can also pass
20 a Request instance instead of URL. Raises a URLError (subclass of
21 IOError); for HTTP errors, raises an HTTPError, which can also be
22 treated as a valid response.
24 build_opener -- function that creates a new OpenerDirector instance.
25 will install the default handlers. accepts one or more Handlers as
26 arguments, either instances or Handler classes that it will
27 instantiate. if one of the argument is a subclass of the default
28 handler, the argument will be installed instead of the default.
30 install_opener -- installs a new opener as the default opener.
32 objects of interest:
33 OpenerDirector --
35 Request -- an object that encapsulates the state of a request. the
36 state can be a simple as the URL. it can also include extra HTTP
37 headers, e.g. a User-Agent.
39 BaseHandler --
41 exceptions:
42 URLError-- a subclass of IOError, individual protocols have their own
43 specific subclass
45 HTTPError-- also a valid HTTP response, so you can treat an HTTP error
46 as an exceptional event or valid response
48 internals:
49 BaseHandler and parent
50 _call_chain conventions
52 Example usage:
54 import urllib2
56 # set up authentication info
57 authinfo = urllib2.HTTPBasicAuthHandler()
58 authinfo.add_password('realm', 'host', 'username', 'password')
60 # build a new opener that adds authentication and caching FTP handlers
61 opener = urllib2.build_opener(authinfo, urllib2.CacheFTPHandler)
63 # install it
64 urllib2.install_opener(opener)
66 f = urllib2.urlopen('http://www.python.org/')
69 """
71 # XXX issues:
72 # If an authentication error handler that tries to perform
73 # authentication for some reason but fails, how should the error be
74 # signalled? The client needs to know the HTTP error code. But if
75 # the handler knows that the problem was, e.g., that it didn't know
76 # that hash algo that requested in the challenge, it would be good to
77 # pass that information along to the client, too.
79 # XXX to do:
80 # name!
81 # documentation (getting there)
82 # complex proxies
83 # abstract factory for opener
84 # ftp errors aren't handled cleanly
85 # gopher can return a socket.error
86 # check digest against correct (i.e. non-apache) implementation
88 import string
89 import socket
90 import UserDict
91 import httplib
92 import re
93 import base64
94 import types
95 import urlparse
96 import os
97 import md5
98 import mimetypes
99 import mimetools
100 import ftplib
101 import sys
102 import time
103 import gopherlib
105 try:
106 from cStringIO import StringIO
107 except ImportError:
108 from StringIO import StringIO
110 try:
111 import sha
112 except ImportError:
113 # need 1.5.2 final
114 sha = None
116 # not sure how many of these need to be gotten rid of
117 from urllib import unwrap, unquote, splittype, splithost, \
118 addinfourl, splitport, splitgophertype, splitquery, \
119 splitattr, ftpwrapper, noheaders
121 # support for proxies via environment variables
122 from urllib import getproxies
124 # support for FileHandler
125 from urllib import localhost, thishost, url2pathname, pathname2url
127 # support for GopherHandler
128 from urllib import splitgophertype, splitquery
130 __version__ = "2.0a1"
132 _opener = None
133 def urlopen(url, data=None):
134 global _opener
135 if _opener is None:
136 _opener = build_opener()
137 return _opener.open(url, data)
139 def install_opener(opener):
140 global _opener
141 _opener = opener
143 # do these error classes make sense?
144 # make sure all of the IOError stuff is overridden. we just want to be
145 # subtypes.
147 class URLError(IOError):
148 # URLError is a sub-type of IOError, but it doesn't share any of
149 # the implementation. need to override __init__ and __str__
150 def __init__(self, reason):
151 self.reason = reason
153 def __str__(self):
154 return '<urlopen error %s>' % self.reason
156 class HTTPError(URLError, addinfourl):
157 """Raised when HTTP error occurs, but also acts like non-error return"""
159 def __init__(self, url, code, msg, hdrs, fp):
160 addinfourl.__init__(self, fp, hdrs, url)
161 self.code = code
162 self.msg = msg
163 self.hdrs = hdrs
164 self.fp = fp
165 # XXX
166 self.filename = url
168 def __str__(self):
169 return 'HTTP Error %s: %s' % (self.code, self.msg)
171 def __del__(self):
172 # XXX is this safe? what if user catches exception, then
173 # extracts fp and discards exception?
174 self.fp.close()
176 class GopherError(URLError):
177 pass
179 class Request:
180 def __init__(self, url, data=None, headers={}):
181 # unwrap('<URL:type://host/path>') --> 'type://host/path'
182 self.__original = unwrap(url)
183 self.type = None
184 # self.__r_type is what's left after doing the splittype
185 self.host = None
186 self.port = None
187 self.data = data
188 self.headers = {}
189 self.headers.update(headers)
191 def __getattr__(self, attr):
192 # XXX this is a fallback mechanism to guard against these
193 # methods getting called in a non-standard order. this may be
194 # too complicated and/or unnecessary.
195 # XXX should the __r_XXX attributes be public?
196 if attr[:12] == '_Request__r_':
197 name = attr[12:]
198 if hasattr(Request, 'get_' + name):
199 getattr(self, 'get_' + name)()
200 return getattr(self, attr)
201 raise AttributeError, attr
203 def add_data(self, data):
204 self.data = data
206 def has_data(self):
207 return self.data is not None
209 def get_data(self):
210 return self.data
212 def get_full_url(self):
213 return self.__original
215 def get_type(self):
216 if self.type is None:
217 self.type, self.__r_type = splittype(self.__original)
218 return self.type
220 def get_host(self):
221 if self.host is None:
222 self.host, self.__r_host = splithost(self.__r_type)
223 if self.host:
224 self.host = unquote(self.host)
225 return self.host
227 def get_selector(self):
228 return self.__r_host
230 def set_proxy(self, proxy):
231 self.__proxy = proxy
232 # XXX this code is based on urllib, but it doesn't seem
233 # correct. specifically, if the proxy has a port number then
234 # splittype will return the hostname as the type and the port
235 # will be include with everything else
236 self.type, self.__r_type = splittype(self.__proxy)
237 self.host, XXX = splithost(self.__r_type)
238 self.host = unquote(self.host)
239 self.__r_host = self.__original
241 def add_header(self, key, val):
242 # useful for something like authentication
243 self.headers[key] = val
245 class OpenerDirector:
246 def __init__(self):
247 server_version = "Python-urllib/%s" % __version__
248 self.addheaders = [('User-agent', server_version)]
249 # manage the individual handlers
250 self.handlers = []
251 self.handle_open = {}
252 self.handle_error = {}
254 def add_handler(self, handler):
255 added = 0
256 for meth in get_methods(handler):
257 if meth[-5:] == '_open':
258 protocol = meth[:-5]
259 if self.handle_open.has_key(protocol):
260 self.handle_open[protocol].append(handler)
261 else:
262 self.handle_open[protocol] = [handler]
263 added = 1
264 continue
265 i = string.find(meth, '_')
266 j = string.find(meth[i+1:], '_') + i + 1
267 if j != -1 and meth[i+1:j] == 'error':
268 proto = meth[:i]
269 kind = meth[j+1:]
270 try:
271 kind = string.atoi(kind)
272 except ValueError:
273 pass
274 dict = self.handle_error.get(proto, {})
275 if dict.has_key(kind):
276 dict[kind].append(handler)
277 else:
278 dict[kind] = [handler]
279 self.handle_error[proto] = dict
280 added = 1
281 continue
282 if added:
283 self.handlers.append(handler)
284 handler.add_parent(self)
286 def __del__(self):
287 self.close()
289 def close(self):
290 for handler in self.handlers:
291 handler.close()
292 self.handlers = []
294 def _call_chain(self, chain, kind, meth_name, *args):
295 # XXX raise an exception if no one else should try to handle
296 # this url. return None if you can't but someone else could.
297 handlers = chain.get(kind, ())
298 for handler in handlers:
299 func = getattr(handler, meth_name)
300 result = apply(func, args)
301 if result is not None:
302 return result
304 def open(self, fullurl, data=None):
305 # accept a URL or a Request object
306 if type(fullurl) == types.StringType:
307 req = Request(fullurl, data)
308 else:
309 req = fullurl
310 if data is not None:
311 req.add_data(data)
312 assert isinstance(req, Request) # really only care about interface
314 result = self._call_chain(self.handle_open, 'default',
315 'default_open', req)
316 if result:
317 return result
319 type_ = req.get_type()
320 result = self._call_chain(self.handle_open, type_, type_ + \
321 '_open', req)
322 if result:
323 return result
325 return self._call_chain(self.handle_open, 'unknown',
326 'unknown_open', req)
328 def error(self, proto, *args):
329 if proto == 'http':
330 # XXX http protocol is special cased
331 dict = self.handle_error[proto]
332 proto = args[2] # YUCK!
333 meth_name = 'http_error_%d' % proto
334 http_err = 1
335 orig_args = args
336 else:
337 dict = self.handle_error
338 meth_name = proto + '_error'
339 http_err = 0
340 args = (dict, proto, meth_name) + args
341 result = apply(self._call_chain, args)
342 if result:
343 return result
345 if http_err:
346 args = (dict, 'default', 'http_error_default') + orig_args
347 return apply(self._call_chain, args)
349 def is_callable(obj):
350 # not quite like builtin callable (which I didn't know existed),
351 # not entirely sure it needs to be different
352 if type(obj) in (types.BuiltinFunctionType,
353 types.BuiltinMethodType, types.LambdaType,
354 types.MethodType):
355 return 1
356 if type(obj) == types.InstanceType:
357 return hasattr(obj, '__call__')
358 return 0
360 def get_methods(inst):
361 methods = {}
362 classes = []
363 classes.append(inst.__class__)
364 while classes:
365 klass = classes[0]
366 del classes[0]
367 classes = classes + list(klass.__bases__)
368 for name in dir(klass):
369 attr = getattr(klass, name)
370 if type(attr) == types.UnboundMethodType:
371 methods[name] = 1
372 for name in dir(inst):
373 if is_callable(getattr(inst, name)):
374 methods[name] = 1
375 return methods.keys()
377 # XXX probably also want an abstract factory that knows things like
378 # the fact that a ProxyHandler needs to get inserted first.
379 # would also know when it makes sense to skip a superclass in favor of
380 # a subclass and when it might make sense to include both
382 def build_opener(*handlers):
383 """Create an opener object from a list of handlers.
385 The opener will use several default handlers, including support
386 for HTTP and FTP. If there is a ProxyHandler, it must be at the
387 front of the list of handlers. (Yuck.)
389 If any of the handlers passed as arguments are subclasses of the
390 default handlers, the default handlers will not be used.
393 opener = OpenerDirector()
394 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
395 HTTPDefaultErrorHandler, HTTPRedirectHandler,
396 FTPHandler, FileHandler]
397 skip = []
398 for klass in default_classes:
399 for check in handlers:
400 if type(check) == types.ClassType:
401 if issubclass(check, klass):
402 skip.append(klass)
403 elif type(check) == types.InstanceType:
404 if isinstance(check, klass):
405 skip.append(klass)
406 for klass in skip:
407 default_classes.remove(klass)
409 for klass in default_classes:
410 opener.add_handler(klass())
412 for h in handlers:
413 if type(h) == types.ClassType:
414 h = h()
415 opener.add_handler(h)
416 return opener
418 class BaseHandler:
419 def add_parent(self, parent):
420 self.parent = parent
421 def close(self):
422 self.parent = None
424 class HTTPDefaultErrorHandler(BaseHandler):
425 def http_error_default(self, req, fp, code, msg, hdrs):
426 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
428 class HTTPRedirectHandler(BaseHandler):
429 # Implementation note: To avoid the server sending us into an
430 # infinite loop, the request object needs to track what URLs we
431 # have already seen. Do this by adding a handler-specific
432 # attribute to the Request object.
433 def http_error_302(self, req, fp, code, msg, headers):
434 if headers.has_key('location'):
435 newurl = headers['location']
436 elif headers.has_key('uri'):
437 newurl = headers['uri']
438 else:
439 return
440 nil = fp.read()
441 fp.close()
443 # XXX Probably want to forget about the state of the current
444 # request, although that might interact poorly with other
445 # handlers that also use handler-specific request attributes
446 new = Request(newurl, req.get_data())
447 new.error_302_dict = {}
448 if hasattr(req, 'error_302_dict'):
449 if req.error_302_dict.has_key(newurl):
450 raise HTTPError(req.get_full_url(), code,
451 self.inf_msg + msg, headers)
452 new.error_302_dict.update(req.error_302_dict)
453 new.error_302_dict[newurl] = newurl
454 return self.parent.open(new)
456 http_error_301 = http_error_302
458 inf_msg = "The HTTP server returned a redirect error that would" \
459 "lead to an infinite loop.\n" \
460 "The last 302 error message was:\n"
462 class ProxyHandler(BaseHandler):
463 def __init__(self, proxies=None):
464 if proxies is None:
465 proxies = getproxies()
466 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
467 self.proxies = proxies
468 for type, url in proxies.items():
469 setattr(self, '%s_open' % type,
470 lambda r, proxy=url, type=type, meth=self.proxy_open: \
471 meth(r, proxy, type))
473 def proxy_open(self, req, proxy, type):
474 orig_type = req.get_type()
475 req.set_proxy(proxy)
476 if orig_type == type:
477 # let other handlers take care of it
478 # XXX this only makes sense if the proxy is before the
479 # other handlers
480 return None
481 else:
482 # need to start over, because the other handlers don't
483 # grok the proxy's URL type
484 return self.parent.open(req)
486 # feature suggested by Duncan Booth
487 # XXX custom is not a good name
488 class CustomProxy:
489 # either pass a function to the constructor or override handle
490 def __init__(self, proto, func=None, proxy_addr=None):
491 self.proto = proto
492 self.func = func
493 self.addr = proxy_addr
495 def handle(self, req):
496 if self.func and self.func(req):
497 return 1
499 def get_proxy(self):
500 return self.addr
502 class CustomProxyHandler(BaseHandler):
503 def __init__(self, *proxies):
504 self.proxies = {}
506 def proxy_open(self, req):
507 proto = req.get_type()
508 try:
509 proxies = self.proxies[proto]
510 except KeyError:
511 return None
512 for p in proxies:
513 if p.handle(req):
514 req.set_proxy(p.get_proxy())
515 return self.parent.open(req)
516 return None
518 def do_proxy(self, p, req):
520 return self.parent.open(req)
522 def add_proxy(self, cpo):
523 if self.proxies.has_key(cpo.proto):
524 self.proxies[cpo.proto].append(cpo)
525 else:
526 self.proxies[cpo.proto] = [cpo]
528 class HTTPPasswordMgr:
529 def __init__(self):
530 self.passwd = {}
532 def add_password(self, realm, uri, user, passwd):
533 # uri could be a single URI or a sequence
534 if type(uri) == types.StringType:
535 uri = [uri]
536 uri = tuple(map(self.reduce_uri, uri))
537 if not self.passwd.has_key(realm):
538 self.passwd[realm] = {}
539 self.passwd[realm][uri] = (user, passwd)
541 def find_user_password(self, realm, authuri):
542 domains = self.passwd.get(realm, {})
543 authuri = self.reduce_uri(authuri)
544 for uris, authinfo in domains.items():
545 for uri in uris:
546 if self.is_suburi(uri, authuri):
547 return authinfo
548 return None, None
550 def reduce_uri(self, uri):
551 """Accept netloc or URI and extract only the netloc and path"""
552 parts = urlparse.urlparse(uri)
553 if parts[1]:
554 return parts[1], parts[2] or '/'
555 else:
556 return parts[2], '/'
558 def is_suburi(self, base, test):
559 """Check if test is below base in a URI tree
561 Both args must be URIs in reduced form.
563 if base == test:
564 return 1
565 if base[0] != test[0]:
566 return 0
567 common = os.path.commonprefix((base[1], test[1]))
568 if len(common) == len(base[1]):
569 return 1
570 return 0
573 class HTTPBasicAuthHandler(BaseHandler):
574 rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"')
576 # XXX there can actually be multiple auth-schemes in a
577 # www-authenticate header. should probably be a lot more careful
578 # in parsing them to extract multiple alternatives
580 def __init__(self):
581 self.passwd = HTTPPasswordMgr()
582 self.add_password = self.passwd.add_password
583 self.__current_realm = None
584 # if __current_realm is not None, then the server must have
585 # refused our name/password and is asking for authorization
586 # again. must be careful to set it to None on successful
587 # return.
589 def http_error_401(self, req, fp, code, msg, headers):
590 # XXX could be mult. headers
591 authreq = headers.get('www-authenticate', None)
592 if authreq:
593 mo = HTTPBasicAuthHandler.rx.match(authreq)
594 if mo:
595 scheme, realm = mo.groups()
596 if string.lower(scheme) == 'basic':
597 return self.retry_http_basic_auth(req, realm)
599 def retry_http_basic_auth(self, req, realm):
600 if self.__current_realm is None:
601 self.__current_realm = realm
602 else:
603 self.__current_realm = realm
604 return None
605 # XXX host isn't really the correct URI?
606 host = req.get_host()
607 user,pw = self.passwd.find_user_password(realm, host)
608 if pw:
609 raw = "%s:%s" % (user, pw)
610 auth = string.strip(base64.encodestring(raw))
611 req.add_header('Authorization', 'Basic %s' % auth)
612 resp = self.parent.open(req)
613 self.__current_realm = None
614 return resp
615 else:
616 self.__current_realm = None
617 return None
619 class HTTPDigestAuthHandler(BaseHandler):
620 """An authentication protocol defined by RFC 2069
622 Digest authentication improves on basic authentication because it
623 does not transmit passwords in the clear.
626 def __init__(self):
627 self.passwd = HTTPPasswordMgr()
628 self.add_password = self.passwd.add_password
629 self.__current_realm = None
631 def http_error_401(self, req, fp, code, msg, headers):
632 # XXX could be mult. headers
633 authreq = headers.get('www-authenticate', None)
634 if authreq:
635 kind = string.split(authreq)[0]
636 if kind == 'Digest':
637 return self.retry_http_digest_auth(req, authreq)
639 def retry_http_digest_auth(self, req, auth):
640 token, challenge = string.split(auth, ' ', 1)
641 chal = parse_keqv_list(parse_http_list(challenge))
642 auth = self.get_authorization(req, chal)
643 if auth:
644 req.add_header('Authorization', 'Digest %s' % auth)
645 resp = self.parent.open(req)
646 self.__current_realm = None
647 return resp
649 def get_authorization(self, req, chal):
650 try:
651 realm = chal['realm']
652 nonce = chal['nonce']
653 algorithm = chal.get('algorithm', 'MD5')
654 # mod_digest doesn't send an opaque, even though it isn't
655 # supposed to be optional
656 opaque = chal.get('opaque', None)
657 except KeyError:
658 return None
660 if self.__current_realm is None:
661 self.__current_realm = realm
662 else:
663 self.__current_realm = realm
664 return None
666 H, KD = self.get_algorithm_impls(algorithm)
667 if H is None:
668 return None
670 user, pw = self.passwd.find_user_password(realm,
671 req.get_full_url())
672 if user is None:
673 return None
675 # XXX not implemented yet
676 if req.has_data():
677 entdig = self.get_entity_digest(req.get_data(), chal)
678 else:
679 entdig = None
681 A1 = "%s:%s:%s" % (user, realm, pw)
682 A2 = "%s:%s" % (req.has_data() and 'POST' or 'GET',
683 # XXX selector: what about proxies and full urls
684 req.get_selector())
685 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
686 # XXX should the partial digests be encoded too?
688 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
689 'response="%s"' % (user, realm, nonce, req.get_selector(),
690 respdig)
691 if opaque:
692 base = base + ', opaque="%s"' % opaque
693 if entdig:
694 base = base + ', digest="%s"' % entdig
695 if algorithm != 'MD5':
696 base = base + ', algorithm="%s"' % algorithm
697 return base
699 def get_algorithm_impls(self, algorithm):
700 # lambdas assume digest modules are imported at the top level
701 if algorithm == 'MD5':
702 H = lambda x, e=encode_digest:e(md5.new(x).digest())
703 elif algorithm == 'SHA':
704 H = lambda x, e=encode_digest:e(sha.new(x).digest())
705 # XXX MD5-sess
706 KD = lambda s, d, H=H: H("%s:%s" % (s, d))
707 return H, KD
709 def get_entity_digest(self, data, chal):
710 # XXX not implemented yet
711 return None
713 def encode_digest(digest):
714 hexrep = []
715 for c in digest:
716 n = (ord(c) >> 4) & 0xf
717 hexrep.append(hex(n)[-1])
718 n = ord(c) & 0xf
719 hexrep.append(hex(n)[-1])
720 return string.join(hexrep, '')
723 class HTTPHandler(BaseHandler):
724 def http_open(self, req):
725 # XXX devise a new mechanism to specify user/password
726 host = req.get_host()
727 if not host:
728 raise URLError('no host given')
730 h = httplib.HTTP(host) # will parse host:port
731 ## h.set_debuglevel(1)
732 if req.has_data():
733 data = req.get_data()
734 h.putrequest('POST', req.get_selector())
735 h.putheader('Content-type', 'application/x-www-form-urlencoded')
736 h.putheader('Content-length', '%d' % len(data))
737 else:
738 h.putrequest('GET', req.get_selector())
739 # XXX proxies would have different host here
740 h.putheader('Host', host)
741 for args in self.parent.addheaders:
742 apply(h.putheader, args)
743 for k, v in req.headers.items():
744 h.putheader(k, v)
745 h.endheaders()
746 if req.has_data():
747 h.send(data + '\r\n')
749 code, msg, hdrs = h.getreply()
750 fp = h.getfile()
751 if code == 200:
752 return addinfourl(fp, hdrs, req.get_full_url())
753 else:
754 # want to make sure the socket is closed, even if error
755 # handling doesn't return immediately. the socket won't
756 # actually be closed until fp is also closed.
757 if h.sock:
758 h.sock.close()
759 h.sock = None
760 return self.parent.error('http', req, fp, code, msg, hdrs)
762 class UnknownHandler(BaseHandler):
763 def unknown_open(self, req):
764 type = req.get_type()
765 raise URLError('unknown url type: %s' % type)
767 def parse_keqv_list(l):
768 """Parse list of key=value strings where keys are not duplicated."""
769 parsed = {}
770 for elt in l:
771 k, v = string.split(elt, '=', 1)
772 if v[0] == '"' and v[-1] == '"':
773 v = v[1:-1]
774 parsed[k] = v
775 return parsed
777 def parse_http_list(s):
778 """Parse lists as described by RFC 2068 Section 2.
780 In particular, parse comman-separated lists where the elements of
781 the list may include quoted-strings. A quoted-string could
782 contain a comma.
784 # XXX this function could probably use more testing
786 list = []
787 end = len(s)
788 i = 0
789 inquote = 0
790 start = 0
791 while i < end:
792 cur = s[i:]
793 c = string.find(cur, ',')
794 q = string.find(cur, '"')
795 if c == -1:
796 list.append(s[start:])
797 break
798 if q == -1:
799 if inquote:
800 raise ValueError, "unbalanced quotes"
801 else:
802 list.append(s[start:i+c])
803 i = i + c + 1
804 continue
805 if inquote:
806 if q < c:
807 list.append(s[start:i+c])
808 i = i + c + 1
809 start = i
810 inquote = 0
811 else:
812 i = i + q
813 else:
814 if c < q:
815 list.append(s[start:i+c])
816 i = i + c + 1
817 start = i
818 else:
819 inquote = 1
820 i = i + q + 1
821 return map(string.strip, list)
823 class FileHandler(BaseHandler):
824 # Use local file or FTP depending on form of URL
825 def file_open(self, req):
826 url = req.get_selector()
827 if url[:2] == '//' and url[2:3] != '/':
828 req.type = 'ftp'
829 return self.parent.open(req)
830 else:
831 return self.open_local_file(req)
833 # names for the localhost
834 names = None
835 def get_names(self):
836 if FileHandler.names is None:
837 FileHandler.names = (socket.gethostbyname('localhost'),
838 socket.gethostbyname(socket.gethostname()))
839 return FileHandler.names
841 # not entirely sure what the rules are here
842 def open_local_file(self, req):
843 mtype = mimetypes.guess_type(req.get_selector())[0]
844 headers = mimetools.Message(StringIO('Content-Type: %s\n' \
845 % (mtype or 'text/plain')))
846 host = req.get_host()
847 file = req.get_selector()
848 if host:
849 host, port = splitport(host)
850 if not host or \
851 (not port and socket.gethostbyname(host) in self.get_names()):
852 return addinfourl(open(url2pathname(file), 'rb'),
853 headers, 'file:'+file)
854 raise URLError('file not on local host')
856 class FTPHandler(BaseHandler):
857 def ftp_open(self, req):
858 host = req.get_host()
859 if not host:
860 raise IOError, ('ftp error', 'no host given')
861 # XXX handle custom username & password
862 host = socket.gethostbyname(host)
863 host, port = splitport(host)
864 if port is None:
865 port = ftplib.FTP_PORT
866 path, attrs = splitattr(req.get_selector())
867 path = unquote(path)
868 dirs = string.splitfields(path, '/')
869 dirs, file = dirs[:-1], dirs[-1]
870 if dirs and not dirs[0]:
871 dirs = dirs[1:]
872 user = passwd = '' # XXX
873 try:
874 fw = self.connect_ftp(user, passwd, host, port, dirs)
875 type = file and 'I' or 'D'
876 for attr in attrs:
877 attr, value = splitattr(attr)
878 if string.lower(attr) == 'type' and \
879 value in ('a', 'A', 'i', 'I', 'd', 'D'):
880 type = string.upper(value)
881 fp, retrlen = fw.retrfile(file, type)
882 if retrlen is not None and retrlen >= 0:
883 sf = StringIO('Content-Length: %d\n' % retrlen)
884 headers = mimetools.Message(sf)
885 else:
886 headers = noheaders()
887 return addinfourl(fp, headers, req.get_full_url())
888 except ftplib.all_errors, msg:
889 raise IOError, ('ftp error', msg), sys.exc_info()[2]
891 def connect_ftp(self, user, passwd, host, port, dirs):
892 fw = ftpwrapper(user, passwd, host, port, dirs)
893 ## fw.ftp.set_debuglevel(1)
894 return fw
896 class CacheFTPHandler(FTPHandler):
897 # XXX would be nice to have pluggable cache strategies
898 # XXX this stuff is definitely not thread safe
899 def __init__(self):
900 self.cache = {}
901 self.timeout = {}
902 self.soonest = 0
903 self.delay = 60
904 self.max_conns = 16
906 def setTimeout(self, t):
907 self.delay = t
909 def setMaxConns(self, m):
910 self.max_conns = m
912 def connect_ftp(self, user, passwd, host, port, dirs):
913 key = user, passwd, host, port
914 if self.cache.has_key(key):
915 self.timeout[key] = time.time() + self.delay
916 else:
917 self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
918 self.timeout[key] = time.time() + self.delay
919 self.check_cache()
920 return self.cache[key]
922 def check_cache(self):
923 # first check for old ones
924 t = time.time()
925 if self.soonest <= t:
926 for k, v in self.timeout.items():
927 if v < t:
928 self.cache[k].close()
929 del self.cache[k]
930 del self.timeout[k]
931 self.soonest = min(self.timeout.values())
933 # then check the size
934 if len(self.cache) == self.max_conns:
935 for k, v in self.timeout.items():
936 if v == self.soonest:
937 del self.cache[k]
938 del self.timeout[k]
939 break
940 self.soonest = min(self.timeout.values())
942 class GopherHandler(BaseHandler):
943 def gopher_open(self, req):
944 host = req.get_host()
945 if not host:
946 raise GopherError('no host given')
947 host = unquote(host)
948 selector = req.get_selector()
949 type, selector = splitgophertype(selector)
950 selector, query = splitquery(selector)
951 selector = unquote(selector)
952 if query:
953 query = unquote(query)
954 fp = gopherlib.send_query(selector, query, host)
955 else:
956 fp = gopherlib.send_selector(selector, host)
957 return addinfourl(fp, noheaders(), req.get_full_url())
959 #bleck! don't use this yet
960 class OpenerFactory:
962 default_handlers = [UnknownHandler, HTTPHandler,
963 HTTPDefaultErrorHandler, HTTPRedirectHandler,
964 FTPHandler, FileHandler]
965 proxy_handlers = [ProxyHandler]
966 handlers = []
967 replacement_handlers = []
969 def add_proxy_handler(self, ph):
970 self.proxy_handlers = self.proxy_handlers + [ph]
972 def add_handler(self, h):
973 self.handlers = self.handlers + [h]
975 def replace_handler(self, h):
976 pass
978 def build_opener(self):
979 opener = OpenerDirectory()
980 for ph in self.proxy_handlers:
981 if type(ph) == types.ClassType:
982 ph = ph()
983 opener.add_handler(ph)
985 if __name__ == "__main__":
986 # XXX some of the test code depends on machine configurations that
987 # are internal to CNRI. Need to set up a public server with the
988 # right authentication configuration for test purposes.
989 if socket.gethostname() == 'bitdiddle':
990 localhost = 'bitdiddle.cnri.reston.va.us'
991 elif socket.gethostname() == 'walden':
992 localhost = 'localhost'
993 else:
994 localhost = None
995 urls = [
996 # Thanks to Fred for finding these!
997 'gopher://gopher.lib.ncsu.edu/11/library/stacks/Alex',
998 'gopher://gopher.vt.edu:10010/10/33',
1000 'file:/etc/passwd',
1001 'file://nonsensename/etc/passwd',
1002 'ftp://www.python.org/pub/tmp/httplib.py',
1003 'ftp://www.python.org/pub/tmp/imageop.c',
1004 'ftp://www.python.org/pub/tmp/blat',
1005 'http://www.espn.com/', # redirect
1006 'http://www.python.org/Spanish/Inquistion/',
1007 ('http://grail.cnri.reston.va.us/cgi-bin/faqw.py',
1008 'query=pythonistas&querytype=simple&casefold=yes&req=search'),
1009 'http://www.python.org/',
1010 'ftp://prep.ai.mit.edu/welcome.msg',
1011 'ftp://www.python.org/pub/tmp/figure.prn',
1012 'ftp://www.python.org/pub/tmp/interp.pl',
1013 'http://checkproxy.cnri.reston.va.us/test/test.html',
1016 if localhost is not None:
1017 urls = urls + [
1018 'file://%s/etc/passwd' % localhost,
1019 'http://%s/simple/' % localhost,
1020 'http://%s/digest/' % localhost,
1021 'http://%s/not/found.h' % localhost,
1024 bauth = HTTPBasicAuthHandler()
1025 bauth.add_password('basic_test_realm', localhost, 'jhylton',
1026 'password')
1027 dauth = HTTPDigestAuthHandler()
1028 dauth.add_password('digest_test_realm', localhost, 'jhylton',
1029 'password')
1032 cfh = CacheFTPHandler()
1033 cfh.setTimeout(1)
1035 # XXX try out some custom proxy objects too!
1036 def at_cnri(req):
1037 host = req.get_host()
1038 print host
1039 if host[-18:] == '.cnri.reston.va.us':
1040 return 1
1041 p = CustomProxy('http', at_cnri, 'proxy.cnri.reston.va.us')
1042 ph = CustomProxyHandler(p)
1044 install_opener(build_opener(dauth, bauth, cfh, GopherHandler, ph))
1046 for url in urls:
1047 if type(url) == types.TupleType:
1048 url, req = url
1049 else:
1050 req = None
1051 print url
1052 try:
1053 f = urlopen(url, req)
1054 except IOError, err:
1055 print "IOError:", err
1056 except socket.error, err:
1057 print "socket.error:", err
1058 else:
1059 buf = f.read()
1060 f.close()
1061 print "read %d bytes" % len(buf)
1062 print
1063 time.sleep(0.1)