Lib/urllib2.py

   1 """An extensible library for opening URLs using a variety of protocols
   2
   3 The simplest way to use this module is to call the urlopen function,
   4 which accepts a string containing a URL or a Request object (described
   5 below).  It opens the URL and returns the results as file-like
   6 object; the returned object has some extra methods described below.
   7
   8 The OpenerDirector manages a collection of Handler objects that do
   9 all the actual work.  Each Handler implements a particular protocol or
  10 option.  The OpenerDirector is a composite object that invokes the
  11 Handlers needed to open the requested URL.  For example, the
  12 HTTPHandler performs HTTP GET and POST requests and deals with
  13 non-error returns.  The HTTPRedirectHandler automatically deals with
  14 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
  15 deals with digest authentication.
  16
  17 urlopen(url, data=None) -- basic usage is that same as original
  18 urllib.  pass the url and optionally data to post to an HTTP URL, and
  19 get a file-like object back.  One difference is that you can also pass
  20 a Request instance instead of URL.  Raises a URLError (subclass of
  21 IOError); for HTTP errors, raises an HTTPError, which can also be
  22 treated as a valid response.
  23
  24 build_opener -- function that creates a new OpenerDirector instance.
  25 will install the default handlers.  accepts one or more Handlers as
  26 arguments, either instances or Handler classes that it will
  27 instantiate.  if one of the argument is a subclass of the default
  28 handler, the argument will be installed instead of the default.
  29
  30 install_opener -- installs a new opener as the default opener.
  31
  32 objects of interest:
  33 OpenerDirector --
  34
  35 Request -- an object that encapsulates the state of a request.  the
  36 state can be a simple as the URL.  it can also include extra HTTP
  37 headers, e.g. a User-Agent.
  38
  39 BaseHandler --
  40
  41 exceptions:
  42 URLError-- a subclass of IOError, individual protocols have their own
  43 specific subclass
  44
  45 HTTPError-- also a valid HTTP response, so you can treat an HTTP error
  46 as an exceptional event or valid response
  47
  48 internals:
  49 BaseHandler and parent
  50 _call_chain conventions
  51
  52 Example usage:
  53
  54 import urllib2
  55
  56 # set up authentication info
  57 authinfo = urllib2.HTTPBasicAuthHandler()
  58 authinfo.add_password('realm', 'host', 'username', 'password')
  59
  60 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
  61
  62 # build a new opener that adds authentication and caching FTP handlers
  63 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
  64
  65 # install it
  66 urllib2.install_opener(opener)
  67
  68 f = urllib2.urlopen('http://www.python.org/')
  69
  70
  71 """
  72
  73 # XXX issues:
  74 # If an authentication error handler that tries to perform
  75 # authentication for some reason but fails, how should the error be
  76 # signalled?  The client needs to know the HTTP error code.  But if
  77 # the handler knows that the problem was, e.g., that it didn't know
  78 # that hash algo that requested in the challenge, it would be good to
  79 # pass that information along to the client, too.
  80
  81 # XXX to do:
  82 # name!
  83 # documentation (getting there)
  84 # complex proxies
  85 # abstract factory for opener
  86 # ftp errors aren't handled cleanly
  87 # gopher can return a socket.error
  88 # check digest against correct (i.e. non-apache) implementation
  89
  90 import socket
  91 import httplib
  92 import inspect
  93 import re
  94 import base64
  95 import urlparse
  96 import md5
  97 import mimetypes
  98 import mimetools
  99 import rfc822
 100 import ftplib
 101 import sys
 102 import time
 103 import os
 104 import gopherlib
 105 import posixpath
 106
 107 try:
 108     from cStringIO import StringIO
 109 except ImportError:
 110     from StringIO import StringIO
 111
 112 try:
 113     import sha
 114 except ImportError:
 115     # need 1.5.2 final
 116     sha = None
 117
 118 # not sure how many of these need to be gotten rid of
 119 from urllib import unwrap, unquote, splittype, splithost, \
 120      addinfourl, splitport, splitgophertype, splitquery, \
 121      splitattr, ftpwrapper, noheaders
 122
 123 # support for proxies via environment variables
 124 from urllib import getproxies
 125
 126 # support for FileHandler
 127 from urllib import localhost, url2pathname
 128
 129 __version__ = "2.0a1"
 130
 131 _opener = None
 132 def urlopen(url, data=None):
 133     global _opener
 134     if _opener is None:
 135         _opener = build_opener()
 136     return _opener.open(url, data)
 137
 138 def install_opener(opener):
 139     global _opener
 140     _opener = opener
 141
 142 # do these error classes make sense?
 143 # make sure all of the IOError stuff is overridden.  we just want to be
 144 # subtypes.
 145
 146 class URLError(IOError):
 147     # URLError is a sub-type of IOError, but it doesn't share any of
 148     # the implementation.  need to override __init__ and __str__
 149     def __init__(self, reason):
 150         self.reason = reason
 151
 152     def __str__(self):
 153         return '<urlopen error %s>' % self.reason
 154
 155 class HTTPError(URLError, addinfourl):
 156     """Raised when HTTP error occurs, but also acts like non-error return"""
 157     __super_init = addinfourl.__init__
 158
 159     def __init__(self, url, code, msg, hdrs, fp):
 160         self.code = code
 161         self.msg = msg
 162         self.hdrs = hdrs
 163         self.fp = fp
 164         self.filename = url
 165         # The addinfourl classes depend on fp being a valid file
 166         # object.  In some cases, the HTTPError may not have a valid
 167         # file object.  If this happens, the simplest workaround is to
 168         # not initialize the base classes.
 169         if fp is not None:
 170             self.__super_init(fp, hdrs, url)
 171
 172     def __str__(self):
 173         return 'HTTP Error %s: %s' % (self.code, self.msg)
 174
 175     def __del__(self):
 176         # XXX is this safe? what if user catches exception, then
 177         # extracts fp and discards exception?
 178         if self.fp:
 179             self.fp.close()
 180
 181 class GopherError(URLError):
 182     pass
 183
 184
 185 class Request:
 186
 187     def __init__(self, url, data=None, headers={}):
 188         # unwrap('<URL:type://host/path>') --> 'type://host/path'
 189         self.__original = unwrap(url)
 190         self.type = None
 191         # self.__r_type is what's left after doing the splittype
 192         self.host = None
 193         self.port = None
 194         self.data = data
 195         self.headers = {}
 196         self.headers.update(headers)
 197
 198     def __getattr__(self, attr):
 199         # XXX this is a fallback mechanism to guard against these
 200         # methods getting called in a non-standard order.  this may be
 201         # too complicated and/or unnecessary.
 202         # XXX should the __r_XXX attributes be public?
 203         if attr[:12] == '_Request__r_':
 204             name = attr[12:]
 205             if hasattr(Request, 'get_' + name):
 206                 getattr(self, 'get_' + name)()
 207                 return getattr(self, attr)
 208         raise AttributeError, attr
 209
 210     def get_method(self):
 211         if self.has_data():
 212             return "POST"
 213         else:
 214             return "GET"
 215
 216     def add_data(self, data):
 217         self.data = data
 218
 219     def has_data(self):
 220         return self.data is not None
 221
 222     def get_data(self):
 223         return self.data
 224
 225     def get_full_url(self):
 226         return self.__original
 227
 228     def get_type(self):
 229         if self.type is None:
 230             self.type, self.__r_type = splittype(self.__original)
 231             if self.type is None:
 232                 raise ValueError, "unknown url type: %s" % self.__original
 233         return self.type
 234
 235     def get_host(self):
 236         if self.host is None:
 237             self.host, self.__r_host = splithost(self.__r_type)
 238             if self.host:
 239                 self.host = unquote(self.host)
 240         return self.host
 241
 242     def get_selector(self):
 243         return self.__r_host
 244
 245     def set_proxy(self, host, type):
 246         self.host, self.type = host, type
 247         self.__r_host = self.__original
 248
 249     def add_header(self, key, val):
 250         # useful for something like authentication
 251         self.headers[key] = val
 252
 253 class OpenerDirector:
 254     def __init__(self):
 255         server_version = "Python-urllib/%s" % __version__
 256         self.addheaders = [('User-Agent', server_version)]
 257         # manage the individual handlers
 258         self.handlers = []
 259         self.handle_open = {}
 260         self.handle_error = {}
 261
 262     def add_handler(self, handler):
 263         added = 0
 264         for meth in dir(handler):
 265             if meth[-5:] == '_open':
 266                 protocol = meth[:-5]
 267                 if protocol in self.handle_open:
 268                     self.handle_open[protocol].append(handler)
 269                 else:
 270                     self.handle_open[protocol] = [handler]
 271                 added = 1
 272                 continue
 273             i = meth.find('_')
 274             j = meth[i+1:].find('_') + i + 1
 275             if j != -1 and meth[i+1:j] == 'error':
 276                 proto = meth[:i]
 277                 kind = meth[j+1:]
 278                 try:
 279                     kind = int(kind)
 280                 except ValueError:
 281                     pass
 282                 dict = self.handle_error.get(proto, {})
 283                 if kind in dict:
 284                     dict[kind].append(handler)
 285                 else:
 286                     dict[kind] = [handler]
 287                 self.handle_error[proto] = dict
 288                 added = 1
 289                 continue
 290         if added:
 291             self.handlers.append(handler)
 292             handler.add_parent(self)
 293
 294     def __del__(self):
 295         self.close()
 296
 297     def close(self):
 298         for handler in self.handlers:
 299             handler.close()
 300         self.handlers = []
 301
 302     def _call_chain(self, chain, kind, meth_name, *args):
 303         # XXX raise an exception if no one else should try to handle
 304         # this url.  return None if you can't but someone else could.
 305         handlers = chain.get(kind, ())
 306         for handler in handlers:
 307             func = getattr(handler, meth_name)
 308
 309             result = func(*args)
 310             if result is not None:
 311                 return result
 312
 313     def open(self, fullurl, data=None):
 314         # accept a URL or a Request object
 315         if isinstance(fullurl, basestring):
 316             req = Request(fullurl, data)
 317         else:
 318             req = fullurl
 319             if data is not None:
 320                 req.add_data(data)
 321         assert isinstance(req, Request) # really only care about interface
 322
 323         result = self._call_chain(self.handle_open, 'default',
 324                                   'default_open', req)
 325         if result:
 326             return result
 327
 328         type_ = req.get_type()
 329         result = self._call_chain(self.handle_open, type_, type_ + \
 330                                   '_open', req)
 331         if result:
 332             return result
 333
 334         return self._call_chain(self.handle_open, 'unknown',
 335                                 'unknown_open', req)
 336
 337     def error(self, proto, *args):
 338         if proto in ['http', 'https']:
 339             # XXX http[s] protocols are special-cased
 340             dict = self.handle_error['http'] # https is not different than http
 341             proto = args[2]  # YUCK!
 342             meth_name = 'http_error_%d' % proto
 343             http_err = 1
 344             orig_args = args
 345         else:
 346             dict = self.handle_error
 347             meth_name = proto + '_error'
 348             http_err = 0
 349         args = (dict, proto, meth_name) + args
 350         result = self._call_chain(*args)
 351         if result:
 352             return result
 353
 354         if http_err:
 355             args = (dict, 'default', 'http_error_default') + orig_args
 356             return self._call_chain(*args)
 357
 358 # XXX probably also want an abstract factory that knows things like
 359 # the fact that a ProxyHandler needs to get inserted first.
 360 # would also know when it makes sense to skip a superclass in favor of
 361 # a subclass and when it might make sense to include both
 362
 363 def build_opener(*handlers):
 364     """Create an opener object from a list of handlers.
 365
 366     The opener will use several default handlers, including support
 367     for HTTP and FTP.  If there is a ProxyHandler, it must be at the
 368     front of the list of handlers.  (Yuck.)
 369
 370     If any of the handlers passed as arguments are subclasses of the
 371     default handlers, the default handlers will not be used.
 372     """
 373
 374     opener = OpenerDirector()
 375     default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
 376                        HTTPDefaultErrorHandler, HTTPRedirectHandler,
 377                        FTPHandler, FileHandler]
 378     if hasattr(httplib, 'HTTPS'):
 379         default_classes.append(HTTPSHandler)
 380     skip = []
 381     for klass in default_classes:
 382         for check in handlers:
 383             if inspect.isclass(check):
 384                 if issubclass(check, klass):
 385                     skip.append(klass)
 386             elif isinstance(check, klass):
 387                 skip.append(klass)
 388     for klass in skip:
 389         default_classes.remove(klass)
 390
 391     for klass in default_classes:
 392         opener.add_handler(klass())
 393
 394     for h in handlers:
 395         if inspect.isclass(h):
 396             h = h()
 397         opener.add_handler(h)
 398     return opener
 399
 400 class BaseHandler:
 401     def add_parent(self, parent):
 402         self.parent = parent
 403     def close(self):
 404         self.parent = None
 405
 406 class HTTPDefaultErrorHandler(BaseHandler):
 407     def http_error_default(self, req, fp, code, msg, hdrs):
 408         raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
 409
 410 class HTTPRedirectHandler(BaseHandler):
 411     def redirect_request(self, req, fp, code, msg, headers):
 412         """Return a Request or None in response to a redirect.
 413
 414         This is called by the http_error_30x methods when a redirection
 415         response is received.  If a redirection should take place, return a new
 416         Request to allow http_error_30x to perform the redirect.  Otherwise,
 417         raise HTTPError if no-one else should try to handle this url.  Return
 418         None if you can't but another Handler might.
 419
 420         """
 421         if (code in (301, 302, 303, 307) and req.method() in ("GET", "HEAD") or
 422             code in (302, 303) and req.method() == "POST"):
 423             # Strictly (according to RFC 2616), 302 in response to a POST
 424             # MUST NOT cause a redirection without confirmation from the user
 425            # (of urllib2, in this case).  In practice, essentially all clients
 426             # do redirect in this case, so we do the same.
 427             return Request(newurl, headers=req.headers)
 428         else:
 429             raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
 430
 431     # Implementation note: To avoid the server sending us into an
 432     # infinite loop, the request object needs to track what URLs we
 433     # have already seen.  Do this by adding a handler-specific
 434     # attribute to the Request object.
 435     def http_error_302(self, req, fp, code, msg, headers):
 436         if 'location' in headers:
 437             newurl = headers['location']
 438         elif 'uri' in headers:
 439             newurl = headers['uri']
 440         else:
 441             return
 442         newurl = urlparse.urljoin(req.get_full_url(), newurl)
 443
 444         # XXX Probably want to forget about the state of the current
 445         # request, although that might interact poorly with other
 446         # handlers that also use handler-specific request attributes
 447         new = self.redirect_request(req, fp, code, msg, headers)
 448         if new is None:
 449             return
 450
 451         # loop detection
 452         new.error_302_dict = {}
 453         if hasattr(req, 'error_302_dict'):
 454             if len(req.error_302_dict)>10 or \
 455                newurl in req.error_302_dict:
 456                 raise HTTPError(req.get_full_url(), code,
 457                                 self.inf_msg + msg, headers, fp)
 458             new.error_302_dict.update(req.error_302_dict)
 459         new.error_302_dict[newurl] = newurl
 460
 461         # Don't close the fp until we are sure that we won't use it
 462         # with HTTPError.
 463         fp.read()
 464         fp.close()
 465
 466         return self.parent.open(new)
 467
 468     http_error_301 = http_error_303 = http_error_307 = http_error_302
 469
 470     inf_msg = "The HTTP server returned a redirect error that would" \
 471               "lead to an infinite loop.\n" \
 472               "The last 302 error message was:\n"
 473
 474 class ProxyHandler(BaseHandler):
 475     def __init__(self, proxies=None):
 476         if proxies is None:
 477             proxies = getproxies()
 478         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
 479         self.proxies = proxies
 480         for type, url in proxies.items():
 481             setattr(self, '%s_open' % type,
 482                     lambda r, proxy=url, type=type, meth=self.proxy_open: \
 483                     meth(r, proxy, type))
 484
 485     def proxy_open(self, req, proxy, type):
 486         orig_type = req.get_type()
 487         type, r_type = splittype(proxy)
 488         host, XXX = splithost(r_type)
 489         if '@' in host:
 490             user_pass, host = host.split('@', 1)
 491             if ':' in user_pass:
 492                 user, password = user_pass.split(':', 1)
 493                 user_pass = base64.encodestring('%s:%s' % (unquote(user),
 494                                                            unquote(password)))
 495                 req.add_header('Proxy-Authorization', 'Basic ' + user_pass)
 496         host = unquote(host)
 497         req.set_proxy(host, type)
 498         if orig_type == type:
 499             # let other handlers take care of it
 500             # XXX this only makes sense if the proxy is before the
 501             # other handlers
 502             return None
 503         else:
 504             # need to start over, because the other handlers don't
 505             # grok the proxy's URL type
 506             return self.parent.open(req)
 507
 508 # feature suggested by Duncan Booth
 509 # XXX custom is not a good name
 510 class CustomProxy:
 511     # either pass a function to the constructor or override handle
 512     def __init__(self, proto, func=None, proxy_addr=None):
 513         self.proto = proto
 514         self.func = func
 515         self.addr = proxy_addr
 516
 517     def handle(self, req):
 518         if self.func and self.func(req):
 519             return 1
 520
 521     def get_proxy(self):
 522         return self.addr
 523
 524 class CustomProxyHandler(BaseHandler):
 525     def __init__(self, *proxies):
 526         self.proxies = {}
 527
 528     def proxy_open(self, req):
 529         proto = req.get_type()
 530         try:
 531             proxies = self.proxies[proto]
 532         except KeyError:
 533             return None
 534         for p in proxies:
 535             if p.handle(req):
 536                 req.set_proxy(p.get_proxy())
 537                 return self.parent.open(req)
 538         return None
 539
 540     def do_proxy(self, p, req):
 541         return self.parent.open(req)
 542
 543     def add_proxy(self, cpo):
 544         if cpo.proto in self.proxies:
 545             self.proxies[cpo.proto].append(cpo)
 546         else:
 547             self.proxies[cpo.proto] = [cpo]
 548
 549 class HTTPPasswordMgr:
 550     def __init__(self):
 551         self.passwd = {}
 552
 553     def add_password(self, realm, uri, user, passwd):
 554         # uri could be a single URI or a sequence
 555         if isinstance(uri, basestring):
 556             uri = [uri]
 557         uri = tuple(map(self.reduce_uri, uri))
 558         if not realm in self.passwd:
 559             self.passwd[realm] = {}
 560         self.passwd[realm][uri] = (user, passwd)
 561
 562     def find_user_password(self, realm, authuri):
 563         domains = self.passwd.get(realm, {})
 564         authuri = self.reduce_uri(authuri)
 565         for uris, authinfo in domains.items():
 566             for uri in uris:
 567                 if self.is_suburi(uri, authuri):
 568                     return authinfo
 569         return None, None
 570
 571     def reduce_uri(self, uri):
 572         """Accept netloc or URI and extract only the netloc and path"""
 573         parts = urlparse.urlparse(uri)
 574         if parts[1]:
 575             return parts[1], parts[2] or '/'
 576         else:
 577             return parts[2], '/'
 578
 579     def is_suburi(self, base, test):
 580         """Check if test is below base in a URI tree
 581
 582         Both args must be URIs in reduced form.
 583         """
 584         if base == test:
 585             return True
 586         if base[0] != test[0]:
 587             return False
 588         common = posixpath.commonprefix((base[1], test[1]))
 589         if len(common) == len(base[1]):
 590             return True
 591         return False
 592
 593
 594 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
 595
 596     def find_user_password(self, realm, authuri):
 597         user, password = HTTPPasswordMgr.find_user_password(self,realm,authuri)
 598         if user is not None:
 599             return user, password
 600         return HTTPPasswordMgr.find_user_password(self, None, authuri)
 601
 602
 603 class AbstractBasicAuthHandler:
 604
 605     rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
 606
 607     # XXX there can actually be multiple auth-schemes in a
 608     # www-authenticate header.  should probably be a lot more careful
 609     # in parsing them to extract multiple alternatives
 610
 611     def __init__(self, password_mgr=None):
 612         if password_mgr is None:
 613             password_mgr = HTTPPasswordMgr()
 614         self.passwd = password_mgr
 615         self.add_password = self.passwd.add_password
 616
 617     def http_error_auth_reqed(self, authreq, host, req, headers):
 618         # XXX could be multiple headers
 619         authreq = headers.get(authreq, None)
 620         if authreq:
 621             mo = AbstractBasicAuthHandler.rx.match(authreq)
 622             if mo:
 623                 scheme, realm = mo.groups()
 624                 if scheme.lower() == 'basic':
 625                     return self.retry_http_basic_auth(host, req, realm)
 626
 627     def retry_http_basic_auth(self, host, req, realm):
 628         user,pw = self.passwd.find_user_password(realm, host)
 629         if pw:
 630             raw = "%s:%s" % (user, pw)
 631             auth = 'Basic %s' % base64.encodestring(raw).strip()
 632             if req.headers.get(self.auth_header, None) == auth:
 633                 return None
 634             req.add_header(self.auth_header, auth)
 635             return self.parent.open(req)
 636         else:
 637             return None
 638
 639 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
 640
 641     auth_header = 'Authorization'
 642
 643     def http_error_401(self, req, fp, code, msg, headers):
 644         host = urlparse.urlparse(req.get_full_url())[1]
 645         return self.http_error_auth_reqed('www-authenticate',
 646                                           host, req, headers)
 647
 648
 649 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
 650
 651     auth_header = 'Proxy-Authorization'
 652
 653     def http_error_407(self, req, fp, code, msg, headers):
 654         host = req.get_host()
 655         return self.http_error_auth_reqed('proxy-authenticate',
 656                                           host, req, headers)
 657
 658
 659 class AbstractDigestAuthHandler:
 660
 661     def __init__(self, passwd=None):
 662         if passwd is None:
 663             passwd = HTTPPasswordMgr()
 664         self.passwd = passwd
 665         self.add_password = self.passwd.add_password
 666
 667     def http_error_auth_reqed(self, authreq, host, req, headers):
 668         authreq = headers.get(self.auth_header, None)
 669         if authreq:
 670             kind = authreq.split()[0]
 671             if kind == 'Digest':
 672                 return self.retry_http_digest_auth(req, authreq)
 673
 674     def retry_http_digest_auth(self, req, auth):
 675         token, challenge = auth.split(' ', 1)
 676         chal = parse_keqv_list(parse_http_list(challenge))
 677         auth = self.get_authorization(req, chal)
 678         if auth:
 679             auth_val = 'Digest %s' % auth
 680             if req.headers.get(self.auth_header, None) == auth_val:
 681                 return None
 682             req.add_header(self.auth_header, auth_val)
 683             resp = self.parent.open(req)
 684             return resp
 685
 686     def get_authorization(self, req, chal):
 687         try:
 688             realm = chal['realm']
 689             nonce = chal['nonce']
 690             algorithm = chal.get('algorithm', 'MD5')
 691             # mod_digest doesn't send an opaque, even though it isn't
 692             # supposed to be optional
 693             opaque = chal.get('opaque', None)
 694         except KeyError:
 695             return None
 696
 697         H, KD = self.get_algorithm_impls(algorithm)
 698         if H is None:
 699             return None
 700
 701         user, pw = self.passwd.find_user_password(realm,
 702                                                   req.get_full_url())
 703         if user is None:
 704             return None
 705
 706         # XXX not implemented yet
 707         if req.has_data():
 708             entdig = self.get_entity_digest(req.get_data(), chal)
 709         else:
 710             entdig = None
 711
 712         A1 = "%s:%s:%s" % (user, realm, pw)
 713         A2 = "%s:%s" % (req.has_data() and 'POST' or 'GET',
 714                         # XXX selector: what about proxies and full urls
 715                         req.get_selector())
 716         respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
 717         # XXX should the partial digests be encoded too?
 718
 719         base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
 720                'response="%s"' % (user, realm, nonce, req.get_selector(),
 721                                   respdig)
 722         if opaque:
 723             base = base + ', opaque="%s"' % opaque
 724         if entdig:
 725             base = base + ', digest="%s"' % entdig
 726         if algorithm != 'MD5':
 727             base = base + ', algorithm="%s"' % algorithm
 728         return base
 729
 730     def get_algorithm_impls(self, algorithm):
 731         # lambdas assume digest modules are imported at the top level
 732         if algorithm == 'MD5':
 733             H = lambda x, e=encode_digest:e(md5.new(x).digest())
 734         elif algorithm == 'SHA':
 735             H = lambda x, e=encode_digest:e(sha.new(x).digest())
 736         # XXX MD5-sess
 737         KD = lambda s, d, H=H: H("%s:%s" % (s, d))
 738         return H, KD
 739
 740     def get_entity_digest(self, data, chal):
 741         # XXX not implemented yet
 742         return None
 743
 744
 745 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
 746     """An authentication protocol defined by RFC 2069
 747
 748     Digest authentication improves on basic authentication because it
 749     does not transmit passwords in the clear.
 750     """
 751
 752     header = 'Authorization'
 753
 754     def http_error_401(self, req, fp, code, msg, headers):
 755         host = urlparse.urlparse(req.get_full_url())[1]
 756         self.http_error_auth_reqed('www-authenticate', host, req, headers)
 757
 758
 759 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
 760
 761     header = 'Proxy-Authorization'
 762
 763     def http_error_407(self, req, fp, code, msg, headers):
 764         host = req.get_host()
 765         self.http_error_auth_reqed('proxy-authenticate', host, req, headers)
 766
 767
 768 def encode_digest(digest):
 769     hexrep = []
 770     for c in digest:
 771         n = (ord(c) >> 4) & 0xf
 772         hexrep.append(hex(n)[-1])
 773         n = ord(c) & 0xf
 774         hexrep.append(hex(n)[-1])
 775     return ''.join(hexrep)
 776
 777
 778 class AbstractHTTPHandler(BaseHandler):
 779
 780     def do_open(self, http_class, req):
 781         host = req.get_host()
 782         if not host:
 783             raise URLError('no host given')
 784
 785         try:
 786             h = http_class(host) # will parse host:port
 787             if req.has_data():
 788                 data = req.get_data()
 789                 h.putrequest('POST', req.get_selector())
 790                 if not 'Content-type' in req.headers:
 791                     h.putheader('Content-type',
 792                                 'application/x-www-form-urlencoded')
 793                 if not 'Content-length' in req.headers:
 794                     h.putheader('Content-length', '%d' % len(data))
 795             else:
 796                 h.putrequest('GET', req.get_selector())
 797         except socket.error, err:
 798             raise URLError(err)
 799
 800         scheme, sel = splittype(req.get_selector())
 801         sel_host, sel_path = splithost(sel)
 802         h.putheader('Host', sel_host or host)
 803         for args in self.parent.addheaders:
 804             name, value = args
 805             if name not in req.headers:
 806                 h.putheader(*args)
 807         for k, v in req.headers.items():
 808             h.putheader(k, v)
 809         h.endheaders()
 810         if req.has_data():
 811             h.send(data)
 812
 813         code, msg, hdrs = h.getreply()
 814         fp = h.getfile()
 815         if code == 200:
 816             return addinfourl(fp, hdrs, req.get_full_url())
 817         else:
 818             return self.parent.error('http', req, fp, code, msg, hdrs)
 819
 820
 821 class HTTPHandler(AbstractHTTPHandler):
 822
 823     def http_open(self, req):
 824         return self.do_open(httplib.HTTP, req)
 825
 826
 827 if hasattr(httplib, 'HTTPS'):
 828     class HTTPSHandler(AbstractHTTPHandler):
 829
 830         def https_open(self, req):
 831             return self.do_open(httplib.HTTPS, req)
 832
 833
 834 class UnknownHandler(BaseHandler):
 835     def unknown_open(self, req):
 836         type = req.get_type()
 837         raise URLError('unknown url type: %s' % type)
 838
 839 def parse_keqv_list(l):
 840     """Parse list of key=value strings where keys are not duplicated."""
 841     parsed = {}
 842     for elt in l:
 843         k, v = elt.split('=', 1)
 844         if v[0] == '"' and v[-1] == '"':
 845             v = v[1:-1]
 846         parsed[k] = v
 847     return parsed
 848
 849 def parse_http_list(s):
 850     """Parse lists as described by RFC 2068 Section 2.
 851
 852     In particular, parse comman-separated lists where the elements of
 853     the list may include quoted-strings.  A quoted-string could
 854     contain a comma.
 855     """
 856     # XXX this function could probably use more testing
 857
 858     list = []
 859     end = len(s)
 860     i = 0
 861     inquote = 0
 862     start = 0
 863     while i < end:
 864         cur = s[i:]
 865         c = cur.find(',')
 866         q = cur.find('"')
 867         if c == -1:
 868             list.append(s[start:])
 869             break
 870         if q == -1:
 871             if inquote:
 872                 raise ValueError, "unbalanced quotes"
 873             else:
 874                 list.append(s[start:i+c])
 875                 i = i + c + 1
 876                 continue
 877         if inquote:
 878             if q < c:
 879                 list.append(s[start:i+c])
 880                 i = i + c + 1
 881                 start = i
 882                 inquote = 0
 883             else:
 884                 i = i + q
 885         else:
 886             if c < q:
 887                 list.append(s[start:i+c])
 888                 i = i + c + 1
 889                 start = i
 890             else:
 891                 inquote = 1
 892                 i = i + q + 1
 893     return map(lambda x: x.strip(), list)
 894
 895 class FileHandler(BaseHandler):
 896     # Use local file or FTP depending on form of URL
 897     def file_open(self, req):
 898         url = req.get_selector()
 899         if url[:2] == '//' and url[2:3] != '/':
 900             req.type = 'ftp'
 901             return self.parent.open(req)
 902         else:
 903             return self.open_local_file(req)
 904
 905     # names for the localhost
 906     names = None
 907     def get_names(self):
 908         if FileHandler.names is None:
 909             FileHandler.names = (socket.gethostbyname('localhost'),
 910                                  socket.gethostbyname(socket.gethostname()))
 911         return FileHandler.names
 912
 913     # not entirely sure what the rules are here
 914     def open_local_file(self, req):
 915         host = req.get_host()
 916         file = req.get_selector()
 917         localfile = url2pathname(file)
 918         stats = os.stat(localfile)
 919         size = stats.st_size
 920         modified = rfc822.formatdate(stats.st_mtime)
 921         mtype = mimetypes.guess_type(file)[0]
 922         headers = mimetools.Message(StringIO(
 923             'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
 924             (mtype or 'text/plain', size, modified)))
 925         if host:
 926             host, port = splitport(host)
 927         if not host or \
 928            (not port and socket.gethostbyname(host) in self.get_names()):
 929             return addinfourl(open(localfile, 'rb'),
 930                               headers, 'file:'+file)
 931         raise URLError('file not on local host')
 932
 933 class FTPHandler(BaseHandler):
 934     def ftp_open(self, req):
 935         host = req.get_host()
 936         if not host:
 937             raise IOError, ('ftp error', 'no host given')
 938         # XXX handle custom username & password
 939         try:
 940             host = socket.gethostbyname(host)
 941         except socket.error, msg:
 942             raise URLError(msg)
 943         host, port = splitport(host)
 944         if port is None:
 945             port = ftplib.FTP_PORT
 946         path, attrs = splitattr(req.get_selector())
 947         path = unquote(path)
 948         dirs = path.split('/')
 949         dirs, file = dirs[:-1], dirs[-1]
 950         if dirs and not dirs[0]:
 951             dirs = dirs[1:]
 952         user = passwd = '' # XXX
 953         try:
 954             fw = self.connect_ftp(user, passwd, host, port, dirs)
 955             type = file and 'I' or 'D'
 956             for attr in attrs:
 957                 attr, value = splitattr(attr)
 958                 if attr.lower() == 'type' and \
 959                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
 960                     type = value.upper()
 961             fp, retrlen = fw.retrfile(file, type)
 962             headers = ""
 963             mtype = mimetypes.guess_type(req.get_full_url())[0]
 964             if mtype:
 965                 headers += "Content-Type: %s\n" % mtype
 966             if retrlen is not None and retrlen >= 0:
 967                 headers += "Content-Length: %d\n" % retrlen
 968             sf = StringIO(headers)
 969             headers = mimetools.Message(sf)
 970             return addinfourl(fp, headers, req.get_full_url())
 971         except ftplib.all_errors, msg:
 972             raise IOError, ('ftp error', msg), sys.exc_info()[2]
 973
 974     def connect_ftp(self, user, passwd, host, port, dirs):
 975         fw = ftpwrapper(user, passwd, host, port, dirs)
 976 ##        fw.ftp.set_debuglevel(1)
 977         return fw
 978
 979 class CacheFTPHandler(FTPHandler):
 980     # XXX would be nice to have pluggable cache strategies
 981     # XXX this stuff is definitely not thread safe
 982     def __init__(self):
 983         self.cache = {}
 984         self.timeout = {}
 985         self.soonest = 0
 986         self.delay = 60
 987         self.max_conns = 16
 988
 989     def setTimeout(self, t):
 990         self.delay = t
 991
 992     def setMaxConns(self, m):
 993         self.max_conns = m
 994
 995     def connect_ftp(self, user, passwd, host, port, dirs):
 996         key = user, passwd, host, port
 997         if key in self.cache:
 998             self.timeout[key] = time.time() + self.delay
 999         else:
1000             self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
1001             self.timeout[key] = time.time() + self.delay
1002         self.check_cache()
1003         return self.cache[key]
1004
1005     def check_cache(self):
1006         # first check for old ones
1007         t = time.time()
1008         if self.soonest <= t:
1009             for k, v in self.timeout.items():
1010                 if v < t:
1011                     self.cache[k].close()
1012                     del self.cache[k]
1013                     del self.timeout[k]
1014         self.soonest = min(self.timeout.values())
1015
1016         # then check the size
1017         if len(self.cache) == self.max_conns:
1018             for k, v in self.timeout.items():
1019                 if v == self.soonest:
1020                     del self.cache[k]
1021                     del self.timeout[k]
1022                     break
1023             self.soonest = min(self.timeout.values())
1024
1025 class GopherHandler(BaseHandler):
1026     def gopher_open(self, req):
1027         host = req.get_host()
1028         if not host:
1029             raise GopherError('no host given')
1030         host = unquote(host)
1031         selector = req.get_selector()
1032         type, selector = splitgophertype(selector)
1033         selector, query = splitquery(selector)
1034         selector = unquote(selector)
1035         if query:
1036             query = unquote(query)
1037             fp = gopherlib.send_query(selector, query, host)
1038         else:
1039             fp = gopherlib.send_selector(selector, host)
1040         return addinfourl(fp, noheaders(), req.get_full_url())
1041
1042 #bleck! don't use this yet
1043 class OpenerFactory:
1044
1045     default_handlers = [UnknownHandler, HTTPHandler,
1046                         HTTPDefaultErrorHandler, HTTPRedirectHandler,
1047                         FTPHandler, FileHandler]
1048     proxy_handlers = [ProxyHandler]
1049     handlers = []
1050     replacement_handlers = []
1051
1052     def add_proxy_handler(self, ph):
1053         self.proxy_handlers = self.proxy_handlers + [ph]
1054
1055     def add_handler(self, h):
1056         self.handlers = self.handlers + [h]
1057
1058     def replace_handler(self, h):
1059         pass
1060
1061     def build_opener(self):
1062         opener = OpenerDirector()
1063         for ph in self.proxy_handlers:
1064             if inspect.isclass(ph):
1065                 ph = ph()
1066             opener.add_handler(ph)
1067
1068 if __name__ == "__main__":
1069     # XXX some of the test code depends on machine configurations that
1070     # are internal to CNRI.   Need to set up a public server with the
1071     # right authentication configuration for test purposes.
1072     if socket.gethostname() == 'bitdiddle':
1073         localhost = 'bitdiddle.cnri.reston.va.us'
1074     elif socket.gethostname() == 'bitdiddle.concentric.net':
1075         localhost = 'localhost'
1076     else:
1077         localhost = None
1078     urls = [
1079         # Thanks to Fred for finding these!
1080         'gopher://gopher.lib.ncsu.edu/11/library/stacks/Alex',
1081         'gopher://gopher.vt.edu:10010/10/33',
1082
1083         'file:/etc/passwd',
1084         'file://nonsensename/etc/passwd',
1085         'ftp://www.python.org/pub/python/misc/sousa.au',
1086         'ftp://www.python.org/pub/tmp/blat',
1087         'http://www.espn.com/', # redirect
1088         'http://www.python.org/Spanish/Inquistion/',
1089         ('http://www.python.org/cgi-bin/faqw.py',
1090          'query=pythonistas&querytype=simple&casefold=yes&req=search'),
1091         'http://www.python.org/',
1092         'ftp://gatekeeper.research.compaq.com/pub/DEC/SRC/research-reports/00README-Legal-Rules-Regs',
1093             ]
1094
1095 ##    if localhost is not None:
1096 ##        urls = urls + [
1097 ##            'file://%s/etc/passwd' % localhost,
1098 ##            'http://%s/simple/' % localhost,
1099 ##            'http://%s/digest/' % localhost,
1100 ##            'http://%s/not/found.h' % localhost,
1101 ##            ]
1102
1103 ##        bauth = HTTPBasicAuthHandler()
1104 ##        bauth.add_password('basic_test_realm', localhost, 'jhylton',
1105 ##                           'password')
1106 ##        dauth = HTTPDigestAuthHandler()
1107 ##        dauth.add_password('digest_test_realm', localhost, 'jhylton',
1108 ##                           'password')
1109
1110
1111     cfh = CacheFTPHandler()
1112     cfh.setTimeout(1)
1113
1114 ##    # XXX try out some custom proxy objects too!
1115 ##    def at_cnri(req):
1116 ##        host = req.get_host()
1117 ##        print host
1118 ##        if host[-18:] == '.cnri.reston.va.us':
1119 ##            return 1
1120 ##    p = CustomProxy('http', at_cnri, 'proxy.cnri.reston.va.us')
1121 ##    ph = CustomProxyHandler(p)
1122
1123 ##    install_opener(build_opener(dauth, bauth, cfh, GopherHandler, ph))
1124     install_opener(build_opener(cfh, GopherHandler))
1125
1126     for url in urls:
1127         if isinstance(url, tuple):
1128             url, req = url
1129         else:
1130             req = None
1131         print url
1132         try:
1133             f = urlopen(url, req)
1134         except IOError, err:
1135             print "IOError:", err
1136         except socket.error, err:
1137             print "socket.error:", err
1138         else:
1139             buf = f.read()
1140             f.close()
1141             print "read %d bytes" % len(buf)
1142         print
1143         time.sleep(0.1)