Lib/urllib2.py

   1 """An extensible library for opening URLs using a variety of protocols
   2
   3 The simplest way to use this module is to call the urlopen function,
   4 which accepts a string containing a URL or a Request object (described
   5 below).  It opens the URL and returns the results as file-like
   6 object; the returned object has some extra methods described below.
   7
   8 The OpenerDirector manages a collection of Handler objects that do
   9 all the actual work.  Each Handler implements a particular protocol or
  10 option.  The OpenerDirector is a composite object that invokes the
  11 Handlers needed to open the requested URL.  For example, the
  12 HTTPHandler performs HTTP GET and POST requests and deals with
  13 non-error returns.  The HTTPRedirectHandler automatically deals with
  14 HTTP 301 & 302 redirect errors, and the HTTPDigestAuthHandler deals
  15 with digest authentication.
  16
  17 urlopen(url, data=None) -- basic usage is that same as original
  18 urllib.  pass the url and optionally data to post to an HTTP URL, and
  19 get a file-like object back.  One difference is that you can also pass
  20 a Request instance instead of URL.  Raises a URLError (subclass of
  21 IOError); for HTTP errors, raises an HTTPError, which can also be
  22 treated as a valid response.
  23
  24 build_opener -- function that creates a new OpenerDirector instance.
  25 will install the default handlers.  accepts one or more Handlers as
  26 arguments, either instances or Handler classes that it will
  27 instantiate.  if one of the argument is a subclass of the default
  28 handler, the argument will be installed instead of the default.
  29
  30 install_opener -- installs a new opener as the default opener.
  31
  32 objects of interest:
  33 OpenerDirector --
  34
  35 Request -- an object that encapsulates the state of a request.  the
  36 state can be a simple as the URL.  it can also include extra HTTP
  37 headers, e.g. a User-Agent.
  38
  39 BaseHandler --
  40
  41 exceptions:
  42 URLError-- a subclass of IOError, individual protocols have their own
  43 specific subclass
  44
  45 HTTPError-- also a valid HTTP response, so you can treat an HTTP error
  46 as an exceptional event or valid response
  47
  48 internals:
  49 BaseHandler and parent
  50 _call_chain conventions
  51
  52 Example usage:
  53
  54 import urllib2
  55
  56 # set up authentication info
  57 authinfo = urllib2.HTTPBasicAuthHandler()
  58 authinfo.add_password('realm', 'host', 'username', 'password')
  59
  60 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
  61
  62 # build a new opener that adds authentication and caching FTP handlers
  63 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
  64
  65 # install it
  66 urllib2.install_opener(opener)
  67
  68 f = urllib2.urlopen('http://www.python.org/')
  69
  70
  71 """
  72
  73 # XXX issues:
  74 # If an authentication error handler that tries to perform
  75 # authentication for some reason but fails, how should the error be
  76 # signalled?  The client needs to know the HTTP error code.  But if
  77 # the handler knows that the problem was, e.g., that it didn't know
  78 # that hash algo that requested in the challenge, it would be good to
  79 # pass that information along to the client, too.
  80
  81 # XXX to do:
  82 # name!
  83 # documentation (getting there)
  84 # complex proxies
  85 # abstract factory for opener
  86 # ftp errors aren't handled cleanly
  87 # gopher can return a socket.error
  88 # check digest against correct (i.e. non-apache) implementation
  89
  90 import socket
  91 import httplib
  92 import inspect
  93 import re
  94 import base64
  95 import urlparse
  96 import md5
  97 import mimetypes
  98 import mimetools
  99 import rfc822
 100 import ftplib
 101 import sys
 102 import time
 103 import os
 104 import gopherlib
 105 import posixpath
 106
 107 try:
 108     from cStringIO import StringIO
 109 except ImportError:
 110     from StringIO import StringIO
 111
 112 try:
 113     import sha
 114 except ImportError:
 115     # need 1.5.2 final
 116     sha = None
 117
 118 # not sure how many of these need to be gotten rid of
 119 from urllib import unwrap, unquote, splittype, splithost, \
 120      addinfourl, splitport, splitgophertype, splitquery, \
 121      splitattr, ftpwrapper, noheaders
 122
 123 # support for proxies via environment variables
 124 from urllib import getproxies
 125
 126 # support for FileHandler
 127 from urllib import localhost, url2pathname
 128
 129 __version__ = "2.0a1"
 130
 131 _opener = None
 132 def urlopen(url, data=None):
 133     global _opener
 134     if _opener is None:
 135         _opener = build_opener()
 136     return _opener.open(url, data)
 137
 138 def install_opener(opener):
 139     global _opener
 140     _opener = opener
 141
 142 # do these error classes make sense?
 143 # make sure all of the IOError stuff is overridden.  we just want to be
 144 # subtypes.
 145
 146 class URLError(IOError):
 147     # URLError is a sub-type of IOError, but it doesn't share any of
 148     # the implementation.  need to override __init__ and __str__
 149     def __init__(self, reason):
 150         self.reason = reason
 151
 152     def __str__(self):
 153         return '<urlopen error %s>' % self.reason
 154
 155 class HTTPError(URLError, addinfourl):
 156     """Raised when HTTP error occurs, but also acts like non-error return"""
 157     __super_init = addinfourl.__init__
 158
 159     def __init__(self, url, code, msg, hdrs, fp):
 160         self.code = code
 161         self.msg = msg
 162         self.hdrs = hdrs
 163         self.fp = fp
 164         self.filename = url
 165         # The addinfourl classes depend on fp being a valid file
 166         # object.  In some cases, the HTTPError may not have a valid
 167         # file object.  If this happens, the simplest workaround is to
 168         # not initialize the base classes.
 169         if fp is not None:
 170             self.__super_init(fp, hdrs, url)
 171
 172     def __str__(self):
 173         return 'HTTP Error %s: %s' % (self.code, self.msg)
 174
 175     def __del__(self):
 176         # XXX is this safe? what if user catches exception, then
 177         # extracts fp and discards exception?
 178         if self.fp:
 179             self.fp.close()
 180
 181 class GopherError(URLError):
 182     pass
 183
 184
 185 class Request:
 186
 187     def __init__(self, url, data=None, headers={}):
 188         # unwrap('<URL:type://host/path>') --> 'type://host/path'
 189         self.__original = unwrap(url)
 190         self.type = None
 191         # self.__r_type is what's left after doing the splittype
 192         self.host = None
 193         self.port = None
 194         self.data = data
 195         self.headers = {}
 196         self.headers.update(headers)
 197
 198     def __getattr__(self, attr):
 199         # XXX this is a fallback mechanism to guard against these
 200         # methods getting called in a non-standard order.  this may be
 201         # too complicated and/or unnecessary.
 202         # XXX should the __r_XXX attributes be public?
 203         if attr[:12] == '_Request__r_':
 204             name = attr[12:]
 205             if hasattr(Request, 'get_' + name):
 206                 getattr(self, 'get_' + name)()
 207                 return getattr(self, attr)
 208         raise AttributeError, attr
 209
 210     def add_data(self, data):
 211         self.data = data
 212
 213     def has_data(self):
 214         return self.data is not None
 215
 216     def get_data(self):
 217         return self.data
 218
 219     def get_full_url(self):
 220         return self.__original
 221
 222     def get_type(self):
 223         if self.type is None:
 224             self.type, self.__r_type = splittype(self.__original)
 225             if self.type is None:
 226                 raise ValueError, "unknown url type: %s" % self.__original
 227         return self.type
 228
 229     def get_host(self):
 230         if self.host is None:
 231             self.host, self.__r_host = splithost(self.__r_type)
 232             if self.host:
 233                 self.host = unquote(self.host)
 234         return self.host
 235
 236     def get_selector(self):
 237         return self.__r_host
 238
 239     def set_proxy(self, host, type):
 240         self.host, self.type = host, type
 241         self.__r_host = self.__original
 242
 243     def add_header(self, key, val):
 244         # useful for something like authentication
 245         self.headers[key] = val
 246
 247 class OpenerDirector:
 248     def __init__(self):
 249         server_version = "Python-urllib/%s" % __version__
 250         self.addheaders = [('User-Agent', server_version)]
 251         # manage the individual handlers
 252         self.handlers = []
 253         self.handle_open = {}
 254         self.handle_error = {}
 255
 256     def add_handler(self, handler):
 257         added = 0
 258         for meth in dir(handler):
 259             if meth[-5:] == '_open':
 260                 protocol = meth[:-5]
 261                 if protocol in self.handle_open:
 262                     self.handle_open[protocol].append(handler)
 263                 else:
 264                     self.handle_open[protocol] = [handler]
 265                 added = 1
 266                 continue
 267             i = meth.find('_')
 268             j = meth[i+1:].find('_') + i + 1
 269             if j != -1 and meth[i+1:j] == 'error':
 270                 proto = meth[:i]
 271                 kind = meth[j+1:]
 272                 try:
 273                     kind = int(kind)
 274                 except ValueError:
 275                     pass
 276                 dict = self.handle_error.get(proto, {})
 277                 if kind in dict:
 278                     dict[kind].append(handler)
 279                 else:
 280                     dict[kind] = [handler]
 281                 self.handle_error[proto] = dict
 282                 added = 1
 283                 continue
 284         if added:
 285             self.handlers.append(handler)
 286             handler.add_parent(self)
 287
 288     def __del__(self):
 289         self.close()
 290
 291     def close(self):
 292         for handler in self.handlers:
 293             handler.close()
 294         self.handlers = []
 295
 296     def _call_chain(self, chain, kind, meth_name, *args):
 297         # XXX raise an exception if no one else should try to handle
 298         # this url.  return None if you can't but someone else could.
 299         handlers = chain.get(kind, ())
 300         for handler in handlers:
 301             func = getattr(handler, meth_name)
 302
 303             result = func(*args)
 304             if result is not None:
 305                 return result
 306
 307     def open(self, fullurl, data=None):
 308         # accept a URL or a Request object
 309         if isinstance(fullurl, basestring):
 310             req = Request(fullurl, data)
 311         else:
 312             req = fullurl
 313             if data is not None:
 314                 req.add_data(data)
 315         assert isinstance(req, Request) # really only care about interface
 316
 317         result = self._call_chain(self.handle_open, 'default',
 318                                   'default_open', req)
 319         if result:
 320             return result
 321
 322         type_ = req.get_type()
 323         result = self._call_chain(self.handle_open, type_, type_ + \
 324                                   '_open', req)
 325         if result:
 326             return result
 327
 328         return self._call_chain(self.handle_open, 'unknown',
 329                                 'unknown_open', req)
 330
 331     def error(self, proto, *args):
 332         if proto in ['http', 'https']:
 333             # XXX http[s] protocols are special-cased
 334             dict = self.handle_error['http'] # https is not different than http
 335             proto = args[2]  # YUCK!
 336             meth_name = 'http_error_%d' % proto
 337             http_err = 1
 338             orig_args = args
 339         else:
 340             dict = self.handle_error
 341             meth_name = proto + '_error'
 342             http_err = 0
 343         args = (dict, proto, meth_name) + args
 344         result = self._call_chain(*args)
 345         if result:
 346             return result
 347
 348         if http_err:
 349             args = (dict, 'default', 'http_error_default') + orig_args
 350             return self._call_chain(*args)
 351
 352 # XXX probably also want an abstract factory that knows things like
 353 # the fact that a ProxyHandler needs to get inserted first.
 354 # would also know when it makes sense to skip a superclass in favor of
 355 # a subclass and when it might make sense to include both
 356
 357 def build_opener(*handlers):
 358     """Create an opener object from a list of handlers.
 359
 360     The opener will use several default handlers, including support
 361     for HTTP and FTP.  If there is a ProxyHandler, it must be at the
 362     front of the list of handlers.  (Yuck.)
 363
 364     If any of the handlers passed as arguments are subclasses of the
 365     default handlers, the default handlers will not be used.
 366     """
 367
 368     opener = OpenerDirector()
 369     default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
 370                        HTTPDefaultErrorHandler, HTTPRedirectHandler,
 371                        FTPHandler, FileHandler]
 372     if hasattr(httplib, 'HTTPS'):
 373         default_classes.append(HTTPSHandler)
 374     skip = []
 375     for klass in default_classes:
 376         for check in handlers:
 377             if inspect.isclass(check):
 378                 if issubclass(check, klass):
 379                     skip.append(klass)
 380             elif isinstance(check, klass):
 381                 skip.append(klass)
 382     for klass in skip:
 383         default_classes.remove(klass)
 384
 385     for klass in default_classes:
 386         opener.add_handler(klass())
 387
 388     for h in handlers:
 389         if inspect.isclass(h):
 390             h = h()
 391         opener.add_handler(h)
 392     return opener
 393
 394 class BaseHandler:
 395     def add_parent(self, parent):
 396         self.parent = parent
 397     def close(self):
 398         self.parent = None
 399
 400 class HTTPDefaultErrorHandler(BaseHandler):
 401     def http_error_default(self, req, fp, code, msg, hdrs):
 402         raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
 403
 404 class HTTPRedirectHandler(BaseHandler):
 405     # Implementation note: To avoid the server sending us into an
 406     # infinite loop, the request object needs to track what URLs we
 407     # have already seen.  Do this by adding a handler-specific
 408     # attribute to the Request object.
 409     def http_error_302(self, req, fp, code, msg, headers):
 410         if 'location' in headers:
 411             newurl = headers['location']
 412         elif 'uri' in headers:
 413             newurl = headers['uri']
 414         else:
 415             return
 416         newurl = urlparse.urljoin(req.get_full_url(), newurl)
 417
 418         # XXX Probably want to forget about the state of the current
 419         # request, although that might interact poorly with other
 420         # handlers that also use handler-specific request attributes
 421         new = Request(newurl, req.get_data(), req.headers)
 422         new.error_302_dict = {}
 423         if hasattr(req, 'error_302_dict'):
 424             if len(req.error_302_dict)>10 or \
 425                newurl in req.error_302_dict:
 426                 raise HTTPError(req.get_full_url(), code,
 427                                 self.inf_msg + msg, headers, fp)
 428             new.error_302_dict.update(req.error_302_dict)
 429         new.error_302_dict[newurl] = newurl
 430
 431         # Don't close the fp until we are sure that we won't use it
 432         # with HTTPError.
 433         fp.read()
 434         fp.close()
 435
 436         return self.parent.open(new)
 437
 438     http_error_301 = http_error_302
 439
 440     inf_msg = "The HTTP server returned a redirect error that would" \
 441               "lead to an infinite loop.\n" \
 442               "The last 302 error message was:\n"
 443
 444 class ProxyHandler(BaseHandler):
 445     def __init__(self, proxies=None):
 446         if proxies is None:
 447             proxies = getproxies()
 448         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
 449         self.proxies = proxies
 450         for type, url in proxies.items():
 451             setattr(self, '%s_open' % type,
 452                     lambda r, proxy=url, type=type, meth=self.proxy_open: \
 453                     meth(r, proxy, type))
 454
 455     def proxy_open(self, req, proxy, type):
 456         orig_type = req.get_type()
 457         type, r_type = splittype(proxy)
 458         host, XXX = splithost(r_type)
 459         if '@' in host:
 460             user_pass, host = host.split('@', 1)
 461             if ':' in user_pass:
 462                 user, password = user_pass.split(':', 1)
 463                 user_pass = base64.encodestring('%s:%s' % (unquote(user),
 464                                                            unquote(password)))
 465                 req.add_header('Proxy-Authorization', 'Basic ' + user_pass)
 466         host = unquote(host)
 467         req.set_proxy(host, type)
 468         if orig_type == type:
 469             # let other handlers take care of it
 470             # XXX this only makes sense if the proxy is before the
 471             # other handlers
 472             return None
 473         else:
 474             # need to start over, because the other handlers don't
 475             # grok the proxy's URL type
 476             return self.parent.open(req)
 477
 478 # feature suggested by Duncan Booth
 479 # XXX custom is not a good name
 480 class CustomProxy:
 481     # either pass a function to the constructor or override handle
 482     def __init__(self, proto, func=None, proxy_addr=None):
 483         self.proto = proto
 484         self.func = func
 485         self.addr = proxy_addr
 486
 487     def handle(self, req):
 488         if self.func and self.func(req):
 489             return 1
 490
 491     def get_proxy(self):
 492         return self.addr
 493
 494 class CustomProxyHandler(BaseHandler):
 495     def __init__(self, *proxies):
 496         self.proxies = {}
 497
 498     def proxy_open(self, req):
 499         proto = req.get_type()
 500         try:
 501             proxies = self.proxies[proto]
 502         except KeyError:
 503             return None
 504         for p in proxies:
 505             if p.handle(req):
 506                 req.set_proxy(p.get_proxy())
 507                 return self.parent.open(req)
 508         return None
 509
 510     def do_proxy(self, p, req):
 511         return self.parent.open(req)
 512
 513     def add_proxy(self, cpo):
 514         if cpo.proto in self.proxies:
 515             self.proxies[cpo.proto].append(cpo)
 516         else:
 517             self.proxies[cpo.proto] = [cpo]
 518
 519 class HTTPPasswordMgr:
 520     def __init__(self):
 521         self.passwd = {}
 522
 523     def add_password(self, realm, uri, user, passwd):
 524         # uri could be a single URI or a sequence
 525         if isinstance(uri, basestring):
 526             uri = [uri]
 527         uri = tuple(map(self.reduce_uri, uri))
 528         if not realm in self.passwd:
 529             self.passwd[realm] = {}
 530         self.passwd[realm][uri] = (user, passwd)
 531
 532     def find_user_password(self, realm, authuri):
 533         domains = self.passwd.get(realm, {})
 534         authuri = self.reduce_uri(authuri)
 535         for uris, authinfo in domains.items():
 536             for uri in uris:
 537                 if self.is_suburi(uri, authuri):
 538                     return authinfo
 539         return None, None
 540
 541     def reduce_uri(self, uri):
 542         """Accept netloc or URI and extract only the netloc and path"""
 543         parts = urlparse.urlparse(uri)
 544         if parts[1]:
 545             return parts[1], parts[2] or '/'
 546         else:
 547             return parts[2], '/'
 548
 549     def is_suburi(self, base, test):
 550         """Check if test is below base in a URI tree
 551
 552         Both args must be URIs in reduced form.
 553         """
 554         if base == test:
 555             return True
 556         if base[0] != test[0]:
 557             return False
 558         common = posixpath.commonprefix((base[1], test[1]))
 559         if len(common) == len(base[1]):
 560             return True
 561         return False
 562
 563
 564 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
 565
 566     def find_user_password(self, realm, authuri):
 567         user, password = HTTPPasswordMgr.find_user_password(self,realm,authuri)
 568         if user is not None:
 569             return user, password
 570         return HTTPPasswordMgr.find_user_password(self, None, authuri)
 571
 572
 573 class AbstractBasicAuthHandler:
 574
 575     rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
 576
 577     # XXX there can actually be multiple auth-schemes in a
 578     # www-authenticate header.  should probably be a lot more careful
 579     # in parsing them to extract multiple alternatives
 580
 581     def __init__(self, password_mgr=None):
 582         if password_mgr is None:
 583             password_mgr = HTTPPasswordMgr()
 584         self.passwd = password_mgr
 585         self.add_password = self.passwd.add_password
 586
 587     def http_error_auth_reqed(self, authreq, host, req, headers):
 588         # XXX could be multiple headers
 589         authreq = headers.get(authreq, None)
 590         if authreq:
 591             mo = AbstractBasicAuthHandler.rx.match(authreq)
 592             if mo:
 593                 scheme, realm = mo.groups()
 594                 if scheme.lower() == 'basic':
 595                     return self.retry_http_basic_auth(host, req, realm)
 596
 597     def retry_http_basic_auth(self, host, req, realm):
 598         user,pw = self.passwd.find_user_password(realm, host)
 599         if pw:
 600             raw = "%s:%s" % (user, pw)
 601             auth = 'Basic %s' % base64.encodestring(raw).strip()
 602             if req.headers.get(self.auth_header, None) == auth:
 603                 return None
 604             req.add_header(self.auth_header, auth)
 605             return self.parent.open(req)
 606         else:
 607             return None
 608
 609 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
 610
 611     auth_header = 'Authorization'
 612
 613     def http_error_401(self, req, fp, code, msg, headers):
 614         host = urlparse.urlparse(req.get_full_url())[1]
 615         return self.http_error_auth_reqed('www-authenticate',
 616                                           host, req, headers)
 617
 618
 619 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
 620
 621     auth_header = 'Proxy-Authorization'
 622
 623     def http_error_407(self, req, fp, code, msg, headers):
 624         host = req.get_host()
 625         return self.http_error_auth_reqed('proxy-authenticate',
 626                                           host, req, headers)
 627
 628
 629 class AbstractDigestAuthHandler:
 630
 631     def __init__(self, passwd=None):
 632         if passwd is None:
 633             passwd = HTTPPasswordMgr()
 634         self.passwd = passwd
 635         self.add_password = self.passwd.add_password
 636
 637     def http_error_auth_reqed(self, authreq, host, req, headers):
 638         authreq = headers.get(self.auth_header, None)
 639         if authreq:
 640             kind = authreq.split()[0]
 641             if kind == 'Digest':
 642                 return self.retry_http_digest_auth(req, authreq)
 643
 644     def retry_http_digest_auth(self, req, auth):
 645         token, challenge = auth.split(' ', 1)
 646         chal = parse_keqv_list(parse_http_list(challenge))
 647         auth = self.get_authorization(req, chal)
 648         if auth:
 649             auth_val = 'Digest %s' % auth
 650             if req.headers.get(self.auth_header, None) == auth_val:
 651                 return None
 652             req.add_header(self.auth_header, auth_val)
 653             resp = self.parent.open(req)
 654             return resp
 655
 656     def get_authorization(self, req, chal):
 657         try:
 658             realm = chal['realm']
 659             nonce = chal['nonce']
 660             algorithm = chal.get('algorithm', 'MD5')
 661             # mod_digest doesn't send an opaque, even though it isn't
 662             # supposed to be optional
 663             opaque = chal.get('opaque', None)
 664         except KeyError:
 665             return None
 666
 667         H, KD = self.get_algorithm_impls(algorithm)
 668         if H is None:
 669             return None
 670
 671         user, pw = self.passwd.find_user_password(realm,
 672                                                   req.get_full_url())
 673         if user is None:
 674             return None
 675
 676         # XXX not implemented yet
 677         if req.has_data():
 678             entdig = self.get_entity_digest(req.get_data(), chal)
 679         else:
 680             entdig = None
 681
 682         A1 = "%s:%s:%s" % (user, realm, pw)
 683         A2 = "%s:%s" % (req.has_data() and 'POST' or 'GET',
 684                         # XXX selector: what about proxies and full urls
 685                         req.get_selector())
 686         respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
 687         # XXX should the partial digests be encoded too?
 688
 689         base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
 690                'response="%s"' % (user, realm, nonce, req.get_selector(),
 691                                   respdig)
 692         if opaque:
 693             base = base + ', opaque="%s"' % opaque
 694         if entdig:
 695             base = base + ', digest="%s"' % entdig
 696         if algorithm != 'MD5':
 697             base = base + ', algorithm="%s"' % algorithm
 698         return base
 699
 700     def get_algorithm_impls(self, algorithm):
 701         # lambdas assume digest modules are imported at the top level
 702         if algorithm == 'MD5':
 703             H = lambda x, e=encode_digest:e(md5.new(x).digest())
 704         elif algorithm == 'SHA':
 705             H = lambda x, e=encode_digest:e(sha.new(x).digest())
 706         # XXX MD5-sess
 707         KD = lambda s, d, H=H: H("%s:%s" % (s, d))
 708         return H, KD
 709
 710     def get_entity_digest(self, data, chal):
 711         # XXX not implemented yet
 712         return None
 713
 714
 715 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
 716     """An authentication protocol defined by RFC 2069
 717
 718     Digest authentication improves on basic authentication because it
 719     does not transmit passwords in the clear.
 720     """
 721
 722     header = 'Authorization'
 723
 724     def http_error_401(self, req, fp, code, msg, headers):
 725         host = urlparse.urlparse(req.get_full_url())[1]
 726         self.http_error_auth_reqed('www-authenticate', host, req, headers)
 727
 728
 729 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
 730
 731     header = 'Proxy-Authorization'
 732
 733     def http_error_407(self, req, fp, code, msg, headers):
 734         host = req.get_host()
 735         self.http_error_auth_reqed('proxy-authenticate', host, req, headers)
 736
 737
 738 def encode_digest(digest):
 739     hexrep = []
 740     for c in digest:
 741         n = (ord(c) >> 4) & 0xf
 742         hexrep.append(hex(n)[-1])
 743         n = ord(c) & 0xf
 744         hexrep.append(hex(n)[-1])
 745     return ''.join(hexrep)
 746
 747
 748 class AbstractHTTPHandler(BaseHandler):
 749
 750     def do_open(self, http_class, req):
 751         host = req.get_host()
 752         if not host:
 753             raise URLError('no host given')
 754
 755         try:
 756             h = http_class(host) # will parse host:port
 757             if req.has_data():
 758                 data = req.get_data()
 759                 h.putrequest('POST', req.get_selector())
 760                 if not 'Content-type' in req.headers:
 761                     h.putheader('Content-type',
 762                                 'application/x-www-form-urlencoded')
 763                 if not 'Content-length' in req.headers:
 764                     h.putheader('Content-length', '%d' % len(data))
 765             else:
 766                 h.putrequest('GET', req.get_selector())
 767         except socket.error, err:
 768             raise URLError(err)
 769
 770         scheme, sel = splittype(req.get_selector())
 771         sel_host, sel_path = splithost(sel)
 772         h.putheader('Host', sel_host or host)
 773         for args in self.parent.addheaders:
 774             if name not in req.headers:
 775                 h.putheader(*args)
 776         for k, v in req.headers.items():
 777             h.putheader(k, v)
 778         h.endheaders()
 779         if req.has_data():
 780             h.send(data)
 781
 782         code, msg, hdrs = h.getreply()
 783         fp = h.getfile()
 784         if code == 200:
 785             return addinfourl(fp, hdrs, req.get_full_url())
 786         else:
 787             return self.parent.error('http', req, fp, code, msg, hdrs)
 788
 789
 790 class HTTPHandler(AbstractHTTPHandler):
 791
 792     def http_open(self, req):
 793         return self.do_open(httplib.HTTP, req)
 794
 795
 796 if hasattr(httplib, 'HTTPS'):
 797     class HTTPSHandler(AbstractHTTPHandler):
 798
 799         def https_open(self, req):
 800             return self.do_open(httplib.HTTPS, req)
 801
 802
 803 class UnknownHandler(BaseHandler):
 804     def unknown_open(self, req):
 805         type = req.get_type()
 806         raise URLError('unknown url type: %s' % type)
 807
 808 def parse_keqv_list(l):
 809     """Parse list of key=value strings where keys are not duplicated."""
 810     parsed = {}
 811     for elt in l:
 812         k, v = elt.split('=', 1)
 813         if v[0] == '"' and v[-1] == '"':
 814             v = v[1:-1]
 815         parsed[k] = v
 816     return parsed
 817
 818 def parse_http_list(s):
 819     """Parse lists as described by RFC 2068 Section 2.
 820
 821     In particular, parse comman-separated lists where the elements of
 822     the list may include quoted-strings.  A quoted-string could
 823     contain a comma.
 824     """
 825     # XXX this function could probably use more testing
 826
 827     list = []
 828     end = len(s)
 829     i = 0
 830     inquote = 0
 831     start = 0
 832     while i < end:
 833         cur = s[i:]
 834         c = cur.find(',')
 835         q = cur.find('"')
 836         if c == -1:
 837             list.append(s[start:])
 838             break
 839         if q == -1:
 840             if inquote:
 841                 raise ValueError, "unbalanced quotes"
 842             else:
 843                 list.append(s[start:i+c])
 844                 i = i + c + 1
 845                 continue
 846         if inquote:
 847             if q < c:
 848                 list.append(s[start:i+c])
 849                 i = i + c + 1
 850                 start = i
 851                 inquote = 0
 852             else:
 853                 i = i + q
 854         else:
 855             if c < q:
 856                 list.append(s[start:i+c])
 857                 i = i + c + 1
 858                 start = i
 859             else:
 860                 inquote = 1
 861                 i = i + q + 1
 862     return map(lambda x: x.strip(), list)
 863
 864 class FileHandler(BaseHandler):
 865     # Use local file or FTP depending on form of URL
 866     def file_open(self, req):
 867         url = req.get_selector()
 868         if url[:2] == '//' and url[2:3] != '/':
 869             req.type = 'ftp'
 870             return self.parent.open(req)
 871         else:
 872             return self.open_local_file(req)
 873
 874     # names for the localhost
 875     names = None
 876     def get_names(self):
 877         if FileHandler.names is None:
 878             FileHandler.names = (socket.gethostbyname('localhost'),
 879                                  socket.gethostbyname(socket.gethostname()))
 880         return FileHandler.names
 881
 882     # not entirely sure what the rules are here
 883     def open_local_file(self, req):
 884         host = req.get_host()
 885         file = req.get_selector()
 886         localfile = url2pathname(file)
 887         stats = os.stat(localfile)
 888         size = stats.st_size
 889         modified = rfc822.formatdate(stats.st_mtime)
 890         mtype = mimetypes.guess_type(file)[0]
 891         headers = mimetools.Message(StringIO(
 892             'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
 893             (mtype or 'text/plain', size, modified)))
 894         if host:
 895             host, port = splitport(host)
 896         if not host or \
 897            (not port and socket.gethostbyname(host) in self.get_names()):
 898             return addinfourl(open(localfile, 'rb'),
 899                               headers, 'file:'+file)
 900         raise URLError('file not on local host')
 901
 902 class FTPHandler(BaseHandler):
 903     def ftp_open(self, req):
 904         host = req.get_host()
 905         if not host:
 906             raise IOError, ('ftp error', 'no host given')
 907         # XXX handle custom username & password
 908         try:
 909             host = socket.gethostbyname(host)
 910         except socket.error, msg:
 911             raise URLError(msg)
 912         host, port = splitport(host)
 913         if port is None:
 914             port = ftplib.FTP_PORT
 915         path, attrs = splitattr(req.get_selector())
 916         path = unquote(path)
 917         dirs = path.split('/')
 918         dirs, file = dirs[:-1], dirs[-1]
 919         if dirs and not dirs[0]:
 920             dirs = dirs[1:]
 921         user = passwd = '' # XXX
 922         try:
 923             fw = self.connect_ftp(user, passwd, host, port, dirs)
 924             type = file and 'I' or 'D'
 925             for attr in attrs:
 926                 attr, value = splitattr(attr)
 927                 if attr.lower() == 'type' and \
 928                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
 929                     type = value.upper()
 930             fp, retrlen = fw.retrfile(file, type)
 931             headers = ""
 932             mtype = mimetypes.guess_type(req.get_full_url())[0]
 933             if mtype:
 934                 headers += "Content-Type: %s\n" % mtype
 935             if retrlen is not None and retrlen >= 0:
 936                 headers += "Content-Length: %d\n" % retrlen
 937             sf = StringIO(headers)
 938             headers = mimetools.Message(sf)
 939             return addinfourl(fp, headers, req.get_full_url())
 940         except ftplib.all_errors, msg:
 941             raise IOError, ('ftp error', msg), sys.exc_info()[2]
 942
 943     def connect_ftp(self, user, passwd, host, port, dirs):
 944         fw = ftpwrapper(user, passwd, host, port, dirs)
 945 ##        fw.ftp.set_debuglevel(1)
 946         return fw
 947
 948 class CacheFTPHandler(FTPHandler):
 949     # XXX would be nice to have pluggable cache strategies
 950     # XXX this stuff is definitely not thread safe
 951     def __init__(self):
 952         self.cache = {}
 953         self.timeout = {}
 954         self.soonest = 0
 955         self.delay = 60
 956         self.max_conns = 16
 957
 958     def setTimeout(self, t):
 959         self.delay = t
 960
 961     def setMaxConns(self, m):
 962         self.max_conns = m
 963
 964     def connect_ftp(self, user, passwd, host, port, dirs):
 965         key = user, passwd, host, port
 966         if key in self.cache:
 967             self.timeout[key] = time.time() + self.delay
 968         else:
 969             self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
 970             self.timeout[key] = time.time() + self.delay
 971         self.check_cache()
 972         return self.cache[key]
 973
 974     def check_cache(self):
 975         # first check for old ones
 976         t = time.time()
 977         if self.soonest <= t:
 978             for k, v in self.timeout.items():
 979                 if v < t:
 980                     self.cache[k].close()
 981                     del self.cache[k]
 982                     del self.timeout[k]
 983         self.soonest = min(self.timeout.values())
 984
 985         # then check the size
 986         if len(self.cache) == self.max_conns:
 987             for k, v in self.timeout.items():
 988                 if v == self.soonest:
 989                     del self.cache[k]
 990                     del self.timeout[k]
 991                     break
 992             self.soonest = min(self.timeout.values())
 993
 994 class GopherHandler(BaseHandler):
 995     def gopher_open(self, req):
 996         host = req.get_host()
 997         if not host:
 998             raise GopherError('no host given')
 999         host = unquote(host)
1000         selector = req.get_selector()
1001         type, selector = splitgophertype(selector)
1002         selector, query = splitquery(selector)
1003         selector = unquote(selector)
1004         if query:
1005             query = unquote(query)
1006             fp = gopherlib.send_query(selector, query, host)
1007         else:
1008             fp = gopherlib.send_selector(selector, host)
1009         return addinfourl(fp, noheaders(), req.get_full_url())
1010
1011 #bleck! don't use this yet
1012 class OpenerFactory:
1013
1014     default_handlers = [UnknownHandler, HTTPHandler,
1015                         HTTPDefaultErrorHandler, HTTPRedirectHandler,
1016                         FTPHandler, FileHandler]
1017     proxy_handlers = [ProxyHandler]
1018     handlers = []
1019     replacement_handlers = []
1020
1021     def add_proxy_handler(self, ph):
1022         self.proxy_handlers = self.proxy_handlers + [ph]
1023
1024     def add_handler(self, h):
1025         self.handlers = self.handlers + [h]
1026
1027     def replace_handler(self, h):
1028         pass
1029
1030     def build_opener(self):
1031         opener = OpenerDirector()
1032         for ph in self.proxy_handlers:
1033             if inspect.isclass(ph):
1034                 ph = ph()
1035             opener.add_handler(ph)
1036
1037 if __name__ == "__main__":
1038     # XXX some of the test code depends on machine configurations that
1039     # are internal to CNRI.   Need to set up a public server with the
1040     # right authentication configuration for test purposes.
1041     if socket.gethostname() == 'bitdiddle':
1042         localhost = 'bitdiddle.cnri.reston.va.us'
1043     elif socket.gethostname() == 'bitdiddle.concentric.net':
1044         localhost = 'localhost'
1045     else:
1046         localhost = None
1047     urls = [
1048         # Thanks to Fred for finding these!
1049         'gopher://gopher.lib.ncsu.edu/11/library/stacks/Alex',
1050         'gopher://gopher.vt.edu:10010/10/33',
1051
1052         'file:/etc/passwd',
1053         'file://nonsensename/etc/passwd',
1054         'ftp://www.python.org/pub/python/misc/sousa.au',
1055         'ftp://www.python.org/pub/tmp/blat',
1056         'http://www.espn.com/', # redirect
1057         'http://www.python.org/Spanish/Inquistion/',
1058         ('http://www.python.org/cgi-bin/faqw.py',
1059          'query=pythonistas&querytype=simple&casefold=yes&req=search'),
1060         'http://www.python.org/',
1061         'ftp://gatekeeper.research.compaq.com/pub/DEC/SRC/research-reports/00README-Legal-Rules-Regs',
1062             ]
1063
1064 ##    if localhost is not None:
1065 ##        urls = urls + [
1066 ##            'file://%s/etc/passwd' % localhost,
1067 ##            'http://%s/simple/' % localhost,
1068 ##            'http://%s/digest/' % localhost,
1069 ##            'http://%s/not/found.h' % localhost,
1070 ##            ]
1071
1072 ##        bauth = HTTPBasicAuthHandler()
1073 ##        bauth.add_password('basic_test_realm', localhost, 'jhylton',
1074 ##                           'password')
1075 ##        dauth = HTTPDigestAuthHandler()
1076 ##        dauth.add_password('digest_test_realm', localhost, 'jhylton',
1077 ##                           'password')
1078
1079
1080     cfh = CacheFTPHandler()
1081     cfh.setTimeout(1)
1082
1083 ##    # XXX try out some custom proxy objects too!
1084 ##    def at_cnri(req):
1085 ##        host = req.get_host()
1086 ##        print host
1087 ##        if host[-18:] == '.cnri.reston.va.us':
1088 ##            return 1
1089 ##    p = CustomProxy('http', at_cnri, 'proxy.cnri.reston.va.us')
1090 ##    ph = CustomProxyHandler(p)
1091
1092 ##    install_opener(build_opener(dauth, bauth, cfh, GopherHandler, ph))
1093     install_opener(build_opener(cfh, GopherHandler))
1094
1095     for url in urls:
1096         if isinstance(url, tuple):
1097             url, req = url
1098         else:
1099             req = None
1100         print url
1101         try:
1102             f = urlopen(url, req)
1103         except IOError, err:
1104             print "IOError:", err
1105         except socket.error, err:
1106             print "socket.error:", err
1107         else:
1108             buf = f.read()
1109             f.close()
1110             print "read %d bytes" % len(buf)
1111         print
1112         time.sleep(0.1)