Lib/urllib2.py

   1 """An extensible library for opening URLs using a variety of protocols
   2
   3 The simplest way to use this module is to call the urlopen function,
   4 which accepts a string containing a URL or a Request object (described
   5 below).  It opens the URL and returns the results as file-like
   6 object; the returned object has some extra methods described below.
   7
   8 The OpenerDirectory manages a collection of Handler objects that do
   9 all the actual work.  Each Handler implements a particular protocol or
  10 option.  The OpenerDirector is a composite object that invokes the
  11 Handlers needed to open the requested URL.  For example, the
  12 HTTPHandler performs HTTP GET and POST requests and deals with
  13 non-error returns.  The HTTPRedirectHandler automatically deals with
  14 HTTP 301 & 302 redirect errors, and the HTTPDigestAuthHandler deals
  15 with digest authentication.
  16
  17 urlopen(url, data=None) -- basic usage is that same as original
  18 urllib.  pass the url and optionally data to post to an HTTP URL, and
  19 get a file-like object back.  One difference is that you can also pass
  20 a Request instance instead of URL.  Raises a URLError (subclass of
  21 IOError); for HTTP errors, raises an HTTPError, which can also be
  22 treated as a valid response.
  23
  24 build_opener -- function that creates a new OpenerDirector instance.
  25 will install the default handlers.  accepts one or more Handlers as
  26 arguments, either instances or Handler classes that it will
  27 instantiate.  if one of the argument is a subclass of the default
  28 handler, the argument will be installed instead of the default.
  29
  30 install_opener -- installs a new opener as the default opener.
  31
  32 objects of interest:
  33 OpenerDirector --
  34
  35 Request -- an object that encapsulates the state of a request.  the
  36 state can be a simple as the URL.  it can also include extra HTTP
  37 headers, e.g. a User-Agent.
  38
  39 BaseHandler --
  40
  41 exceptions:
  42 URLError-- a subclass of IOError, individual protocols have their own
  43 specific subclass
  44
  45 HTTPError-- also a valid HTTP response, so you can treat an HTTP error
  46 as an exceptional event or valid response
  47
  48 internals:
  49 BaseHandler and parent
  50 _call_chain conventions
  51
  52 Example usage:
  53
  54 import urllib2
  55
  56 # set up authentication info
  57 authinfo = urllib2.HTTPBasicAuthHandler()
  58 authinfo.add_password('realm', 'host', 'username', 'password')
  59
  60 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
  61
  62 # build a new opener that adds authentication and caching FTP handlers
  63 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
  64
  65 # install it
  66 urllib2.install_opener(opener)
  67
  68 f = urllib2.urlopen('http://www.python.org/')
  69
  70
  71 """
  72
  73 # XXX issues:
  74 # If an authentication error handler that tries to perform
  75 # authentication for some reason but fails, how should the error be
  76 # signalled?  The client needs to know the HTTP error code.  But if
  77 # the handler knows that the problem was, e.g., that it didn't know
  78 # that hash algo that requested in the challenge, it would be good to
  79 # pass that information along to the client, too.
  80
  81 # XXX to do:
  82 # name!
  83 # documentation (getting there)
  84 # complex proxies
  85 # abstract factory for opener
  86 # ftp errors aren't handled cleanly
  87 # gopher can return a socket.error
  88 # check digest against correct (i.e. non-apache) implementation
  89
  90 import socket
  91 import httplib
  92 import inspect
  93 import re
  94 import base64
  95 import types
  96 import urlparse
  97 import md5
  98 import mimetypes
  99 import mimetools
 100 import rfc822
 101 import ftplib
 102 import sys
 103 import time
 104 import os
 105 import stat
 106 import gopherlib
 107 import posixpath
 108
 109 try:
 110     from cStringIO import StringIO
 111 except ImportError:
 112     from StringIO import StringIO
 113
 114 try:
 115     import sha
 116 except ImportError:
 117     # need 1.5.2 final
 118     sha = None
 119
 120 # not sure how many of these need to be gotten rid of
 121 from urllib import unwrap, unquote, splittype, splithost, \
 122      addinfourl, splitport, splitgophertype, splitquery, \
 123      splitattr, ftpwrapper, noheaders
 124
 125 # support for proxies via environment variables
 126 from urllib import getproxies
 127
 128 # support for FileHandler
 129 from urllib import localhost, url2pathname
 130
 131 __version__ = "2.0a1"
 132
 133 _opener = None
 134 def urlopen(url, data=None):
 135     global _opener
 136     if _opener is None:
 137         _opener = build_opener()
 138     return _opener.open(url, data)
 139
 140 def install_opener(opener):
 141     global _opener
 142     _opener = opener
 143
 144 # do these error classes make sense?
 145 # make sure all of the IOError stuff is overridden.  we just want to be
 146  # subtypes.
 147
 148 class URLError(IOError):
 149     # URLError is a sub-type of IOError, but it doesn't share any of
 150     # the implementation.  need to override __init__ and __str__
 151     def __init__(self, reason):
 152         self.reason = reason
 153
 154     def __str__(self):
 155         return '<urlopen error %s>' % self.reason
 156
 157 class HTTPError(URLError, addinfourl):
 158     """Raised when HTTP error occurs, but also acts like non-error return"""
 159     __super_init = addinfourl.__init__
 160
 161     def __init__(self, url, code, msg, hdrs, fp):
 162         self.__super_init(fp, hdrs, url)
 163         self.code = code
 164         self.msg = msg
 165         self.hdrs = hdrs
 166         self.fp = fp
 167         # XXX
 168         self.filename = url
 169
 170     def __str__(self):
 171         return 'HTTP Error %s: %s' % (self.code, self.msg)
 172
 173     def __del__(self):
 174         # XXX is this safe? what if user catches exception, then
 175         # extracts fp and discards exception?
 176         if self.fp:
 177             self.fp.close()
 178
 179 class GopherError(URLError):
 180     pass
 181
 182
 183 class Request:
 184
 185     def __init__(self, url, data=None, headers={}):
 186         # unwrap('<URL:type://host/path>') --> 'type://host/path'
 187         self.__original = unwrap(url)
 188         self.type = None
 189         # self.__r_type is what's left after doing the splittype
 190         self.host = None
 191         self.port = None
 192         self.data = data
 193         self.headers = {}
 194         self.headers.update(headers)
 195
 196     def __getattr__(self, attr):
 197         # XXX this is a fallback mechanism to guard against these
 198         # methods getting called in a non-standard order.  this may be
 199         # too complicated and/or unnecessary.
 200         # XXX should the __r_XXX attributes be public?
 201         if attr[:12] == '_Request__r_':
 202             name = attr[12:]
 203             if hasattr(Request, 'get_' + name):
 204                 getattr(self, 'get_' + name)()
 205                 return getattr(self, attr)
 206         raise AttributeError, attr
 207
 208     def add_data(self, data):
 209         self.data = data
 210
 211     def has_data(self):
 212         return self.data is not None
 213
 214     def get_data(self):
 215         return self.data
 216
 217     def get_full_url(self):
 218         return self.__original
 219
 220     def get_type(self):
 221         if self.type is None:
 222             self.type, self.__r_type = splittype(self.__original)
 223             if self.type is None:
 224                 raise ValueError, "unknown url type: %s" % self.__original
 225         return self.type
 226
 227     def get_host(self):
 228         if self.host is None:
 229             self.host, self.__r_host = splithost(self.__r_type)
 230             if self.host:
 231                 self.host = unquote(self.host)
 232         return self.host
 233
 234     def get_selector(self):
 235         return self.__r_host
 236
 237     def set_proxy(self, host, type):
 238         self.host, self.type = host, type
 239         self.__r_host = self.__original
 240
 241     def add_header(self, key, val):
 242         # useful for something like authentication
 243         self.headers[key] = val
 244
 245 class OpenerDirector:
 246     def __init__(self):
 247         server_version = "Python-urllib/%s" % __version__
 248         self.addheaders = [('User-agent', server_version)]
 249         # manage the individual handlers
 250         self.handlers = []
 251         self.handle_open = {}
 252         self.handle_error = {}
 253
 254     def add_handler(self, handler):
 255         added = 0
 256         for meth in dir(handler):
 257             if meth[-5:] == '_open':
 258                 protocol = meth[:-5]
 259                 if self.handle_open.has_key(protocol):
 260                     self.handle_open[protocol].append(handler)
 261                 else:
 262                     self.handle_open[protocol] = [handler]
 263                 added = 1
 264                 continue
 265             i = meth.find('_')
 266             j = meth[i+1:].find('_') + i + 1
 267             if j != -1 and meth[i+1:j] == 'error':
 268                 proto = meth[:i]
 269                 kind = meth[j+1:]
 270                 try:
 271                     kind = int(kind)
 272                 except ValueError:
 273                     pass
 274                 dict = self.handle_error.get(proto, {})
 275                 if dict.has_key(kind):
 276                     dict[kind].append(handler)
 277                 else:
 278                     dict[kind] = [handler]
 279                 self.handle_error[proto] = dict
 280                 added = 1
 281                 continue
 282         if added:
 283             self.handlers.append(handler)
 284             handler.add_parent(self)
 285
 286     def __del__(self):
 287         self.close()
 288
 289     def close(self):
 290         for handler in self.handlers:
 291             handler.close()
 292         self.handlers = []
 293
 294     def _call_chain(self, chain, kind, meth_name, *args):
 295         # XXX raise an exception if no one else should try to handle
 296         # this url.  return None if you can't but someone else could.
 297         handlers = chain.get(kind, ())
 298         for handler in handlers:
 299             func = getattr(handler, meth_name)
 300
 301             result = func(*args)
 302             if result is not None:
 303                 return result
 304
 305     def open(self, fullurl, data=None):
 306         # accept a URL or a Request object
 307         if isinstance(fullurl, (types.StringType, types.UnicodeType)):
 308             req = Request(fullurl, data)
 309         else:
 310             req = fullurl
 311             if data is not None:
 312                 req.add_data(data)
 313         assert isinstance(req, Request) # really only care about interface
 314
 315         result = self._call_chain(self.handle_open, 'default',
 316                                   'default_open', req)
 317         if result:
 318             return result
 319
 320         type_ = req.get_type()
 321         result = self._call_chain(self.handle_open, type_, type_ + \
 322                                   '_open', req)
 323         if result:
 324             return result
 325
 326         return self._call_chain(self.handle_open, 'unknown',
 327                                 'unknown_open', req)
 328
 329     def error(self, proto, *args):
 330         if proto in ['http', 'https']:
 331             # XXX http[s] protocols are special-cased
 332             dict = self.handle_error['http'] # https is not different than http
 333             proto = args[2]  # YUCK!
 334             meth_name = 'http_error_%d' % proto
 335             http_err = 1
 336             orig_args = args
 337         else:
 338             dict = self.handle_error
 339             meth_name = proto + '_error'
 340             http_err = 0
 341         args = (dict, proto, meth_name) + args
 342         result = self._call_chain(*args)
 343         if result:
 344             return result
 345
 346         if http_err:
 347             args = (dict, 'default', 'http_error_default') + orig_args
 348             return self._call_chain(*args)
 349
 350 # XXX probably also want an abstract factory that knows things like
 351  # the fact that a ProxyHandler needs to get inserted first.
 352 # would also know when it makes sense to skip a superclass in favor of
 353  # a subclass and when it might make sense to include both
 354
 355 def build_opener(*handlers):
 356     """Create an opener object from a list of handlers.
 357
 358     The opener will use several default handlers, including support
 359     for HTTP and FTP.  If there is a ProxyHandler, it must be at the
 360     front of the list of handlers.  (Yuck.)
 361
 362     If any of the handlers passed as arguments are subclasses of the
 363     default handlers, the default handlers will not be used.
 364     """
 365
 366     opener = OpenerDirector()
 367     default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
 368                        HTTPDefaultErrorHandler, HTTPRedirectHandler,
 369                        FTPHandler, FileHandler]
 370     if hasattr(httplib, 'HTTPS'):
 371         default_classes.append(HTTPSHandler)
 372     skip = []
 373     for klass in default_classes:
 374         for check in handlers:
 375             if inspect.isclass(check):
 376                 if issubclass(check, klass):
 377                     skip.append(klass)
 378             elif isinstance(check, klass):
 379                 skip.append(klass)
 380     for klass in skip:
 381         default_classes.remove(klass)
 382
 383     for klass in default_classes:
 384         opener.add_handler(klass())
 385
 386     for h in handlers:
 387         if inspect.isclass(h):
 388             h = h()
 389         opener.add_handler(h)
 390     return opener
 391
 392 class BaseHandler:
 393     def add_parent(self, parent):
 394         self.parent = parent
 395     def close(self):
 396         self.parent = None
 397
 398 class HTTPDefaultErrorHandler(BaseHandler):
 399     def http_error_default(self, req, fp, code, msg, hdrs):
 400         raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
 401
 402 class HTTPRedirectHandler(BaseHandler):
 403     # Implementation note: To avoid the server sending us into an
 404     # infinite loop, the request object needs to track what URLs we
 405     # have already seen.  Do this by adding a handler-specific
 406     # attribute to the Request object.
 407     def http_error_302(self, req, fp, code, msg, headers):
 408         if headers.has_key('location'):
 409             newurl = headers['location']
 410         elif headers.has_key('uri'):
 411             newurl = headers['uri']
 412         else:
 413             return
 414         newurl = urlparse.urljoin(req.get_full_url(), newurl)
 415
 416         # XXX Probably want to forget about the state of the current
 417         # request, although that might interact poorly with other
 418         # handlers that also use handler-specific request attributes
 419         new = Request(newurl, req.get_data())
 420         new.error_302_dict = {}
 421         if hasattr(req, 'error_302_dict'):
 422             if len(req.error_302_dict)>10 or \
 423                req.error_302_dict.has_key(newurl):
 424                 raise HTTPError(req.get_full_url(), code,
 425                                 self.inf_msg + msg, headers, fp)
 426             new.error_302_dict.update(req.error_302_dict)
 427         new.error_302_dict[newurl] = newurl
 428
 429         # Don't close the fp until we are sure that we won't use it
 430         # with HTTPError.
 431         fp.read()
 432         fp.close()
 433
 434         return self.parent.open(new)
 435
 436     http_error_301 = http_error_302
 437
 438     inf_msg = "The HTTP server returned a redirect error that would" \
 439               "lead to an infinite loop.\n" \
 440               "The last 302 error message was:\n"
 441
 442 class ProxyHandler(BaseHandler):
 443     def __init__(self, proxies=None):
 444         if proxies is None:
 445             proxies = getproxies()
 446         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
 447         self.proxies = proxies
 448         for type, url in proxies.items():
 449             setattr(self, '%s_open' % type,
 450                     lambda r, proxy=url, type=type, meth=self.proxy_open: \
 451                     meth(r, proxy, type))
 452
 453     def proxy_open(self, req, proxy, type):
 454         orig_type = req.get_type()
 455         type, r_type = splittype(proxy)
 456         host, XXX = splithost(r_type)
 457         if '@' in host:
 458             user_pass, host = host.split('@', 1)
 459             user_pass = base64.encodestring(unquote(user_pass)).strip()
 460             req.add_header('Proxy-Authorization', 'Basic '+user_pass)
 461         host = unquote(host)
 462         req.set_proxy(host, type)
 463         if orig_type == type:
 464             # let other handlers take care of it
 465             # XXX this only makes sense if the proxy is before the
 466             # other handlers
 467             return None
 468         else:
 469             # need to start over, because the other handlers don't
 470             # grok the proxy's URL type
 471             return self.parent.open(req)
 472
 473 # feature suggested by Duncan Booth
 474 # XXX custom is not a good name
 475 class CustomProxy:
 476     # either pass a function to the constructor or override handle
 477     def __init__(self, proto, func=None, proxy_addr=None):
 478         self.proto = proto
 479         self.func = func
 480         self.addr = proxy_addr
 481
 482     def handle(self, req):
 483         if self.func and self.func(req):
 484             return 1
 485
 486     def get_proxy(self):
 487         return self.addr
 488
 489 class CustomProxyHandler(BaseHandler):
 490     def __init__(self, *proxies):
 491         self.proxies = {}
 492
 493     def proxy_open(self, req):
 494         proto = req.get_type()
 495         try:
 496             proxies = self.proxies[proto]
 497         except KeyError:
 498             return None
 499         for p in proxies:
 500             if p.handle(req):
 501                 req.set_proxy(p.get_proxy())
 502                 return self.parent.open(req)
 503         return None
 504
 505     def do_proxy(self, p, req):
 506         return self.parent.open(req)
 507
 508     def add_proxy(self, cpo):
 509         if self.proxies.has_key(cpo.proto):
 510             self.proxies[cpo.proto].append(cpo)
 511         else:
 512             self.proxies[cpo.proto] = [cpo]
 513
 514 class HTTPPasswordMgr:
 515     def __init__(self):
 516         self.passwd = {}
 517
 518     def add_password(self, realm, uri, user, passwd):
 519         # uri could be a single URI or a sequence
 520         if isinstance(uri, (types.StringType, types.UnicodeType)):
 521             uri = [uri]
 522         uri = tuple(map(self.reduce_uri, uri))
 523         if not self.passwd.has_key(realm):
 524             self.passwd[realm] = {}
 525         self.passwd[realm][uri] = (user, passwd)
 526
 527     def find_user_password(self, realm, authuri):
 528         domains = self.passwd.get(realm, {})
 529         authuri = self.reduce_uri(authuri)
 530         for uris, authinfo in domains.items():
 531             for uri in uris:
 532                 if self.is_suburi(uri, authuri):
 533                     return authinfo
 534         return None, None
 535
 536     def reduce_uri(self, uri):
 537         """Accept netloc or URI and extract only the netloc and path"""
 538         parts = urlparse.urlparse(uri)
 539         if parts[1]:
 540             return parts[1], parts[2] or '/'
 541         else:
 542             return parts[2], '/'
 543
 544     def is_suburi(self, base, test):
 545         """Check if test is below base in a URI tree
 546
 547         Both args must be URIs in reduced form.
 548         """
 549         if base == test:
 550             return 1
 551         if base[0] != test[0]:
 552             return 0
 553         common = posixpath.commonprefix((base[1], test[1]))
 554         if len(common) == len(base[1]):
 555             return 1
 556         return 0
 557
 558
 559 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
 560
 561     def find_user_password(self, realm, authuri):
 562         user, password = HTTPPasswordMgr.find_user_password(self,realm,authuri)
 563         if user is not None:
 564             return user, password
 565         return HTTPPasswordMgr.find_user_password(self, None, authuri)
 566
 567
 568 class AbstractBasicAuthHandler:
 569
 570     rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"')
 571
 572     # XXX there can actually be multiple auth-schemes in a
 573     # www-authenticate header.  should probably be a lot more careful
 574     # in parsing them to extract multiple alternatives
 575
 576     def __init__(self, password_mgr=None):
 577         if password_mgr is None:
 578             password_mgr = HTTPPasswordMgr()
 579         self.passwd = password_mgr
 580         self.add_password = self.passwd.add_password
 581
 582     def http_error_auth_reqed(self, authreq, host, req, headers):
 583         # XXX could be multiple headers
 584         authreq = headers.get(authreq, None)
 585         if authreq:
 586             mo = AbstractBasicAuthHandler.rx.match(authreq)
 587             if mo:
 588                 scheme, realm = mo.groups()
 589                 if scheme.lower() == 'basic':
 590                     return self.retry_http_basic_auth(host, req, realm)
 591
 592     def retry_http_basic_auth(self, host, req, realm):
 593         user,pw = self.passwd.find_user_password(realm, host)
 594         if pw:
 595             raw = "%s:%s" % (user, pw)
 596             auth = 'Basic %s' % base64.encodestring(raw).strip()
 597             if req.headers.get(self.auth_header, None) == auth:
 598                 return None
 599             req.add_header(self.auth_header, auth)
 600             return self.parent.open(req)
 601         else:
 602             return None
 603
 604 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
 605
 606     auth_header = 'Authorization'
 607
 608     def http_error_401(self, req, fp, code, msg, headers):
 609         host = urlparse.urlparse(req.get_full_url())[1]
 610         return self.http_error_auth_reqed('www-authenticate',
 611                                           host, req, headers)
 612
 613
 614 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
 615
 616     auth_header = 'Proxy-Authorization'
 617
 618     def http_error_407(self, req, fp, code, msg, headers):
 619         host = req.get_host()
 620         return self.http_error_auth_reqed('proxy-authenticate',
 621                                           host, req, headers)
 622
 623
 624 class AbstractDigestAuthHandler:
 625
 626     def __init__(self, passwd=None):
 627         if passwd is None:
 628             passwd = HTTPPasswordMgr()
 629         self.passwd = passwd
 630         self.add_password = self.passwd.add_password
 631
 632     def http_error_auth_reqed(self, authreq, host, req, headers):
 633         authreq = headers.get(self.auth_header, None)
 634         if authreq:
 635             kind = authreq.split()[0]
 636             if kind == 'Digest':
 637                 return self.retry_http_digest_auth(req, authreq)
 638
 639     def retry_http_digest_auth(self, req, auth):
 640         token, challenge = auth.split(' ', 1)
 641         chal = parse_keqv_list(parse_http_list(challenge))
 642         auth = self.get_authorization(req, chal)
 643         if auth:
 644             auth_val = 'Digest %s' % auth
 645             if req.headers.get(self.auth_header, None) == auth_val:
 646                 return None
 647             req.add_header(self.auth_header, auth_val)
 648             resp = self.parent.open(req)
 649             return resp
 650
 651     def get_authorization(self, req, chal):
 652         try:
 653             realm = chal['realm']
 654             nonce = chal['nonce']
 655             algorithm = chal.get('algorithm', 'MD5')
 656             # mod_digest doesn't send an opaque, even though it isn't
 657             # supposed to be optional
 658             opaque = chal.get('opaque', None)
 659         except KeyError:
 660             return None
 661
 662         H, KD = self.get_algorithm_impls(algorithm)
 663         if H is None:
 664             return None
 665
 666         user, pw = self.passwd.find_user_password(realm,
 667                                                   req.get_full_url())
 668         if user is None:
 669             return None
 670
 671         # XXX not implemented yet
 672         if req.has_data():
 673             entdig = self.get_entity_digest(req.get_data(), chal)
 674         else:
 675             entdig = None
 676
 677         A1 = "%s:%s:%s" % (user, realm, pw)
 678         A2 = "%s:%s" % (req.has_data() and 'POST' or 'GET',
 679                         # XXX selector: what about proxies and full urls
 680                         req.get_selector())
 681         respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
 682         # XXX should the partial digests be encoded too?
 683
 684         base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
 685                'response="%s"' % (user, realm, nonce, req.get_selector(),
 686                                   respdig)
 687         if opaque:
 688             base = base + ', opaque="%s"' % opaque
 689         if entdig:
 690             base = base + ', digest="%s"' % entdig
 691         if algorithm != 'MD5':
 692             base = base + ', algorithm="%s"' % algorithm
 693         return base
 694
 695     def get_algorithm_impls(self, algorithm):
 696         # lambdas assume digest modules are imported at the top level
 697         if algorithm == 'MD5':
 698             H = lambda x, e=encode_digest:e(md5.new(x).digest())
 699         elif algorithm == 'SHA':
 700             H = lambda x, e=encode_digest:e(sha.new(x).digest())
 701         # XXX MD5-sess
 702         KD = lambda s, d, H=H: H("%s:%s" % (s, d))
 703         return H, KD
 704
 705     def get_entity_digest(self, data, chal):
 706         # XXX not implemented yet
 707         return None
 708
 709
 710 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
 711     """An authentication protocol defined by RFC 2069
 712
 713     Digest authentication improves on basic authentication because it
 714     does not transmit passwords in the clear.
 715     """
 716
 717     header = 'Authorization'
 718
 719     def http_error_401(self, req, fp, code, msg, headers):
 720         host = urlparse.urlparse(req.get_full_url())[1]
 721         self.http_error_auth_reqed('www-authenticate', host, req, headers)
 722
 723
 724 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
 725
 726     header = 'Proxy-Authorization'
 727
 728     def http_error_407(self, req, fp, code, msg, headers):
 729         host = req.get_host()
 730         self.http_error_auth_reqed('proxy-authenticate', host, req, headers)
 731
 732
 733 def encode_digest(digest):
 734     hexrep = []
 735     for c in digest:
 736         n = (ord(c) >> 4) & 0xf
 737         hexrep.append(hex(n)[-1])
 738         n = ord(c) & 0xf
 739         hexrep.append(hex(n)[-1])
 740     return ''.join(hexrep)
 741
 742
 743 class AbstractHTTPHandler(BaseHandler):
 744
 745     def do_open(self, http_class, req):
 746         host = req.get_host()
 747         if not host:
 748             raise URLError('no host given')
 749
 750         try:
 751             h = http_class(host) # will parse host:port
 752             if req.has_data():
 753                 data = req.get_data()
 754                 h.putrequest('POST', req.get_selector())
 755                 if not req.headers.has_key('Content-type'):
 756                     h.putheader('Content-type',
 757                                 'application/x-www-form-urlencoded')
 758                 if not req.headers.has_key('Content-length'):
 759                     h.putheader('Content-length', '%d' % len(data))
 760             else:
 761                 h.putrequest('GET', req.get_selector())
 762         except socket.error, err:
 763             raise URLError(err)
 764
 765         h.putheader('Host', host)
 766         for args in self.parent.addheaders:
 767             h.putheader(*args)
 768         for k, v in req.headers.items():
 769             h.putheader(k, v)
 770         h.endheaders()
 771         if req.has_data():
 772             h.send(data)
 773
 774         code, msg, hdrs = h.getreply()
 775         fp = h.getfile()
 776         if code == 200:
 777             return addinfourl(fp, hdrs, req.get_full_url())
 778         else:
 779             return self.parent.error('http', req, fp, code, msg, hdrs)
 780
 781
 782 class HTTPHandler(AbstractHTTPHandler):
 783
 784     def http_open(self, req):
 785         return self.do_open(httplib.HTTP, req)
 786
 787
 788 if hasattr(httplib, 'HTTPS'):
 789     class HTTPSHandler(AbstractHTTPHandler):
 790
 791         def https_open(self, req):
 792             return self.do_open(httplib.HTTPS, req)
 793
 794
 795 class UnknownHandler(BaseHandler):
 796     def unknown_open(self, req):
 797         type = req.get_type()
 798         raise URLError('unknown url type: %s' % type)
 799
 800 def parse_keqv_list(l):
 801     """Parse list of key=value strings where keys are not duplicated."""
 802     parsed = {}
 803     for elt in l:
 804         k, v = elt.split('=', 1)
 805         if v[0] == '"' and v[-1] == '"':
 806             v = v[1:-1]
 807         parsed[k] = v
 808     return parsed
 809
 810 def parse_http_list(s):
 811     """Parse lists as described by RFC 2068 Section 2.
 812
 813     In particular, parse comman-separated lists where the elements of
 814     the list may include quoted-strings.  A quoted-string could
 815     contain a comma.
 816     """
 817     # XXX this function could probably use more testing
 818
 819     list = []
 820     end = len(s)
 821     i = 0
 822     inquote = 0
 823     start = 0
 824     while i < end:
 825         cur = s[i:]
 826         c = cur.find(',')
 827         q = cur.find('"')
 828         if c == -1:
 829             list.append(s[start:])
 830             break
 831         if q == -1:
 832             if inquote:
 833                 raise ValueError, "unbalanced quotes"
 834             else:
 835                 list.append(s[start:i+c])
 836                 i = i + c + 1
 837                 continue
 838         if inquote:
 839             if q < c:
 840                 list.append(s[start:i+c])
 841                 i = i + c + 1
 842                 start = i
 843                 inquote = 0
 844             else:
 845                 i = i + q
 846         else:
 847             if c < q:
 848                 list.append(s[start:i+c])
 849                 i = i + c + 1
 850                 start = i
 851             else:
 852                 inquote = 1
 853                 i = i + q + 1
 854     return map(lambda x: x.strip(), list)
 855
 856 class FileHandler(BaseHandler):
 857     # Use local file or FTP depending on form of URL
 858     def file_open(self, req):
 859         url = req.get_selector()
 860         if url[:2] == '//' and url[2:3] != '/':
 861             req.type = 'ftp'
 862             return self.parent.open(req)
 863         else:
 864             return self.open_local_file(req)
 865
 866     # names for the localhost
 867     names = None
 868     def get_names(self):
 869         if FileHandler.names is None:
 870             FileHandler.names = (socket.gethostbyname('localhost'),
 871                                  socket.gethostbyname(socket.gethostname()))
 872         return FileHandler.names
 873
 874     # not entirely sure what the rules are here
 875     def open_local_file(self, req):
 876         host = req.get_host()
 877         file = req.get_selector()
 878         localfile = url2pathname(file)
 879         stats = os.stat(localfile)
 880         size = stats[stat.ST_SIZE]
 881         modified = rfc822.formatdate(stats[stat.ST_MTIME])
 882         mtype = mimetypes.guess_type(file)[0]
 883         stats = os.stat(localfile)
 884         headers = mimetools.Message(StringIO(
 885             'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
 886             (mtype or 'text/plain', size, modified)))
 887         if host:
 888             host, port = splitport(host)
 889         if not host or \
 890            (not port and socket.gethostbyname(host) in self.get_names()):
 891             return addinfourl(open(localfile, 'rb'),
 892                               headers, 'file:'+file)
 893         raise URLError('file not on local host')
 894
 895 class FTPHandler(BaseHandler):
 896     def ftp_open(self, req):
 897         host = req.get_host()
 898         if not host:
 899             raise IOError, ('ftp error', 'no host given')
 900         # XXX handle custom username & password
 901         try:
 902             host = socket.gethostbyname(host)
 903         except socket.error, msg:
 904             raise URLError(msg)
 905         host, port = splitport(host)
 906         if port is None:
 907             port = ftplib.FTP_PORT
 908         path, attrs = splitattr(req.get_selector())
 909         path = unquote(path)
 910         dirs = path.split('/')
 911         dirs, file = dirs[:-1], dirs[-1]
 912         if dirs and not dirs[0]:
 913             dirs = dirs[1:]
 914         user = passwd = '' # XXX
 915         try:
 916             fw = self.connect_ftp(user, passwd, host, port, dirs)
 917             type = file and 'I' or 'D'
 918             for attr in attrs:
 919                 attr, value = splitattr(attr)
 920                 if attr.lower() == 'type' and \
 921                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
 922                     type = value.upper()
 923             fp, retrlen = fw.retrfile(file, type)
 924             headers = ""
 925             mtype = mimetypes.guess_type(req.get_full_url())[0]
 926             if mtype:
 927                 headers += "Content-Type: %s\n" % mtype
 928             if retrlen is not None and retrlen >= 0:
 929                 headers += "Content-Length: %d\n" % retrlen
 930             sf = StringIO(headers)
 931             headers = mimetools.Message(sf)
 932             return addinfourl(fp, headers, req.get_full_url())
 933         except ftplib.all_errors, msg:
 934             raise IOError, ('ftp error', msg), sys.exc_info()[2]
 935
 936     def connect_ftp(self, user, passwd, host, port, dirs):
 937         fw = ftpwrapper(user, passwd, host, port, dirs)
 938 ##        fw.ftp.set_debuglevel(1)
 939         return fw
 940
 941 class CacheFTPHandler(FTPHandler):
 942     # XXX would be nice to have pluggable cache strategies
 943     # XXX this stuff is definitely not thread safe
 944     def __init__(self):
 945         self.cache = {}
 946         self.timeout = {}
 947         self.soonest = 0
 948         self.delay = 60
 949         self.max_conns = 16
 950
 951     def setTimeout(self, t):
 952         self.delay = t
 953
 954     def setMaxConns(self, m):
 955         self.max_conns = m
 956
 957     def connect_ftp(self, user, passwd, host, port, dirs):
 958         key = user, passwd, host, port
 959         if self.cache.has_key(key):
 960             self.timeout[key] = time.time() + self.delay
 961         else:
 962             self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
 963             self.timeout[key] = time.time() + self.delay
 964         self.check_cache()
 965         return self.cache[key]
 966
 967     def check_cache(self):
 968         # first check for old ones
 969         t = time.time()
 970         if self.soonest <= t:
 971             for k, v in self.timeout.items():
 972                 if v < t:
 973                     self.cache[k].close()
 974                     del self.cache[k]
 975                     del self.timeout[k]
 976         self.soonest = min(self.timeout.values())
 977
 978         # then check the size
 979         if len(self.cache) == self.max_conns:
 980             for k, v in self.timeout.items():
 981                 if v == self.soonest:
 982                     del self.cache[k]
 983                     del self.timeout[k]
 984                     break
 985             self.soonest = min(self.timeout.values())
 986
 987 class GopherHandler(BaseHandler):
 988     def gopher_open(self, req):
 989         host = req.get_host()
 990         if not host:
 991             raise GopherError('no host given')
 992         host = unquote(host)
 993         selector = req.get_selector()
 994         type, selector = splitgophertype(selector)
 995         selector, query = splitquery(selector)
 996         selector = unquote(selector)
 997         if query:
 998             query = unquote(query)
 999             fp = gopherlib.send_query(selector, query, host)
1000         else:
1001             fp = gopherlib.send_selector(selector, host)
1002         return addinfourl(fp, noheaders(), req.get_full_url())
1003
1004 #bleck! don't use this yet
1005 class OpenerFactory:
1006
1007     default_handlers = [UnknownHandler, HTTPHandler,
1008                         HTTPDefaultErrorHandler, HTTPRedirectHandler,
1009                         FTPHandler, FileHandler]
1010     proxy_handlers = [ProxyHandler]
1011     handlers = []
1012     replacement_handlers = []
1013
1014     def add_proxy_handler(self, ph):
1015         self.proxy_handlers = self.proxy_handlers + [ph]
1016
1017     def add_handler(self, h):
1018         self.handlers = self.handlers + [h]
1019
1020     def replace_handler(self, h):
1021         pass
1022
1023     def build_opener(self):
1024         opener = OpenerDirector()
1025         for ph in self.proxy_handlers:
1026             if inspect.isclass(ph):
1027                 ph = ph()
1028             opener.add_handler(ph)
1029
1030 if __name__ == "__main__":
1031     # XXX some of the test code depends on machine configurations that
1032     # are internal to CNRI.   Need to set up a public server with the
1033     # right authentication configuration for test purposes.
1034     if socket.gethostname() == 'bitdiddle':
1035         localhost = 'bitdiddle.cnri.reston.va.us'
1036     elif socket.gethostname() == 'bitdiddle.concentric.net':
1037         localhost = 'localhost'
1038     else:
1039         localhost = None
1040     urls = [
1041         # Thanks to Fred for finding these!
1042         'gopher://gopher.lib.ncsu.edu/11/library/stacks/Alex',
1043         'gopher://gopher.vt.edu:10010/10/33',
1044
1045         'file:/etc/passwd',
1046         'file://nonsensename/etc/passwd',
1047         'ftp://www.python.org/pub/python/misc/sousa.au',
1048         'ftp://www.python.org/pub/tmp/blat',
1049         'http://www.espn.com/', # redirect
1050         'http://www.python.org/Spanish/Inquistion/',
1051         ('http://www.python.org/cgi-bin/faqw.py',
1052          'query=pythonistas&querytype=simple&casefold=yes&req=search'),
1053         'http://www.python.org/',
1054         'ftp://gatekeeper.research.compaq.com/pub/DEC/SRC/research-reports/00README-Legal-Rules-Regs',
1055             ]
1056
1057 ##    if localhost is not None:
1058 ##        urls = urls + [
1059 ##            'file://%s/etc/passwd' % localhost,
1060 ##            'http://%s/simple/' % localhost,
1061 ##            'http://%s/digest/' % localhost,
1062 ##            'http://%s/not/found.h' % localhost,
1063 ##            ]
1064
1065 ##        bauth = HTTPBasicAuthHandler()
1066 ##        bauth.add_password('basic_test_realm', localhost, 'jhylton',
1067 ##                           'password')
1068 ##        dauth = HTTPDigestAuthHandler()
1069 ##        dauth.add_password('digest_test_realm', localhost, 'jhylton',
1070 ##                           'password')
1071
1072
1073     cfh = CacheFTPHandler()
1074     cfh.setTimeout(1)
1075
1076 ##    # XXX try out some custom proxy objects too!
1077 ##    def at_cnri(req):
1078 ##        host = req.get_host()
1079 ##        print host
1080 ##        if host[-18:] == '.cnri.reston.va.us':
1081 ##            return 1
1082 ##    p = CustomProxy('http', at_cnri, 'proxy.cnri.reston.va.us')
1083 ##    ph = CustomProxyHandler(p)
1084
1085 ##    install_opener(build_opener(dauth, bauth, cfh, GopherHandler, ph))
1086     install_opener(build_opener(cfh, GopherHandler))
1087
1088     for url in urls:
1089         if isinstance(url, types.TupleType):
1090             url, req = url
1091         else:
1092             req = None
1093         print url
1094         try:
1095             f = urlopen(url, req)
1096         except IOError, err:
1097             print "IOError:", err
1098         except socket.error, err:
1099             print "socket.error:", err
1100         else:
1101             buf = f.read()
1102             f.close()
1103             print "read %d bytes" % len(buf)
1104         print
1105         time.sleep(0.1)