Lib/urllib.py

   1 """Open an arbitrary URL.
   2
   3 See the following document for more info on URLs:
   4 "Names and Addresses, URIs, URLs, URNs, URCs", at
   5 http://www.w3.org/pub/WWW/Addressing/Overview.html
   6
   7 See also the HTTP spec (from which the error codes are derived):
   8 "HTTP - Hypertext Transfer Protocol", at
   9 http://www.w3.org/pub/WWW/Protocols/
  10
  11 Related standards and specs:
  12 - RFC1808: the "relative URL" spec. (authoritative status)
  13 - RFC1738 - the "URL standard". (authoritative status)
  14 - RFC1630 - the "URI spec". (informational status)
  15
  16 The object returned by URLopener().open(file) will differ per
  17 protocol.  All you know is that is has methods read(), readline(),
  18 readlines(), fileno(), close() and info().  The read*(), fileno()
  19 and close() methods work like those of open files.
  20 The info() method returns a mimetools.Message object which can be
  21 used to query various info about the object, if available.
  22 (mimetools.Message objects are queried with the getheader() method.)
  23 """
  24
  25 import string
  26 import socket
  27 import os
  28 import sys
  29 import types
  30
  31 __all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
  32            "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
  33            "urlencode", "url2pathname", "pathname2url", "splittag",
  34            "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
  35            "splittype", "splithost", "splituser", "splitpasswd", "splitport",
  36            "splitnport", "splitquery", "splitattr", "splitvalue",
  37            "splitgophertype", "getproxies"]
  38
  39 __version__ = '1.15'    # XXX This version is not always updated :-(
  40
  41 MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
  42
  43 # Helper for non-unix systems
  44 if os.name == 'mac':
  45     from macurl2path import url2pathname, pathname2url
  46 elif os.name == 'nt':
  47     from nturl2path import url2pathname, pathname2url
  48 elif os.name == 'riscos':
  49     from rourl2path import url2pathname, pathname2url
  50 else:
  51     def url2pathname(pathname):
  52         return unquote(pathname)
  53     def pathname2url(pathname):
  54         return quote(pathname)
  55
  56 # This really consists of two pieces:
  57 # (1) a class which handles opening of all sorts of URLs
  58 #     (plus assorted utilities etc.)
  59 # (2) a set of functions for parsing URLs
  60 # XXX Should these be separated out into different modules?
  61
  62
  63 # Shortcut for basic usage
  64 _urlopener = None
  65 def urlopen(url, data=None):
  66     """urlopen(url [, data]) -> open file-like object"""
  67     global _urlopener
  68     if not _urlopener:
  69         _urlopener = FancyURLopener()
  70     if data is None:
  71         return _urlopener.open(url)
  72     else:
  73         return _urlopener.open(url, data)
  74 def urlretrieve(url, filename=None, reporthook=None, data=None):
  75     global _urlopener
  76     if not _urlopener:
  77         _urlopener = FancyURLopener()
  78     return _urlopener.retrieve(url, filename, reporthook, data)
  79 def urlcleanup():
  80     if _urlopener:
  81         _urlopener.cleanup()
  82
  83
  84 ftpcache = {}
  85 class URLopener:
  86     """Class to open URLs.
  87     This is a class rather than just a subroutine because we may need
  88     more than one set of global protocol-specific options.
  89     Note -- this is a base class for those who don't want the
  90     automatic handling of errors type 302 (relocated) and 401
  91     (authorization needed)."""
  92
  93     __tempfiles = None
  94
  95     version = "Python-urllib/%s" % __version__
  96
  97     # Constructor
  98     def __init__(self, proxies=None, **x509):
  99         if proxies is None:
 100             proxies = getproxies()
 101         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
 102         self.proxies = proxies
 103         self.key_file = x509.get('key_file')
 104         self.cert_file = x509.get('cert_file')
 105         self.addheaders = [('User-agent', self.version)]
 106         self.__tempfiles = []
 107         self.__unlink = os.unlink # See cleanup()
 108         self.tempcache = None
 109         # Undocumented feature: if you assign {} to tempcache,
 110         # it is used to cache files retrieved with
 111         # self.retrieve().  This is not enabled by default
 112         # since it does not work for changing documents (and I
 113         # haven't got the logic to check expiration headers
 114         # yet).
 115         self.ftpcache = ftpcache
 116         # Undocumented feature: you can use a different
 117         # ftp cache by assigning to the .ftpcache member;
 118         # in case you want logically independent URL openers
 119         # XXX This is not threadsafe.  Bah.
 120
 121     def __del__(self):
 122         self.close()
 123
 124     def close(self):
 125         self.cleanup()
 126
 127     def cleanup(self):
 128         # This code sometimes runs when the rest of this module
 129         # has already been deleted, so it can't use any globals
 130         # or import anything.
 131         if self.__tempfiles:
 132             for file in self.__tempfiles:
 133                 try:
 134                     self.__unlink(file)
 135                 except:
 136                     pass
 137             del self.__tempfiles[:]
 138         if self.tempcache:
 139             self.tempcache.clear()
 140
 141     def addheader(self, *args):
 142         """Add a header to be used by the HTTP interface only
 143         e.g. u.addheader('Accept', 'sound/basic')"""
 144         self.addheaders.append(args)
 145
 146     # External interface
 147     def open(self, fullurl, data=None):
 148         """Use URLopener().open(file) instead of open(file, 'r')."""
 149         fullurl = unwrap(toBytes(fullurl))
 150         if self.tempcache and self.tempcache.has_key(fullurl):
 151             filename, headers = self.tempcache[fullurl]
 152             fp = open(filename, 'rb')
 153             return addinfourl(fp, headers, fullurl)
 154         urltype, url = splittype(fullurl)
 155         if not urltype:
 156             urltype = 'file'
 157         if self.proxies.has_key(urltype):
 158             proxy = self.proxies[urltype]
 159             urltype, proxyhost = splittype(proxy)
 160             host, selector = splithost(proxyhost)
 161             url = (host, fullurl) # Signal special case to open_*()
 162         else:
 163             proxy = None
 164         name = 'open_' + urltype
 165         self.type = urltype
 166         if '-' in name:
 167             # replace - with _
 168             name = '_'.join(name.split('-'))
 169         if not hasattr(self, name):
 170             if proxy:
 171                 return self.open_unknown_proxy(proxy, fullurl, data)
 172             else:
 173                 return self.open_unknown(fullurl, data)
 174         try:
 175             if data is None:
 176                 return getattr(self, name)(url)
 177             else:
 178                 return getattr(self, name)(url, data)
 179         except socket.error, msg:
 180             raise IOError, ('socket error', msg), sys.exc_info()[2]
 181
 182     def open_unknown(self, fullurl, data=None):
 183         """Overridable interface to open unknown URL type."""
 184         type, url = splittype(fullurl)
 185         raise IOError, ('url error', 'unknown url type', type)
 186
 187     def open_unknown_proxy(self, proxy, fullurl, data=None):
 188         """Overridable interface to open unknown URL type."""
 189         type, url = splittype(fullurl)
 190         raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
 191
 192     # External interface
 193     def retrieve(self, url, filename=None, reporthook=None, data=None):
 194         """retrieve(url) returns (filename, None) for a local object
 195         or (tempfilename, headers) for a remote object."""
 196         url = unwrap(toBytes(url))
 197         if self.tempcache and self.tempcache.has_key(url):
 198             return self.tempcache[url]
 199         type, url1 = splittype(url)
 200         if not filename and (not type or type == 'file'):
 201             try:
 202                 fp = self.open_local_file(url1)
 203                 hdrs = fp.info()
 204                 del fp
 205                 return url2pathname(splithost(url1)[1]), hdrs
 206             except IOError, msg:
 207                 pass
 208         fp = self.open(url, data)
 209         headers = fp.info()
 210         if not filename:
 211             import tempfile
 212             garbage, path = splittype(url)
 213             garbage, path = splithost(path or "")
 214             path, garbage = splitquery(path or "")
 215             path, garbage = splitattr(path or "")
 216             suffix = os.path.splitext(path)[1]
 217             filename = tempfile.mktemp(suffix)
 218             self.__tempfiles.append(filename)
 219         result = filename, headers
 220         if self.tempcache is not None:
 221             self.tempcache[url] = result
 222         tfp = open(filename, 'wb')
 223         bs = 1024*8
 224         size = -1
 225         blocknum = 1
 226         if reporthook:
 227             if headers.has_key("content-length"):
 228                 size = int(headers["Content-Length"])
 229             reporthook(0, bs, size)
 230         block = fp.read(bs)
 231         if reporthook:
 232             reporthook(1, bs, size)
 233         while block:
 234             tfp.write(block)
 235             block = fp.read(bs)
 236             blocknum = blocknum + 1
 237             if reporthook:
 238                 reporthook(blocknum, bs, size)
 239         fp.close()
 240         tfp.close()
 241         del fp
 242         del tfp
 243         return result
 244
 245     # Each method named open_<type> knows how to open that type of URL
 246
 247     def open_http(self, url, data=None):
 248         """Use HTTP protocol."""
 249         import httplib
 250         user_passwd = None
 251         if type(url) is types.StringType:
 252             host, selector = splithost(url)
 253             if host:
 254                 user_passwd, host = splituser(host)
 255                 host = unquote(host)
 256             realhost = host
 257         else:
 258             host, selector = url
 259             urltype, rest = splittype(selector)
 260             url = rest
 261             user_passwd = None
 262             if urltype.lower() != 'http':
 263                 realhost = None
 264             else:
 265                 realhost, rest = splithost(rest)
 266                 if realhost:
 267                     user_passwd, realhost = splituser(realhost)
 268                 if user_passwd:
 269                     selector = "%s://%s%s" % (urltype, realhost, rest)
 270             #print "proxy via http:", host, selector
 271         if not host: raise IOError, ('http error', 'no host given')
 272         if user_passwd:
 273             import base64
 274             auth = base64.encodestring(user_passwd).strip()
 275         else:
 276             auth = None
 277         h = httplib.HTTP(host)
 278         if data is not None:
 279             h.putrequest('POST', selector)
 280             h.putheader('Content-type', 'application/x-www-form-urlencoded')
 281             h.putheader('Content-length', '%d' % len(data))
 282         else:
 283             h.putrequest('GET', selector)
 284         if auth: h.putheader('Authorization', 'Basic %s' % auth)
 285         if realhost: h.putheader('Host', realhost)
 286         for args in self.addheaders: apply(h.putheader, args)
 287         h.endheaders()
 288         if data is not None:
 289             h.send(data)
 290         errcode, errmsg, headers = h.getreply()
 291         fp = h.getfile()
 292         if errcode == 200:
 293             return addinfourl(fp, headers, "http:" + url)
 294         else:
 295             if data is None:
 296                 return self.http_error(url, fp, errcode, errmsg, headers)
 297             else:
 298                 return self.http_error(url, fp, errcode, errmsg, headers, data)
 299
 300     def http_error(self, url, fp, errcode, errmsg, headers, data=None):
 301         """Handle http errors.
 302         Derived class can override this, or provide specific handlers
 303         named http_error_DDD where DDD is the 3-digit error code."""
 304         # First check if there's a specific handler for this error
 305         name = 'http_error_%d' % errcode
 306         if hasattr(self, name):
 307             method = getattr(self, name)
 308             if data is None:
 309                 result = method(url, fp, errcode, errmsg, headers)
 310             else:
 311                 result = method(url, fp, errcode, errmsg, headers, data)
 312             if result: return result
 313         return self.http_error_default(url, fp, errcode, errmsg, headers)
 314
 315     def http_error_default(self, url, fp, errcode, errmsg, headers):
 316         """Default error handler: close the connection and raise IOError."""
 317         void = fp.read()
 318         fp.close()
 319         raise IOError, ('http error', errcode, errmsg, headers)
 320
 321     if hasattr(socket, "ssl"):
 322         def open_https(self, url, data=None):
 323             """Use HTTPS protocol."""
 324             import httplib
 325             user_passwd = None
 326             if type(url) is types.StringType:
 327                 host, selector = splithost(url)
 328                 if host:
 329                     user_passwd, host = splituser(host)
 330                     host = unquote(host)
 331                 realhost = host
 332             else:
 333                 host, selector = url
 334                 urltype, rest = splittype(selector)
 335                 url = rest
 336                 user_passwd = None
 337                 if urltype.lower() != 'https':
 338                     realhost = None
 339                 else:
 340                     realhost, rest = splithost(rest)
 341                     if realhost:
 342                         user_passwd, realhost = splituser(realhost)
 343                     if user_passwd:
 344                         selector = "%s://%s%s" % (urltype, realhost, rest)
 345                 #print "proxy via https:", host, selector
 346             if not host: raise IOError, ('https error', 'no host given')
 347             if user_passwd:
 348                 import base64
 349                 auth = base64.encodestring(user_passwd).strip()
 350             else:
 351                 auth = None
 352             h = httplib.HTTPS(host, 0,
 353                               key_file=self.key_file,
 354                               cert_file=self.cert_file)
 355             if data is not None:
 356                 h.putrequest('POST', selector)
 357                 h.putheader('Content-type',
 358                             'application/x-www-form-urlencoded')
 359                 h.putheader('Content-length', '%d' % len(data))
 360             else:
 361                 h.putrequest('GET', selector)
 362             if auth: h.putheader('Authorization: Basic %s' % auth)
 363             if realhost: h.putheader('Host', realhost)
 364             for args in self.addheaders: apply(h.putheader, args)
 365             h.endheaders()
 366             if data is not None:
 367                 h.send(data)
 368             errcode, errmsg, headers = h.getreply()
 369             fp = h.getfile()
 370             if errcode == 200:
 371                 return addinfourl(fp, headers, url)
 372             else:
 373                 if data is None:
 374                     return self.http_error(url, fp, errcode, errmsg, headers)
 375                 else:
 376                     return self.http_error(url, fp, errcode, errmsg, headers,
 377                                            data)
 378
 379     def open_gopher(self, url):
 380         """Use Gopher protocol."""
 381         import gopherlib
 382         host, selector = splithost(url)
 383         if not host: raise IOError, ('gopher error', 'no host given')
 384         host = unquote(host)
 385         type, selector = splitgophertype(selector)
 386         selector, query = splitquery(selector)
 387         selector = unquote(selector)
 388         if query:
 389             query = unquote(query)
 390             fp = gopherlib.send_query(selector, query, host)
 391         else:
 392             fp = gopherlib.send_selector(selector, host)
 393         return addinfourl(fp, noheaders(), "gopher:" + url)
 394
 395     def open_file(self, url):
 396         """Use local file or FTP depending on form of URL."""
 397         if url[:2] == '//' and url[2:3] != '/':
 398             return self.open_ftp(url)
 399         else:
 400             return self.open_local_file(url)
 401
 402     def open_local_file(self, url):
 403         """Use local file."""
 404         import mimetypes, mimetools, StringIO
 405         mtype = mimetypes.guess_type(url)[0]
 406         headers = mimetools.Message(StringIO.StringIO(
 407             'Content-Type: %s\n' % (mtype or 'text/plain')))
 408         host, file = splithost(url)
 409         if not host:
 410             urlfile = file
 411             if file[:1] == '/':
 412                 urlfile = 'file://' + file
 413             return addinfourl(open(url2pathname(file), 'rb'),
 414                               headers, urlfile)
 415         host, port = splitport(host)
 416         if not port \
 417            and socket.gethostbyname(host) in (localhost(), thishost()):
 418             urlfile = file
 419             if file[:1] == '/':
 420                 urlfile = 'file://' + file
 421             return addinfourl(open(url2pathname(file), 'rb'),
 422                               headers, urlfile)
 423         raise IOError, ('local file error', 'not on local host')
 424
 425     def open_ftp(self, url):
 426         """Use FTP protocol."""
 427         host, path = splithost(url)
 428         if not host: raise IOError, ('ftp error', 'no host given')
 429         host, port = splitport(host)
 430         user, host = splituser(host)
 431         if user: user, passwd = splitpasswd(user)
 432         else: passwd = None
 433         host = unquote(host)
 434         user = unquote(user or '')
 435         passwd = unquote(passwd or '')
 436         host = socket.gethostbyname(host)
 437         if not port:
 438             import ftplib
 439             port = ftplib.FTP_PORT
 440         else:
 441             port = int(port)
 442         path, attrs = splitattr(path)
 443         path = unquote(path)
 444         dirs = path.split('/')
 445         dirs, file = dirs[:-1], dirs[-1]
 446         if dirs and not dirs[0]: dirs = dirs[1:]
 447         if dirs and not dirs[0]: dirs[0] = '/'
 448         key = user, host, port, '/'.join(dirs)
 449         # XXX thread unsafe!
 450         if len(self.ftpcache) > MAXFTPCACHE:
 451             # Prune the cache, rather arbitrarily
 452             for k in self.ftpcache.keys():
 453                 if k != key:
 454                     v = self.ftpcache[k]
 455                     del self.ftpcache[k]
 456                     v.close()
 457         try:
 458             if not self.ftpcache.has_key(key):
 459                 self.ftpcache[key] = \
 460                     ftpwrapper(user, passwd, host, port, dirs)
 461             if not file: type = 'D'
 462             else: type = 'I'
 463             for attr in attrs:
 464                 attr, value = splitvalue(attr)
 465                 if attr.lower() == 'type' and \
 466                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
 467                     type = value.upper()
 468             (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
 469             if retrlen is not None and retrlen >= 0:
 470                 import mimetools, StringIO
 471                 headers = mimetools.Message(StringIO.StringIO(
 472                     'Content-Length: %d\n' % retrlen))
 473             else:
 474                 headers = noheaders()
 475             return addinfourl(fp, headers, "ftp:" + url)
 476         except ftperrors(), msg:
 477             raise IOError, ('ftp error', msg), sys.exc_info()[2]
 478
 479     def open_data(self, url, data=None):
 480         """Use "data" URL."""
 481         # ignore POSTed data
 482         #
 483         # syntax of data URLs:
 484         # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
 485         # mediatype := [ type "/" subtype ] *( ";" parameter )
 486         # data      := *urlchar
 487         # parameter := attribute "=" value
 488         import StringIO, mimetools, time
 489         try:
 490             [type, data] = url.split(',', 1)
 491         except ValueError:
 492             raise IOError, ('data error', 'bad data URL')
 493         if not type:
 494             type = 'text/plain;charset=US-ASCII'
 495         semi = type.rfind(';')
 496         if semi >= 0 and '=' not in type[semi:]:
 497             encoding = type[semi+1:]
 498             type = type[:semi]
 499         else:
 500             encoding = ''
 501         msg = []
 502         msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
 503                                             time.gmtime(time.time())))
 504         msg.append('Content-type: %s' % type)
 505         if encoding == 'base64':
 506             import base64
 507             data = base64.decodestring(data)
 508         else:
 509             data = unquote(data)
 510         msg.append('Content-length: %d' % len(data))
 511         msg.append('')
 512         msg.append(data)
 513         msg = '\n'.join(msg)
 514         f = StringIO.StringIO(msg)
 515         headers = mimetools.Message(f, 0)
 516         f.fileno = None     # needed for addinfourl
 517         return addinfourl(f, headers, url)
 518
 519
 520 class FancyURLopener(URLopener):
 521     """Derived class with handlers for errors we can handle (perhaps)."""
 522
 523     def __init__(self, *args):
 524         apply(URLopener.__init__, (self,) + args)
 525         self.auth_cache = {}
 526         self.tries = 0
 527         self.maxtries = 10
 528
 529     def http_error_default(self, url, fp, errcode, errmsg, headers):
 530         """Default error handling -- don't raise an exception."""
 531         return addinfourl(fp, headers, "http:" + url)
 532
 533     def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
 534         """Error 302 -- relocated (temporarily)."""
 535         self.tries += 1
 536         if self.maxtries and self.tries >= self.maxtries:
 537             if hasattr(self, "http_error_500"):
 538                 meth = self.http_error_500
 539             else:
 540                 meth = self.http_error_default
 541             self.tries = 0
 542             return meth(url, fp, 500,
 543                         "Internal Server Error: Redirect Recursion", headers)
 544         result = self.redirect_internal(url, fp, errcode, errmsg, headers,
 545                                         data)
 546         self.tries = 0
 547         return result
 548
 549     def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
 550         if headers.has_key('location'):
 551             newurl = headers['location']
 552         elif headers.has_key('uri'):
 553             newurl = headers['uri']
 554         else:
 555             return
 556         void = fp.read()
 557         fp.close()
 558         # In case the server sent a relative URL, join with original:
 559         newurl = basejoin(self.type + ":" + url, newurl)
 560         if data is None:
 561             return self.open(newurl)
 562         else:
 563             return self.open(newurl, data)
 564
 565     def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
 566         """Error 301 -- also relocated (permanently)."""
 567         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
 568
 569     def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
 570         """Error 401 -- authentication required.
 571         See this URL for a description of the basic authentication scheme:
 572         http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
 573         if not headers.has_key('www-authenticate'):
 574             URLopener.http_error_default(self, url, fp,
 575                                          errmsg, headers)
 576         stuff = headers['www-authenticate']
 577         import re
 578         match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
 579         if not match:
 580             URLopener.http_error_default(self, url, fp,
 581                                          errcode, errmsg, headers)
 582         scheme, realm = match.groups()
 583         if scheme.lower() != 'basic':
 584             URLopener.http_error_default(self, url, fp,
 585                                          errcode, errmsg, headers)
 586         name = 'retry_' + self.type + '_basic_auth'
 587         if data is None:
 588             return getattr(self,name)(url, realm)
 589         else:
 590             return getattr(self,name)(url, realm, data)
 591
 592     def retry_http_basic_auth(self, url, realm, data=None):
 593         host, selector = splithost(url)
 594         i = host.find('@') + 1
 595         host = host[i:]
 596         user, passwd = self.get_user_passwd(host, realm, i)
 597         if not (user or passwd): return None
 598         host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
 599         newurl = 'http://' + host + selector
 600         if data is None:
 601             return self.open(newurl)
 602         else:
 603             return self.open(newurl, data)
 604
 605     def retry_https_basic_auth(self, url, realm, data=None):
 606         host, selector = splithost(url)
 607         i = host.find('@') + 1
 608         host = host[i:]
 609         user, passwd = self.get_user_passwd(host, realm, i)
 610         if not (user or passwd): return None
 611         host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
 612         newurl = '//' + host + selector
 613         return self.open_https(newurl, data)
 614
 615     def get_user_passwd(self, host, realm, clear_cache = 0):
 616         key = realm + '@' + host.lower()
 617         if self.auth_cache.has_key(key):
 618             if clear_cache:
 619                 del self.auth_cache[key]
 620             else:
 621                 return self.auth_cache[key]
 622         user, passwd = self.prompt_user_passwd(host, realm)
 623         if user or passwd: self.auth_cache[key] = (user, passwd)
 624         return user, passwd
 625
 626     def prompt_user_passwd(self, host, realm):
 627         """Override this in a GUI environment!"""
 628         import getpass
 629         try:
 630             user = raw_input("Enter username for %s at %s: " % (realm,
 631                                                                 host))
 632             passwd = getpass.getpass("Enter password for %s in %s at %s: " %
 633                 (user, realm, host))
 634             return user, passwd
 635         except KeyboardInterrupt:
 636             print
 637             return None, None
 638
 639
 640 # Utility functions
 641
 642 _localhost = None
 643 def localhost():
 644     """Return the IP address of the magic hostname 'localhost'."""
 645     global _localhost
 646     if not _localhost:
 647         _localhost = socket.gethostbyname('localhost')
 648     return _localhost
 649
 650 _thishost = None
 651 def thishost():
 652     """Return the IP address of the current host."""
 653     global _thishost
 654     if not _thishost:
 655         _thishost = socket.gethostbyname(socket.gethostname())
 656     return _thishost
 657
 658 _ftperrors = None
 659 def ftperrors():
 660     """Return the set of errors raised by the FTP class."""
 661     global _ftperrors
 662     if not _ftperrors:
 663         import ftplib
 664         _ftperrors = ftplib.all_errors
 665     return _ftperrors
 666
 667 _noheaders = None
 668 def noheaders():
 669     """Return an empty mimetools.Message object."""
 670     global _noheaders
 671     if not _noheaders:
 672         import mimetools
 673         import StringIO
 674         _noheaders = mimetools.Message(StringIO.StringIO(), 0)
 675         _noheaders.fp.close()   # Recycle file descriptor
 676     return _noheaders
 677
 678
 679 # Utility classes
 680
 681 class ftpwrapper:
 682     """Class used by open_ftp() for cache of open FTP connections."""
 683
 684     def __init__(self, user, passwd, host, port, dirs):
 685         self.user = user
 686         self.passwd = passwd
 687         self.host = host
 688         self.port = port
 689         self.dirs = dirs
 690         self.init()
 691
 692     def init(self):
 693         import ftplib
 694         self.busy = 0
 695         self.ftp = ftplib.FTP()
 696         self.ftp.connect(self.host, self.port)
 697         self.ftp.login(self.user, self.passwd)
 698         for dir in self.dirs:
 699             self.ftp.cwd(dir)
 700
 701     def retrfile(self, file, type):
 702         import ftplib
 703         self.endtransfer()
 704         if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
 705         else: cmd = 'TYPE ' + type; isdir = 0
 706         try:
 707             self.ftp.voidcmd(cmd)
 708         except ftplib.all_errors:
 709             self.init()
 710             self.ftp.voidcmd(cmd)
 711         conn = None
 712         if file and not isdir:
 713             # Use nlst to see if the file exists at all
 714             try:
 715                 self.ftp.nlst(file)
 716             except ftplib.error_perm, reason:
 717                 raise IOError, ('ftp error', reason), sys.exc_info()[2]
 718             # Restore the transfer mode!
 719             self.ftp.voidcmd(cmd)
 720             # Try to retrieve as a file
 721             try:
 722                 cmd = 'RETR ' + file
 723                 conn = self.ftp.ntransfercmd(cmd)
 724             except ftplib.error_perm, reason:
 725                 if str(reason)[:3] != '550':
 726                     raise IOError, ('ftp error', reason), sys.exc_info()[2]
 727         if not conn:
 728             # Set transfer mode to ASCII!
 729             self.ftp.voidcmd('TYPE A')
 730             # Try a directory listing
 731             if file: cmd = 'LIST ' + file
 732             else: cmd = 'LIST'
 733             conn = self.ftp.ntransfercmd(cmd)
 734         self.busy = 1
 735         # Pass back both a suitably decorated object and a retrieval length
 736         return (addclosehook(conn[0].makefile('rb'),
 737                              self.endtransfer), conn[1])
 738     def endtransfer(self):
 739         if not self.busy:
 740             return
 741         self.busy = 0
 742         try:
 743             self.ftp.voidresp()
 744         except ftperrors():
 745             pass
 746
 747     def close(self):
 748         self.endtransfer()
 749         try:
 750             self.ftp.close()
 751         except ftperrors():
 752             pass
 753
 754 class addbase:
 755     """Base class for addinfo and addclosehook."""
 756
 757     def __init__(self, fp):
 758         self.fp = fp
 759         self.read = self.fp.read
 760         self.readline = self.fp.readline
 761         if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
 762         if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
 763
 764     def __repr__(self):
 765         return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
 766                                              `id(self)`, `self.fp`)
 767
 768     def close(self):
 769         self.read = None
 770         self.readline = None
 771         self.readlines = None
 772         self.fileno = None
 773         if self.fp: self.fp.close()
 774         self.fp = None
 775
 776 class addclosehook(addbase):
 777     """Class to add a close hook to an open file."""
 778
 779     def __init__(self, fp, closehook, *hookargs):
 780         addbase.__init__(self, fp)
 781         self.closehook = closehook
 782         self.hookargs = hookargs
 783
 784     def close(self):
 785         addbase.close(self)
 786         if self.closehook:
 787             apply(self.closehook, self.hookargs)
 788             self.closehook = None
 789             self.hookargs = None
 790
 791 class addinfo(addbase):
 792     """class to add an info() method to an open file."""
 793
 794     def __init__(self, fp, headers):
 795         addbase.__init__(self, fp)
 796         self.headers = headers
 797
 798     def info(self):
 799         return self.headers
 800
 801 class addinfourl(addbase):
 802     """class to add info() and geturl() methods to an open file."""
 803
 804     def __init__(self, fp, headers, url):
 805         addbase.__init__(self, fp)
 806         self.headers = headers
 807         self.url = url
 808
 809     def info(self):
 810         return self.headers
 811
 812     def geturl(self):
 813         return self.url
 814
 815
 816 def basejoin(base, url):
 817     """Utility to combine a URL with a base URL to form a new URL."""
 818     type, path = splittype(url)
 819     if type:
 820         # if url is complete (i.e., it contains a type), return it
 821         return url
 822     host, path = splithost(path)
 823     type, basepath = splittype(base) # inherit type from base
 824     if host:
 825         # if url contains host, just inherit type
 826         if type: return type + '://' + host + path
 827         else:
 828             # no type inherited, so url must have started with //
 829             # just return it
 830             return url
 831     host, basepath = splithost(basepath) # inherit host
 832     basepath, basetag = splittag(basepath) # remove extraneous cruft
 833     basepath, basequery = splitquery(basepath) # idem
 834     if path[:1] != '/':
 835         # non-absolute path name
 836         if path[:1] in ('#', '?'):
 837             # path is just a tag or query, attach to basepath
 838             i = len(basepath)
 839         else:
 840             # else replace last component
 841             i = basepath.rfind('/')
 842         if i < 0:
 843             # basepath not absolute
 844             if host:
 845                 # host present, make absolute
 846                 basepath = '/'
 847             else:
 848                 # else keep non-absolute
 849                 basepath = ''
 850         else:
 851             # remove last file component
 852             basepath = basepath[:i+1]
 853         # Interpret ../ (important because of symlinks)
 854         while basepath and path[:3] == '../':
 855             path = path[3:]
 856             i = basepath[:-1].rfind('/')
 857             if i > 0:
 858                 basepath = basepath[:i+1]
 859             elif i == 0:
 860                 basepath = '/'
 861                 break
 862             else:
 863                 basepath = ''
 864
 865         path = basepath + path
 866     if host and path and path[0] != '/':
 867         path = '/' + path
 868     if type and host: return type + '://' + host + path
 869     elif type: return type + ':' + path
 870     elif host: return '//' + host + path # don't know what this means
 871     else: return path
 872
 873
 874 # Utilities to parse URLs (most of these return None for missing parts):
 875 # unwrap('<URL:type://host/path>') --> 'type://host/path'
 876 # splittype('type:opaquestring') --> 'type', 'opaquestring'
 877 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
 878 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
 879 # splitpasswd('user:passwd') -> 'user', 'passwd'
 880 # splitport('host:port') --> 'host', 'port'
 881 # splitquery('/path?query') --> '/path', 'query'
 882 # splittag('/path#tag') --> '/path', 'tag'
 883 # splitattr('/path;attr1=value1;attr2=value2;...') ->
 884 #   '/path', ['attr1=value1', 'attr2=value2', ...]
 885 # splitvalue('attr=value') --> 'attr', 'value'
 886 # splitgophertype('/Xselector') --> 'X', 'selector'
 887 # unquote('abc%20def') -> 'abc def'
 888 # quote('abc def') -> 'abc%20def')
 889
 890 def toBytes(url):
 891     """toBytes(u"URL") --> 'URL'."""
 892     # Most URL schemes require ASCII. If that changes, the conversion
 893     # can be relaxed
 894     if type(url) is types.UnicodeType:
 895         try:
 896             url = url.encode("ASCII")
 897         except UnicodeError:
 898             raise UnicodeError("URL " + repr(url) +
 899                                " contains non-ASCII characters")
 900     return url
 901
 902 def unwrap(url):
 903     """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
 904     url = url.strip()
 905     if url[:1] == '<' and url[-1:] == '>':
 906         url = url[1:-1].strip()
 907     if url[:4] == 'URL:': url = url[4:].strip()
 908     return url
 909
 910 _typeprog = None
 911 def splittype(url):
 912     """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
 913     global _typeprog
 914     if _typeprog is None:
 915         import re
 916         _typeprog = re.compile('^([^/:]+):')
 917
 918     match = _typeprog.match(url)
 919     if match:
 920         scheme = match.group(1)
 921         return scheme.lower(), url[len(scheme) + 1:]
 922     return None, url
 923
 924 _hostprog = None
 925 def splithost(url):
 926     """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
 927     global _hostprog
 928     if _hostprog is None:
 929         import re
 930         _hostprog = re.compile('^//([^/]*)(.*)$')
 931
 932     match = _hostprog.match(url)
 933     if match: return match.group(1, 2)
 934     return None, url
 935
 936 _userprog = None
 937 def splituser(host):
 938     """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
 939     global _userprog
 940     if _userprog is None:
 941         import re
 942         _userprog = re.compile('^([^@]*)@(.*)$')
 943
 944     match = _userprog.match(host)
 945     if match: return map(unquote, match.group(1, 2))
 946     return None, host
 947
 948 _passwdprog = None
 949 def splitpasswd(user):
 950     """splitpasswd('user:passwd') -> 'user', 'passwd'."""
 951     global _passwdprog
 952     if _passwdprog is None:
 953         import re
 954         _passwdprog = re.compile('^([^:]*):(.*)$')
 955
 956     match = _passwdprog.match(user)
 957     if match: return match.group(1, 2)
 958     return user, None
 959
 960 # splittag('/path#tag') --> '/path', 'tag'
 961 _portprog = None
 962 def splitport(host):
 963     """splitport('host:port') --> 'host', 'port'."""
 964     global _portprog
 965     if _portprog is None:
 966         import re
 967         _portprog = re.compile('^(.*):([0-9]+)$')
 968
 969     match = _portprog.match(host)
 970     if match: return match.group(1, 2)
 971     return host, None
 972
 973 _nportprog = None
 974 def splitnport(host, defport=-1):
 975     """Split host and port, returning numeric port.
 976     Return given default port if no ':' found; defaults to -1.
 977     Return numerical port if a valid number are found after ':'.
 978     Return None if ':' but not a valid number."""
 979     global _nportprog
 980     if _nportprog is None:
 981         import re
 982         _nportprog = re.compile('^(.*):(.*)$')
 983
 984     match = _nportprog.match(host)
 985     if match:
 986         host, port = match.group(1, 2)
 987         try:
 988             if not port: raise ValueError, "no digits"
 989             nport = int(port)
 990         except ValueError:
 991             nport = None
 992         return host, nport
 993     return host, defport
 994
 995 _queryprog = None
 996 def splitquery(url):
 997     """splitquery('/path?query') --> '/path', 'query'."""
 998     global _queryprog
 999     if _queryprog is None:
1000         import re
1001         _queryprog = re.compile('^(.*)\?([^?]*)$')
1002
1003     match = _queryprog.match(url)
1004     if match: return match.group(1, 2)
1005     return url, None
1006
1007 _tagprog = None
1008 def splittag(url):
1009     """splittag('/path#tag') --> '/path', 'tag'."""
1010     global _tagprog
1011     if _tagprog is None:
1012         import re
1013         _tagprog = re.compile('^(.*)#([^#]*)$')
1014
1015     match = _tagprog.match(url)
1016     if match: return match.group(1, 2)
1017     return url, None
1018
1019 def splitattr(url):
1020     """splitattr('/path;attr1=value1;attr2=value2;...') ->
1021         '/path', ['attr1=value1', 'attr2=value2', ...]."""
1022     words = url.split(';')
1023     return words[0], words[1:]
1024
1025 _valueprog = None
1026 def splitvalue(attr):
1027     """splitvalue('attr=value') --> 'attr', 'value'."""
1028     global _valueprog
1029     if _valueprog is None:
1030         import re
1031         _valueprog = re.compile('^([^=]*)=(.*)$')
1032
1033     match = _valueprog.match(attr)
1034     if match: return match.group(1, 2)
1035     return attr, None
1036
1037 def splitgophertype(selector):
1038     """splitgophertype('/Xselector') --> 'X', 'selector'."""
1039     if selector[:1] == '/' and selector[1:2]:
1040         return selector[1], selector[2:]
1041     return None, selector
1042
1043 def unquote(s):
1044     """unquote('abc%20def') -> 'abc def'."""
1045     mychr = chr
1046     myatoi = int
1047     list = s.split('%')
1048     res = [list[0]]
1049     myappend = res.append
1050     del list[0]
1051     for item in list:
1052         if item[1:2]:
1053             try:
1054                 myappend(mychr(myatoi(item[:2], 16))
1055                      + item[2:])
1056             except:
1057                 myappend('%' + item)
1058         else:
1059             myappend('%' + item)
1060     return "".join(res)
1061
1062 def unquote_plus(s):
1063     """unquote('%7e/abc+def') -> '~/abc def'"""
1064     if '+' in s:
1065         # replace '+' with ' '
1066         s = ' '.join(s.split('+'))
1067     return unquote(s)
1068
1069 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1070                'abcdefghijklmnopqrstuvwxyz'
1071                '0123456789' '_.-')
1072
1073 _fast_safe_test = always_safe + '/'
1074 _fast_safe = None
1075
1076 def _fast_quote(s):
1077     global _fast_safe
1078     if _fast_safe is None:
1079         _fast_safe = {}
1080         for c in _fast_safe_test:
1081             _fast_safe[c] = c
1082     res = list(s)
1083     for i in range(len(res)):
1084         c = res[i]
1085         if not _fast_safe.has_key(c):
1086             res[i] = '%%%02X' % ord(c)
1087     return ''.join(res)
1088
1089 def quote(s, safe = '/'):
1090     """quote('abc def') -> 'abc%20def'
1091
1092     Each part of a URL, e.g. the path info, the query, etc., has a
1093     different set of reserved characters that must be quoted.
1094
1095     RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1096     the following reserved characters.
1097
1098     reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1099                   "$" | ","
1100
1101     Each of these characters is reserved in some component of a URL,
1102     but not necessarily in all of them.
1103
1104     By default, the quote function is intended for quoting the path
1105     section of a URL.  Thus, it will not encode '/'.  This character
1106     is reserved, but in typical usage the quote function is being
1107     called on a path where the existing slash characters are used as
1108     reserved characters.
1109     """
1110     safe = always_safe + safe
1111     if _fast_safe_test == safe:
1112         return _fast_quote(s)
1113     res = list(s)
1114     for i in range(len(res)):
1115         c = res[i]
1116         if c not in safe:
1117             res[i] = '%%%02X' % ord(c)
1118     return ''.join(res)
1119
1120 def quote_plus(s, safe = ''):
1121     """Quote the query fragment of a URL; replacing ' ' with '+'"""
1122     if ' ' in s:
1123         l = s.split(' ')
1124         for i in range(len(l)):
1125             l[i] = quote(l[i], safe)
1126         return '+'.join(l)
1127     else:
1128         return quote(s, safe)
1129
1130 def urlencode(query,doseq=0):
1131     """Encode a sequence of two-element tuples or dictionary into a URL query string.
1132
1133     If any values in the query arg are sequences and doseq is true, each
1134     sequence element is converted to a separate parameter.
1135
1136     If the query arg is a sequence of two-element tuples, the order of the
1137     parameters in the output will match the order of parameters in the
1138     input.
1139     """
1140
1141     if hasattr(query,"items"):
1142         # mapping objects
1143         query = query.items()
1144     else:
1145         # it's a bother at times that strings and string-like objects are
1146         # sequences...
1147         try:
1148             # non-sequence items should not work with len()
1149             x = len(query)
1150             # non-empty strings will fail this
1151             if len(query) and type(query[0]) != types.TupleType:
1152                 raise TypeError
1153             # zero-length sequences of all types will get here and succeed,
1154             # but that's a minor nit - since the original implementation
1155             # allowed empty dicts that type of behavior probably should be
1156             # preserved for consistency
1157         except TypeError:
1158             ty,va,tb = sys.exc_info()
1159             raise TypeError, "not a valid non-string sequence or mapping object", tb
1160
1161     l = []
1162     if not doseq:
1163         # preserve old behavior
1164         for k, v in query:
1165             k = quote_plus(str(k))
1166             v = quote_plus(str(v))
1167             l.append(k + '=' + v)
1168     else:
1169         for k, v in query:
1170             k = quote_plus(str(k))
1171             if type(v) == types.StringType:
1172                 v = quote_plus(v)
1173                 l.append(k + '=' + v)
1174             elif type(v) == types.UnicodeType:
1175                 # is there a reasonable way to convert to ASCII?
1176                 # encode generates a string, but "replace" or "ignore"
1177                 # lose information and "strict" can raise UnicodeError
1178                 v = quote_plus(v.encode("ASCII","replace"))
1179                 l.append(k + '=' + v)
1180             else:
1181                 try:
1182                     # is this a sufficient test for sequence-ness?
1183                     x = len(v)
1184                 except TypeError:
1185                     # not a sequence
1186                     v = quote_plus(str(v))
1187                     l.append(k + '=' + v)
1188                 else:
1189                     # loop over the sequence
1190                     for elt in v:
1191                         l.append(k + '=' + quote_plus(str(elt)))
1192     return '&'.join(l)
1193
1194 # Proxy handling
1195 def getproxies_environment():
1196     """Return a dictionary of scheme -> proxy server URL mappings.
1197
1198     Scan the environment for variables named <scheme>_proxy;
1199     this seems to be the standard convention.  If you need a
1200     different way, you can pass a proxies dictionary to the
1201     [Fancy]URLopener constructor.
1202
1203     """
1204     proxies = {}
1205     for name, value in os.environ.items():
1206         name = name.lower()
1207         if value and name[-6:] == '_proxy':
1208             proxies[name[:-6]] = value
1209     return proxies
1210
1211 if os.name == 'mac':
1212     def getproxies():
1213         """Return a dictionary of scheme -> proxy server URL mappings.
1214
1215         By convention the mac uses Internet Config to store
1216         proxies.  An HTTP proxy, for instance, is stored under
1217         the HttpProxy key.
1218
1219         """
1220         try:
1221             import ic
1222         except ImportError:
1223             return {}
1224
1225         try:
1226             config = ic.IC()
1227         except ic.error:
1228             return {}
1229         proxies = {}
1230         # HTTP:
1231         if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1232             try:
1233                 value = config['HTTPProxyHost']
1234             except ic.error:
1235                 pass
1236             else:
1237                 proxies['http'] = 'http://%s' % value
1238         # FTP: XXXX To be done.
1239         # Gopher: XXXX To be done.
1240         return proxies
1241
1242 elif os.name == 'nt':
1243     def getproxies_registry():
1244         """Return a dictionary of scheme -> proxy server URL mappings.
1245
1246         Win32 uses the registry to store proxies.
1247
1248         """
1249         proxies = {}
1250         try:
1251             import _winreg
1252         except ImportError:
1253             # Std module, so should be around - but you never know!
1254             return proxies
1255         try:
1256             internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1257                 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1258             proxyEnable = _winreg.QueryValueEx(internetSettings,
1259                                                'ProxyEnable')[0]
1260             if proxyEnable:
1261                 # Returned as Unicode but problems if not converted to ASCII
1262                 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1263                                                        'ProxyServer')[0])
1264                 if '=' in proxyServer:
1265                     # Per-protocol settings
1266                     for p in proxyServer.split(';'):
1267                         protocol, address = p.split('=', 1)
1268                         proxies[protocol] = '%s://%s' % (protocol, address)
1269                 else:
1270                     # Use one setting for all protocols
1271                     if proxyServer[:5] == 'http:':
1272                         proxies['http'] = proxyServer
1273                     else:
1274                         proxies['http'] = 'http://%s' % proxyServer
1275                         proxies['ftp'] = 'ftp://%s' % proxyServer
1276             internetSettings.Close()
1277         except (WindowsError, ValueError, TypeError):
1278             # Either registry key not found etc, or the value in an
1279             # unexpected format.
1280             # proxies already set up to be empty so nothing to do
1281             pass
1282         return proxies
1283
1284     def getproxies():
1285         """Return a dictionary of scheme -> proxy server URL mappings.
1286
1287         Returns settings gathered from the environment, if specified,
1288         or the registry.
1289
1290         """
1291         return getproxies_environment() or getproxies_registry()
1292 else:
1293     # By default use environment variables
1294     getproxies = getproxies_environment
1295
1296
1297 # Test and time quote() and unquote()
1298 def test1():
1299     import time
1300     s = ''
1301     for i in range(256): s = s + chr(i)
1302     s = s*4
1303     t0 = time.time()
1304     qs = quote(s)
1305     uqs = unquote(qs)
1306     t1 = time.time()
1307     if uqs != s:
1308         print 'Wrong!'
1309     print `s`
1310     print `qs`
1311     print `uqs`
1312     print round(t1 - t0, 3), 'sec'
1313
1314
1315 def reporthook(blocknum, blocksize, totalsize):
1316     # Report during remote transfers
1317     print "Block number: %d, Block size: %d, Total size: %d" % (
1318         blocknum, blocksize, totalsize)
1319
1320 # Test program
1321 def test(args=[]):
1322     if not args:
1323         args = [
1324             '/etc/passwd',
1325             'file:/etc/passwd',
1326             'file://localhost/etc/passwd',
1327             'ftp://ftp.python.org/etc/passwd',
1328 ##          'gopher://gopher.micro.umn.edu/1/',
1329             'http://www.python.org/index.html',
1330             ]
1331         if hasattr(URLopener, "open_https"):
1332             args.append('https://synergy.as.cmu.edu/~geek/')
1333     try:
1334         for url in args:
1335             print '-'*10, url, '-'*10
1336             fn, h = urlretrieve(url, None, reporthook)
1337             print fn
1338             if h:
1339                 print '======'
1340                 for k in h.keys(): print k + ':', h[k]
1341                 print '======'
1342             fp = open(fn, 'rb')
1343             data = fp.read()
1344             del fp
1345             if '\r' in data:
1346                 table = string.maketrans("", "")
1347                 data = data.translate(table, "\r")
1348             print data
1349             fn, h = None, None
1350         print '-'*40
1351     finally:
1352         urlcleanup()
1353
1354 def main():
1355     import getopt, sys
1356     try:
1357         opts, args = getopt.getopt(sys.argv[1:], "th")
1358     except getopt.error, msg:
1359         print msg
1360         print "Use -h for help"
1361         return
1362     t = 0
1363     for o, a in opts:
1364         if o == '-t':
1365             t = t + 1
1366         if o == '-h':
1367             print "Usage: python urllib.py [-t] [url ...]"
1368             print "-t runs self-test;",
1369             print "otherwise, contents of urls are printed"
1370             return
1371     if t:
1372         if t > 1:
1373             test1()
1374         test(args)
1375     else:
1376         if not args:
1377             print "Use -h for help"
1378         for url in args:
1379             print urlopen(url).read(),
1380
1381 # Run test program when run as a script
1382 if __name__ == '__main__':
1383     main()