Lib/urllib.py

   1 """Open an arbitrary URL.
   2
   3 See the following document for more info on URLs:
   4 "Names and Addresses, URIs, URLs, URNs, URCs", at
   5 http://www.w3.org/pub/WWW/Addressing/Overview.html
   6
   7 See also the HTTP spec (from which the error codes are derived):
   8 "HTTP - Hypertext Transfer Protocol", at
   9 http://www.w3.org/pub/WWW/Protocols/
  10
  11 Related standards and specs:
  12 - RFC1808: the "relative URL" spec. (authoritative status)
  13 - RFC1738 - the "URL standard". (authoritative status)
  14 - RFC1630 - the "URI spec". (informational status)
  15
  16 The object returned by URLopener().open(file) will differ per
  17 protocol.  All you know is that is has methods read(), readline(),
  18 readlines(), fileno(), close() and info().  The read*(), fileno()
  19 and close() methods work like those of open files.
  20 The info() method returns a mimetools.Message object which can be
  21 used to query various info about the object, if available.
  22 (mimetools.Message objects are queried with the getheader() method.)
  23 """
  24
  25 import string
  26 import socket
  27 import os
  28 import sys
  29
  30
  31 __version__ = '1.12'    # XXX This version is not always updated :-(
  32
  33 MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
  34
  35 # Helper for non-unix systems
  36 if os.name == 'mac':
  37     from macurl2path import url2pathname, pathname2url
  38 elif os.name == 'nt':
  39     from nturl2path import url2pathname, pathname2url
  40 else:
  41     def url2pathname(pathname):
  42         return unquote(pathname)
  43     def pathname2url(pathname):
  44         return quote(pathname)
  45
  46 # This really consists of two pieces:
  47 # (1) a class which handles opening of all sorts of URLs
  48 #     (plus assorted utilities etc.)
  49 # (2) a set of functions for parsing URLs
  50 # XXX Should these be separated out into different modules?
  51
  52
  53 # Shortcut for basic usage
  54 _urlopener = None
  55 def urlopen(url, data=None):
  56     global _urlopener
  57     if not _urlopener:
  58         _urlopener = FancyURLopener()
  59     if data is None:
  60         return _urlopener.open(url)
  61     else:
  62         return _urlopener.open(url, data)
  63 def urlretrieve(url, filename=None, reporthook=None):
  64     global _urlopener
  65     if not _urlopener:
  66         _urlopener = FancyURLopener()
  67     return _urlopener.retrieve(url, filename, reporthook)
  68 def urlcleanup():
  69     if _urlopener:
  70         _urlopener.cleanup()
  71
  72
  73 ftpcache = {}
  74 class URLopener:
  75     """Class to open URLs.
  76     This is a class rather than just a subroutine because we may need
  77     more than one set of global protocol-specific options.
  78     Note -- this is a base class for those who don't want the
  79     automatic handling of errors type 302 (relocated) and 401
  80     (authorization needed)."""
  81
  82     __tempfiles = None
  83
  84     # Constructor
  85     def __init__(self, proxies=None, **x509):
  86         if proxies is None:
  87             proxies = getproxies()
  88         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
  89         self.proxies = proxies
  90         self.key_file = x509.get('key_file')
  91         self.cert_file = x509.get('cert_file')
  92         server_version = "Python-urllib/%s" % __version__
  93         self.addheaders = [('User-agent', server_version)]
  94         self.__tempfiles = []
  95         self.__unlink = os.unlink # See cleanup()
  96         self.tempcache = None
  97         # Undocumented feature: if you assign {} to tempcache,
  98         # it is used to cache files retrieved with
  99         # self.retrieve().  This is not enabled by default
 100         # since it does not work for changing documents (and I
 101         # haven't got the logic to check expiration headers
 102         # yet).
 103         self.ftpcache = ftpcache
 104         # Undocumented feature: you can use a different
 105         # ftp cache by assigning to the .ftpcache member;
 106         # in case you want logically independent URL openers
 107         # XXX This is not threadsafe.  Bah.
 108
 109     def __del__(self):
 110         self.close()
 111
 112     def close(self):
 113         self.cleanup()
 114
 115     def cleanup(self):
 116         # This code sometimes runs when the rest of this module
 117         # has already been deleted, so it can't use any globals
 118         # or import anything.
 119         if self.__tempfiles:
 120             for file in self.__tempfiles:
 121                 try:
 122                     self.__unlink(file)
 123                 except:
 124                     pass
 125             del self.__tempfiles[:]
 126         if self.tempcache:
 127             self.tempcache.clear()
 128
 129     def addheader(self, *args):
 130         """Add a header to be used by the HTTP interface only
 131         e.g. u.addheader('Accept', 'sound/basic')"""
 132         self.addheaders.append(args)
 133
 134     # External interface
 135     def open(self, fullurl, data=None):
 136         """Use URLopener().open(file) instead of open(file, 'r')."""
 137         fullurl = unwrap(fullurl)
 138         if self.tempcache and self.tempcache.has_key(fullurl):
 139             filename, headers = self.tempcache[fullurl]
 140             fp = open(filename, 'rb')
 141             return addinfourl(fp, headers, fullurl)
 142         type, url = splittype(fullurl)
 143         if not type: type = 'file'
 144         if self.proxies.has_key(type):
 145             proxy = self.proxies[type]
 146             type, proxy = splittype(proxy)
 147             host, selector = splithost(proxy)
 148             url = (host, fullurl) # Signal special case to open_*()
 149         name = 'open_' + type
 150         self.type = type
 151         if '-' in name:
 152             # replace - with _
 153             name = string.join(string.split(name, '-'), '_')
 154         if not hasattr(self, name):
 155             if data is None:
 156                 return self.open_unknown(fullurl)
 157             else:
 158                 return self.open_unknown(fullurl, data)
 159         try:
 160             if data is None:
 161                 return getattr(self, name)(url)
 162             else:
 163                 return getattr(self, name)(url, data)
 164         except socket.error, msg:
 165             raise IOError, ('socket error', msg), sys.exc_info()[2]
 166
 167     def open_unknown(self, fullurl, data=None):
 168         """Overridable interface to open unknown URL type."""
 169         type, url = splittype(fullurl)
 170         raise IOError, ('url error', 'unknown url type', type)
 171
 172     # External interface
 173     def retrieve(self, url, filename=None, reporthook=None):
 174         """retrieve(url) returns (filename, None) for a local object
 175         or (tempfilename, headers) for a remote object."""
 176         url = unwrap(url)
 177         if self.tempcache and self.tempcache.has_key(url):
 178             return self.tempcache[url]
 179         type, url1 = splittype(url)
 180         if not filename and (not type or type == 'file'):
 181             try:
 182                 fp = self.open_local_file(url1)
 183                 hdrs = fp.info()
 184                 del fp
 185                 return url2pathname(splithost(url1)[1]), hdrs
 186             except IOError, msg:
 187                 pass
 188         fp = self.open(url)
 189         headers = fp.info()
 190         if not filename:
 191             import tempfile
 192             garbage, path = splittype(url)
 193             garbage, path = splithost(path or "")
 194             path, garbage = splitquery(path or "")
 195             path, garbage = splitattr(path or "")
 196             suffix = os.path.splitext(path)[1]
 197             filename = tempfile.mktemp(suffix)
 198             self.__tempfiles.append(filename)
 199         result = filename, headers
 200         if self.tempcache is not None:
 201             self.tempcache[url] = result
 202         tfp = open(filename, 'wb')
 203         bs = 1024*8
 204         size = -1
 205         blocknum = 1
 206         if reporthook:
 207             if headers.has_key("content-length"):
 208                 size = int(headers["Content-Length"])
 209             reporthook(0, bs, size)
 210         block = fp.read(bs)
 211         if reporthook:
 212             reporthook(1, bs, size)
 213         while block:
 214             tfp.write(block)
 215             block = fp.read(bs)
 216             blocknum = blocknum + 1
 217             if reporthook:
 218                 reporthook(blocknum, bs, size)
 219         fp.close()
 220         tfp.close()
 221         del fp
 222         del tfp
 223         return result
 224
 225     # Each method named open_<type> knows how to open that type of URL
 226
 227     def open_http(self, url, data=None):
 228         """Use HTTP protocol."""
 229         import httplib
 230         user_passwd = None
 231         if type(url) is type(""):
 232             host, selector = splithost(url)
 233             if host:
 234                 user_passwd, host = splituser(host)
 235                 host = unquote(host)
 236             realhost = host
 237         else:
 238             host, selector = url
 239             urltype, rest = splittype(selector)
 240             url = rest
 241             user_passwd = None
 242             if string.lower(urltype) != 'http':
 243                 realhost = None
 244             else:
 245                 realhost, rest = splithost(rest)
 246                 if realhost:
 247                     user_passwd, realhost = splituser(realhost)
 248                 if user_passwd:
 249                     selector = "%s://%s%s" % (urltype, realhost, rest)
 250             #print "proxy via http:", host, selector
 251         if not host: raise IOError, ('http error', 'no host given')
 252         if user_passwd:
 253             import base64
 254             auth = string.strip(base64.encodestring(user_passwd))
 255         else:
 256             auth = None
 257         h = httplib.HTTP(host)
 258         if data is not None:
 259             h.putrequest('POST', selector)
 260             h.putheader('Content-type', 'application/x-www-form-urlencoded')
 261             h.putheader('Content-length', '%d' % len(data))
 262         else:
 263             h.putrequest('GET', selector)
 264         if auth: h.putheader('Authorization', 'Basic %s' % auth)
 265         if realhost: h.putheader('Host', realhost)
 266         for args in self.addheaders: apply(h.putheader, args)
 267         h.endheaders()
 268         if data is not None:
 269             h.send(data + '\r\n')
 270         errcode, errmsg, headers = h.getreply()
 271         fp = h.getfile()
 272         if errcode == 200:
 273             return addinfourl(fp, headers, "http:" + url)
 274         else:
 275             if data is None:
 276                 return self.http_error(url, fp, errcode, errmsg, headers)
 277             else:
 278                 return self.http_error(url, fp, errcode, errmsg, headers, data)
 279
 280     def http_error(self, url, fp, errcode, errmsg, headers, data=None):
 281         """Handle http errors.
 282         Derived class can override this, or provide specific handlers
 283         named http_error_DDD where DDD is the 3-digit error code."""
 284         # First check if there's a specific handler for this error
 285         name = 'http_error_%d' % errcode
 286         if hasattr(self, name):
 287             method = getattr(self, name)
 288             if data is None:
 289                 result = method(url, fp, errcode, errmsg, headers)
 290             else:
 291                 result = method(url, fp, errcode, errmsg, headers, data)
 292             if result: return result
 293         return self.http_error_default(url, fp, errcode, errmsg, headers)
 294
 295     def http_error_default(self, url, fp, errcode, errmsg, headers):
 296         """Default error handler: close the connection and raise IOError."""
 297         void = fp.read()
 298         fp.close()
 299         raise IOError, ('http error', errcode, errmsg, headers)
 300
 301     if hasattr(socket, "ssl"):
 302         def open_https(self, url, data=None):
 303             """Use HTTPS protocol."""
 304             import httplib
 305             if type(url) is type(""):
 306                 host, selector = splithost(url)
 307                 user_passwd, host = splituser(host)
 308             else:
 309                 host, selector = url
 310                 urltype, rest = splittype(selector)
 311                 if string.lower(urltype) == 'https':
 312                     realhost, rest = splithost(rest)
 313                     user_passwd, realhost = splituser(realhost)
 314                     if user_passwd:
 315                         selector = "%s://%s%s" % (urltype, realhost, rest)
 316                 #print "proxy via https:", host, selector
 317             if not host: raise IOError, ('https error', 'no host given')
 318             if user_passwd:
 319                 import base64
 320                 auth = string.strip(base64.encodestring(user_passwd))
 321             else:
 322                 auth = None
 323             h = httplib.HTTPS(host, 0,
 324                               key_file=self.key_file,
 325                               cert_file=self.cert_file)
 326             if data is not None:
 327                 h.putrequest('POST', selector)
 328                 h.putheader('Content-type',
 329                             'application/x-www-form-urlencoded')
 330                 h.putheader('Content-length', '%d' % len(data))
 331             else:
 332                 h.putrequest('GET', selector)
 333             if auth: h.putheader('Authorization: Basic %s' % auth)
 334             for args in self.addheaders: apply(h.putheader, args)
 335             h.endheaders()
 336             if data is not None:
 337                 h.send(data + '\r\n')
 338             errcode, errmsg, headers = h.getreply()
 339             fp = h.getfile()
 340             if errcode == 200:
 341                 return addinfourl(fp, headers, url)
 342             else:
 343                 return self.http_error(url, fp, errcode, errmsg, headers)
 344
 345     def open_gopher(self, url):
 346         """Use Gopher protocol."""
 347         import gopherlib
 348         host, selector = splithost(url)
 349         if not host: raise IOError, ('gopher error', 'no host given')
 350         host = unquote(host)
 351         type, selector = splitgophertype(selector)
 352         selector, query = splitquery(selector)
 353         selector = unquote(selector)
 354         if query:
 355             query = unquote(query)
 356             fp = gopherlib.send_query(selector, query, host)
 357         else:
 358             fp = gopherlib.send_selector(selector, host)
 359         return addinfourl(fp, noheaders(), "gopher:" + url)
 360
 361     def open_file(self, url):
 362         """Use local file or FTP depending on form of URL."""
 363         if url[:2] == '//' and url[2:3] != '/':
 364             return self.open_ftp(url)
 365         else:
 366             return self.open_local_file(url)
 367
 368     def open_local_file(self, url):
 369         """Use local file."""
 370         import mimetypes, mimetools, StringIO
 371         mtype = mimetypes.guess_type(url)[0]
 372         headers = mimetools.Message(StringIO.StringIO(
 373             'Content-Type: %s\n' % (mtype or 'text/plain')))
 374         host, file = splithost(url)
 375         if not host:
 376             urlfile = file
 377             if file[:1] == '/':
 378                 urlfile = 'file://' + file
 379             return addinfourl(open(url2pathname(file), 'rb'),
 380                               headers, urlfile)
 381         host, port = splitport(host)
 382         if not port \
 383            and socket.gethostbyname(host) in (localhost(), thishost()):
 384             urlfile = file
 385             if file[:1] == '/':
 386                 urlfile = 'file://' + file
 387             return addinfourl(open(url2pathname(file), 'rb'),
 388                               headers, urlfile)
 389         raise IOError, ('local file error', 'not on local host')
 390
 391     def open_ftp(self, url):
 392         """Use FTP protocol."""
 393         host, path = splithost(url)
 394         if not host: raise IOError, ('ftp error', 'no host given')
 395         host, port = splitport(host)
 396         user, host = splituser(host)
 397         if user: user, passwd = splitpasswd(user)
 398         else: passwd = None
 399         host = unquote(host)
 400         user = unquote(user or '')
 401         passwd = unquote(passwd or '')
 402         host = socket.gethostbyname(host)
 403         if not port:
 404             import ftplib
 405             port = ftplib.FTP_PORT
 406         else:
 407             port = int(port)
 408         path, attrs = splitattr(path)
 409         path = unquote(path)
 410         dirs = string.splitfields(path, '/')
 411         dirs, file = dirs[:-1], dirs[-1]
 412         if dirs and not dirs[0]: dirs = dirs[1:]
 413         if dirs and not dirs[0]: dirs[0] = '/'
 414         key = (user, host, port, string.joinfields(dirs, '/'))
 415         # XXX thread unsafe!
 416         if len(self.ftpcache) > MAXFTPCACHE:
 417             # Prune the cache, rather arbitrarily
 418             for k in self.ftpcache.keys():
 419                 if k != key:
 420                     v = self.ftpcache[k]
 421                     del self.ftpcache[k]
 422                     v.close()
 423         try:
 424             if not self.ftpcache.has_key(key):
 425                 self.ftpcache[key] = \
 426                     ftpwrapper(user, passwd, host, port, dirs)
 427             if not file: type = 'D'
 428             else: type = 'I'
 429             for attr in attrs:
 430                 attr, value = splitvalue(attr)
 431                 if string.lower(attr) == 'type' and \
 432                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
 433                     type = string.upper(value)
 434             (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
 435             if retrlen is not None and retrlen >= 0:
 436                 import mimetools, StringIO
 437                 headers = mimetools.Message(StringIO.StringIO(
 438                     'Content-Length: %d\n' % retrlen))
 439             else:
 440                 headers = noheaders()
 441             return addinfourl(fp, headers, "ftp:" + url)
 442         except ftperrors(), msg:
 443             raise IOError, ('ftp error', msg), sys.exc_info()[2]
 444
 445     def open_data(self, url, data=None):
 446         """Use "data" URL."""
 447         # ignore POSTed data
 448         #
 449         # syntax of data URLs:
 450         # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
 451         # mediatype := [ type "/" subtype ] *( ";" parameter )
 452         # data      := *urlchar
 453         # parameter := attribute "=" value
 454         import StringIO, mimetools, time
 455         try:
 456             [type, data] = string.split(url, ',', 1)
 457         except ValueError:
 458             raise IOError, ('data error', 'bad data URL')
 459         if not type:
 460             type = 'text/plain;charset=US-ASCII'
 461         semi = string.rfind(type, ';')
 462         if semi >= 0 and '=' not in type[semi:]:
 463             encoding = type[semi+1:]
 464             type = type[:semi]
 465         else:
 466             encoding = ''
 467         msg = []
 468         msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
 469                                             time.gmtime(time.time())))
 470         msg.append('Content-type: %s' % type)
 471         if encoding == 'base64':
 472             import base64
 473             data = base64.decodestring(data)
 474         else:
 475             data = unquote(data)
 476         msg.append('Content-length: %d' % len(data))
 477         msg.append('')
 478         msg.append(data)
 479         msg = string.join(msg, '\n')
 480         f = StringIO.StringIO(msg)
 481         headers = mimetools.Message(f, 0)
 482         f.fileno = None     # needed for addinfourl
 483         return addinfourl(f, headers, url)
 484
 485
 486 class FancyURLopener(URLopener):
 487     """Derived class with handlers for errors we can handle (perhaps)."""
 488
 489     def __init__(self, *args):
 490         apply(URLopener.__init__, (self,) + args)
 491         self.auth_cache = {}
 492
 493     def http_error_default(self, url, fp, errcode, errmsg, headers):
 494         """Default error handling -- don't raise an exception."""
 495         return addinfourl(fp, headers, "http:" + url)
 496
 497     def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
 498         """Error 302 -- relocated (temporarily)."""
 499         # XXX The server can force infinite recursion here!
 500         if headers.has_key('location'):
 501             newurl = headers['location']
 502         elif headers.has_key('uri'):
 503             newurl = headers['uri']
 504         else:
 505             return
 506         void = fp.read()
 507         fp.close()
 508         # In case the server sent a relative URL, join with original:
 509         newurl = basejoin("http:" + url, newurl)
 510         if data is None:
 511             return self.open(newurl)
 512         else:
 513             return self.open(newurl, data)
 514
 515     def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
 516         """Error 301 -- also relocated (permanently)."""
 517         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
 518
 519     def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
 520         """Error 401 -- authentication required.
 521         See this URL for a description of the basic authentication scheme:
 522         http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
 523         if headers.has_key('www-authenticate'):
 524             stuff = headers['www-authenticate']
 525             import re
 526             match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
 527             if match:
 528                 scheme, realm = match.groups()
 529                 if string.lower(scheme) == 'basic':
 530                    name = 'retry_' + self.type + '_basic_auth'
 531                    if data is None:
 532                        return getattr(self,name)(url, realm)
 533                    else:
 534                        return getattr(self,name)(url, realm, data)
 535
 536     def retry_http_basic_auth(self, url, realm, data=None):
 537         host, selector = splithost(url)
 538         i = string.find(host, '@') + 1
 539         host = host[i:]
 540         user, passwd = self.get_user_passwd(host, realm, i)
 541         if not (user or passwd): return None
 542         host = user + ':' + passwd + '@' + host
 543         newurl = 'http://' + host + selector
 544         if data is None:
 545             return self.open(newurl)
 546         else:
 547             return self.open(newurl, data)
 548
 549     def retry_https_basic_auth(self, url, realm, data=None):
 550             host, selector = splithost(url)
 551             i = string.find(host, '@') + 1
 552             host = host[i:]
 553             user, passwd = self.get_user_passwd(host, realm, i)
 554             if not (user or passwd): return None
 555             host = user + ':' + passwd + '@' + host
 556             newurl = '//' + host + selector
 557             return self.open_https(newurl)
 558
 559     def get_user_passwd(self, host, realm, clear_cache = 0):
 560         key = realm + '@' + string.lower(host)
 561         if self.auth_cache.has_key(key):
 562             if clear_cache:
 563                 del self.auth_cache[key]
 564             else:
 565                 return self.auth_cache[key]
 566         user, passwd = self.prompt_user_passwd(host, realm)
 567         if user or passwd: self.auth_cache[key] = (user, passwd)
 568         return user, passwd
 569
 570     def prompt_user_passwd(self, host, realm):
 571         """Override this in a GUI environment!"""
 572         import getpass
 573         try:
 574             user = raw_input("Enter username for %s at %s: " % (realm,
 575                                                                 host))
 576             passwd = getpass.getpass("Enter password for %s in %s at %s: " %
 577                 (user, realm, host))
 578             return user, passwd
 579         except KeyboardInterrupt:
 580             print
 581             return None, None
 582
 583
 584 # Utility functions
 585
 586 _localhost = None
 587 def localhost():
 588     """Return the IP address of the magic hostname 'localhost'."""
 589     global _localhost
 590     if not _localhost:
 591         _localhost = socket.gethostbyname('localhost')
 592     return _localhost
 593
 594 _thishost = None
 595 def thishost():
 596     """Return the IP address of the current host."""
 597     global _thishost
 598     if not _thishost:
 599         _thishost = socket.gethostbyname(socket.gethostname())
 600     return _thishost
 601
 602 _ftperrors = None
 603 def ftperrors():
 604     """Return the set of errors raised by the FTP class."""
 605     global _ftperrors
 606     if not _ftperrors:
 607         import ftplib
 608         _ftperrors = ftplib.all_errors
 609     return _ftperrors
 610
 611 _noheaders = None
 612 def noheaders():
 613     """Return an empty mimetools.Message object."""
 614     global _noheaders
 615     if not _noheaders:
 616         import mimetools
 617         import StringIO
 618         _noheaders = mimetools.Message(StringIO.StringIO(), 0)
 619         _noheaders.fp.close()   # Recycle file descriptor
 620     return _noheaders
 621
 622
 623 # Utility classes
 624
 625 class ftpwrapper:
 626     """Class used by open_ftp() for cache of open FTP connections."""
 627
 628     def __init__(self, user, passwd, host, port, dirs):
 629         self.user = user
 630         self.passwd = passwd
 631         self.host = host
 632         self.port = port
 633         self.dirs = dirs
 634         self.init()
 635
 636     def init(self):
 637         import ftplib
 638         self.busy = 0
 639         self.ftp = ftplib.FTP()
 640         self.ftp.connect(self.host, self.port)
 641         self.ftp.login(self.user, self.passwd)
 642         for dir in self.dirs:
 643             self.ftp.cwd(dir)
 644
 645     def retrfile(self, file, type):
 646         import ftplib
 647         self.endtransfer()
 648         if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
 649         else: cmd = 'TYPE ' + type; isdir = 0
 650         try:
 651             self.ftp.voidcmd(cmd)
 652         except ftplib.all_errors:
 653             self.init()
 654             self.ftp.voidcmd(cmd)
 655         conn = None
 656         if file and not isdir:
 657             # Use nlst to see if the file exists at all
 658             try:
 659                 self.ftp.nlst(file)
 660             except ftplib.error_perm, reason:
 661                 raise IOError, ('ftp error', reason), sys.exc_info()[2]
 662             # Restore the transfer mode!
 663             self.ftp.voidcmd(cmd)
 664             # Try to retrieve as a file
 665             try:
 666                 cmd = 'RETR ' + file
 667                 conn = self.ftp.ntransfercmd(cmd)
 668             except ftplib.error_perm, reason:
 669                 if reason[:3] != '550':
 670                     raise IOError, ('ftp error', reason), sys.exc_info()[2]
 671         if not conn:
 672             # Set transfer mode to ASCII!
 673             self.ftp.voidcmd('TYPE A')
 674             # Try a directory listing
 675             if file: cmd = 'LIST ' + file
 676             else: cmd = 'LIST'
 677             conn = self.ftp.ntransfercmd(cmd)
 678         self.busy = 1
 679         # Pass back both a suitably decorated object and a retrieval length
 680         return (addclosehook(conn[0].makefile('rb'),
 681                              self.endtransfer), conn[1])
 682     def endtransfer(self):
 683         if not self.busy:
 684             return
 685         self.busy = 0
 686         try:
 687             self.ftp.voidresp()
 688         except ftperrors():
 689             pass
 690
 691     def close(self):
 692         self.endtransfer()
 693         try:
 694             self.ftp.close()
 695         except ftperrors():
 696             pass
 697
 698 class addbase:
 699     """Base class for addinfo and addclosehook."""
 700
 701     def __init__(self, fp):
 702         self.fp = fp
 703         self.read = self.fp.read
 704         self.readline = self.fp.readline
 705         if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
 706         if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
 707
 708     def __repr__(self):
 709         return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
 710                                              `id(self)`, `self.fp`)
 711
 712     def close(self):
 713         self.read = None
 714         self.readline = None
 715         self.readlines = None
 716         self.fileno = None
 717         if self.fp: self.fp.close()
 718         self.fp = None
 719
 720 class addclosehook(addbase):
 721     """Class to add a close hook to an open file."""
 722
 723     def __init__(self, fp, closehook, *hookargs):
 724         addbase.__init__(self, fp)
 725         self.closehook = closehook
 726         self.hookargs = hookargs
 727
 728     def close(self):
 729         addbase.close(self)
 730         if self.closehook:
 731             apply(self.closehook, self.hookargs)
 732             self.closehook = None
 733             self.hookargs = None
 734
 735 class addinfo(addbase):
 736     """class to add an info() method to an open file."""
 737
 738     def __init__(self, fp, headers):
 739         addbase.__init__(self, fp)
 740         self.headers = headers
 741
 742     def info(self):
 743         return self.headers
 744
 745 class addinfourl(addbase):
 746     """class to add info() and geturl() methods to an open file."""
 747
 748     def __init__(self, fp, headers, url):
 749         addbase.__init__(self, fp)
 750         self.headers = headers
 751         self.url = url
 752
 753     def info(self):
 754         return self.headers
 755
 756     def geturl(self):
 757         return self.url
 758
 759
 760 def basejoin(base, url):
 761     """Utility to combine a URL with a base URL to form a new URL."""
 762     type, path = splittype(url)
 763     if type:
 764         # if url is complete (i.e., it contains a type), return it
 765         return url
 766     host, path = splithost(path)
 767     type, basepath = splittype(base) # inherit type from base
 768     if host:
 769         # if url contains host, just inherit type
 770         if type: return type + '://' + host + path
 771         else:
 772             # no type inherited, so url must have started with //
 773             # just return it
 774             return url
 775     host, basepath = splithost(basepath) # inherit host
 776     basepath, basetag = splittag(basepath) # remove extraneous cruft
 777     basepath, basequery = splitquery(basepath) # idem
 778     if path[:1] != '/':
 779         # non-absolute path name
 780         if path[:1] in ('#', '?'):
 781             # path is just a tag or query, attach to basepath
 782             i = len(basepath)
 783         else:
 784             # else replace last component
 785             i = string.rfind(basepath, '/')
 786         if i < 0:
 787             # basepath not absolute
 788             if host:
 789                 # host present, make absolute
 790                 basepath = '/'
 791             else:
 792                 # else keep non-absolute
 793                 basepath = ''
 794         else:
 795             # remove last file component
 796             basepath = basepath[:i+1]
 797         # Interpret ../ (important because of symlinks)
 798         while basepath and path[:3] == '../':
 799             path = path[3:]
 800             i = string.rfind(basepath[:-1], '/')
 801             if i > 0:
 802                 basepath = basepath[:i+1]
 803             elif i == 0:
 804                 basepath = '/'
 805                 break
 806             else:
 807                 basepath = ''
 808
 809         path = basepath + path
 810     if type and host: return type + '://' + host + path
 811     elif type: return type + ':' + path
 812     elif host: return '//' + host + path # don't know what this means
 813     else: return path
 814
 815
 816 # Utilities to parse URLs (most of these return None for missing parts):
 817 # unwrap('<URL:type://host/path>') --> 'type://host/path'
 818 # splittype('type:opaquestring') --> 'type', 'opaquestring'
 819 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
 820 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
 821 # splitpasswd('user:passwd') -> 'user', 'passwd'
 822 # splitport('host:port') --> 'host', 'port'
 823 # splitquery('/path?query') --> '/path', 'query'
 824 # splittag('/path#tag') --> '/path', 'tag'
 825 # splitattr('/path;attr1=value1;attr2=value2;...') ->
 826 #   '/path', ['attr1=value1', 'attr2=value2', ...]
 827 # splitvalue('attr=value') --> 'attr', 'value'
 828 # splitgophertype('/Xselector') --> 'X', 'selector'
 829 # unquote('abc%20def') -> 'abc def'
 830 # quote('abc def') -> 'abc%20def')
 831
 832 def unwrap(url):
 833     """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
 834     url = string.strip(url)
 835     if url[:1] == '<' and url[-1:] == '>':
 836         url = string.strip(url[1:-1])
 837     if url[:4] == 'URL:': url = string.strip(url[4:])
 838     return url
 839
 840 _typeprog = None
 841 def splittype(url):
 842     """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
 843     global _typeprog
 844     if _typeprog is None:
 845         import re
 846         _typeprog = re.compile('^([^/:]+):')
 847
 848     match = _typeprog.match(url)
 849     if match:
 850         scheme = match.group(1)
 851         return scheme.lower(), url[len(scheme) + 1:]
 852     return None, url
 853
 854 _hostprog = None
 855 def splithost(url):
 856     """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
 857     global _hostprog
 858     if _hostprog is None:
 859         import re
 860         _hostprog = re.compile('^//([^/]*)(.*)$')
 861
 862     match = _hostprog.match(url)
 863     if match: return match.group(1, 2)
 864     return None, url
 865
 866 _userprog = None
 867 def splituser(host):
 868     """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
 869     global _userprog
 870     if _userprog is None:
 871         import re
 872         _userprog = re.compile('^([^@]*)@(.*)$')
 873
 874     match = _userprog.match(host)
 875     if match: return match.group(1, 2)
 876     return None, host
 877
 878 _passwdprog = None
 879 def splitpasswd(user):
 880     """splitpasswd('user:passwd') -> 'user', 'passwd'."""
 881     global _passwdprog
 882     if _passwdprog is None:
 883         import re
 884         _passwdprog = re.compile('^([^:]*):(.*)$')
 885
 886     match = _passwdprog.match(user)
 887     if match: return match.group(1, 2)
 888     return user, None
 889
 890 # splittag('/path#tag') --> '/path', 'tag'
 891 _portprog = None
 892 def splitport(host):
 893     """splitport('host:port') --> 'host', 'port'."""
 894     global _portprog
 895     if _portprog is None:
 896         import re
 897         _portprog = re.compile('^(.*):([0-9]+)$')
 898
 899     match = _portprog.match(host)
 900     if match: return match.group(1, 2)
 901     return host, None
 902
 903 _nportprog = None
 904 def splitnport(host, defport=-1):
 905     """Split host and port, returning numeric port.
 906     Return given default port if no ':' found; defaults to -1.
 907     Return numerical port if a valid number are found after ':'.
 908     Return None if ':' but not a valid number."""
 909     global _nportprog
 910     if _nportprog is None:
 911         import re
 912         _nportprog = re.compile('^(.*):(.*)$')
 913
 914     match = _nportprog.match(host)
 915     if match:
 916         host, port = match.group(1, 2)
 917         try:
 918             if not port: raise string.atoi_error, "no digits"
 919             nport = string.atoi(port)
 920         except string.atoi_error:
 921             nport = None
 922         return host, nport
 923     return host, defport
 924
 925 _queryprog = None
 926 def splitquery(url):
 927     """splitquery('/path?query') --> '/path', 'query'."""
 928     global _queryprog
 929     if _queryprog is None:
 930         import re
 931         _queryprog = re.compile('^(.*)\?([^?]*)$')
 932
 933     match = _queryprog.match(url)
 934     if match: return match.group(1, 2)
 935     return url, None
 936
 937 _tagprog = None
 938 def splittag(url):
 939     """splittag('/path#tag') --> '/path', 'tag'."""
 940     global _tagprog
 941     if _tagprog is None:
 942         import re
 943         _tagprog = re.compile('^(.*)#([^#]*)$')
 944
 945     match = _tagprog.match(url)
 946     if match: return match.group(1, 2)
 947     return url, None
 948
 949 def splitattr(url):
 950     """splitattr('/path;attr1=value1;attr2=value2;...') ->
 951         '/path', ['attr1=value1', 'attr2=value2', ...]."""
 952     words = string.splitfields(url, ';')
 953     return words[0], words[1:]
 954
 955 _valueprog = None
 956 def splitvalue(attr):
 957     """splitvalue('attr=value') --> 'attr', 'value'."""
 958     global _valueprog
 959     if _valueprog is None:
 960         import re
 961         _valueprog = re.compile('^([^=]*)=(.*)$')
 962
 963     match = _valueprog.match(attr)
 964     if match: return match.group(1, 2)
 965     return attr, None
 966
 967 def splitgophertype(selector):
 968     """splitgophertype('/Xselector') --> 'X', 'selector'."""
 969     if selector[:1] == '/' and selector[1:2]:
 970         return selector[1], selector[2:]
 971     return None, selector
 972
 973 def unquote(s):
 974     """unquote('abc%20def') -> 'abc def'."""
 975     mychr = chr
 976     myatoi = string.atoi
 977     list = string.split(s, '%')
 978     res = [list[0]]
 979     myappend = res.append
 980     del list[0]
 981     for item in list:
 982         if item[1:2]:
 983             try:
 984                 myappend(mychr(myatoi(item[:2], 16))
 985                      + item[2:])
 986             except:
 987                 myappend('%' + item)
 988         else:
 989             myappend('%' + item)
 990     return string.join(res, "")
 991
 992 def unquote_plus(s):
 993     if '+' in s:
 994         # replace '+' with ' '
 995         s = string.join(string.split(s, '+'), ' ')
 996     return unquote(s)
 997
 998 always_safe = string.letters + string.digits + '_,.-'
 999 def quote(s, safe = '/'):
1000     """quote('abc def') -> 'abc%20def')."""
1001     # XXX Can speed this up an order of magnitude
1002     safe = always_safe + safe
1003     res = list(s)
1004     for i in range(len(res)):
1005         c = res[i]
1006         if c not in safe:
1007             res[i] = '%%%02x' % ord(c)
1008     return string.joinfields(res, '')
1009
1010 def quote_plus(s, safe = '/'):
1011     # XXX Can speed this up an order of magnitude
1012     if ' ' in s:
1013         # replace ' ' with '+'
1014         l = string.split(s, ' ')
1015         for i in range(len(l)):
1016             l[i] = quote(l[i], safe)
1017         return string.join(l, '+')
1018     else:
1019         return quote(s, safe)
1020
1021 def urlencode(dict):
1022     """Encode a dictionary of form entries into a URL query string."""
1023     l = []
1024     for k, v in dict.items():
1025         k = quote_plus(str(k))
1026         v = quote_plus(str(v))
1027         l.append(k + '=' + v)
1028     return string.join(l, '&')
1029
1030
1031 # Proxy handling
1032 def getproxies_environment():
1033     """Return a dictionary of scheme -> proxy server URL mappings.
1034
1035     Scan the environment for variables named <scheme>_proxy;
1036     this seems to be the standard convention.  If you need a
1037     different way, you can pass a proxies dictionary to the
1038     [Fancy]URLopener constructor.
1039
1040     """
1041     proxies = {}
1042     for name, value in os.environ.items():
1043         name = string.lower(name)
1044         if value and name[-6:] == '_proxy':
1045             proxies[name[:-6]] = value
1046     return proxies
1047
1048 if os.name == 'mac':
1049     def getproxies():
1050         """Return a dictionary of scheme -> proxy server URL mappings.
1051
1052         By convention the mac uses Internet Config to store
1053         proxies.  An HTTP proxy, for instance, is stored under
1054         the HttpProxy key.
1055
1056         """
1057         try:
1058             import ic
1059         except ImportError:
1060             return {}
1061
1062         try:
1063             config = ic.IC()
1064         except ic.error:
1065             return {}
1066         proxies = {}
1067         # HTTP:
1068         if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1069             try:
1070                 value = config['HTTPProxyHost']
1071             except ic.error:
1072                 pass
1073             else:
1074                 proxies['http'] = 'http://%s' % value
1075         # FTP: XXXX To be done.
1076         # Gopher: XXXX To be done.
1077         return proxies
1078
1079 elif os.name == 'nt':
1080     def getproxies_registry():
1081         """Return a dictionary of scheme -> proxy server URL mappings.
1082
1083         Win32 uses the registry to store proxies.
1084
1085         """
1086         proxies = {}
1087         try:
1088             import _winreg
1089         except ImportError:
1090             # Std module, so should be around - but you never know!
1091             return proxies
1092         try:
1093             internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1094                 'Software\\Microsoft\\Windows\\CurrentVersion\\Internet Settings')
1095             proxyEnable = _winreg.QueryValueEx(internetSettings,
1096                                                'ProxyEnable')[0]
1097             if proxyEnable:
1098                 # Returned as Unicode but problems if not converted to ASCII
1099                 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1100                                                        'ProxyServer')[0])
1101                 if ';' in proxyServer:        # Per-protocol settings
1102                     for p in proxyServer.split(';'):
1103                         protocol, address = p.split('=')
1104                         proxies[protocol] = '%s://%s' % (protocol, address)
1105                 else:        # Use one setting for all protocols
1106                     proxies['http'] = 'http://%s' % proxyServer
1107                     proxies['ftp'] = 'ftp://%s' % proxyServer
1108             internetSettings.Close()
1109         except (WindowsError, ValueError, TypeError):
1110             # Either registry key not found etc, or the value in an
1111             # unexpected format.
1112             # proxies already set up to be empty so nothing to do
1113             pass
1114         return proxies
1115
1116     def getproxies():
1117         """Return a dictionary of scheme -> proxy server URL mappings.
1118
1119         Returns settings gathered from the environment, if specified,
1120         or the registry.
1121
1122         """
1123         return getproxies_environment() or getproxies_registry()
1124 else:
1125     # By default use environment variables
1126     getproxies = getproxies_environment
1127
1128
1129 # Test and time quote() and unquote()
1130 def test1():
1131     import time
1132     s = ''
1133     for i in range(256): s = s + chr(i)
1134     s = s*4
1135     t0 = time.time()
1136     qs = quote(s)
1137     uqs = unquote(qs)
1138     t1 = time.time()
1139     if uqs != s:
1140         print 'Wrong!'
1141     print `s`
1142     print `qs`
1143     print `uqs`
1144     print round(t1 - t0, 3), 'sec'
1145
1146
1147 def reporthook(blocknum, blocksize, totalsize):
1148     # Report during remote transfers
1149     print "Block number: %d, Block size: %d, Total size: %d" % (blocknum, blocksize, totalsize)
1150
1151 # Test program
1152 def test(args=[]):
1153     if not args:
1154         args = [
1155             '/etc/passwd',
1156             'file:/etc/passwd',
1157             'file://localhost/etc/passwd',
1158             'ftp://ftp.python.org/etc/passwd',
1159 ##          'gopher://gopher.micro.umn.edu/1/',
1160             'http://www.python.org/index.html',
1161             ]
1162         if hasattr(URLopener, "open_https"):
1163             args.append('https://synergy.as.cmu.edu/~geek/')
1164     try:
1165         for url in args:
1166             print '-'*10, url, '-'*10
1167             fn, h = urlretrieve(url, None, reporthook)
1168             print fn, h
1169             if h:
1170                 print '======'
1171                 for k in h.keys(): print k + ':', h[k]
1172                 print '======'
1173             fp = open(fn, 'rb')
1174             data = fp.read()
1175             del fp
1176             if '\r' in data:
1177                 table = string.maketrans("", "")
1178                 data = string.translate(data, table, "\r")
1179             print data
1180             fn, h = None, None
1181         print '-'*40
1182     finally:
1183         urlcleanup()
1184
1185 def main():
1186     import getopt, sys
1187     try:
1188         opts, args = getopt.getopt(sys.argv[1:], "th")
1189     except getopt.error, msg:
1190         print msg
1191         print "Use -h for help"
1192         return
1193     t = 0
1194     for o, a in opts:
1195         if o == '-t':
1196             t = t + 1
1197         if o == '-h':
1198             print "Usage: python urllib.py [-t] [url ...]"
1199             print "-t runs self-test;",
1200             print "otherwise, contents of urls are printed"
1201             return
1202     if t:
1203         if t > 1:
1204             test1()
1205         test(args)
1206     else:
1207         if not args:
1208             print "Use -h for help"
1209         for url in args:
1210             print urlopen(url).read(),
1211
1212 # Run test program when run as a script
1213 if __name__ == '__main__':
1214     main()