Lib/urllib.py

   1 """Open an arbitrary URL.
   2
   3 See the following document for more info on URLs:
   4 "Names and Addresses, URIs, URLs, URNs, URCs", at
   5 http://www.w3.org/pub/WWW/Addressing/Overview.html
   6
   7 See also the HTTP spec (from which the error codes are derived):
   8 "HTTP - Hypertext Transfer Protocol", at
   9 http://www.w3.org/pub/WWW/Protocols/
  10
  11 Related standards and specs:
  12 - RFC1808: the "relative URL" spec. (authoritative status)
  13 - RFC1738 - the "URL standard". (authoritative status)
  14 - RFC1630 - the "URI spec". (informational status)
  15
  16 The object returned by URLopener().open(file) will differ per
  17 protocol.  All you know is that is has methods read(), readline(),
  18 readlines(), fileno(), close() and info().  The read*(), fileno()
  19 and close() methods work like those of open files.
  20 The info() method returns a mimetools.Message object which can be
  21 used to query various info about the object, if available.
  22 (mimetools.Message objects are queried with the getheader() method.)
  23 """
  24
  25 import string
  26 import socket
  27 import os
  28 import sys
  29
  30
  31 __version__ = '1.12'    # XXX This version is not always updated :-(
  32
  33 MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
  34
  35 # Helper for non-unix systems
  36 if os.name == 'mac':
  37     from macurl2path import url2pathname, pathname2url
  38 elif os.name == 'nt':
  39     from nturl2path import url2pathname, pathname2url
  40 else:
  41     def url2pathname(pathname):
  42         return unquote(pathname)
  43     def pathname2url(pathname):
  44         return quote(pathname)
  45
  46 # This really consists of two pieces:
  47 # (1) a class which handles opening of all sorts of URLs
  48 #     (plus assorted utilities etc.)
  49 # (2) a set of functions for parsing URLs
  50 # XXX Should these be separated out into different modules?
  51
  52
  53 # Shortcut for basic usage
  54 _urlopener = None
  55 def urlopen(url, data=None):
  56     global _urlopener
  57     if not _urlopener:
  58         _urlopener = FancyURLopener()
  59     if data is None:
  60         return _urlopener.open(url)
  61     else:
  62         return _urlopener.open(url, data)
  63 def urlretrieve(url, filename=None, reporthook=None):
  64     global _urlopener
  65     if not _urlopener:
  66         _urlopener = FancyURLopener()
  67     return _urlopener.retrieve(url, filename, reporthook)
  68 def urlcleanup():
  69     if _urlopener:
  70         _urlopener.cleanup()
  71
  72
  73 ftpcache = {}
  74 class URLopener:
  75     """Class to open URLs.
  76     This is a class rather than just a subroutine because we may need
  77     more than one set of global protocol-specific options.
  78     Note -- this is a base class for those who don't want the
  79     automatic handling of errors type 302 (relocated) and 401
  80     (authorization needed)."""
  81
  82     __tempfiles = None
  83
  84     # Constructor
  85     def __init__(self, proxies=None, **x509):
  86         if proxies is None:
  87             proxies = getproxies()
  88         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
  89         self.proxies = proxies
  90         self.key_file = x509.get('key_file')
  91         self.cert_file = x509.get('cert_file')
  92         server_version = "Python-urllib/%s" % __version__
  93         self.addheaders = [('User-agent', server_version)]
  94         self.__tempfiles = []
  95         self.__unlink = os.unlink # See cleanup()
  96         self.tempcache = None
  97         # Undocumented feature: if you assign {} to tempcache,
  98         # it is used to cache files retrieved with
  99         # self.retrieve().  This is not enabled by default
 100         # since it does not work for changing documents (and I
 101         # haven't got the logic to check expiration headers
 102         # yet).
 103         self.ftpcache = ftpcache
 104         # Undocumented feature: you can use a different
 105         # ftp cache by assigning to the .ftpcache member;
 106         # in case you want logically independent URL openers
 107         # XXX This is not threadsafe.  Bah.
 108
 109     def __del__(self):
 110         self.close()
 111
 112     def close(self):
 113         self.cleanup()
 114
 115     def cleanup(self):
 116         # This code sometimes runs when the rest of this module
 117         # has already been deleted, so it can't use any globals
 118         # or import anything.
 119         if self.__tempfiles:
 120             for file in self.__tempfiles:
 121                 try:
 122                     self.__unlink(file)
 123                 except:
 124                     pass
 125             del self.__tempfiles[:]
 126         if self.tempcache:
 127             self.tempcache.clear()
 128
 129     def addheader(self, *args):
 130         """Add a header to be used by the HTTP interface only
 131         e.g. u.addheader('Accept', 'sound/basic')"""
 132         self.addheaders.append(args)
 133
 134     # External interface
 135     def open(self, fullurl, data=None):
 136         """Use URLopener().open(file) instead of open(file, 'r')."""
 137         fullurl = unwrap(fullurl)
 138         if self.tempcache and self.tempcache.has_key(fullurl):
 139             filename, headers = self.tempcache[fullurl]
 140             fp = open(filename, 'rb')
 141             return addinfourl(fp, headers, fullurl)
 142         type, url = splittype(fullurl)
 143         if not type: type = 'file'
 144         if self.proxies.has_key(type):
 145             proxy = self.proxies[type]
 146             type, proxy = splittype(proxy)
 147             host, selector = splithost(proxy)
 148             url = (host, fullurl) # Signal special case to open_*()
 149         name = 'open_' + type
 150         self.type = type
 151         if '-' in name:
 152             # replace - with _
 153             name = string.join(string.split(name, '-'), '_')
 154         if not hasattr(self, name):
 155             if data is None:
 156                 return self.open_unknown(fullurl)
 157             else:
 158                 return self.open_unknown(fullurl, data)
 159         try:
 160             if data is None:
 161                 return getattr(self, name)(url)
 162             else:
 163                 return getattr(self, name)(url, data)
 164         except socket.error, msg:
 165             raise IOError, ('socket error', msg), sys.exc_info()[2]
 166
 167     def open_unknown(self, fullurl, data=None):
 168         """Overridable interface to open unknown URL type."""
 169         type, url = splittype(fullurl)
 170         raise IOError, ('url error', 'unknown url type', type)
 171
 172     # External interface
 173     def retrieve(self, url, filename=None, reporthook=None):
 174         """retrieve(url) returns (filename, None) for a local object
 175         or (tempfilename, headers) for a remote object."""
 176         url = unwrap(url)
 177         if self.tempcache and self.tempcache.has_key(url):
 178             return self.tempcache[url]
 179         type, url1 = splittype(url)
 180         if not filename and (not type or type == 'file'):
 181             try:
 182                 fp = self.open_local_file(url1)
 183                 hdrs = fp.info()
 184                 del fp
 185                 return url2pathname(splithost(url1)[1]), hdrs
 186             except IOError, msg:
 187                 pass
 188         fp = self.open(url)
 189         headers = fp.info()
 190         if not filename:
 191             import tempfile
 192             garbage, path = splittype(url)
 193             garbage, path = splithost(path or "")
 194             path, garbage = splitquery(path or "")
 195             path, garbage = splitattr(path or "")
 196             suffix = os.path.splitext(path)[1]
 197             filename = tempfile.mktemp(suffix)
 198             self.__tempfiles.append(filename)
 199         result = filename, headers
 200         if self.tempcache is not None:
 201             self.tempcache[url] = result
 202         tfp = open(filename, 'wb')
 203         bs = 1024*8
 204         size = -1
 205         blocknum = 1
 206         if reporthook:
 207             if headers.has_key("content-length"):
 208                 size = int(headers["Content-Length"])
 209             reporthook(0, bs, size)
 210         block = fp.read(bs)
 211         if reporthook:
 212             reporthook(1, bs, size)
 213         while block:
 214             tfp.write(block)
 215             block = fp.read(bs)
 216             blocknum = blocknum + 1
 217             if reporthook:
 218                 reporthook(blocknum, bs, size)
 219         fp.close()
 220         tfp.close()
 221         del fp
 222         del tfp
 223         return result
 224
 225     # Each method named open_<type> knows how to open that type of URL
 226
 227     def open_http(self, url, data=None):
 228         """Use HTTP protocol."""
 229         import httplib
 230         user_passwd = None
 231         if type(url) is type(""):
 232             host, selector = splithost(url)
 233             if host:
 234                 user_passwd, host = splituser(host)
 235                 host = unquote(host)
 236             realhost = host
 237         else:
 238             host, selector = url
 239             urltype, rest = splittype(selector)
 240             url = rest
 241             user_passwd = None
 242             if string.lower(urltype) != 'http':
 243                 realhost = None
 244             else:
 245                 realhost, rest = splithost(rest)
 246                 if realhost:
 247                     user_passwd, realhost = splituser(realhost)
 248                 if user_passwd:
 249                     selector = "%s://%s%s" % (urltype, realhost, rest)
 250             #print "proxy via http:", host, selector
 251         if not host: raise IOError, ('http error', 'no host given')
 252         if user_passwd:
 253             import base64
 254             auth = string.strip(base64.encodestring(user_passwd))
 255         else:
 256             auth = None
 257         h = httplib.HTTP(host)
 258         if data is not None:
 259             h.putrequest('POST', selector)
 260             h.putheader('Content-type', 'application/x-www-form-urlencoded')
 261             h.putheader('Content-length', '%d' % len(data))
 262         else:
 263             h.putrequest('GET', selector)
 264         if auth: h.putheader('Authorization', 'Basic %s' % auth)
 265         if realhost: h.putheader('Host', realhost)
 266         for args in self.addheaders: apply(h.putheader, args)
 267         h.endheaders()
 268         if data is not None:
 269             h.send(data + '\r\n')
 270         errcode, errmsg, headers = h.getreply()
 271         fp = h.getfile()
 272         if errcode == 200:
 273             return addinfourl(fp, headers, "http:" + url)
 274         else:
 275             if data is None:
 276                 return self.http_error(url, fp, errcode, errmsg, headers)
 277             else:
 278                 return self.http_error(url, fp, errcode, errmsg, headers, data)
 279
 280     def http_error(self, url, fp, errcode, errmsg, headers, data=None):
 281         """Handle http errors.
 282         Derived class can override this, or provide specific handlers
 283         named http_error_DDD where DDD is the 3-digit error code."""
 284         # First check if there's a specific handler for this error
 285         name = 'http_error_%d' % errcode
 286         if hasattr(self, name):
 287             method = getattr(self, name)
 288             if data is None:
 289                 result = method(url, fp, errcode, errmsg, headers)
 290             else:
 291                 result = method(url, fp, errcode, errmsg, headers, data)
 292             if result: return result
 293         return self.http_error_default(url, fp, errcode, errmsg, headers)
 294
 295     def http_error_default(self, url, fp, errcode, errmsg, headers):
 296         """Default error handler: close the connection and raise IOError."""
 297         void = fp.read()
 298         fp.close()
 299         raise IOError, ('http error', errcode, errmsg, headers)
 300
 301     if hasattr(socket, "ssl"):
 302         def open_https(self, url):
 303             """Use HTTPS protocol."""
 304             import httplib
 305             if type(url) is type(""):
 306                 host, selector = splithost(url)
 307                 user_passwd, host = splituser(host)
 308             else:
 309                 host, selector = url
 310                 urltype, rest = splittype(selector)
 311                 if string.lower(urltype) == 'https':
 312                     realhost, rest = splithost(rest)
 313                     user_passwd, realhost = splituser(realhost)
 314                     if user_passwd:
 315                         selector = "%s://%s%s" % (urltype, realhost, rest)
 316                 print "proxy via https:", host, selector
 317             if not host: raise IOError, ('https error', 'no host given')
 318             if user_passwd:
 319                 import base64
 320                 auth = string.strip(base64.encodestring(user_passwd))
 321             else:
 322                 auth = None
 323             h = httplib.HTTPS(host, 0,
 324                               key_file=self.key_file,
 325                               cert_file=self.cert_file)
 326             h.putrequest('GET', selector)
 327             if auth: h.putheader('Authorization: Basic %s' % auth)
 328             for args in self.addheaders: apply(h.putheader, args)
 329             h.endheaders()
 330             errcode, errmsg, headers = h.getreply()
 331             fp = h.getfile()
 332             if errcode == 200:
 333                 return addinfourl(fp, headers, url)
 334             else:
 335                 return self.http_error(url, fp, errcode, errmsg, headers)
 336
 337     def open_gopher(self, url):
 338         """Use Gopher protocol."""
 339         import gopherlib
 340         host, selector = splithost(url)
 341         if not host: raise IOError, ('gopher error', 'no host given')
 342         host = unquote(host)
 343         type, selector = splitgophertype(selector)
 344         selector, query = splitquery(selector)
 345         selector = unquote(selector)
 346         if query:
 347             query = unquote(query)
 348             fp = gopherlib.send_query(selector, query, host)
 349         else:
 350             fp = gopherlib.send_selector(selector, host)
 351         return addinfourl(fp, noheaders(), "gopher:" + url)
 352
 353     def open_file(self, url):
 354         """Use local file or FTP depending on form of URL."""
 355         if url[:2] == '//' and url[2:3] != '/':
 356             return self.open_ftp(url)
 357         else:
 358             return self.open_local_file(url)
 359
 360     def open_local_file(self, url):
 361         """Use local file."""
 362         import mimetypes, mimetools, StringIO
 363         mtype = mimetypes.guess_type(url)[0]
 364         headers = mimetools.Message(StringIO.StringIO(
 365             'Content-Type: %s\n' % (mtype or 'text/plain')))
 366         host, file = splithost(url)
 367         if not host:
 368             urlfile = file
 369             if file[:1] == '/':
 370                 urlfile = 'file://' + file
 371             return addinfourl(open(url2pathname(file), 'rb'),
 372                               headers, urlfile)
 373         host, port = splitport(host)
 374         if not port \
 375            and socket.gethostbyname(host) in (localhost(), thishost()):
 376             urlfile = file
 377             if file[:1] == '/':
 378                 urlfile = 'file://' + file
 379             return addinfourl(open(url2pathname(file), 'rb'),
 380                               headers, urlfile)
 381         raise IOError, ('local file error', 'not on local host')
 382
 383     def open_ftp(self, url):
 384         """Use FTP protocol."""
 385         host, path = splithost(url)
 386         if not host: raise IOError, ('ftp error', 'no host given')
 387         host, port = splitport(host)
 388         user, host = splituser(host)
 389         if user: user, passwd = splitpasswd(user)
 390         else: passwd = None
 391         host = unquote(host)
 392         user = unquote(user or '')
 393         passwd = unquote(passwd or '')
 394         host = socket.gethostbyname(host)
 395         if not port:
 396             import ftplib
 397             port = ftplib.FTP_PORT
 398         else:
 399             port = int(port)
 400         path, attrs = splitattr(path)
 401         path = unquote(path)
 402         dirs = string.splitfields(path, '/')
 403         dirs, file = dirs[:-1], dirs[-1]
 404         if dirs and not dirs[0]: dirs = dirs[1:]
 405         if dirs and not dirs[0]: dirs[0] = '/'
 406         key = (user, host, port, string.joinfields(dirs, '/'))
 407         # XXX thread unsafe!
 408         if len(self.ftpcache) > MAXFTPCACHE:
 409             # Prune the cache, rather arbitrarily
 410             for k in self.ftpcache.keys():
 411                 if k != key:
 412                     v = self.ftpcache[k]
 413                     del self.ftpcache[k]
 414                     v.close()
 415         try:
 416             if not self.ftpcache.has_key(key):
 417                 self.ftpcache[key] = \
 418                     ftpwrapper(user, passwd, host, port, dirs)
 419             if not file: type = 'D'
 420             else: type = 'I'
 421             for attr in attrs:
 422                 attr, value = splitvalue(attr)
 423                 if string.lower(attr) == 'type' and \
 424                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
 425                     type = string.upper(value)
 426             (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
 427             if retrlen is not None and retrlen >= 0:
 428                 import mimetools, StringIO
 429                 headers = mimetools.Message(StringIO.StringIO(
 430                     'Content-Length: %d\n' % retrlen))
 431             else:
 432                 headers = noheaders()
 433             return addinfourl(fp, headers, "ftp:" + url)
 434         except ftperrors(), msg:
 435             raise IOError, ('ftp error', msg), sys.exc_info()[2]
 436
 437     def open_data(self, url, data=None):
 438         """Use "data" URL."""
 439         # ignore POSTed data
 440         #
 441         # syntax of data URLs:
 442         # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
 443         # mediatype := [ type "/" subtype ] *( ";" parameter )
 444         # data      := *urlchar
 445         # parameter := attribute "=" value
 446         import StringIO, mimetools, time
 447         try:
 448             [type, data] = string.split(url, ',', 1)
 449         except ValueError:
 450             raise IOError, ('data error', 'bad data URL')
 451         if not type:
 452             type = 'text/plain;charset=US-ASCII'
 453         semi = string.rfind(type, ';')
 454         if semi >= 0 and '=' not in type[semi:]:
 455             encoding = type[semi+1:]
 456             type = type[:semi]
 457         else:
 458             encoding = ''
 459         msg = []
 460         msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
 461                                             time.gmtime(time.time())))
 462         msg.append('Content-type: %s' % type)
 463         if encoding == 'base64':
 464             import base64
 465             data = base64.decodestring(data)
 466         else:
 467             data = unquote(data)
 468         msg.append('Content-length: %d' % len(data))
 469         msg.append('')
 470         msg.append(data)
 471         msg = string.join(msg, '\n')
 472         f = StringIO.StringIO(msg)
 473         headers = mimetools.Message(f, 0)
 474         f.fileno = None     # needed for addinfourl
 475         return addinfourl(f, headers, url)
 476
 477
 478 class FancyURLopener(URLopener):
 479     """Derived class with handlers for errors we can handle (perhaps)."""
 480
 481     def __init__(self, *args):
 482         apply(URLopener.__init__, (self,) + args)
 483         self.auth_cache = {}
 484
 485     def http_error_default(self, url, fp, errcode, errmsg, headers):
 486         """Default error handling -- don't raise an exception."""
 487         return addinfourl(fp, headers, "http:" + url)
 488
 489     def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
 490         """Error 302 -- relocated (temporarily)."""
 491         # XXX The server can force infinite recursion here!
 492         if headers.has_key('location'):
 493             newurl = headers['location']
 494         elif headers.has_key('uri'):
 495             newurl = headers['uri']
 496         else:
 497             return
 498         void = fp.read()
 499         fp.close()
 500         # In case the server sent a relative URL, join with original:
 501         newurl = basejoin("http:" + url, newurl)
 502         if data is None:
 503             return self.open(newurl)
 504         else:
 505             return self.open(newurl, data)
 506
 507     def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
 508         """Error 301 -- also relocated (permanently)."""
 509         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
 510
 511     def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
 512         """Error 401 -- authentication required.
 513         See this URL for a description of the basic authentication scheme:
 514         http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
 515         if headers.has_key('www-authenticate'):
 516             stuff = headers['www-authenticate']
 517             import re
 518             match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
 519             if match:
 520                 scheme, realm = match.groups()
 521                 if string.lower(scheme) == 'basic':
 522                    name = 'retry_' + self.type + '_basic_auth'
 523                    if data is None:
 524                        return getattr(self,name)(url, realm)
 525                    else:
 526                        return getattr(self,name)(url, realm, data)
 527
 528     def retry_http_basic_auth(self, url, realm, data=None):
 529         host, selector = splithost(url)
 530         i = string.find(host, '@') + 1
 531         host = host[i:]
 532         user, passwd = self.get_user_passwd(host, realm, i)
 533         if not (user or passwd): return None
 534         host = user + ':' + passwd + '@' + host
 535         newurl = 'http://' + host + selector
 536         if data is None:
 537             return self.open(newurl)
 538         else:
 539             return self.open(newurl, data)
 540
 541     def retry_https_basic_auth(self, url, realm, data=None):
 542             host, selector = splithost(url)
 543             i = string.find(host, '@') + 1
 544             host = host[i:]
 545             user, passwd = self.get_user_passwd(host, realm, i)
 546             if not (user or passwd): return None
 547             host = user + ':' + passwd + '@' + host
 548             newurl = '//' + host + selector
 549             return self.open_https(newurl)
 550
 551     def get_user_passwd(self, host, realm, clear_cache = 0):
 552         key = realm + '@' + string.lower(host)
 553         if self.auth_cache.has_key(key):
 554             if clear_cache:
 555                 del self.auth_cache[key]
 556             else:
 557                 return self.auth_cache[key]
 558         user, passwd = self.prompt_user_passwd(host, realm)
 559         if user or passwd: self.auth_cache[key] = (user, passwd)
 560         return user, passwd
 561
 562     def prompt_user_passwd(self, host, realm):
 563         """Override this in a GUI environment!"""
 564         import getpass
 565         try:
 566             user = raw_input("Enter username for %s at %s: " % (realm,
 567                                                                 host))
 568             passwd = getpass.getpass("Enter password for %s in %s at %s: " %
 569                 (user, realm, host))
 570             return user, passwd
 571         except KeyboardInterrupt:
 572             print
 573             return None, None
 574
 575
 576 # Utility functions
 577
 578 _localhost = None
 579 def localhost():
 580     """Return the IP address of the magic hostname 'localhost'."""
 581     global _localhost
 582     if not _localhost:
 583         _localhost = socket.gethostbyname('localhost')
 584     return _localhost
 585
 586 _thishost = None
 587 def thishost():
 588     """Return the IP address of the current host."""
 589     global _thishost
 590     if not _thishost:
 591         _thishost = socket.gethostbyname(socket.gethostname())
 592     return _thishost
 593
 594 _ftperrors = None
 595 def ftperrors():
 596     """Return the set of errors raised by the FTP class."""
 597     global _ftperrors
 598     if not _ftperrors:
 599         import ftplib
 600         _ftperrors = ftplib.all_errors
 601     return _ftperrors
 602
 603 _noheaders = None
 604 def noheaders():
 605     """Return an empty mimetools.Message object."""
 606     global _noheaders
 607     if not _noheaders:
 608         import mimetools
 609         import StringIO
 610         _noheaders = mimetools.Message(StringIO.StringIO(), 0)
 611         _noheaders.fp.close()   # Recycle file descriptor
 612     return _noheaders
 613
 614
 615 # Utility classes
 616
 617 class ftpwrapper:
 618     """Class used by open_ftp() for cache of open FTP connections."""
 619
 620     def __init__(self, user, passwd, host, port, dirs):
 621         self.user = user
 622         self.passwd = passwd
 623         self.host = host
 624         self.port = port
 625         self.dirs = dirs
 626         self.init()
 627
 628     def init(self):
 629         import ftplib
 630         self.busy = 0
 631         self.ftp = ftplib.FTP()
 632         self.ftp.connect(self.host, self.port)
 633         self.ftp.login(self.user, self.passwd)
 634         for dir in self.dirs:
 635             self.ftp.cwd(dir)
 636
 637     def retrfile(self, file, type):
 638         import ftplib
 639         self.endtransfer()
 640         if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
 641         else: cmd = 'TYPE ' + type; isdir = 0
 642         try:
 643             self.ftp.voidcmd(cmd)
 644         except ftplib.all_errors:
 645             self.init()
 646             self.ftp.voidcmd(cmd)
 647         conn = None
 648         if file and not isdir:
 649             # Use nlst to see if the file exists at all
 650             try:
 651                 self.ftp.nlst(file)
 652             except ftplib.error_perm, reason:
 653                 raise IOError, ('ftp error', reason), sys.exc_info()[2]
 654             # Restore the transfer mode!
 655             self.ftp.voidcmd(cmd)
 656             # Try to retrieve as a file
 657             try:
 658                 cmd = 'RETR ' + file
 659                 conn = self.ftp.ntransfercmd(cmd)
 660             except ftplib.error_perm, reason:
 661                 if reason[:3] != '550':
 662                     raise IOError, ('ftp error', reason), sys.exc_info()[2]
 663         if not conn:
 664             # Set transfer mode to ASCII!
 665             self.ftp.voidcmd('TYPE A')
 666             # Try a directory listing
 667             if file: cmd = 'LIST ' + file
 668             else: cmd = 'LIST'
 669             conn = self.ftp.ntransfercmd(cmd)
 670         self.busy = 1
 671         # Pass back both a suitably decorated object and a retrieval length
 672         return (addclosehook(conn[0].makefile('rb'),
 673                              self.endtransfer), conn[1])
 674     def endtransfer(self):
 675         if not self.busy:
 676             return
 677         self.busy = 0
 678         try:
 679             self.ftp.voidresp()
 680         except ftperrors():
 681             pass
 682
 683     def close(self):
 684         self.endtransfer()
 685         try:
 686             self.ftp.close()
 687         except ftperrors():
 688             pass
 689
 690 class addbase:
 691     """Base class for addinfo and addclosehook."""
 692
 693     def __init__(self, fp):
 694         self.fp = fp
 695         self.read = self.fp.read
 696         self.readline = self.fp.readline
 697         if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
 698         if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
 699
 700     def __repr__(self):
 701         return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
 702                                              `id(self)`, `self.fp`)
 703
 704     def close(self):
 705         self.read = None
 706         self.readline = None
 707         self.readlines = None
 708         self.fileno = None
 709         if self.fp: self.fp.close()
 710         self.fp = None
 711
 712 class addclosehook(addbase):
 713     """Class to add a close hook to an open file."""
 714
 715     def __init__(self, fp, closehook, *hookargs):
 716         addbase.__init__(self, fp)
 717         self.closehook = closehook
 718         self.hookargs = hookargs
 719
 720     def close(self):
 721         if self.closehook:
 722             apply(self.closehook, self.hookargs)
 723             self.closehook = None
 724             self.hookargs = None
 725         addbase.close(self)
 726
 727 class addinfo(addbase):
 728     """class to add an info() method to an open file."""
 729
 730     def __init__(self, fp, headers):
 731         addbase.__init__(self, fp)
 732         self.headers = headers
 733
 734     def info(self):
 735         return self.headers
 736
 737 class addinfourl(addbase):
 738     """class to add info() and geturl() methods to an open file."""
 739
 740     def __init__(self, fp, headers, url):
 741         addbase.__init__(self, fp)
 742         self.headers = headers
 743         self.url = url
 744
 745     def info(self):
 746         return self.headers
 747
 748     def geturl(self):
 749         return self.url
 750
 751
 752 def basejoin(base, url):
 753     """Utility to combine a URL with a base URL to form a new URL."""
 754     type, path = splittype(url)
 755     if type:
 756         # if url is complete (i.e., it contains a type), return it
 757         return url
 758     host, path = splithost(path)
 759     type, basepath = splittype(base) # inherit type from base
 760     if host:
 761         # if url contains host, just inherit type
 762         if type: return type + '://' + host + path
 763         else:
 764             # no type inherited, so url must have started with //
 765             # just return it
 766             return url
 767     host, basepath = splithost(basepath) # inherit host
 768     basepath, basetag = splittag(basepath) # remove extraneuous cruft
 769     basepath, basequery = splitquery(basepath) # idem
 770     if path[:1] != '/':
 771         # non-absolute path name
 772         if path[:1] in ('#', '?'):
 773             # path is just a tag or query, attach to basepath
 774             i = len(basepath)
 775         else:
 776             # else replace last component
 777             i = string.rfind(basepath, '/')
 778         if i < 0:
 779             # basepath not absolute
 780             if host:
 781                 # host present, make absolute
 782                 basepath = '/'
 783             else:
 784                 # else keep non-absolute
 785                 basepath = ''
 786         else:
 787             # remove last file component
 788             basepath = basepath[:i+1]
 789         # Interpret ../ (important because of symlinks)
 790         while basepath and path[:3] == '../':
 791             path = path[3:]
 792             i = string.rfind(basepath[:-1], '/')
 793             if i > 0:
 794                 basepath = basepath[:i+1]
 795             elif i == 0:
 796                 basepath = '/'
 797                 break
 798             else:
 799                 basepath = ''
 800
 801         path = basepath + path
 802     if type and host: return type + '://' + host + path
 803     elif type: return type + ':' + path
 804     elif host: return '//' + host + path # don't know what this means
 805     else: return path
 806
 807
 808 # Utilities to parse URLs (most of these return None for missing parts):
 809 # unwrap('<URL:type://host/path>') --> 'type://host/path'
 810 # splittype('type:opaquestring') --> 'type', 'opaquestring'
 811 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
 812 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
 813 # splitpasswd('user:passwd') -> 'user', 'passwd'
 814 # splitport('host:port') --> 'host', 'port'
 815 # splitquery('/path?query') --> '/path', 'query'
 816 # splittag('/path#tag') --> '/path', 'tag'
 817 # splitattr('/path;attr1=value1;attr2=value2;...') ->
 818 #   '/path', ['attr1=value1', 'attr2=value2', ...]
 819 # splitvalue('attr=value') --> 'attr', 'value'
 820 # splitgophertype('/Xselector') --> 'X', 'selector'
 821 # unquote('abc%20def') -> 'abc def'
 822 # quote('abc def') -> 'abc%20def')
 823
 824 def unwrap(url):
 825     """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
 826     url = string.strip(url)
 827     if url[:1] == '<' and url[-1:] == '>':
 828         url = string.strip(url[1:-1])
 829     if url[:4] == 'URL:': url = string.strip(url[4:])
 830     return url
 831
 832 _typeprog = None
 833 def splittype(url):
 834     """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
 835     global _typeprog
 836     if _typeprog is None:
 837         import re
 838         _typeprog = re.compile('^([^/:]+):')
 839
 840     match = _typeprog.match(url)
 841     if match:
 842         scheme = match.group(1)
 843         return scheme, url[len(scheme) + 1:]
 844     return None, url
 845
 846 _hostprog = None
 847 def splithost(url):
 848     """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
 849     global _hostprog
 850     if _hostprog is None:
 851         import re
 852         _hostprog = re.compile('^//([^/]*)(.*)$')
 853
 854     match = _hostprog.match(url)
 855     if match: return match.group(1, 2)
 856     return None, url
 857
 858 _userprog = None
 859 def splituser(host):
 860     """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
 861     global _userprog
 862     if _userprog is None:
 863         import re
 864         _userprog = re.compile('^([^@]*)@(.*)$')
 865
 866     match = _userprog.match(host)
 867     if match: return match.group(1, 2)
 868     return None, host
 869
 870 _passwdprog = None
 871 def splitpasswd(user):
 872     """splitpasswd('user:passwd') -> 'user', 'passwd'."""
 873     global _passwdprog
 874     if _passwdprog is None:
 875         import re
 876         _passwdprog = re.compile('^([^:]*):(.*)$')
 877
 878     match = _passwdprog.match(user)
 879     if match: return match.group(1, 2)
 880     return user, None
 881
 882 # splittag('/path#tag') --> '/path', 'tag'
 883 _portprog = None
 884 def splitport(host):
 885     """splitport('host:port') --> 'host', 'port'."""
 886     global _portprog
 887     if _portprog is None:
 888         import re
 889         _portprog = re.compile('^(.*):([0-9]+)$')
 890
 891     match = _portprog.match(host)
 892     if match: return match.group(1, 2)
 893     return host, None
 894
 895 _nportprog = None
 896 def splitnport(host, defport=-1):
 897     """Split host and port, returning numeric port.
 898     Return given default port if no ':' found; defaults to -1.
 899     Return numerical port if a valid number are found after ':'.
 900     Return None if ':' but not a valid number."""
 901     global _nportprog
 902     if _nportprog is None:
 903         import re
 904         _nportprog = re.compile('^(.*):(.*)$')
 905
 906     match = _nportprog.match(host)
 907     if match:
 908         host, port = match.group(1, 2)
 909         try:
 910             if not port: raise string.atoi_error, "no digits"
 911             nport = string.atoi(port)
 912         except string.atoi_error:
 913             nport = None
 914         return host, nport
 915     return host, defport
 916
 917 _queryprog = None
 918 def splitquery(url):
 919     """splitquery('/path?query') --> '/path', 'query'."""
 920     global _queryprog
 921     if _queryprog is None:
 922         import re
 923         _queryprog = re.compile('^(.*)\?([^?]*)$')
 924
 925     match = _queryprog.match(url)
 926     if match: return match.group(1, 2)
 927     return url, None
 928
 929 _tagprog = None
 930 def splittag(url):
 931     """splittag('/path#tag') --> '/path', 'tag'."""
 932     global _tagprog
 933     if _tagprog is None:
 934         import re
 935         _tagprog = re.compile('^(.*)#([^#]*)$')
 936
 937     match = _tagprog.match(url)
 938     if match: return match.group(1, 2)
 939     return url, None
 940
 941 def splitattr(url):
 942     """splitattr('/path;attr1=value1;attr2=value2;...') ->
 943         '/path', ['attr1=value1', 'attr2=value2', ...]."""
 944     words = string.splitfields(url, ';')
 945     return words[0], words[1:]
 946
 947 _valueprog = None
 948 def splitvalue(attr):
 949     """splitvalue('attr=value') --> 'attr', 'value'."""
 950     global _valueprog
 951     if _valueprog is None:
 952         import re
 953         _valueprog = re.compile('^([^=]*)=(.*)$')
 954
 955     match = _valueprog.match(attr)
 956     if match: return match.group(1, 2)
 957     return attr, None
 958
 959 def splitgophertype(selector):
 960     """splitgophertype('/Xselector') --> 'X', 'selector'."""
 961     if selector[:1] == '/' and selector[1:2]:
 962         return selector[1], selector[2:]
 963     return None, selector
 964
 965 def unquote(s):
 966     """unquote('abc%20def') -> 'abc def'."""
 967     mychr = chr
 968     myatoi = string.atoi
 969     list = string.split(s, '%')
 970     res = [list[0]]
 971     myappend = res.append
 972     del list[0]
 973     for item in list:
 974         if item[1:2]:
 975             try:
 976                 myappend(mychr(myatoi(item[:2], 16))
 977                      + item[2:])
 978             except:
 979                 myappend('%' + item)
 980         else:
 981             myappend('%' + item)
 982     return string.join(res, "")
 983
 984 def unquote_plus(s):
 985     if '+' in s:
 986         # replace '+' with ' '
 987         s = string.join(string.split(s, '+'), ' ')
 988     return unquote(s)
 989
 990 always_safe = string.letters + string.digits + '_,.-'
 991 def quote(s, safe = '/'):
 992     """quote('abc def') -> 'abc%20def')."""
 993     # XXX Can speed this up an order of magnitude
 994     safe = always_safe + safe
 995     res = list(s)
 996     for i in range(len(res)):
 997         c = res[i]
 998         if c not in safe:
 999             res[i] = '%%%02x' % ord(c)
1000     return string.joinfields(res, '')
1001
1002 def quote_plus(s, safe = '/'):
1003     # XXX Can speed this up an order of magnitude
1004     if ' ' in s:
1005         # replace ' ' with '+'
1006         l = string.split(s, ' ')
1007         for i in range(len(l)):
1008             l[i] = quote(l[i], safe)
1009         return string.join(l, '+')
1010     else:
1011         return quote(s, safe)
1012
1013 def urlencode(dict):
1014     """Encode a dictionary of form entries into a URL query string."""
1015     l = []
1016     for k, v in dict.items():
1017         k = quote_plus(str(k))
1018         v = quote_plus(str(v))
1019         l.append(k + '=' + v)
1020     return string.join(l, '&')
1021
1022
1023 # Proxy handling
1024 if os.name == 'mac':
1025     def getproxies():
1026         """Return a dictionary of scheme -> proxy server URL mappings.
1027
1028         By convention the mac uses Internet Config to store
1029         proxies.  An HTTP proxy, for instance, is stored under
1030         the HttpProxy key.
1031
1032         """
1033         try:
1034             import ic
1035         except ImportError:
1036             return {}
1037
1038         try:
1039             config = ic.IC()
1040         except ic.error:
1041             return {}
1042         proxies = {}
1043         # HTTP:
1044         if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1045             try:
1046                 value = config['HTTPProxyHost']
1047             except ic.error:
1048                 pass
1049             else:
1050                 proxies['http'] = 'http://%s' % value
1051         # FTP: XXXX To be done.
1052         # Gopher: XXXX To be done.
1053         return proxies
1054
1055 else:
1056     def getproxies():
1057         """Return a dictionary of scheme -> proxy server URL mappings.
1058
1059         Scan the environment for variables named <scheme>_proxy;
1060         this seems to be the standard convention.  If you need a
1061         different way, you can pass a proxies dictionary to the
1062         [Fancy]URLopener constructor.
1063
1064         """
1065         proxies = {}
1066         for name, value in os.environ.items():
1067             name = string.lower(name)
1068             if value and name[-6:] == '_proxy':
1069                 proxies[name[:-6]] = value
1070         return proxies
1071
1072
1073 # Test and time quote() and unquote()
1074 def test1():
1075     import time
1076     s = ''
1077     for i in range(256): s = s + chr(i)
1078     s = s*4
1079     t0 = time.time()
1080     qs = quote(s)
1081     uqs = unquote(qs)
1082     t1 = time.time()
1083     if uqs != s:
1084         print 'Wrong!'
1085     print `s`
1086     print `qs`
1087     print `uqs`
1088     print round(t1 - t0, 3), 'sec'
1089
1090
1091 def reporthook(blocknum, blocksize, totalsize):
1092     # Report during remote transfers
1093     print "Block number: %d, Block size: %d, Total size: %d" % (blocknum, blocksize, totalsize)
1094
1095 # Test program
1096 def test(args=[]):
1097     if not args:
1098         args = [
1099             '/etc/passwd',
1100             'file:/etc/passwd',
1101             'file://localhost/etc/passwd',
1102             'ftp://ftp.python.org/etc/passwd',
1103 ##          'gopher://gopher.micro.umn.edu/1/',
1104             'http://www.python.org/index.html',
1105             ]
1106         if hasattr(URLopener, "open_https"):
1107             args.append('https://synergy.as.cmu.edu/~geek/')
1108     try:
1109         for url in args:
1110             print '-'*10, url, '-'*10
1111             fn, h = urlretrieve(url, None, reporthook)
1112             print fn, h
1113             if h:
1114                 print '======'
1115                 for k in h.keys(): print k + ':', h[k]
1116                 print '======'
1117             fp = open(fn, 'rb')
1118             data = fp.read()
1119             del fp
1120             if '\r' in data:
1121                 table = string.maketrans("", "")
1122                 data = string.translate(data, table, "\r")
1123             print data
1124             fn, h = None, None
1125         print '-'*40
1126     finally:
1127         urlcleanup()
1128
1129 def main():
1130     import getopt, sys
1131     try:
1132         opts, args = getopt.getopt(sys.argv[1:], "th")
1133     except getopt.error, msg:
1134         print msg
1135         print "Use -h for help"
1136         return
1137     t = 0
1138     for o, a in opts:
1139         if o == '-t':
1140             t = t + 1
1141         if o == '-h':
1142             print "Usage: python urllib.py [-t] [url ...]"
1143             print "-t runs self-test;",
1144             print "otherwise, contents of urls are printed"
1145             return
1146     if t:
1147         if t > 1:
1148             test1()
1149         test(args)
1150     else:
1151         if not args:
1152             print "Use -h for help"
1153         for url in args:
1154             print urlopen(url).read(),
1155
1156 # Run test program when run as a script
1157 if __name__ == '__main__':
1158     main()