Re-commit Ping's patch to the cgi and cgitb documentation, using the
[python/dscho.git] / Lib / urllib.py
blob2f0f847dbcfe885976b01fa73c6d235deb129dbb
1 """Open an arbitrary URL.
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
16 The object returned by URLopener().open(file) will differ per
17 protocol. All you know is that is has methods read(), readline(),
18 readlines(), fileno(), close() and info(). The read*(), fileno()
19 and close() methods work like those of open files.
20 The info() method returns a mimetools.Message object which can be
21 used to query various info about the object, if available.
22 (mimetools.Message objects are queried with the getheader() method.)
23 """
25 import string
26 import socket
27 import os
28 import stat
29 import time
30 import sys
31 import types
33 __all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
34 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
35 "urlencode", "url2pathname", "pathname2url", "splittag",
36 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
37 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
38 "splitnport", "splitquery", "splitattr", "splitvalue",
39 "splitgophertype", "getproxies"]
41 __version__ = '1.15' # XXX This version is not always updated :-(
43 MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
45 # Helper for non-unix systems
46 if os.name == 'mac':
47 from macurl2path import url2pathname, pathname2url
48 elif os.name == 'nt':
49 from nturl2path import url2pathname, pathname2url
50 elif os.name == 'riscos':
51 from rourl2path import url2pathname, pathname2url
52 else:
53 def url2pathname(pathname):
54 return unquote(pathname)
55 def pathname2url(pathname):
56 return quote(pathname)
58 # This really consists of two pieces:
59 # (1) a class which handles opening of all sorts of URLs
60 # (plus assorted utilities etc.)
61 # (2) a set of functions for parsing URLs
62 # XXX Should these be separated out into different modules?
65 # Shortcut for basic usage
66 _urlopener = None
67 def urlopen(url, data=None):
68 """urlopen(url [, data]) -> open file-like object"""
69 global _urlopener
70 if not _urlopener:
71 _urlopener = FancyURLopener()
72 if data is None:
73 return _urlopener.open(url)
74 else:
75 return _urlopener.open(url, data)
76 def urlretrieve(url, filename=None, reporthook=None, data=None):
77 global _urlopener
78 if not _urlopener:
79 _urlopener = FancyURLopener()
80 return _urlopener.retrieve(url, filename, reporthook, data)
81 def urlcleanup():
82 if _urlopener:
83 _urlopener.cleanup()
86 ftpcache = {}
87 class URLopener:
88 """Class to open URLs.
89 This is a class rather than just a subroutine because we may need
90 more than one set of global protocol-specific options.
91 Note -- this is a base class for those who don't want the
92 automatic handling of errors type 302 (relocated) and 401
93 (authorization needed)."""
95 __tempfiles = None
97 version = "Python-urllib/%s" % __version__
99 # Constructor
100 def __init__(self, proxies=None, **x509):
101 if proxies is None:
102 proxies = getproxies()
103 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
104 self.proxies = proxies
105 self.key_file = x509.get('key_file')
106 self.cert_file = x509.get('cert_file')
107 self.addheaders = [('User-agent', self.version)]
108 self.__tempfiles = []
109 self.__unlink = os.unlink # See cleanup()
110 self.tempcache = None
111 # Undocumented feature: if you assign {} to tempcache,
112 # it is used to cache files retrieved with
113 # self.retrieve(). This is not enabled by default
114 # since it does not work for changing documents (and I
115 # haven't got the logic to check expiration headers
116 # yet).
117 self.ftpcache = ftpcache
118 # Undocumented feature: you can use a different
119 # ftp cache by assigning to the .ftpcache member;
120 # in case you want logically independent URL openers
121 # XXX This is not threadsafe. Bah.
123 def __del__(self):
124 self.close()
126 def close(self):
127 self.cleanup()
129 def cleanup(self):
130 # This code sometimes runs when the rest of this module
131 # has already been deleted, so it can't use any globals
132 # or import anything.
133 if self.__tempfiles:
134 for file in self.__tempfiles:
135 try:
136 self.__unlink(file)
137 except OSError:
138 pass
139 del self.__tempfiles[:]
140 if self.tempcache:
141 self.tempcache.clear()
143 def addheader(self, *args):
144 """Add a header to be used by the HTTP interface only
145 e.g. u.addheader('Accept', 'sound/basic')"""
146 self.addheaders.append(args)
148 # External interface
149 def open(self, fullurl, data=None):
150 """Use URLopener().open(file) instead of open(file, 'r')."""
151 fullurl = unwrap(toBytes(fullurl))
152 if self.tempcache and self.tempcache.has_key(fullurl):
153 filename, headers = self.tempcache[fullurl]
154 fp = open(filename, 'rb')
155 return addinfourl(fp, headers, fullurl)
156 urltype, url = splittype(fullurl)
157 if not urltype:
158 urltype = 'file'
159 if self.proxies.has_key(urltype):
160 proxy = self.proxies[urltype]
161 urltype, proxyhost = splittype(proxy)
162 host, selector = splithost(proxyhost)
163 url = (host, fullurl) # Signal special case to open_*()
164 else:
165 proxy = None
166 name = 'open_' + urltype
167 self.type = urltype
168 if '-' in name:
169 # replace - with _
170 name = '_'.join(name.split('-'))
171 if not hasattr(self, name):
172 if proxy:
173 return self.open_unknown_proxy(proxy, fullurl, data)
174 else:
175 return self.open_unknown(fullurl, data)
176 try:
177 if data is None:
178 return getattr(self, name)(url)
179 else:
180 return getattr(self, name)(url, data)
181 except socket.error, msg:
182 raise IOError, ('socket error', msg), sys.exc_info()[2]
184 def open_unknown(self, fullurl, data=None):
185 """Overridable interface to open unknown URL type."""
186 type, url = splittype(fullurl)
187 raise IOError, ('url error', 'unknown url type', type)
189 def open_unknown_proxy(self, proxy, fullurl, data=None):
190 """Overridable interface to open unknown URL type."""
191 type, url = splittype(fullurl)
192 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
194 # External interface
195 def retrieve(self, url, filename=None, reporthook=None, data=None):
196 """retrieve(url) returns (filename, None) for a local object
197 or (tempfilename, headers) for a remote object."""
198 url = unwrap(toBytes(url))
199 if self.tempcache and self.tempcache.has_key(url):
200 return self.tempcache[url]
201 type, url1 = splittype(url)
202 if not filename and (not type or type == 'file'):
203 try:
204 fp = self.open_local_file(url1)
205 hdrs = fp.info()
206 del fp
207 return url2pathname(splithost(url1)[1]), hdrs
208 except IOError, msg:
209 pass
210 fp = self.open(url, data)
211 headers = fp.info()
212 if not filename:
213 import tempfile
214 garbage, path = splittype(url)
215 garbage, path = splithost(path or "")
216 path, garbage = splitquery(path or "")
217 path, garbage = splitattr(path or "")
218 suffix = os.path.splitext(path)[1]
219 filename = tempfile.mktemp(suffix)
220 self.__tempfiles.append(filename)
221 result = filename, headers
222 if self.tempcache is not None:
223 self.tempcache[url] = result
224 tfp = open(filename, 'wb')
225 bs = 1024*8
226 size = -1
227 blocknum = 1
228 if reporthook:
229 if headers.has_key("content-length"):
230 size = int(headers["Content-Length"])
231 reporthook(0, bs, size)
232 block = fp.read(bs)
233 if reporthook:
234 reporthook(1, bs, size)
235 while block:
236 tfp.write(block)
237 block = fp.read(bs)
238 blocknum = blocknum + 1
239 if reporthook:
240 reporthook(blocknum, bs, size)
241 fp.close()
242 tfp.close()
243 del fp
244 del tfp
245 return result
247 # Each method named open_<type> knows how to open that type of URL
249 def open_http(self, url, data=None):
250 """Use HTTP protocol."""
251 import httplib
252 user_passwd = None
253 if type(url) is types.StringType:
254 host, selector = splithost(url)
255 if host:
256 user_passwd, host = splituser(host)
257 host = unquote(host)
258 realhost = host
259 else:
260 host, selector = url
261 urltype, rest = splittype(selector)
262 url = rest
263 user_passwd = None
264 if urltype.lower() != 'http':
265 realhost = None
266 else:
267 realhost, rest = splithost(rest)
268 if realhost:
269 user_passwd, realhost = splituser(realhost)
270 if user_passwd:
271 selector = "%s://%s%s" % (urltype, realhost, rest)
272 if proxy_bypass(realhost):
273 host = realhost
275 #print "proxy via http:", host, selector
276 if not host: raise IOError, ('http error', 'no host given')
277 if user_passwd:
278 import base64
279 auth = base64.encodestring(user_passwd).strip()
280 else:
281 auth = None
282 h = httplib.HTTP(host)
283 if data is not None:
284 h.putrequest('POST', selector)
285 h.putheader('Content-type', 'application/x-www-form-urlencoded')
286 h.putheader('Content-length', '%d' % len(data))
287 else:
288 h.putrequest('GET', selector)
289 if auth: h.putheader('Authorization', 'Basic %s' % auth)
290 if realhost: h.putheader('Host', realhost)
291 for args in self.addheaders: apply(h.putheader, args)
292 h.endheaders()
293 if data is not None:
294 h.send(data)
295 errcode, errmsg, headers = h.getreply()
296 fp = h.getfile()
297 if errcode == 200:
298 return addinfourl(fp, headers, "http:" + url)
299 else:
300 if data is None:
301 return self.http_error(url, fp, errcode, errmsg, headers)
302 else:
303 return self.http_error(url, fp, errcode, errmsg, headers, data)
305 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
306 """Handle http errors.
307 Derived class can override this, or provide specific handlers
308 named http_error_DDD where DDD is the 3-digit error code."""
309 # First check if there's a specific handler for this error
310 name = 'http_error_%d' % errcode
311 if hasattr(self, name):
312 method = getattr(self, name)
313 if data is None:
314 result = method(url, fp, errcode, errmsg, headers)
315 else:
316 result = method(url, fp, errcode, errmsg, headers, data)
317 if result: return result
318 return self.http_error_default(url, fp, errcode, errmsg, headers)
320 def http_error_default(self, url, fp, errcode, errmsg, headers):
321 """Default error handler: close the connection and raise IOError."""
322 void = fp.read()
323 fp.close()
324 raise IOError, ('http error', errcode, errmsg, headers)
326 if hasattr(socket, "ssl"):
327 def open_https(self, url, data=None):
328 """Use HTTPS protocol."""
329 import httplib
330 user_passwd = None
331 if type(url) is types.StringType:
332 host, selector = splithost(url)
333 if host:
334 user_passwd, host = splituser(host)
335 host = unquote(host)
336 realhost = host
337 else:
338 host, selector = url
339 urltype, rest = splittype(selector)
340 url = rest
341 user_passwd = None
342 if urltype.lower() != 'https':
343 realhost = None
344 else:
345 realhost, rest = splithost(rest)
346 if realhost:
347 user_passwd, realhost = splituser(realhost)
348 if user_passwd:
349 selector = "%s://%s%s" % (urltype, realhost, rest)
350 #print "proxy via https:", host, selector
351 if not host: raise IOError, ('https error', 'no host given')
352 if user_passwd:
353 import base64
354 auth = base64.encodestring(user_passwd).strip()
355 else:
356 auth = None
357 h = httplib.HTTPS(host, 0,
358 key_file=self.key_file,
359 cert_file=self.cert_file)
360 if data is not None:
361 h.putrequest('POST', selector)
362 h.putheader('Content-type',
363 'application/x-www-form-urlencoded')
364 h.putheader('Content-length', '%d' % len(data))
365 else:
366 h.putrequest('GET', selector)
367 if auth: h.putheader('Authorization: Basic %s' % auth)
368 if realhost: h.putheader('Host', realhost)
369 for args in self.addheaders: apply(h.putheader, args)
370 h.endheaders()
371 if data is not None:
372 h.send(data)
373 errcode, errmsg, headers = h.getreply()
374 fp = h.getfile()
375 if errcode == 200:
376 return addinfourl(fp, headers, "https:" + url)
377 else:
378 if data is None:
379 return self.http_error(url, fp, errcode, errmsg, headers)
380 else:
381 return self.http_error(url, fp, errcode, errmsg, headers,
382 data)
384 def open_gopher(self, url):
385 """Use Gopher protocol."""
386 import gopherlib
387 host, selector = splithost(url)
388 if not host: raise IOError, ('gopher error', 'no host given')
389 host = unquote(host)
390 type, selector = splitgophertype(selector)
391 selector, query = splitquery(selector)
392 selector = unquote(selector)
393 if query:
394 query = unquote(query)
395 fp = gopherlib.send_query(selector, query, host)
396 else:
397 fp = gopherlib.send_selector(selector, host)
398 return addinfourl(fp, noheaders(), "gopher:" + url)
400 def open_file(self, url):
401 """Use local file or FTP depending on form of URL."""
402 if url[:2] == '//' and url[2:3] != '/':
403 return self.open_ftp(url)
404 else:
405 return self.open_local_file(url)
407 def open_local_file(self, url):
408 """Use local file."""
409 import mimetypes, mimetools, rfc822, StringIO
410 host, file = splithost(url)
411 localname = url2pathname(file)
412 stats = os.stat(localname)
413 size = stats[stat.ST_SIZE]
414 modified = rfc822.formatdate(stats[stat.ST_MTIME])
415 mtype = mimetypes.guess_type(url)[0]
416 headers = mimetools.Message(StringIO.StringIO(
417 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
418 (mtype or 'text/plain', size, modified)))
419 if not host:
420 urlfile = file
421 if file[:1] == '/':
422 urlfile = 'file://' + file
423 return addinfourl(open(localname, 'rb'),
424 headers, urlfile)
425 host, port = splitport(host)
426 if not port \
427 and socket.gethostbyname(host) in (localhost(), thishost()):
428 urlfile = file
429 if file[:1] == '/':
430 urlfile = 'file://' + file
431 return addinfourl(open(localname, 'rb'),
432 headers, urlfile)
433 raise IOError, ('local file error', 'not on local host')
435 def open_ftp(self, url):
436 """Use FTP protocol."""
437 import mimetypes, mimetools, StringIO
438 host, path = splithost(url)
439 if not host: raise IOError, ('ftp error', 'no host given')
440 host, port = splitport(host)
441 user, host = splituser(host)
442 if user: user, passwd = splitpasswd(user)
443 else: passwd = None
444 host = unquote(host)
445 user = unquote(user or '')
446 passwd = unquote(passwd or '')
447 host = socket.gethostbyname(host)
448 if not port:
449 import ftplib
450 port = ftplib.FTP_PORT
451 else:
452 port = int(port)
453 path, attrs = splitattr(path)
454 path = unquote(path)
455 dirs = path.split('/')
456 dirs, file = dirs[:-1], dirs[-1]
457 if dirs and not dirs[0]: dirs = dirs[1:]
458 if dirs and not dirs[0]: dirs[0] = '/'
459 key = user, host, port, '/'.join(dirs)
460 # XXX thread unsafe!
461 if len(self.ftpcache) > MAXFTPCACHE:
462 # Prune the cache, rather arbitrarily
463 for k in self.ftpcache.keys():
464 if k != key:
465 v = self.ftpcache[k]
466 del self.ftpcache[k]
467 v.close()
468 try:
469 if not self.ftpcache.has_key(key):
470 self.ftpcache[key] = \
471 ftpwrapper(user, passwd, host, port, dirs)
472 if not file: type = 'D'
473 else: type = 'I'
474 for attr in attrs:
475 attr, value = splitvalue(attr)
476 if attr.lower() == 'type' and \
477 value in ('a', 'A', 'i', 'I', 'd', 'D'):
478 type = value.upper()
479 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
480 mtype = mimetypes.guess_type("ftp:" + url)[0]
481 headers = ""
482 if mtype:
483 headers += "Content-Type: %s\n" % mtype
484 if retrlen is not None and retrlen >= 0:
485 headers += "Content-Length: %d\n" % retrlen
486 headers = mimetools.Message(StringIO.StringIO(headers))
487 return addinfourl(fp, headers, "ftp:" + url)
488 except ftperrors(), msg:
489 raise IOError, ('ftp error', msg), sys.exc_info()[2]
491 def open_data(self, url, data=None):
492 """Use "data" URL."""
493 # ignore POSTed data
495 # syntax of data URLs:
496 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
497 # mediatype := [ type "/" subtype ] *( ";" parameter )
498 # data := *urlchar
499 # parameter := attribute "=" value
500 import StringIO, mimetools, time
501 try:
502 [type, data] = url.split(',', 1)
503 except ValueError:
504 raise IOError, ('data error', 'bad data URL')
505 if not type:
506 type = 'text/plain;charset=US-ASCII'
507 semi = type.rfind(';')
508 if semi >= 0 and '=' not in type[semi:]:
509 encoding = type[semi+1:]
510 type = type[:semi]
511 else:
512 encoding = ''
513 msg = []
514 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
515 time.gmtime(time.time())))
516 msg.append('Content-type: %s' % type)
517 if encoding == 'base64':
518 import base64
519 data = base64.decodestring(data)
520 else:
521 data = unquote(data)
522 msg.append('Content-length: %d' % len(data))
523 msg.append('')
524 msg.append(data)
525 msg = '\n'.join(msg)
526 f = StringIO.StringIO(msg)
527 headers = mimetools.Message(f, 0)
528 f.fileno = None # needed for addinfourl
529 return addinfourl(f, headers, url)
532 class FancyURLopener(URLopener):
533 """Derived class with handlers for errors we can handle (perhaps)."""
535 def __init__(self, *args):
536 apply(URLopener.__init__, (self,) + args)
537 self.auth_cache = {}
538 self.tries = 0
539 self.maxtries = 10
541 def http_error_default(self, url, fp, errcode, errmsg, headers):
542 """Default error handling -- don't raise an exception."""
543 return addinfourl(fp, headers, "http:" + url)
545 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
546 """Error 302 -- relocated (temporarily)."""
547 self.tries += 1
548 if self.maxtries and self.tries >= self.maxtries:
549 if hasattr(self, "http_error_500"):
550 meth = self.http_error_500
551 else:
552 meth = self.http_error_default
553 self.tries = 0
554 return meth(url, fp, 500,
555 "Internal Server Error: Redirect Recursion", headers)
556 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
557 data)
558 self.tries = 0
559 return result
561 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
562 if headers.has_key('location'):
563 newurl = headers['location']
564 elif headers.has_key('uri'):
565 newurl = headers['uri']
566 else:
567 return
568 void = fp.read()
569 fp.close()
570 # In case the server sent a relative URL, join with original:
571 newurl = basejoin(self.type + ":" + url, newurl)
572 if data is None:
573 return self.open(newurl)
574 else:
575 return self.open(newurl, data)
577 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
578 """Error 301 -- also relocated (permanently)."""
579 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
581 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
582 """Error 401 -- authentication required.
583 See this URL for a description of the basic authentication scheme:
584 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
585 if not headers.has_key('www-authenticate'):
586 URLopener.http_error_default(self, url, fp,
587 errcode, errmsg, headers)
588 stuff = headers['www-authenticate']
589 import re
590 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
591 if not match:
592 URLopener.http_error_default(self, url, fp,
593 errcode, errmsg, headers)
594 scheme, realm = match.groups()
595 if scheme.lower() != 'basic':
596 URLopener.http_error_default(self, url, fp,
597 errcode, errmsg, headers)
598 name = 'retry_' + self.type + '_basic_auth'
599 if data is None:
600 return getattr(self,name)(url, realm)
601 else:
602 return getattr(self,name)(url, realm, data)
604 def retry_http_basic_auth(self, url, realm, data=None):
605 host, selector = splithost(url)
606 i = host.find('@') + 1
607 host = host[i:]
608 user, passwd = self.get_user_passwd(host, realm, i)
609 if not (user or passwd): return None
610 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
611 newurl = 'http://' + host + selector
612 if data is None:
613 return self.open(newurl)
614 else:
615 return self.open(newurl, data)
617 def retry_https_basic_auth(self, url, realm, data=None):
618 host, selector = splithost(url)
619 i = host.find('@') + 1
620 host = host[i:]
621 user, passwd = self.get_user_passwd(host, realm, i)
622 if not (user or passwd): return None
623 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
624 newurl = '//' + host + selector
625 return self.open_https(newurl, data)
627 def get_user_passwd(self, host, realm, clear_cache = 0):
628 key = realm + '@' + host.lower()
629 if self.auth_cache.has_key(key):
630 if clear_cache:
631 del self.auth_cache[key]
632 else:
633 return self.auth_cache[key]
634 user, passwd = self.prompt_user_passwd(host, realm)
635 if user or passwd: self.auth_cache[key] = (user, passwd)
636 return user, passwd
638 def prompt_user_passwd(self, host, realm):
639 """Override this in a GUI environment!"""
640 import getpass
641 try:
642 user = raw_input("Enter username for %s at %s: " % (realm,
643 host))
644 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
645 (user, realm, host))
646 return user, passwd
647 except KeyboardInterrupt:
648 print
649 return None, None
652 # Utility functions
654 _localhost = None
655 def localhost():
656 """Return the IP address of the magic hostname 'localhost'."""
657 global _localhost
658 if not _localhost:
659 _localhost = socket.gethostbyname('localhost')
660 return _localhost
662 _thishost = None
663 def thishost():
664 """Return the IP address of the current host."""
665 global _thishost
666 if not _thishost:
667 _thishost = socket.gethostbyname(socket.gethostname())
668 return _thishost
670 _ftperrors = None
671 def ftperrors():
672 """Return the set of errors raised by the FTP class."""
673 global _ftperrors
674 if not _ftperrors:
675 import ftplib
676 _ftperrors = ftplib.all_errors
677 return _ftperrors
679 _noheaders = None
680 def noheaders():
681 """Return an empty mimetools.Message object."""
682 global _noheaders
683 if not _noheaders:
684 import mimetools
685 import StringIO
686 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
687 _noheaders.fp.close() # Recycle file descriptor
688 return _noheaders
691 # Utility classes
693 class ftpwrapper:
694 """Class used by open_ftp() for cache of open FTP connections."""
696 def __init__(self, user, passwd, host, port, dirs):
697 self.user = user
698 self.passwd = passwd
699 self.host = host
700 self.port = port
701 self.dirs = dirs
702 self.init()
704 def init(self):
705 import ftplib
706 self.busy = 0
707 self.ftp = ftplib.FTP()
708 self.ftp.connect(self.host, self.port)
709 self.ftp.login(self.user, self.passwd)
710 for dir in self.dirs:
711 self.ftp.cwd(dir)
713 def retrfile(self, file, type):
714 import ftplib
715 self.endtransfer()
716 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
717 else: cmd = 'TYPE ' + type; isdir = 0
718 try:
719 self.ftp.voidcmd(cmd)
720 except ftplib.all_errors:
721 self.init()
722 self.ftp.voidcmd(cmd)
723 conn = None
724 if file and not isdir:
725 # Use nlst to see if the file exists at all
726 try:
727 self.ftp.nlst(file)
728 except ftplib.error_perm, reason:
729 raise IOError, ('ftp error', reason), sys.exc_info()[2]
730 # Restore the transfer mode!
731 self.ftp.voidcmd(cmd)
732 # Try to retrieve as a file
733 try:
734 cmd = 'RETR ' + file
735 conn = self.ftp.ntransfercmd(cmd)
736 except ftplib.error_perm, reason:
737 if str(reason)[:3] != '550':
738 raise IOError, ('ftp error', reason), sys.exc_info()[2]
739 if not conn:
740 # Set transfer mode to ASCII!
741 self.ftp.voidcmd('TYPE A')
742 # Try a directory listing
743 if file: cmd = 'LIST ' + file
744 else: cmd = 'LIST'
745 conn = self.ftp.ntransfercmd(cmd)
746 self.busy = 1
747 # Pass back both a suitably decorated object and a retrieval length
748 return (addclosehook(conn[0].makefile('rb'),
749 self.endtransfer), conn[1])
750 def endtransfer(self):
751 if not self.busy:
752 return
753 self.busy = 0
754 try:
755 self.ftp.voidresp()
756 except ftperrors():
757 pass
759 def close(self):
760 self.endtransfer()
761 try:
762 self.ftp.close()
763 except ftperrors():
764 pass
766 class addbase:
767 """Base class for addinfo and addclosehook."""
769 def __init__(self, fp):
770 self.fp = fp
771 self.read = self.fp.read
772 self.readline = self.fp.readline
773 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
774 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
776 def __repr__(self):
777 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
778 `id(self)`, `self.fp`)
780 def close(self):
781 self.read = None
782 self.readline = None
783 self.readlines = None
784 self.fileno = None
785 if self.fp: self.fp.close()
786 self.fp = None
788 class addclosehook(addbase):
789 """Class to add a close hook to an open file."""
791 def __init__(self, fp, closehook, *hookargs):
792 addbase.__init__(self, fp)
793 self.closehook = closehook
794 self.hookargs = hookargs
796 def close(self):
797 addbase.close(self)
798 if self.closehook:
799 apply(self.closehook, self.hookargs)
800 self.closehook = None
801 self.hookargs = None
803 class addinfo(addbase):
804 """class to add an info() method to an open file."""
806 def __init__(self, fp, headers):
807 addbase.__init__(self, fp)
808 self.headers = headers
810 def info(self):
811 return self.headers
813 class addinfourl(addbase):
814 """class to add info() and geturl() methods to an open file."""
816 def __init__(self, fp, headers, url):
817 addbase.__init__(self, fp)
818 self.headers = headers
819 self.url = url
821 def info(self):
822 return self.headers
824 def geturl(self):
825 return self.url
828 def basejoin(base, url):
829 """Utility to combine a URL with a base URL to form a new URL."""
830 type, path = splittype(url)
831 if type:
832 # if url is complete (i.e., it contains a type), return it
833 return url
834 host, path = splithost(path)
835 type, basepath = splittype(base) # inherit type from base
836 if host:
837 # if url contains host, just inherit type
838 if type: return type + '://' + host + path
839 else:
840 # no type inherited, so url must have started with //
841 # just return it
842 return url
843 host, basepath = splithost(basepath) # inherit host
844 basepath, basetag = splittag(basepath) # remove extraneous cruft
845 basepath, basequery = splitquery(basepath) # idem
846 if path[:1] != '/':
847 # non-absolute path name
848 if path[:1] in ('#', '?'):
849 # path is just a tag or query, attach to basepath
850 i = len(basepath)
851 else:
852 # else replace last component
853 i = basepath.rfind('/')
854 if i < 0:
855 # basepath not absolute
856 if host:
857 # host present, make absolute
858 basepath = '/'
859 else:
860 # else keep non-absolute
861 basepath = ''
862 else:
863 # remove last file component
864 basepath = basepath[:i+1]
865 # Interpret ../ (important because of symlinks)
866 while basepath and path[:3] == '../':
867 path = path[3:]
868 i = basepath[:-1].rfind('/')
869 if i > 0:
870 basepath = basepath[:i+1]
871 elif i == 0:
872 basepath = '/'
873 break
874 else:
875 basepath = ''
877 path = basepath + path
878 if host and path and path[0] != '/':
879 path = '/' + path
880 if type and host: return type + '://' + host + path
881 elif type: return type + ':' + path
882 elif host: return '//' + host + path # don't know what this means
883 else: return path
886 # Utilities to parse URLs (most of these return None for missing parts):
887 # unwrap('<URL:type://host/path>') --> 'type://host/path'
888 # splittype('type:opaquestring') --> 'type', 'opaquestring'
889 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
890 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
891 # splitpasswd('user:passwd') -> 'user', 'passwd'
892 # splitport('host:port') --> 'host', 'port'
893 # splitquery('/path?query') --> '/path', 'query'
894 # splittag('/path#tag') --> '/path', 'tag'
895 # splitattr('/path;attr1=value1;attr2=value2;...') ->
896 # '/path', ['attr1=value1', 'attr2=value2', ...]
897 # splitvalue('attr=value') --> 'attr', 'value'
898 # splitgophertype('/Xselector') --> 'X', 'selector'
899 # unquote('abc%20def') -> 'abc def'
900 # quote('abc def') -> 'abc%20def')
902 def toBytes(url):
903 """toBytes(u"URL") --> 'URL'."""
904 # Most URL schemes require ASCII. If that changes, the conversion
905 # can be relaxed
906 if type(url) is types.UnicodeType:
907 try:
908 url = url.encode("ASCII")
909 except UnicodeError:
910 raise UnicodeError("URL " + repr(url) +
911 " contains non-ASCII characters")
912 return url
914 def unwrap(url):
915 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
916 url = url.strip()
917 if url[:1] == '<' and url[-1:] == '>':
918 url = url[1:-1].strip()
919 if url[:4] == 'URL:': url = url[4:].strip()
920 return url
922 _typeprog = None
923 def splittype(url):
924 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
925 global _typeprog
926 if _typeprog is None:
927 import re
928 _typeprog = re.compile('^([^/:]+):')
930 match = _typeprog.match(url)
931 if match:
932 scheme = match.group(1)
933 return scheme.lower(), url[len(scheme) + 1:]
934 return None, url
936 _hostprog = None
937 def splithost(url):
938 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
939 global _hostprog
940 if _hostprog is None:
941 import re
942 _hostprog = re.compile('^//([^/]*)(.*)$')
944 match = _hostprog.match(url)
945 if match: return match.group(1, 2)
946 return None, url
948 _userprog = None
949 def splituser(host):
950 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
951 global _userprog
952 if _userprog is None:
953 import re
954 _userprog = re.compile('^([^@]*)@(.*)$')
956 match = _userprog.match(host)
957 if match: return map(unquote, match.group(1, 2))
958 return None, host
960 _passwdprog = None
961 def splitpasswd(user):
962 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
963 global _passwdprog
964 if _passwdprog is None:
965 import re
966 _passwdprog = re.compile('^([^:]*):(.*)$')
968 match = _passwdprog.match(user)
969 if match: return match.group(1, 2)
970 return user, None
972 # splittag('/path#tag') --> '/path', 'tag'
973 _portprog = None
974 def splitport(host):
975 """splitport('host:port') --> 'host', 'port'."""
976 global _portprog
977 if _portprog is None:
978 import re
979 _portprog = re.compile('^(.*):([0-9]+)$')
981 match = _portprog.match(host)
982 if match: return match.group(1, 2)
983 return host, None
985 _nportprog = None
986 def splitnport(host, defport=-1):
987 """Split host and port, returning numeric port.
988 Return given default port if no ':' found; defaults to -1.
989 Return numerical port if a valid number are found after ':'.
990 Return None if ':' but not a valid number."""
991 global _nportprog
992 if _nportprog is None:
993 import re
994 _nportprog = re.compile('^(.*):(.*)$')
996 match = _nportprog.match(host)
997 if match:
998 host, port = match.group(1, 2)
999 try:
1000 if not port: raise ValueError, "no digits"
1001 nport = int(port)
1002 except ValueError:
1003 nport = None
1004 return host, nport
1005 return host, defport
1007 _queryprog = None
1008 def splitquery(url):
1009 """splitquery('/path?query') --> '/path', 'query'."""
1010 global _queryprog
1011 if _queryprog is None:
1012 import re
1013 _queryprog = re.compile('^(.*)\?([^?]*)$')
1015 match = _queryprog.match(url)
1016 if match: return match.group(1, 2)
1017 return url, None
1019 _tagprog = None
1020 def splittag(url):
1021 """splittag('/path#tag') --> '/path', 'tag'."""
1022 global _tagprog
1023 if _tagprog is None:
1024 import re
1025 _tagprog = re.compile('^(.*)#([^#]*)$')
1027 match = _tagprog.match(url)
1028 if match: return match.group(1, 2)
1029 return url, None
1031 def splitattr(url):
1032 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1033 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1034 words = url.split(';')
1035 return words[0], words[1:]
1037 _valueprog = None
1038 def splitvalue(attr):
1039 """splitvalue('attr=value') --> 'attr', 'value'."""
1040 global _valueprog
1041 if _valueprog is None:
1042 import re
1043 _valueprog = re.compile('^([^=]*)=(.*)$')
1045 match = _valueprog.match(attr)
1046 if match: return match.group(1, 2)
1047 return attr, None
1049 def splitgophertype(selector):
1050 """splitgophertype('/Xselector') --> 'X', 'selector'."""
1051 if selector[:1] == '/' and selector[1:2]:
1052 return selector[1], selector[2:]
1053 return None, selector
1055 def unquote(s):
1056 """unquote('abc%20def') -> 'abc def'."""
1057 mychr = chr
1058 myatoi = int
1059 list = s.split('%')
1060 res = [list[0]]
1061 myappend = res.append
1062 del list[0]
1063 for item in list:
1064 if item[1:2]:
1065 try:
1066 myappend(mychr(myatoi(item[:2], 16))
1067 + item[2:])
1068 except ValueError:
1069 myappend('%' + item)
1070 else:
1071 myappend('%' + item)
1072 return "".join(res)
1074 def unquote_plus(s):
1075 """unquote('%7e/abc+def') -> '~/abc def'"""
1076 if '+' in s:
1077 # replace '+' with ' '
1078 s = ' '.join(s.split('+'))
1079 return unquote(s)
1081 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1082 'abcdefghijklmnopqrstuvwxyz'
1083 '0123456789' '_.-')
1085 _fast_safe_test = always_safe + '/'
1086 _fast_safe = None
1088 def _fast_quote(s):
1089 global _fast_safe
1090 if _fast_safe is None:
1091 _fast_safe = {}
1092 for c in _fast_safe_test:
1093 _fast_safe[c] = c
1094 res = list(s)
1095 for i in range(len(res)):
1096 c = res[i]
1097 if not _fast_safe.has_key(c):
1098 res[i] = '%%%02X' % ord(c)
1099 return ''.join(res)
1101 def quote(s, safe = '/'):
1102 """quote('abc def') -> 'abc%20def'
1104 Each part of a URL, e.g. the path info, the query, etc., has a
1105 different set of reserved characters that must be quoted.
1107 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1108 the following reserved characters.
1110 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1111 "$" | ","
1113 Each of these characters is reserved in some component of a URL,
1114 but not necessarily in all of them.
1116 By default, the quote function is intended for quoting the path
1117 section of a URL. Thus, it will not encode '/'. This character
1118 is reserved, but in typical usage the quote function is being
1119 called on a path where the existing slash characters are used as
1120 reserved characters.
1122 safe = always_safe + safe
1123 if _fast_safe_test == safe:
1124 return _fast_quote(s)
1125 res = list(s)
1126 for i in range(len(res)):
1127 c = res[i]
1128 if c not in safe:
1129 res[i] = '%%%02X' % ord(c)
1130 return ''.join(res)
1132 def quote_plus(s, safe = ''):
1133 """Quote the query fragment of a URL; replacing ' ' with '+'"""
1134 if ' ' in s:
1135 l = s.split(' ')
1136 for i in range(len(l)):
1137 l[i] = quote(l[i], safe)
1138 return '+'.join(l)
1139 else:
1140 return quote(s, safe)
1142 def urlencode(query,doseq=0):
1143 """Encode a sequence of two-element tuples or dictionary into a URL query string.
1145 If any values in the query arg are sequences and doseq is true, each
1146 sequence element is converted to a separate parameter.
1148 If the query arg is a sequence of two-element tuples, the order of the
1149 parameters in the output will match the order of parameters in the
1150 input.
1153 if hasattr(query,"items"):
1154 # mapping objects
1155 query = query.items()
1156 else:
1157 # it's a bother at times that strings and string-like objects are
1158 # sequences...
1159 try:
1160 # non-sequence items should not work with len()
1161 x = len(query)
1162 # non-empty strings will fail this
1163 if len(query) and type(query[0]) != types.TupleType:
1164 raise TypeError
1165 # zero-length sequences of all types will get here and succeed,
1166 # but that's a minor nit - since the original implementation
1167 # allowed empty dicts that type of behavior probably should be
1168 # preserved for consistency
1169 except TypeError:
1170 ty,va,tb = sys.exc_info()
1171 raise TypeError, "not a valid non-string sequence or mapping object", tb
1173 l = []
1174 if not doseq:
1175 # preserve old behavior
1176 for k, v in query:
1177 k = quote_plus(str(k))
1178 v = quote_plus(str(v))
1179 l.append(k + '=' + v)
1180 else:
1181 for k, v in query:
1182 k = quote_plus(str(k))
1183 if type(v) == types.StringType:
1184 v = quote_plus(v)
1185 l.append(k + '=' + v)
1186 elif type(v) == types.UnicodeType:
1187 # is there a reasonable way to convert to ASCII?
1188 # encode generates a string, but "replace" or "ignore"
1189 # lose information and "strict" can raise UnicodeError
1190 v = quote_plus(v.encode("ASCII","replace"))
1191 l.append(k + '=' + v)
1192 else:
1193 try:
1194 # is this a sufficient test for sequence-ness?
1195 x = len(v)
1196 except TypeError:
1197 # not a sequence
1198 v = quote_plus(str(v))
1199 l.append(k + '=' + v)
1200 else:
1201 # loop over the sequence
1202 for elt in v:
1203 l.append(k + '=' + quote_plus(str(elt)))
1204 return '&'.join(l)
1206 # Proxy handling
1207 def getproxies_environment():
1208 """Return a dictionary of scheme -> proxy server URL mappings.
1210 Scan the environment for variables named <scheme>_proxy;
1211 this seems to be the standard convention. If you need a
1212 different way, you can pass a proxies dictionary to the
1213 [Fancy]URLopener constructor.
1216 proxies = {}
1217 for name, value in os.environ.items():
1218 name = name.lower()
1219 if value and name[-6:] == '_proxy':
1220 proxies[name[:-6]] = value
1221 return proxies
1223 if os.name == 'mac':
1224 def getproxies():
1225 """Return a dictionary of scheme -> proxy server URL mappings.
1227 By convention the mac uses Internet Config to store
1228 proxies. An HTTP proxy, for instance, is stored under
1229 the HttpProxy key.
1232 try:
1233 import ic
1234 except ImportError:
1235 return {}
1237 try:
1238 config = ic.IC()
1239 except ic.error:
1240 return {}
1241 proxies = {}
1242 # HTTP:
1243 if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1244 try:
1245 value = config['HTTPProxyHost']
1246 except ic.error:
1247 pass
1248 else:
1249 proxies['http'] = 'http://%s' % value
1250 # FTP: XXXX To be done.
1251 # Gopher: XXXX To be done.
1252 return proxies
1254 def proxy_bypass(x):
1255 return 0
1257 elif os.name == 'nt':
1258 def getproxies_registry():
1259 """Return a dictionary of scheme -> proxy server URL mappings.
1261 Win32 uses the registry to store proxies.
1264 proxies = {}
1265 try:
1266 import _winreg
1267 except ImportError:
1268 # Std module, so should be around - but you never know!
1269 return proxies
1270 try:
1271 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1272 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1273 proxyEnable = _winreg.QueryValueEx(internetSettings,
1274 'ProxyEnable')[0]
1275 if proxyEnable:
1276 # Returned as Unicode but problems if not converted to ASCII
1277 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1278 'ProxyServer')[0])
1279 if '=' in proxyServer:
1280 # Per-protocol settings
1281 for p in proxyServer.split(';'):
1282 protocol, address = p.split('=', 1)
1283 proxies[protocol] = '%s://%s' % (protocol, address)
1284 else:
1285 # Use one setting for all protocols
1286 if proxyServer[:5] == 'http:':
1287 proxies['http'] = proxyServer
1288 else:
1289 proxies['http'] = 'http://%s' % proxyServer
1290 proxies['ftp'] = 'ftp://%s' % proxyServer
1291 internetSettings.Close()
1292 except (WindowsError, ValueError, TypeError):
1293 # Either registry key not found etc, or the value in an
1294 # unexpected format.
1295 # proxies already set up to be empty so nothing to do
1296 pass
1297 return proxies
1299 def getproxies():
1300 """Return a dictionary of scheme -> proxy server URL mappings.
1302 Returns settings gathered from the environment, if specified,
1303 or the registry.
1306 return getproxies_environment() or getproxies_registry()
1308 def proxy_bypass(host):
1309 try:
1310 import _winreg
1311 import re
1312 import socket
1313 except ImportError:
1314 # Std modules, so should be around - but you never know!
1315 return 0
1316 try:
1317 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1318 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1319 proxyEnable = _winreg.QueryValueEx(internetSettings,
1320 'ProxyEnable')[0]
1321 proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1322 'ProxyOverride')[0])
1323 # ^^^^ Returned as Unicode but problems if not converted to ASCII
1324 except WindowsError:
1325 return 0
1326 if not proxyEnable or not proxyOverride:
1327 return 0
1328 # try to make a host list from name and IP address.
1329 host = [host]
1330 try:
1331 addr = socket.gethostbyname(host[0])
1332 if addr != host:
1333 host.append(addr)
1334 except socket.error:
1335 pass
1336 # make a check value list from the registry entry: replace the
1337 # '<local>' string by the localhost entry and the corresponding
1338 # canonical entry.
1339 proxyOverride = proxyOverride.split(';')
1340 i = 0
1341 while i < len(proxyOverride):
1342 if proxyOverride[i] == '<local>':
1343 proxyOverride[i:i+1] = ['localhost',
1344 '127.0.0.1',
1345 socket.gethostname(),
1346 socket.gethostbyname(
1347 socket.gethostname())]
1348 i += 1
1349 # print proxyOverride
1350 # now check if we match one of the registry values.
1351 for test in proxyOverride:
1352 test = test.replace(".", r"\.") # mask dots
1353 test = test.replace("*", r".*") # change glob sequence
1354 test = test.replace("?", r".") # change glob char
1355 for val in host:
1356 # print "%s <--> %s" %( test, val )
1357 if re.match(test, val, re.I):
1358 return 1
1359 return 0
1361 else:
1362 # By default use environment variables
1363 getproxies = getproxies_environment
1365 def proxy_bypass(host):
1366 return 0
1368 # Test and time quote() and unquote()
1369 def test1():
1370 import time
1371 s = ''
1372 for i in range(256): s = s + chr(i)
1373 s = s*4
1374 t0 = time.time()
1375 qs = quote(s)
1376 uqs = unquote(qs)
1377 t1 = time.time()
1378 if uqs != s:
1379 print 'Wrong!'
1380 print `s`
1381 print `qs`
1382 print `uqs`
1383 print round(t1 - t0, 3), 'sec'
1386 def reporthook(blocknum, blocksize, totalsize):
1387 # Report during remote transfers
1388 print "Block number: %d, Block size: %d, Total size: %d" % (
1389 blocknum, blocksize, totalsize)
1391 # Test program
1392 def test(args=[]):
1393 if not args:
1394 args = [
1395 '/etc/passwd',
1396 'file:/etc/passwd',
1397 'file://localhost/etc/passwd',
1398 'ftp://ftp.python.org/etc/passwd',
1399 ## 'gopher://gopher.micro.umn.edu/1/',
1400 'http://www.python.org/index.html',
1402 if hasattr(URLopener, "open_https"):
1403 args.append('https://synergy.as.cmu.edu/~geek/')
1404 try:
1405 for url in args:
1406 print '-'*10, url, '-'*10
1407 fn, h = urlretrieve(url, None, reporthook)
1408 print fn
1409 if h:
1410 print '======'
1411 for k in h.keys(): print k + ':', h[k]
1412 print '======'
1413 fp = open(fn, 'rb')
1414 data = fp.read()
1415 del fp
1416 if '\r' in data:
1417 table = string.maketrans("", "")
1418 data = data.translate(table, "\r")
1419 print data
1420 fn, h = None, None
1421 print '-'*40
1422 finally:
1423 urlcleanup()
1425 def main():
1426 import getopt, sys
1427 try:
1428 opts, args = getopt.getopt(sys.argv[1:], "th")
1429 except getopt.error, msg:
1430 print msg
1431 print "Use -h for help"
1432 return
1433 t = 0
1434 for o, a in opts:
1435 if o == '-t':
1436 t = t + 1
1437 if o == '-h':
1438 print "Usage: python urllib.py [-t] [url ...]"
1439 print "-t runs self-test;",
1440 print "otherwise, contents of urls are printed"
1441 return
1442 if t:
1443 if t > 1:
1444 test1()
1445 test(args)
1446 else:
1447 if not args:
1448 print "Use -h for help"
1449 for url in args:
1450 print urlopen(url).read(),
1452 # Run test program when run as a script
1453 if __name__ == '__main__':
1454 main()