py-cvs-rel2_1 (Rev 1.2) merge
[python/dscho.git] / Lib / urllib.py
blob084d32af30b064602e3f9186795d13aa52194ba3
1 """Open an arbitrary URL.
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
16 The object returned by URLopener().open(file) will differ per
17 protocol. All you know is that is has methods read(), readline(),
18 readlines(), fileno(), close() and info(). The read*(), fileno()
19 and close() methods work like those of open files.
20 The info() method returns a mimetools.Message object which can be
21 used to query various info about the object, if available.
22 (mimetools.Message objects are queried with the getheader() method.)
23 """
25 import string
26 import socket
27 import os
28 import sys
29 import types
31 __all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
32 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
33 "urlencode", "url2pathname", "pathname2url", "splittag",
34 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
35 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
36 "splitnport", "splitquery", "splitattr", "splitvalue",
37 "splitgophertype", "getproxies"]
39 __version__ = '1.15' # XXX This version is not always updated :-(
41 MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
43 # Helper for non-unix systems
44 if os.name == 'mac':
45 from macurl2path import url2pathname, pathname2url
46 elif os.name == 'nt':
47 from nturl2path import url2pathname, pathname2url
48 elif os.name == 'riscos':
49 from rourl2path import url2pathname, pathname2url
50 else:
51 def url2pathname(pathname):
52 return unquote(pathname)
53 def pathname2url(pathname):
54 return quote(pathname)
56 # This really consists of two pieces:
57 # (1) a class which handles opening of all sorts of URLs
58 # (plus assorted utilities etc.)
59 # (2) a set of functions for parsing URLs
60 # XXX Should these be separated out into different modules?
63 # Shortcut for basic usage
64 _urlopener = None
65 def urlopen(url, data=None):
66 """urlopen(url [, data]) -> open file-like object"""
67 global _urlopener
68 if not _urlopener:
69 _urlopener = FancyURLopener()
70 if data is None:
71 return _urlopener.open(url)
72 else:
73 return _urlopener.open(url, data)
74 def urlretrieve(url, filename=None, reporthook=None, data=None):
75 global _urlopener
76 if not _urlopener:
77 _urlopener = FancyURLopener()
78 return _urlopener.retrieve(url, filename, reporthook, data)
79 def urlcleanup():
80 if _urlopener:
81 _urlopener.cleanup()
84 ftpcache = {}
85 class URLopener:
86 """Class to open URLs.
87 This is a class rather than just a subroutine because we may need
88 more than one set of global protocol-specific options.
89 Note -- this is a base class for those who don't want the
90 automatic handling of errors type 302 (relocated) and 401
91 (authorization needed)."""
93 __tempfiles = None
95 version = "Python-urllib/%s" % __version__
97 # Constructor
98 def __init__(self, proxies=None, **x509):
99 if proxies is None:
100 proxies = getproxies()
101 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
102 self.proxies = proxies
103 self.key_file = x509.get('key_file')
104 self.cert_file = x509.get('cert_file')
105 self.addheaders = [('User-agent', self.version)]
106 self.__tempfiles = []
107 self.__unlink = os.unlink # See cleanup()
108 self.tempcache = None
109 # Undocumented feature: if you assign {} to tempcache,
110 # it is used to cache files retrieved with
111 # self.retrieve(). This is not enabled by default
112 # since it does not work for changing documents (and I
113 # haven't got the logic to check expiration headers
114 # yet).
115 self.ftpcache = ftpcache
116 # Undocumented feature: you can use a different
117 # ftp cache by assigning to the .ftpcache member;
118 # in case you want logically independent URL openers
119 # XXX This is not threadsafe. Bah.
121 def __del__(self):
122 self.close()
124 def close(self):
125 self.cleanup()
127 def cleanup(self):
128 # This code sometimes runs when the rest of this module
129 # has already been deleted, so it can't use any globals
130 # or import anything.
131 if self.__tempfiles:
132 for file in self.__tempfiles:
133 try:
134 self.__unlink(file)
135 except:
136 pass
137 del self.__tempfiles[:]
138 if self.tempcache:
139 self.tempcache.clear()
141 def addheader(self, *args):
142 """Add a header to be used by the HTTP interface only
143 e.g. u.addheader('Accept', 'sound/basic')"""
144 self.addheaders.append(args)
146 # External interface
147 def open(self, fullurl, data=None):
148 """Use URLopener().open(file) instead of open(file, 'r')."""
149 fullurl = unwrap(toBytes(fullurl))
150 if self.tempcache and self.tempcache.has_key(fullurl):
151 filename, headers = self.tempcache[fullurl]
152 fp = open(filename, 'rb')
153 return addinfourl(fp, headers, fullurl)
154 urltype, url = splittype(fullurl)
155 if not urltype:
156 urltype = 'file'
157 if self.proxies.has_key(urltype):
158 proxy = self.proxies[urltype]
159 urltype, proxyhost = splittype(proxy)
160 host, selector = splithost(proxyhost)
161 url = (host, fullurl) # Signal special case to open_*()
162 else:
163 proxy = None
164 name = 'open_' + urltype
165 self.type = urltype
166 if '-' in name:
167 # replace - with _
168 name = '_'.join(name.split('-'))
169 if not hasattr(self, name):
170 if proxy:
171 return self.open_unknown_proxy(proxy, fullurl, data)
172 else:
173 return self.open_unknown(fullurl, data)
174 try:
175 if data is None:
176 return getattr(self, name)(url)
177 else:
178 return getattr(self, name)(url, data)
179 except socket.error, msg:
180 raise IOError, ('socket error', msg), sys.exc_info()[2]
182 def open_unknown(self, fullurl, data=None):
183 """Overridable interface to open unknown URL type."""
184 type, url = splittype(fullurl)
185 raise IOError, ('url error', 'unknown url type', type)
187 def open_unknown_proxy(self, proxy, fullurl, data=None):
188 """Overridable interface to open unknown URL type."""
189 type, url = splittype(fullurl)
190 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
192 # External interface
193 def retrieve(self, url, filename=None, reporthook=None, data=None):
194 """retrieve(url) returns (filename, None) for a local object
195 or (tempfilename, headers) for a remote object."""
196 url = unwrap(toBytes(url))
197 if self.tempcache and self.tempcache.has_key(url):
198 return self.tempcache[url]
199 type, url1 = splittype(url)
200 if not filename and (not type or type == 'file'):
201 try:
202 fp = self.open_local_file(url1)
203 hdrs = fp.info()
204 del fp
205 return url2pathname(splithost(url1)[1]), hdrs
206 except IOError, msg:
207 pass
208 fp = self.open(url, data)
209 headers = fp.info()
210 if not filename:
211 import tempfile
212 garbage, path = splittype(url)
213 garbage, path = splithost(path or "")
214 path, garbage = splitquery(path or "")
215 path, garbage = splitattr(path or "")
216 suffix = os.path.splitext(path)[1]
217 filename = tempfile.mktemp(suffix)
218 self.__tempfiles.append(filename)
219 result = filename, headers
220 if self.tempcache is not None:
221 self.tempcache[url] = result
222 tfp = open(filename, 'wb')
223 bs = 1024*8
224 size = -1
225 blocknum = 1
226 if reporthook:
227 if headers.has_key("content-length"):
228 size = int(headers["Content-Length"])
229 reporthook(0, bs, size)
230 block = fp.read(bs)
231 if reporthook:
232 reporthook(1, bs, size)
233 while block:
234 tfp.write(block)
235 block = fp.read(bs)
236 blocknum = blocknum + 1
237 if reporthook:
238 reporthook(blocknum, bs, size)
239 fp.close()
240 tfp.close()
241 del fp
242 del tfp
243 return result
245 # Each method named open_<type> knows how to open that type of URL
247 def open_http(self, url, data=None):
248 """Use HTTP protocol."""
249 import httplib
250 user_passwd = None
251 if type(url) is types.StringType:
252 host, selector = splithost(url)
253 if host:
254 user_passwd, host = splituser(host)
255 host = unquote(host)
256 realhost = host
257 else:
258 host, selector = url
259 urltype, rest = splittype(selector)
260 url = rest
261 user_passwd = None
262 if urltype.lower() != 'http':
263 realhost = None
264 else:
265 realhost, rest = splithost(rest)
266 if realhost:
267 user_passwd, realhost = splituser(realhost)
268 if user_passwd:
269 selector = "%s://%s%s" % (urltype, realhost, rest)
270 #print "proxy via http:", host, selector
271 if not host: raise IOError, ('http error', 'no host given')
272 if user_passwd:
273 import base64
274 auth = base64.encodestring(user_passwd).strip()
275 else:
276 auth = None
277 h = httplib.HTTP(host)
278 if data is not None:
279 h.putrequest('POST', selector)
280 h.putheader('Content-type', 'application/x-www-form-urlencoded')
281 h.putheader('Content-length', '%d' % len(data))
282 else:
283 h.putrequest('GET', selector)
284 if auth: h.putheader('Authorization', 'Basic %s' % auth)
285 if realhost: h.putheader('Host', realhost)
286 for args in self.addheaders: apply(h.putheader, args)
287 h.endheaders()
288 if data is not None:
289 h.send(data)
290 errcode, errmsg, headers = h.getreply()
291 fp = h.getfile()
292 if errcode == 200:
293 return addinfourl(fp, headers, "http:" + url)
294 else:
295 if data is None:
296 return self.http_error(url, fp, errcode, errmsg, headers)
297 else:
298 return self.http_error(url, fp, errcode, errmsg, headers, data)
300 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
301 """Handle http errors.
302 Derived class can override this, or provide specific handlers
303 named http_error_DDD where DDD is the 3-digit error code."""
304 # First check if there's a specific handler for this error
305 name = 'http_error_%d' % errcode
306 if hasattr(self, name):
307 method = getattr(self, name)
308 if data is None:
309 result = method(url, fp, errcode, errmsg, headers)
310 else:
311 result = method(url, fp, errcode, errmsg, headers, data)
312 if result: return result
313 return self.http_error_default(url, fp, errcode, errmsg, headers)
315 def http_error_default(self, url, fp, errcode, errmsg, headers):
316 """Default error handler: close the connection and raise IOError."""
317 void = fp.read()
318 fp.close()
319 raise IOError, ('http error', errcode, errmsg, headers)
321 if hasattr(socket, "ssl"):
322 def open_https(self, url, data=None):
323 """Use HTTPS protocol."""
324 import httplib
325 user_passwd = None
326 if type(url) is types.StringType:
327 host, selector = splithost(url)
328 if host:
329 user_passwd, host = splituser(host)
330 host = unquote(host)
331 realhost = host
332 else:
333 host, selector = url
334 urltype, rest = splittype(selector)
335 url = rest
336 user_passwd = None
337 if urltype.lower() != 'https':
338 realhost = None
339 else:
340 realhost, rest = splithost(rest)
341 if realhost:
342 user_passwd, realhost = splituser(realhost)
343 if user_passwd:
344 selector = "%s://%s%s" % (urltype, realhost, rest)
345 #print "proxy via https:", host, selector
346 if not host: raise IOError, ('https error', 'no host given')
347 if user_passwd:
348 import base64
349 auth = base64.encodestring(user_passwd).strip()
350 else:
351 auth = None
352 h = httplib.HTTPS(host, 0,
353 key_file=self.key_file,
354 cert_file=self.cert_file)
355 if data is not None:
356 h.putrequest('POST', selector)
357 h.putheader('Content-type',
358 'application/x-www-form-urlencoded')
359 h.putheader('Content-length', '%d' % len(data))
360 else:
361 h.putrequest('GET', selector)
362 if auth: h.putheader('Authorization: Basic %s' % auth)
363 if realhost: h.putheader('Host', realhost)
364 for args in self.addheaders: apply(h.putheader, args)
365 h.endheaders()
366 if data is not None:
367 h.send(data)
368 errcode, errmsg, headers = h.getreply()
369 fp = h.getfile()
370 if errcode == 200:
371 return addinfourl(fp, headers, url)
372 else:
373 if data is None:
374 return self.http_error(url, fp, errcode, errmsg, headers)
375 else:
376 return self.http_error(url, fp, errcode, errmsg, headers,
377 data)
379 def open_gopher(self, url):
380 """Use Gopher protocol."""
381 import gopherlib
382 host, selector = splithost(url)
383 if not host: raise IOError, ('gopher error', 'no host given')
384 host = unquote(host)
385 type, selector = splitgophertype(selector)
386 selector, query = splitquery(selector)
387 selector = unquote(selector)
388 if query:
389 query = unquote(query)
390 fp = gopherlib.send_query(selector, query, host)
391 else:
392 fp = gopherlib.send_selector(selector, host)
393 return addinfourl(fp, noheaders(), "gopher:" + url)
395 def open_file(self, url):
396 """Use local file or FTP depending on form of URL."""
397 if url[:2] == '//' and url[2:3] != '/':
398 return self.open_ftp(url)
399 else:
400 return self.open_local_file(url)
402 def open_local_file(self, url):
403 """Use local file."""
404 import mimetypes, mimetools, StringIO
405 mtype = mimetypes.guess_type(url)[0]
406 headers = mimetools.Message(StringIO.StringIO(
407 'Content-Type: %s\n' % (mtype or 'text/plain')))
408 host, file = splithost(url)
409 if not host:
410 urlfile = file
411 if file[:1] == '/':
412 urlfile = 'file://' + file
413 return addinfourl(open(url2pathname(file), 'rb'),
414 headers, urlfile)
415 host, port = splitport(host)
416 if not port \
417 and socket.gethostbyname(host) in (localhost(), thishost()):
418 urlfile = file
419 if file[:1] == '/':
420 urlfile = 'file://' + file
421 return addinfourl(open(url2pathname(file), 'rb'),
422 headers, urlfile)
423 raise IOError, ('local file error', 'not on local host')
425 def open_ftp(self, url):
426 """Use FTP protocol."""
427 host, path = splithost(url)
428 if not host: raise IOError, ('ftp error', 'no host given')
429 host, port = splitport(host)
430 user, host = splituser(host)
431 if user: user, passwd = splitpasswd(user)
432 else: passwd = None
433 host = unquote(host)
434 user = unquote(user or '')
435 passwd = unquote(passwd or '')
436 host = socket.gethostbyname(host)
437 if not port:
438 import ftplib
439 port = ftplib.FTP_PORT
440 else:
441 port = int(port)
442 path, attrs = splitattr(path)
443 path = unquote(path)
444 dirs = path.split('/')
445 dirs, file = dirs[:-1], dirs[-1]
446 if dirs and not dirs[0]: dirs = dirs[1:]
447 if dirs and not dirs[0]: dirs[0] = '/'
448 key = user, host, port, '/'.join(dirs)
449 # XXX thread unsafe!
450 if len(self.ftpcache) > MAXFTPCACHE:
451 # Prune the cache, rather arbitrarily
452 for k in self.ftpcache.keys():
453 if k != key:
454 v = self.ftpcache[k]
455 del self.ftpcache[k]
456 v.close()
457 try:
458 if not self.ftpcache.has_key(key):
459 self.ftpcache[key] = \
460 ftpwrapper(user, passwd, host, port, dirs)
461 if not file: type = 'D'
462 else: type = 'I'
463 for attr in attrs:
464 attr, value = splitvalue(attr)
465 if attr.lower() == 'type' and \
466 value in ('a', 'A', 'i', 'I', 'd', 'D'):
467 type = value.upper()
468 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
469 if retrlen is not None and retrlen >= 0:
470 import mimetools, StringIO
471 headers = mimetools.Message(StringIO.StringIO(
472 'Content-Length: %d\n' % retrlen))
473 else:
474 headers = noheaders()
475 return addinfourl(fp, headers, "ftp:" + url)
476 except ftperrors(), msg:
477 raise IOError, ('ftp error', msg), sys.exc_info()[2]
479 def open_data(self, url, data=None):
480 """Use "data" URL."""
481 # ignore POSTed data
483 # syntax of data URLs:
484 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
485 # mediatype := [ type "/" subtype ] *( ";" parameter )
486 # data := *urlchar
487 # parameter := attribute "=" value
488 import StringIO, mimetools, time
489 try:
490 [type, data] = url.split(',', 1)
491 except ValueError:
492 raise IOError, ('data error', 'bad data URL')
493 if not type:
494 type = 'text/plain;charset=US-ASCII'
495 semi = type.rfind(';')
496 if semi >= 0 and '=' not in type[semi:]:
497 encoding = type[semi+1:]
498 type = type[:semi]
499 else:
500 encoding = ''
501 msg = []
502 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
503 time.gmtime(time.time())))
504 msg.append('Content-type: %s' % type)
505 if encoding == 'base64':
506 import base64
507 data = base64.decodestring(data)
508 else:
509 data = unquote(data)
510 msg.append('Content-length: %d' % len(data))
511 msg.append('')
512 msg.append(data)
513 msg = '\n'.join(msg)
514 f = StringIO.StringIO(msg)
515 headers = mimetools.Message(f, 0)
516 f.fileno = None # needed for addinfourl
517 return addinfourl(f, headers, url)
520 class FancyURLopener(URLopener):
521 """Derived class with handlers for errors we can handle (perhaps)."""
523 def __init__(self, *args):
524 apply(URLopener.__init__, (self,) + args)
525 self.auth_cache = {}
526 self.tries = 0
527 self.maxtries = 10
529 def http_error_default(self, url, fp, errcode, errmsg, headers):
530 """Default error handling -- don't raise an exception."""
531 return addinfourl(fp, headers, "http:" + url)
533 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
534 """Error 302 -- relocated (temporarily)."""
535 self.tries += 1
536 if self.maxtries and self.tries >= self.maxtries:
537 if hasattr(self, "http_error_500"):
538 meth = self.http_error_500
539 else:
540 meth = self.http_error_default
541 self.tries = 0
542 return meth(url, fp, 500,
543 "Internal Server Error: Redirect Recursion", headers)
544 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
545 data)
546 self.tries = 0
547 return result
549 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
550 if headers.has_key('location'):
551 newurl = headers['location']
552 elif headers.has_key('uri'):
553 newurl = headers['uri']
554 else:
555 return
556 void = fp.read()
557 fp.close()
558 # In case the server sent a relative URL, join with original:
559 newurl = basejoin(self.type + ":" + url, newurl)
560 if data is None:
561 return self.open(newurl)
562 else:
563 return self.open(newurl, data)
565 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
566 """Error 301 -- also relocated (permanently)."""
567 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
569 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
570 """Error 401 -- authentication required.
571 See this URL for a description of the basic authentication scheme:
572 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
573 if not headers.has_key('www-authenticate'):
574 URLopener.http_error_default(self, url, fp,
575 errmsg, headers)
576 stuff = headers['www-authenticate']
577 import re
578 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
579 if not match:
580 URLopener.http_error_default(self, url, fp,
581 errcode, errmsg, headers)
582 scheme, realm = match.groups()
583 if scheme.lower() != 'basic':
584 URLopener.http_error_default(self, url, fp,
585 errcode, errmsg, headers)
586 name = 'retry_' + self.type + '_basic_auth'
587 if data is None:
588 return getattr(self,name)(url, realm)
589 else:
590 return getattr(self,name)(url, realm, data)
592 def retry_http_basic_auth(self, url, realm, data=None):
593 host, selector = splithost(url)
594 i = host.find('@') + 1
595 host = host[i:]
596 user, passwd = self.get_user_passwd(host, realm, i)
597 if not (user or passwd): return None
598 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
599 newurl = 'http://' + host + selector
600 if data is None:
601 return self.open(newurl)
602 else:
603 return self.open(newurl, data)
605 def retry_https_basic_auth(self, url, realm, data=None):
606 host, selector = splithost(url)
607 i = host.find('@') + 1
608 host = host[i:]
609 user, passwd = self.get_user_passwd(host, realm, i)
610 if not (user or passwd): return None
611 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
612 newurl = '//' + host + selector
613 return self.open_https(newurl, data)
615 def get_user_passwd(self, host, realm, clear_cache = 0):
616 key = realm + '@' + host.lower()
617 if self.auth_cache.has_key(key):
618 if clear_cache:
619 del self.auth_cache[key]
620 else:
621 return self.auth_cache[key]
622 user, passwd = self.prompt_user_passwd(host, realm)
623 if user or passwd: self.auth_cache[key] = (user, passwd)
624 return user, passwd
626 def prompt_user_passwd(self, host, realm):
627 """Override this in a GUI environment!"""
628 import getpass
629 try:
630 user = raw_input("Enter username for %s at %s: " % (realm,
631 host))
632 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
633 (user, realm, host))
634 return user, passwd
635 except KeyboardInterrupt:
636 print
637 return None, None
640 # Utility functions
642 _localhost = None
643 def localhost():
644 """Return the IP address of the magic hostname 'localhost'."""
645 global _localhost
646 if not _localhost:
647 _localhost = socket.gethostbyname('localhost')
648 return _localhost
650 _thishost = None
651 def thishost():
652 """Return the IP address of the current host."""
653 global _thishost
654 if not _thishost:
655 _thishost = socket.gethostbyname(socket.gethostname())
656 return _thishost
658 _ftperrors = None
659 def ftperrors():
660 """Return the set of errors raised by the FTP class."""
661 global _ftperrors
662 if not _ftperrors:
663 import ftplib
664 _ftperrors = ftplib.all_errors
665 return _ftperrors
667 _noheaders = None
668 def noheaders():
669 """Return an empty mimetools.Message object."""
670 global _noheaders
671 if not _noheaders:
672 import mimetools
673 import StringIO
674 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
675 _noheaders.fp.close() # Recycle file descriptor
676 return _noheaders
679 # Utility classes
681 class ftpwrapper:
682 """Class used by open_ftp() for cache of open FTP connections."""
684 def __init__(self, user, passwd, host, port, dirs):
685 self.user = user
686 self.passwd = passwd
687 self.host = host
688 self.port = port
689 self.dirs = dirs
690 self.init()
692 def init(self):
693 import ftplib
694 self.busy = 0
695 self.ftp = ftplib.FTP()
696 self.ftp.connect(self.host, self.port)
697 self.ftp.login(self.user, self.passwd)
698 for dir in self.dirs:
699 self.ftp.cwd(dir)
701 def retrfile(self, file, type):
702 import ftplib
703 self.endtransfer()
704 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
705 else: cmd = 'TYPE ' + type; isdir = 0
706 try:
707 self.ftp.voidcmd(cmd)
708 except ftplib.all_errors:
709 self.init()
710 self.ftp.voidcmd(cmd)
711 conn = None
712 if file and not isdir:
713 # Use nlst to see if the file exists at all
714 try:
715 self.ftp.nlst(file)
716 except ftplib.error_perm, reason:
717 raise IOError, ('ftp error', reason), sys.exc_info()[2]
718 # Restore the transfer mode!
719 self.ftp.voidcmd(cmd)
720 # Try to retrieve as a file
721 try:
722 cmd = 'RETR ' + file
723 conn = self.ftp.ntransfercmd(cmd)
724 except ftplib.error_perm, reason:
725 if str(reason)[:3] != '550':
726 raise IOError, ('ftp error', reason), sys.exc_info()[2]
727 if not conn:
728 # Set transfer mode to ASCII!
729 self.ftp.voidcmd('TYPE A')
730 # Try a directory listing
731 if file: cmd = 'LIST ' + file
732 else: cmd = 'LIST'
733 conn = self.ftp.ntransfercmd(cmd)
734 self.busy = 1
735 # Pass back both a suitably decorated object and a retrieval length
736 return (addclosehook(conn[0].makefile('rb'),
737 self.endtransfer), conn[1])
738 def endtransfer(self):
739 if not self.busy:
740 return
741 self.busy = 0
742 try:
743 self.ftp.voidresp()
744 except ftperrors():
745 pass
747 def close(self):
748 self.endtransfer()
749 try:
750 self.ftp.close()
751 except ftperrors():
752 pass
754 class addbase:
755 """Base class for addinfo and addclosehook."""
757 def __init__(self, fp):
758 self.fp = fp
759 self.read = self.fp.read
760 self.readline = self.fp.readline
761 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
762 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
764 def __repr__(self):
765 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
766 `id(self)`, `self.fp`)
768 def close(self):
769 self.read = None
770 self.readline = None
771 self.readlines = None
772 self.fileno = None
773 if self.fp: self.fp.close()
774 self.fp = None
776 class addclosehook(addbase):
777 """Class to add a close hook to an open file."""
779 def __init__(self, fp, closehook, *hookargs):
780 addbase.__init__(self, fp)
781 self.closehook = closehook
782 self.hookargs = hookargs
784 def close(self):
785 addbase.close(self)
786 if self.closehook:
787 apply(self.closehook, self.hookargs)
788 self.closehook = None
789 self.hookargs = None
791 class addinfo(addbase):
792 """class to add an info() method to an open file."""
794 def __init__(self, fp, headers):
795 addbase.__init__(self, fp)
796 self.headers = headers
798 def info(self):
799 return self.headers
801 class addinfourl(addbase):
802 """class to add info() and geturl() methods to an open file."""
804 def __init__(self, fp, headers, url):
805 addbase.__init__(self, fp)
806 self.headers = headers
807 self.url = url
809 def info(self):
810 return self.headers
812 def geturl(self):
813 return self.url
816 def basejoin(base, url):
817 """Utility to combine a URL with a base URL to form a new URL."""
818 type, path = splittype(url)
819 if type:
820 # if url is complete (i.e., it contains a type), return it
821 return url
822 host, path = splithost(path)
823 type, basepath = splittype(base) # inherit type from base
824 if host:
825 # if url contains host, just inherit type
826 if type: return type + '://' + host + path
827 else:
828 # no type inherited, so url must have started with //
829 # just return it
830 return url
831 host, basepath = splithost(basepath) # inherit host
832 basepath, basetag = splittag(basepath) # remove extraneous cruft
833 basepath, basequery = splitquery(basepath) # idem
834 if path[:1] != '/':
835 # non-absolute path name
836 if path[:1] in ('#', '?'):
837 # path is just a tag or query, attach to basepath
838 i = len(basepath)
839 else:
840 # else replace last component
841 i = basepath.rfind('/')
842 if i < 0:
843 # basepath not absolute
844 if host:
845 # host present, make absolute
846 basepath = '/'
847 else:
848 # else keep non-absolute
849 basepath = ''
850 else:
851 # remove last file component
852 basepath = basepath[:i+1]
853 # Interpret ../ (important because of symlinks)
854 while basepath and path[:3] == '../':
855 path = path[3:]
856 i = basepath[:-1].rfind('/')
857 if i > 0:
858 basepath = basepath[:i+1]
859 elif i == 0:
860 basepath = '/'
861 break
862 else:
863 basepath = ''
865 path = basepath + path
866 if host and path and path[0] != '/':
867 path = '/' + path
868 if type and host: return type + '://' + host + path
869 elif type: return type + ':' + path
870 elif host: return '//' + host + path # don't know what this means
871 else: return path
874 # Utilities to parse URLs (most of these return None for missing parts):
875 # unwrap('<URL:type://host/path>') --> 'type://host/path'
876 # splittype('type:opaquestring') --> 'type', 'opaquestring'
877 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
878 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
879 # splitpasswd('user:passwd') -> 'user', 'passwd'
880 # splitport('host:port') --> 'host', 'port'
881 # splitquery('/path?query') --> '/path', 'query'
882 # splittag('/path#tag') --> '/path', 'tag'
883 # splitattr('/path;attr1=value1;attr2=value2;...') ->
884 # '/path', ['attr1=value1', 'attr2=value2', ...]
885 # splitvalue('attr=value') --> 'attr', 'value'
886 # splitgophertype('/Xselector') --> 'X', 'selector'
887 # unquote('abc%20def') -> 'abc def'
888 # quote('abc def') -> 'abc%20def')
890 def toBytes(url):
891 """toBytes(u"URL") --> 'URL'."""
892 # Most URL schemes require ASCII. If that changes, the conversion
893 # can be relaxed
894 if type(url) is types.UnicodeType:
895 try:
896 url = url.encode("ASCII")
897 except UnicodeError:
898 raise UnicodeError("URL " + repr(url) +
899 " contains non-ASCII characters")
900 return url
902 def unwrap(url):
903 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
904 url = url.strip()
905 if url[:1] == '<' and url[-1:] == '>':
906 url = url[1:-1].strip()
907 if url[:4] == 'URL:': url = url[4:].strip()
908 return url
910 _typeprog = None
911 def splittype(url):
912 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
913 global _typeprog
914 if _typeprog is None:
915 import re
916 _typeprog = re.compile('^([^/:]+):')
918 match = _typeprog.match(url)
919 if match:
920 scheme = match.group(1)
921 return scheme.lower(), url[len(scheme) + 1:]
922 return None, url
924 _hostprog = None
925 def splithost(url):
926 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
927 global _hostprog
928 if _hostprog is None:
929 import re
930 _hostprog = re.compile('^//([^/]*)(.*)$')
932 match = _hostprog.match(url)
933 if match: return match.group(1, 2)
934 return None, url
936 _userprog = None
937 def splituser(host):
938 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
939 global _userprog
940 if _userprog is None:
941 import re
942 _userprog = re.compile('^([^@]*)@(.*)$')
944 match = _userprog.match(host)
945 if match: return map(unquote, match.group(1, 2))
946 return None, host
948 _passwdprog = None
949 def splitpasswd(user):
950 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
951 global _passwdprog
952 if _passwdprog is None:
953 import re
954 _passwdprog = re.compile('^([^:]*):(.*)$')
956 match = _passwdprog.match(user)
957 if match: return match.group(1, 2)
958 return user, None
960 # splittag('/path#tag') --> '/path', 'tag'
961 _portprog = None
962 def splitport(host):
963 """splitport('host:port') --> 'host', 'port'."""
964 global _portprog
965 if _portprog is None:
966 import re
967 _portprog = re.compile('^(.*):([0-9]+)$')
969 match = _portprog.match(host)
970 if match: return match.group(1, 2)
971 return host, None
973 _nportprog = None
974 def splitnport(host, defport=-1):
975 """Split host and port, returning numeric port.
976 Return given default port if no ':' found; defaults to -1.
977 Return numerical port if a valid number are found after ':'.
978 Return None if ':' but not a valid number."""
979 global _nportprog
980 if _nportprog is None:
981 import re
982 _nportprog = re.compile('^(.*):(.*)$')
984 match = _nportprog.match(host)
985 if match:
986 host, port = match.group(1, 2)
987 try:
988 if not port: raise ValueError, "no digits"
989 nport = int(port)
990 except ValueError:
991 nport = None
992 return host, nport
993 return host, defport
995 _queryprog = None
996 def splitquery(url):
997 """splitquery('/path?query') --> '/path', 'query'."""
998 global _queryprog
999 if _queryprog is None:
1000 import re
1001 _queryprog = re.compile('^(.*)\?([^?]*)$')
1003 match = _queryprog.match(url)
1004 if match: return match.group(1, 2)
1005 return url, None
1007 _tagprog = None
1008 def splittag(url):
1009 """splittag('/path#tag') --> '/path', 'tag'."""
1010 global _tagprog
1011 if _tagprog is None:
1012 import re
1013 _tagprog = re.compile('^(.*)#([^#]*)$')
1015 match = _tagprog.match(url)
1016 if match: return match.group(1, 2)
1017 return url, None
1019 def splitattr(url):
1020 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1021 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1022 words = url.split(';')
1023 return words[0], words[1:]
1025 _valueprog = None
1026 def splitvalue(attr):
1027 """splitvalue('attr=value') --> 'attr', 'value'."""
1028 global _valueprog
1029 if _valueprog is None:
1030 import re
1031 _valueprog = re.compile('^([^=]*)=(.*)$')
1033 match = _valueprog.match(attr)
1034 if match: return match.group(1, 2)
1035 return attr, None
1037 def splitgophertype(selector):
1038 """splitgophertype('/Xselector') --> 'X', 'selector'."""
1039 if selector[:1] == '/' and selector[1:2]:
1040 return selector[1], selector[2:]
1041 return None, selector
1043 def unquote(s):
1044 """unquote('abc%20def') -> 'abc def'."""
1045 mychr = chr
1046 myatoi = int
1047 list = s.split('%')
1048 res = [list[0]]
1049 myappend = res.append
1050 del list[0]
1051 for item in list:
1052 if item[1:2]:
1053 try:
1054 myappend(mychr(myatoi(item[:2], 16))
1055 + item[2:])
1056 except:
1057 myappend('%' + item)
1058 else:
1059 myappend('%' + item)
1060 return "".join(res)
1062 def unquote_plus(s):
1063 """unquote('%7e/abc+def') -> '~/abc def'"""
1064 if '+' in s:
1065 # replace '+' with ' '
1066 s = ' '.join(s.split('+'))
1067 return unquote(s)
1069 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1070 'abcdefghijklmnopqrstuvwxyz'
1071 '0123456789' '_.-')
1073 _fast_safe_test = always_safe + '/'
1074 _fast_safe = None
1076 def _fast_quote(s):
1077 global _fast_safe
1078 if _fast_safe is None:
1079 _fast_safe = {}
1080 for c in _fast_safe_test:
1081 _fast_safe[c] = c
1082 res = list(s)
1083 for i in range(len(res)):
1084 c = res[i]
1085 if not _fast_safe.has_key(c):
1086 res[i] = '%%%02X' % ord(c)
1087 return ''.join(res)
1089 def quote(s, safe = '/'):
1090 """quote('abc def') -> 'abc%20def'
1092 Each part of a URL, e.g. the path info, the query, etc., has a
1093 different set of reserved characters that must be quoted.
1095 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1096 the following reserved characters.
1098 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1099 "$" | ","
1101 Each of these characters is reserved in some component of a URL,
1102 but not necessarily in all of them.
1104 By default, the quote function is intended for quoting the path
1105 section of a URL. Thus, it will not encode '/'. This character
1106 is reserved, but in typical usage the quote function is being
1107 called on a path where the existing slash characters are used as
1108 reserved characters.
1110 safe = always_safe + safe
1111 if _fast_safe_test == safe:
1112 return _fast_quote(s)
1113 res = list(s)
1114 for i in range(len(res)):
1115 c = res[i]
1116 if c not in safe:
1117 res[i] = '%%%02X' % ord(c)
1118 return ''.join(res)
1120 def quote_plus(s, safe = ''):
1121 """Quote the query fragment of a URL; replacing ' ' with '+'"""
1122 if ' ' in s:
1123 l = s.split(' ')
1124 for i in range(len(l)):
1125 l[i] = quote(l[i], safe)
1126 return '+'.join(l)
1127 else:
1128 return quote(s, safe)
1130 def urlencode(query,doseq=0):
1131 """Encode a sequence of two-element tuples or dictionary into a URL query string.
1133 If any values in the query arg are sequences and doseq is true, each
1134 sequence element is converted to a separate parameter.
1136 If the query arg is a sequence of two-element tuples, the order of the
1137 parameters in the output will match the order of parameters in the
1138 input.
1141 if hasattr(query,"items"):
1142 # mapping objects
1143 query = query.items()
1144 else:
1145 # it's a bother at times that strings and string-like objects are
1146 # sequences...
1147 try:
1148 # non-sequence items should not work with len()
1149 x = len(query)
1150 # non-empty strings will fail this
1151 if len(query) and type(query[0]) != types.TupleType:
1152 raise TypeError
1153 # zero-length sequences of all types will get here and succeed,
1154 # but that's a minor nit - since the original implementation
1155 # allowed empty dicts that type of behavior probably should be
1156 # preserved for consistency
1157 except TypeError:
1158 ty,va,tb = sys.exc_info()
1159 raise TypeError, "not a valid non-string sequence or mapping object", tb
1161 l = []
1162 if not doseq:
1163 # preserve old behavior
1164 for k, v in query:
1165 k = quote_plus(str(k))
1166 v = quote_plus(str(v))
1167 l.append(k + '=' + v)
1168 else:
1169 for k, v in query:
1170 k = quote_plus(str(k))
1171 if type(v) == types.StringType:
1172 v = quote_plus(v)
1173 l.append(k + '=' + v)
1174 elif type(v) == types.UnicodeType:
1175 # is there a reasonable way to convert to ASCII?
1176 # encode generates a string, but "replace" or "ignore"
1177 # lose information and "strict" can raise UnicodeError
1178 v = quote_plus(v.encode("ASCII","replace"))
1179 l.append(k + '=' + v)
1180 else:
1181 try:
1182 # is this a sufficient test for sequence-ness?
1183 x = len(v)
1184 except TypeError:
1185 # not a sequence
1186 v = quote_plus(str(v))
1187 l.append(k + '=' + v)
1188 else:
1189 # loop over the sequence
1190 for elt in v:
1191 l.append(k + '=' + quote_plus(str(elt)))
1192 return '&'.join(l)
1194 # Proxy handling
1195 def getproxies_environment():
1196 """Return a dictionary of scheme -> proxy server URL mappings.
1198 Scan the environment for variables named <scheme>_proxy;
1199 this seems to be the standard convention. If you need a
1200 different way, you can pass a proxies dictionary to the
1201 [Fancy]URLopener constructor.
1204 proxies = {}
1205 for name, value in os.environ.items():
1206 name = name.lower()
1207 if value and name[-6:] == '_proxy':
1208 proxies[name[:-6]] = value
1209 return proxies
1211 if os.name == 'mac':
1212 def getproxies():
1213 """Return a dictionary of scheme -> proxy server URL mappings.
1215 By convention the mac uses Internet Config to store
1216 proxies. An HTTP proxy, for instance, is stored under
1217 the HttpProxy key.
1220 try:
1221 import ic
1222 except ImportError:
1223 return {}
1225 try:
1226 config = ic.IC()
1227 except ic.error:
1228 return {}
1229 proxies = {}
1230 # HTTP:
1231 if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1232 try:
1233 value = config['HTTPProxyHost']
1234 except ic.error:
1235 pass
1236 else:
1237 proxies['http'] = 'http://%s' % value
1238 # FTP: XXXX To be done.
1239 # Gopher: XXXX To be done.
1240 return proxies
1242 elif os.name == 'nt':
1243 def getproxies_registry():
1244 """Return a dictionary of scheme -> proxy server URL mappings.
1246 Win32 uses the registry to store proxies.
1249 proxies = {}
1250 try:
1251 import _winreg
1252 except ImportError:
1253 # Std module, so should be around - but you never know!
1254 return proxies
1255 try:
1256 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1257 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1258 proxyEnable = _winreg.QueryValueEx(internetSettings,
1259 'ProxyEnable')[0]
1260 if proxyEnable:
1261 # Returned as Unicode but problems if not converted to ASCII
1262 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1263 'ProxyServer')[0])
1264 if '=' in proxyServer:
1265 # Per-protocol settings
1266 for p in proxyServer.split(';'):
1267 protocol, address = p.split('=', 1)
1268 proxies[protocol] = '%s://%s' % (protocol, address)
1269 else:
1270 # Use one setting for all protocols
1271 if proxyServer[:5] == 'http:':
1272 proxies['http'] = proxyServer
1273 else:
1274 proxies['http'] = 'http://%s' % proxyServer
1275 proxies['ftp'] = 'ftp://%s' % proxyServer
1276 internetSettings.Close()
1277 except (WindowsError, ValueError, TypeError):
1278 # Either registry key not found etc, or the value in an
1279 # unexpected format.
1280 # proxies already set up to be empty so nothing to do
1281 pass
1282 return proxies
1284 def getproxies():
1285 """Return a dictionary of scheme -> proxy server URL mappings.
1287 Returns settings gathered from the environment, if specified,
1288 or the registry.
1291 return getproxies_environment() or getproxies_registry()
1292 else:
1293 # By default use environment variables
1294 getproxies = getproxies_environment
1297 # Test and time quote() and unquote()
1298 def test1():
1299 import time
1300 s = ''
1301 for i in range(256): s = s + chr(i)
1302 s = s*4
1303 t0 = time.time()
1304 qs = quote(s)
1305 uqs = unquote(qs)
1306 t1 = time.time()
1307 if uqs != s:
1308 print 'Wrong!'
1309 print `s`
1310 print `qs`
1311 print `uqs`
1312 print round(t1 - t0, 3), 'sec'
1315 def reporthook(blocknum, blocksize, totalsize):
1316 # Report during remote transfers
1317 print "Block number: %d, Block size: %d, Total size: %d" % (
1318 blocknum, blocksize, totalsize)
1320 # Test program
1321 def test(args=[]):
1322 if not args:
1323 args = [
1324 '/etc/passwd',
1325 'file:/etc/passwd',
1326 'file://localhost/etc/passwd',
1327 'ftp://ftp.python.org/etc/passwd',
1328 ## 'gopher://gopher.micro.umn.edu/1/',
1329 'http://www.python.org/index.html',
1331 if hasattr(URLopener, "open_https"):
1332 args.append('https://synergy.as.cmu.edu/~geek/')
1333 try:
1334 for url in args:
1335 print '-'*10, url, '-'*10
1336 fn, h = urlretrieve(url, None, reporthook)
1337 print fn
1338 if h:
1339 print '======'
1340 for k in h.keys(): print k + ':', h[k]
1341 print '======'
1342 fp = open(fn, 'rb')
1343 data = fp.read()
1344 del fp
1345 if '\r' in data:
1346 table = string.maketrans("", "")
1347 data = data.translate(table, "\r")
1348 print data
1349 fn, h = None, None
1350 print '-'*40
1351 finally:
1352 urlcleanup()
1354 def main():
1355 import getopt, sys
1356 try:
1357 opts, args = getopt.getopt(sys.argv[1:], "th")
1358 except getopt.error, msg:
1359 print msg
1360 print "Use -h for help"
1361 return
1362 t = 0
1363 for o, a in opts:
1364 if o == '-t':
1365 t = t + 1
1366 if o == '-h':
1367 print "Usage: python urllib.py [-t] [url ...]"
1368 print "-t runs self-test;",
1369 print "otherwise, contents of urls are printed"
1370 return
1371 if t:
1372 if t > 1:
1373 test1()
1374 test(args)
1375 else:
1376 if not args:
1377 print "Use -h for help"
1378 for url in args:
1379 print urlopen(url).read(),
1381 # Run test program when run as a script
1382 if __name__ == '__main__':
1383 main()