Move setting of ioready 'wait' earlier in call chain, to
[python/dscho.git] / Lib / urllib.py
blobf486aeeb165bcda99bf7f5fcfad6da761366086e
1 """Open an arbitrary URL.
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
16 The object returned by URLopener().open(file) will differ per
17 protocol. All you know is that is has methods read(), readline(),
18 readlines(), fileno(), close() and info(). The read*(), fileno()
19 and close() methods work like those of open files.
20 The info() method returns a mimetools.Message object which can be
21 used to query various info about the object, if available.
22 (mimetools.Message objects are queried with the getheader() method.)
23 """
25 import string
26 import socket
27 import os
28 import time
29 import sys
31 __all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
32 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
33 "urlencode", "url2pathname", "pathname2url", "splittag",
34 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
35 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
36 "splitnport", "splitquery", "splitattr", "splitvalue",
37 "splitgophertype", "getproxies"]
39 __version__ = '1.15' # XXX This version is not always updated :-(
41 MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
43 # Helper for non-unix systems
44 if os.name == 'mac':
45 from macurl2path import url2pathname, pathname2url
46 elif os.name == 'nt':
47 from nturl2path import url2pathname, pathname2url
48 elif os.name == 'riscos':
49 from rourl2path import url2pathname, pathname2url
50 else:
51 def url2pathname(pathname):
52 return unquote(pathname)
53 def pathname2url(pathname):
54 return quote(pathname)
56 # This really consists of two pieces:
57 # (1) a class which handles opening of all sorts of URLs
58 # (plus assorted utilities etc.)
59 # (2) a set of functions for parsing URLs
60 # XXX Should these be separated out into different modules?
63 # Shortcut for basic usage
64 _urlopener = None
65 def urlopen(url, data=None, proxies=None):
66 """urlopen(url [, data]) -> open file-like object"""
67 global _urlopener
68 if proxies is not None:
69 opener = FancyURLopener(proxies=proxies)
70 elif not _urlopener:
71 opener = FancyURLopener()
72 _urlopener = opener
73 else:
74 opener = _urlopener
75 if data is None:
76 return opener.open(url)
77 else:
78 return opener.open(url, data)
79 def urlretrieve(url, filename=None, reporthook=None, data=None):
80 global _urlopener
81 if not _urlopener:
82 _urlopener = FancyURLopener()
83 return _urlopener.retrieve(url, filename, reporthook, data)
84 def urlcleanup():
85 if _urlopener:
86 _urlopener.cleanup()
89 ftpcache = {}
90 class URLopener:
91 """Class to open URLs.
92 This is a class rather than just a subroutine because we may need
93 more than one set of global protocol-specific options.
94 Note -- this is a base class for those who don't want the
95 automatic handling of errors type 302 (relocated) and 401
96 (authorization needed)."""
98 __tempfiles = None
100 version = "Python-urllib/%s" % __version__
102 # Constructor
103 def __init__(self, proxies=None, **x509):
104 if proxies is None:
105 proxies = getproxies()
106 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
107 self.proxies = proxies
108 self.key_file = x509.get('key_file')
109 self.cert_file = x509.get('cert_file')
110 self.addheaders = [('User-agent', self.version)]
111 self.__tempfiles = []
112 self.__unlink = os.unlink # See cleanup()
113 self.tempcache = None
114 # Undocumented feature: if you assign {} to tempcache,
115 # it is used to cache files retrieved with
116 # self.retrieve(). This is not enabled by default
117 # since it does not work for changing documents (and I
118 # haven't got the logic to check expiration headers
119 # yet).
120 self.ftpcache = ftpcache
121 # Undocumented feature: you can use a different
122 # ftp cache by assigning to the .ftpcache member;
123 # in case you want logically independent URL openers
124 # XXX This is not threadsafe. Bah.
126 def __del__(self):
127 self.close()
129 def close(self):
130 self.cleanup()
132 def cleanup(self):
133 # This code sometimes runs when the rest of this module
134 # has already been deleted, so it can't use any globals
135 # or import anything.
136 if self.__tempfiles:
137 for file in self.__tempfiles:
138 try:
139 self.__unlink(file)
140 except OSError:
141 pass
142 del self.__tempfiles[:]
143 if self.tempcache:
144 self.tempcache.clear()
146 def addheader(self, *args):
147 """Add a header to be used by the HTTP interface only
148 e.g. u.addheader('Accept', 'sound/basic')"""
149 self.addheaders.append(args)
151 # External interface
152 def open(self, fullurl, data=None):
153 """Use URLopener().open(file) instead of open(file, 'r')."""
154 fullurl = unwrap(toBytes(fullurl))
155 if self.tempcache and fullurl in self.tempcache:
156 filename, headers = self.tempcache[fullurl]
157 fp = open(filename, 'rb')
158 return addinfourl(fp, headers, fullurl)
159 urltype, url = splittype(fullurl)
160 if not urltype:
161 urltype = 'file'
162 if urltype in self.proxies:
163 proxy = self.proxies[urltype]
164 urltype, proxyhost = splittype(proxy)
165 host, selector = splithost(proxyhost)
166 url = (host, fullurl) # Signal special case to open_*()
167 else:
168 proxy = None
169 name = 'open_' + urltype
170 self.type = urltype
171 if '-' in name:
172 # replace - with _
173 name = '_'.join(name.split('-'))
174 if not hasattr(self, name):
175 if proxy:
176 return self.open_unknown_proxy(proxy, fullurl, data)
177 else:
178 return self.open_unknown(fullurl, data)
179 try:
180 if data is None:
181 return getattr(self, name)(url)
182 else:
183 return getattr(self, name)(url, data)
184 except socket.error, msg:
185 raise IOError, ('socket error', msg), sys.exc_info()[2]
187 def open_unknown(self, fullurl, data=None):
188 """Overridable interface to open unknown URL type."""
189 type, url = splittype(fullurl)
190 raise IOError, ('url error', 'unknown url type', type)
192 def open_unknown_proxy(self, proxy, fullurl, data=None):
193 """Overridable interface to open unknown URL type."""
194 type, url = splittype(fullurl)
195 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
197 # External interface
198 def retrieve(self, url, filename=None, reporthook=None, data=None):
199 """retrieve(url) returns (filename, None) for a local object
200 or (tempfilename, headers) for a remote object."""
201 url = unwrap(toBytes(url))
202 if self.tempcache and url in self.tempcache:
203 return self.tempcache[url]
204 type, url1 = splittype(url)
205 if filename is None and (not type or type == 'file'):
206 try:
207 fp = self.open_local_file(url1)
208 hdrs = fp.info()
209 del fp
210 return url2pathname(splithost(url1)[1]), hdrs
211 except IOError, msg:
212 pass
213 fp = self.open(url, data)
214 headers = fp.info()
215 if filename:
216 tfp = open(filename, 'wb')
217 else:
218 import tempfile
219 garbage, path = splittype(url)
220 garbage, path = splithost(path or "")
221 path, garbage = splitquery(path or "")
222 path, garbage = splitattr(path or "")
223 suffix = os.path.splitext(path)[1]
224 (fd, filename) = tempfile.mkstemp(suffix)
225 self.__tempfiles.append(filename)
226 tfp = os.fdopen(fd, 'wb')
227 result = filename, headers
228 if self.tempcache is not None:
229 self.tempcache[url] = result
230 bs = 1024*8
231 size = -1
232 blocknum = 1
233 if reporthook:
234 if "content-length" in headers:
235 size = int(headers["Content-Length"])
236 reporthook(0, bs, size)
237 block = fp.read(bs)
238 if reporthook:
239 reporthook(1, bs, size)
240 while block:
241 tfp.write(block)
242 block = fp.read(bs)
243 blocknum = blocknum + 1
244 if reporthook:
245 reporthook(blocknum, bs, size)
246 fp.close()
247 tfp.close()
248 del fp
249 del tfp
250 return result
252 # Each method named open_<type> knows how to open that type of URL
254 def open_http(self, url, data=None):
255 """Use HTTP protocol."""
256 import httplib
257 user_passwd = None
258 if isinstance(url, str):
259 host, selector = splithost(url)
260 if host:
261 user_passwd, host = splituser(host)
262 host = unquote(host)
263 realhost = host
264 else:
265 host, selector = url
266 urltype, rest = splittype(selector)
267 url = rest
268 user_passwd = None
269 if urltype.lower() != 'http':
270 realhost = None
271 else:
272 realhost, rest = splithost(rest)
273 if realhost:
274 user_passwd, realhost = splituser(realhost)
275 if user_passwd:
276 selector = "%s://%s%s" % (urltype, realhost, rest)
277 if proxy_bypass(realhost):
278 host = realhost
280 #print "proxy via http:", host, selector
281 if not host: raise IOError, ('http error', 'no host given')
282 if user_passwd:
283 import base64
284 auth = base64.encodestring(user_passwd).strip()
285 else:
286 auth = None
287 h = httplib.HTTP(host)
288 if data is not None:
289 h.putrequest('POST', selector)
290 h.putheader('Content-type', 'application/x-www-form-urlencoded')
291 h.putheader('Content-length', '%d' % len(data))
292 else:
293 h.putrequest('GET', selector)
294 if auth: h.putheader('Authorization', 'Basic %s' % auth)
295 if realhost: h.putheader('Host', realhost)
296 for args in self.addheaders: h.putheader(*args)
297 h.endheaders()
298 if data is not None:
299 h.send(data)
300 errcode, errmsg, headers = h.getreply()
301 fp = h.getfile()
302 if errcode == 200:
303 return addinfourl(fp, headers, "http:" + url)
304 else:
305 if data is None:
306 return self.http_error(url, fp, errcode, errmsg, headers)
307 else:
308 return self.http_error(url, fp, errcode, errmsg, headers, data)
310 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
311 """Handle http errors.
312 Derived class can override this, or provide specific handlers
313 named http_error_DDD where DDD is the 3-digit error code."""
314 # First check if there's a specific handler for this error
315 name = 'http_error_%d' % errcode
316 if hasattr(self, name):
317 method = getattr(self, name)
318 if data is None:
319 result = method(url, fp, errcode, errmsg, headers)
320 else:
321 result = method(url, fp, errcode, errmsg, headers, data)
322 if result: return result
323 return self.http_error_default(url, fp, errcode, errmsg, headers)
325 def http_error_default(self, url, fp, errcode, errmsg, headers):
326 """Default error handler: close the connection and raise IOError."""
327 void = fp.read()
328 fp.close()
329 raise IOError, ('http error', errcode, errmsg, headers)
331 if hasattr(socket, "ssl"):
332 def open_https(self, url, data=None):
333 """Use HTTPS protocol."""
334 import httplib
335 user_passwd = None
336 if isinstance(url, str):
337 host, selector = splithost(url)
338 if host:
339 user_passwd, host = splituser(host)
340 host = unquote(host)
341 realhost = host
342 else:
343 host, selector = url
344 urltype, rest = splittype(selector)
345 url = rest
346 user_passwd = None
347 if urltype.lower() != 'https':
348 realhost = None
349 else:
350 realhost, rest = splithost(rest)
351 if realhost:
352 user_passwd, realhost = splituser(realhost)
353 if user_passwd:
354 selector = "%s://%s%s" % (urltype, realhost, rest)
355 #print "proxy via https:", host, selector
356 if not host: raise IOError, ('https error', 'no host given')
357 if user_passwd:
358 import base64
359 auth = base64.encodestring(user_passwd).strip()
360 else:
361 auth = None
362 h = httplib.HTTPS(host, 0,
363 key_file=self.key_file,
364 cert_file=self.cert_file)
365 if data is not None:
366 h.putrequest('POST', selector)
367 h.putheader('Content-type',
368 'application/x-www-form-urlencoded')
369 h.putheader('Content-length', '%d' % len(data))
370 else:
371 h.putrequest('GET', selector)
372 if auth: h.putheader('Authorization: Basic %s' % auth)
373 if realhost: h.putheader('Host', realhost)
374 for args in self.addheaders: h.putheader(*args)
375 h.endheaders()
376 if data is not None:
377 h.send(data)
378 errcode, errmsg, headers = h.getreply()
379 fp = h.getfile()
380 if errcode == 200:
381 return addinfourl(fp, headers, "https:" + url)
382 else:
383 if data is None:
384 return self.http_error(url, fp, errcode, errmsg, headers)
385 else:
386 return self.http_error(url, fp, errcode, errmsg, headers,
387 data)
389 def open_gopher(self, url):
390 """Use Gopher protocol."""
391 import gopherlib
392 host, selector = splithost(url)
393 if not host: raise IOError, ('gopher error', 'no host given')
394 host = unquote(host)
395 type, selector = splitgophertype(selector)
396 selector, query = splitquery(selector)
397 selector = unquote(selector)
398 if query:
399 query = unquote(query)
400 fp = gopherlib.send_query(selector, query, host)
401 else:
402 fp = gopherlib.send_selector(selector, host)
403 return addinfourl(fp, noheaders(), "gopher:" + url)
405 def open_file(self, url):
406 """Use local file or FTP depending on form of URL."""
407 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
408 return self.open_ftp(url)
409 else:
410 return self.open_local_file(url)
412 def open_local_file(self, url):
413 """Use local file."""
414 import mimetypes, mimetools, rfc822, StringIO
415 host, file = splithost(url)
416 localname = url2pathname(file)
417 try:
418 stats = os.stat(localname)
419 except OSError, e:
420 raise IOError(e.errno, e.strerror, e.filename)
421 size = stats.st_size
422 modified = rfc822.formatdate(stats.st_mtime)
423 mtype = mimetypes.guess_type(url)[0]
424 headers = mimetools.Message(StringIO.StringIO(
425 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
426 (mtype or 'text/plain', size, modified)))
427 if not host:
428 urlfile = file
429 if file[:1] == '/':
430 urlfile = 'file://' + file
431 return addinfourl(open(localname, 'rb'),
432 headers, urlfile)
433 host, port = splitport(host)
434 if not port \
435 and socket.gethostbyname(host) in (localhost(), thishost()):
436 urlfile = file
437 if file[:1] == '/':
438 urlfile = 'file://' + file
439 return addinfourl(open(localname, 'rb'),
440 headers, urlfile)
441 raise IOError, ('local file error', 'not on local host')
443 def open_ftp(self, url):
444 """Use FTP protocol."""
445 import mimetypes, mimetools, StringIO
446 host, path = splithost(url)
447 if not host: raise IOError, ('ftp error', 'no host given')
448 host, port = splitport(host)
449 user, host = splituser(host)
450 if user: user, passwd = splitpasswd(user)
451 else: passwd = None
452 host = unquote(host)
453 user = unquote(user or '')
454 passwd = unquote(passwd or '')
455 host = socket.gethostbyname(host)
456 if not port:
457 import ftplib
458 port = ftplib.FTP_PORT
459 else:
460 port = int(port)
461 path, attrs = splitattr(path)
462 path = unquote(path)
463 dirs = path.split('/')
464 dirs, file = dirs[:-1], dirs[-1]
465 if dirs and not dirs[0]: dirs = dirs[1:]
466 if dirs and not dirs[0]: dirs[0] = '/'
467 key = user, host, port, '/'.join(dirs)
468 # XXX thread unsafe!
469 if len(self.ftpcache) > MAXFTPCACHE:
470 # Prune the cache, rather arbitrarily
471 for k in self.ftpcache.keys():
472 if k != key:
473 v = self.ftpcache[k]
474 del self.ftpcache[k]
475 v.close()
476 try:
477 if not key in self.ftpcache:
478 self.ftpcache[key] = \
479 ftpwrapper(user, passwd, host, port, dirs)
480 if not file: type = 'D'
481 else: type = 'I'
482 for attr in attrs:
483 attr, value = splitvalue(attr)
484 if attr.lower() == 'type' and \
485 value in ('a', 'A', 'i', 'I', 'd', 'D'):
486 type = value.upper()
487 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
488 mtype = mimetypes.guess_type("ftp:" + url)[0]
489 headers = ""
490 if mtype:
491 headers += "Content-Type: %s\n" % mtype
492 if retrlen is not None and retrlen >= 0:
493 headers += "Content-Length: %d\n" % retrlen
494 headers = mimetools.Message(StringIO.StringIO(headers))
495 return addinfourl(fp, headers, "ftp:" + url)
496 except ftperrors(), msg:
497 raise IOError, ('ftp error', msg), sys.exc_info()[2]
499 def open_data(self, url, data=None):
500 """Use "data" URL."""
501 # ignore POSTed data
503 # syntax of data URLs:
504 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
505 # mediatype := [ type "/" subtype ] *( ";" parameter )
506 # data := *urlchar
507 # parameter := attribute "=" value
508 import StringIO, mimetools
509 try:
510 [type, data] = url.split(',', 1)
511 except ValueError:
512 raise IOError, ('data error', 'bad data URL')
513 if not type:
514 type = 'text/plain;charset=US-ASCII'
515 semi = type.rfind(';')
516 if semi >= 0 and '=' not in type[semi:]:
517 encoding = type[semi+1:]
518 type = type[:semi]
519 else:
520 encoding = ''
521 msg = []
522 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
523 time.gmtime(time.time())))
524 msg.append('Content-type: %s' % type)
525 if encoding == 'base64':
526 import base64
527 data = base64.decodestring(data)
528 else:
529 data = unquote(data)
530 msg.append('Content-length: %d' % len(data))
531 msg.append('')
532 msg.append(data)
533 msg = '\n'.join(msg)
534 f = StringIO.StringIO(msg)
535 headers = mimetools.Message(f, 0)
536 f.fileno = None # needed for addinfourl
537 return addinfourl(f, headers, url)
540 class FancyURLopener(URLopener):
541 """Derived class with handlers for errors we can handle (perhaps)."""
543 def __init__(self, *args, **kwargs):
544 URLopener.__init__(self, *args, **kwargs)
545 self.auth_cache = {}
546 self.tries = 0
547 self.maxtries = 10
549 def http_error_default(self, url, fp, errcode, errmsg, headers):
550 """Default error handling -- don't raise an exception."""
551 return addinfourl(fp, headers, "http:" + url)
553 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
554 """Error 302 -- relocated (temporarily)."""
555 self.tries += 1
556 if self.maxtries and self.tries >= self.maxtries:
557 if hasattr(self, "http_error_500"):
558 meth = self.http_error_500
559 else:
560 meth = self.http_error_default
561 self.tries = 0
562 return meth(url, fp, 500,
563 "Internal Server Error: Redirect Recursion", headers)
564 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
565 data)
566 self.tries = 0
567 return result
569 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
570 if 'location' in headers:
571 newurl = headers['location']
572 elif 'uri' in headers:
573 newurl = headers['uri']
574 else:
575 return
576 void = fp.read()
577 fp.close()
578 # In case the server sent a relative URL, join with original:
579 newurl = basejoin(self.type + ":" + url, newurl)
580 if data is None:
581 return self.open(newurl)
582 else:
583 return self.open(newurl, data)
585 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
586 """Error 301 -- also relocated (permanently)."""
587 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
589 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
590 """Error 401 -- authentication required.
591 See this URL for a description of the basic authentication scheme:
592 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
593 if not 'www-authenticate' in headers:
594 URLopener.http_error_default(self, url, fp,
595 errcode, errmsg, headers)
596 stuff = headers['www-authenticate']
597 import re
598 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
599 if not match:
600 URLopener.http_error_default(self, url, fp,
601 errcode, errmsg, headers)
602 scheme, realm = match.groups()
603 if scheme.lower() != 'basic':
604 URLopener.http_error_default(self, url, fp,
605 errcode, errmsg, headers)
606 name = 'retry_' + self.type + '_basic_auth'
607 if data is None:
608 return getattr(self,name)(url, realm)
609 else:
610 return getattr(self,name)(url, realm, data)
612 def retry_http_basic_auth(self, url, realm, data=None):
613 host, selector = splithost(url)
614 i = host.find('@') + 1
615 host = host[i:]
616 user, passwd = self.get_user_passwd(host, realm, i)
617 if not (user or passwd): return None
618 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
619 newurl = 'http://' + host + selector
620 if data is None:
621 return self.open(newurl)
622 else:
623 return self.open(newurl, data)
625 def retry_https_basic_auth(self, url, realm, data=None):
626 host, selector = splithost(url)
627 i = host.find('@') + 1
628 host = host[i:]
629 user, passwd = self.get_user_passwd(host, realm, i)
630 if not (user or passwd): return None
631 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
632 newurl = '//' + host + selector
633 return self.open_https(newurl, data)
635 def get_user_passwd(self, host, realm, clear_cache = 0):
636 key = realm + '@' + host.lower()
637 if key in self.auth_cache:
638 if clear_cache:
639 del self.auth_cache[key]
640 else:
641 return self.auth_cache[key]
642 user, passwd = self.prompt_user_passwd(host, realm)
643 if user or passwd: self.auth_cache[key] = (user, passwd)
644 return user, passwd
646 def prompt_user_passwd(self, host, realm):
647 """Override this in a GUI environment!"""
648 import getpass
649 try:
650 user = raw_input("Enter username for %s at %s: " % (realm,
651 host))
652 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
653 (user, realm, host))
654 return user, passwd
655 except KeyboardInterrupt:
656 print
657 return None, None
660 # Utility functions
662 _localhost = None
663 def localhost():
664 """Return the IP address of the magic hostname 'localhost'."""
665 global _localhost
666 if _localhost is None:
667 _localhost = socket.gethostbyname('localhost')
668 return _localhost
670 _thishost = None
671 def thishost():
672 """Return the IP address of the current host."""
673 global _thishost
674 if _thishost is None:
675 _thishost = socket.gethostbyname(socket.gethostname())
676 return _thishost
678 _ftperrors = None
679 def ftperrors():
680 """Return the set of errors raised by the FTP class."""
681 global _ftperrors
682 if _ftperrors is None:
683 import ftplib
684 _ftperrors = ftplib.all_errors
685 return _ftperrors
687 _noheaders = None
688 def noheaders():
689 """Return an empty mimetools.Message object."""
690 global _noheaders
691 if _noheaders is None:
692 import mimetools
693 import StringIO
694 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
695 _noheaders.fp.close() # Recycle file descriptor
696 return _noheaders
699 # Utility classes
701 class ftpwrapper:
702 """Class used by open_ftp() for cache of open FTP connections."""
704 def __init__(self, user, passwd, host, port, dirs):
705 self.user = user
706 self.passwd = passwd
707 self.host = host
708 self.port = port
709 self.dirs = dirs
710 self.init()
712 def init(self):
713 import ftplib
714 self.busy = 0
715 self.ftp = ftplib.FTP()
716 self.ftp.connect(self.host, self.port)
717 self.ftp.login(self.user, self.passwd)
718 for dir in self.dirs:
719 self.ftp.cwd(dir)
721 def retrfile(self, file, type):
722 import ftplib
723 self.endtransfer()
724 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
725 else: cmd = 'TYPE ' + type; isdir = 0
726 try:
727 self.ftp.voidcmd(cmd)
728 except ftplib.all_errors:
729 self.init()
730 self.ftp.voidcmd(cmd)
731 conn = None
732 if file and not isdir:
733 # Use nlst to see if the file exists at all
734 try:
735 self.ftp.nlst(file)
736 except ftplib.error_perm, reason:
737 raise IOError, ('ftp error', reason), sys.exc_info()[2]
738 # Restore the transfer mode!
739 self.ftp.voidcmd(cmd)
740 # Try to retrieve as a file
741 try:
742 cmd = 'RETR ' + file
743 conn = self.ftp.ntransfercmd(cmd)
744 except ftplib.error_perm, reason:
745 if str(reason)[:3] != '550':
746 raise IOError, ('ftp error', reason), sys.exc_info()[2]
747 if not conn:
748 # Set transfer mode to ASCII!
749 self.ftp.voidcmd('TYPE A')
750 # Try a directory listing
751 if file: cmd = 'LIST ' + file
752 else: cmd = 'LIST'
753 conn = self.ftp.ntransfercmd(cmd)
754 self.busy = 1
755 # Pass back both a suitably decorated object and a retrieval length
756 return (addclosehook(conn[0].makefile('rb'),
757 self.endtransfer), conn[1])
758 def endtransfer(self):
759 if not self.busy:
760 return
761 self.busy = 0
762 try:
763 self.ftp.voidresp()
764 except ftperrors():
765 pass
767 def close(self):
768 self.endtransfer()
769 try:
770 self.ftp.close()
771 except ftperrors():
772 pass
774 class addbase:
775 """Base class for addinfo and addclosehook."""
777 def __init__(self, fp):
778 self.fp = fp
779 self.read = self.fp.read
780 self.readline = self.fp.readline
781 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
782 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
783 if hasattr(self.fp, "__iter__"):
784 self.__iter__ = self.fp.__iter__
785 if hasattr(self.fp, "next"):
786 self.next = self.fp.next
788 def __repr__(self):
789 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
790 `id(self)`, `self.fp`)
792 def close(self):
793 self.read = None
794 self.readline = None
795 self.readlines = None
796 self.fileno = None
797 if self.fp: self.fp.close()
798 self.fp = None
800 class addclosehook(addbase):
801 """Class to add a close hook to an open file."""
803 def __init__(self, fp, closehook, *hookargs):
804 addbase.__init__(self, fp)
805 self.closehook = closehook
806 self.hookargs = hookargs
808 def close(self):
809 addbase.close(self)
810 if self.closehook:
811 self.closehook(*self.hookargs)
812 self.closehook = None
813 self.hookargs = None
815 class addinfo(addbase):
816 """class to add an info() method to an open file."""
818 def __init__(self, fp, headers):
819 addbase.__init__(self, fp)
820 self.headers = headers
822 def info(self):
823 return self.headers
825 class addinfourl(addbase):
826 """class to add info() and geturl() methods to an open file."""
828 def __init__(self, fp, headers, url):
829 addbase.__init__(self, fp)
830 self.headers = headers
831 self.url = url
833 def info(self):
834 return self.headers
836 def geturl(self):
837 return self.url
840 def basejoin(base, url):
841 """Utility to combine a URL with a base URL to form a new URL."""
842 type, path = splittype(url)
843 if type:
844 # if url is complete (i.e., it contains a type), return it
845 return url
846 host, path = splithost(path)
847 type, basepath = splittype(base) # inherit type from base
848 if host:
849 # if url contains host, just inherit type
850 if type: return type + '://' + host + path
851 else:
852 # no type inherited, so url must have started with //
853 # just return it
854 return url
855 host, basepath = splithost(basepath) # inherit host
856 basepath, basetag = splittag(basepath) # remove extraneous cruft
857 basepath, basequery = splitquery(basepath) # idem
858 if path[:1] != '/':
859 # non-absolute path name
860 if path[:1] in ('#', '?'):
861 # path is just a tag or query, attach to basepath
862 i = len(basepath)
863 else:
864 # else replace last component
865 i = basepath.rfind('/')
866 if i < 0:
867 # basepath not absolute
868 if host:
869 # host present, make absolute
870 basepath = '/'
871 else:
872 # else keep non-absolute
873 basepath = ''
874 else:
875 # remove last file component
876 basepath = basepath[:i+1]
877 # Interpret ../ (important because of symlinks)
878 while basepath and path[:3] == '../':
879 path = path[3:]
880 i = basepath[:-1].rfind('/')
881 if i > 0:
882 basepath = basepath[:i+1]
883 elif i == 0:
884 basepath = '/'
885 break
886 else:
887 basepath = ''
889 path = basepath + path
890 if host and path and path[0] != '/':
891 path = '/' + path
892 if type and host: return type + '://' + host + path
893 elif type: return type + ':' + path
894 elif host: return '//' + host + path # don't know what this means
895 else: return path
898 # Utilities to parse URLs (most of these return None for missing parts):
899 # unwrap('<URL:type://host/path>') --> 'type://host/path'
900 # splittype('type:opaquestring') --> 'type', 'opaquestring'
901 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
902 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
903 # splitpasswd('user:passwd') -> 'user', 'passwd'
904 # splitport('host:port') --> 'host', 'port'
905 # splitquery('/path?query') --> '/path', 'query'
906 # splittag('/path#tag') --> '/path', 'tag'
907 # splitattr('/path;attr1=value1;attr2=value2;...') ->
908 # '/path', ['attr1=value1', 'attr2=value2', ...]
909 # splitvalue('attr=value') --> 'attr', 'value'
910 # splitgophertype('/Xselector') --> 'X', 'selector'
911 # unquote('abc%20def') -> 'abc def'
912 # quote('abc def') -> 'abc%20def')
914 try:
915 unicode
916 except NameError:
917 def _is_unicode(x):
918 return 0
919 else:
920 def _is_unicode(x):
921 return isinstance(x, unicode)
923 def toBytes(url):
924 """toBytes(u"URL") --> 'URL'."""
925 # Most URL schemes require ASCII. If that changes, the conversion
926 # can be relaxed
927 if _is_unicode(url):
928 try:
929 url = url.encode("ASCII")
930 except UnicodeError:
931 raise UnicodeError("URL " + repr(url) +
932 " contains non-ASCII characters")
933 return url
935 def unwrap(url):
936 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
937 url = url.strip()
938 if url[:1] == '<' and url[-1:] == '>':
939 url = url[1:-1].strip()
940 if url[:4] == 'URL:': url = url[4:].strip()
941 return url
943 _typeprog = None
944 def splittype(url):
945 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
946 global _typeprog
947 if _typeprog is None:
948 import re
949 _typeprog = re.compile('^([^/:]+):')
951 match = _typeprog.match(url)
952 if match:
953 scheme = match.group(1)
954 return scheme.lower(), url[len(scheme) + 1:]
955 return None, url
957 _hostprog = None
958 def splithost(url):
959 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
960 global _hostprog
961 if _hostprog is None:
962 import re
963 _hostprog = re.compile('^//([^/]*)(.*)$')
965 match = _hostprog.match(url)
966 if match: return match.group(1, 2)
967 return None, url
969 _userprog = None
970 def splituser(host):
971 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
972 global _userprog
973 if _userprog is None:
974 import re
975 _userprog = re.compile('^(.*)@(.*)$')
977 match = _userprog.match(host)
978 if match: return map(unquote, match.group(1, 2))
979 return None, host
981 _passwdprog = None
982 def splitpasswd(user):
983 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
984 global _passwdprog
985 if _passwdprog is None:
986 import re
987 _passwdprog = re.compile('^([^:]*):(.*)$')
989 match = _passwdprog.match(user)
990 if match: return match.group(1, 2)
991 return user, None
993 # splittag('/path#tag') --> '/path', 'tag'
994 _portprog = None
995 def splitport(host):
996 """splitport('host:port') --> 'host', 'port'."""
997 global _portprog
998 if _portprog is None:
999 import re
1000 _portprog = re.compile('^(.*):([0-9]+)$')
1002 match = _portprog.match(host)
1003 if match: return match.group(1, 2)
1004 return host, None
1006 _nportprog = None
1007 def splitnport(host, defport=-1):
1008 """Split host and port, returning numeric port.
1009 Return given default port if no ':' found; defaults to -1.
1010 Return numerical port if a valid number are found after ':'.
1011 Return None if ':' but not a valid number."""
1012 global _nportprog
1013 if _nportprog is None:
1014 import re
1015 _nportprog = re.compile('^(.*):(.*)$')
1017 match = _nportprog.match(host)
1018 if match:
1019 host, port = match.group(1, 2)
1020 try:
1021 if not port: raise ValueError, "no digits"
1022 nport = int(port)
1023 except ValueError:
1024 nport = None
1025 return host, nport
1026 return host, defport
1028 _queryprog = None
1029 def splitquery(url):
1030 """splitquery('/path?query') --> '/path', 'query'."""
1031 global _queryprog
1032 if _queryprog is None:
1033 import re
1034 _queryprog = re.compile('^(.*)\?([^?]*)$')
1036 match = _queryprog.match(url)
1037 if match: return match.group(1, 2)
1038 return url, None
1040 _tagprog = None
1041 def splittag(url):
1042 """splittag('/path#tag') --> '/path', 'tag'."""
1043 global _tagprog
1044 if _tagprog is None:
1045 import re
1046 _tagprog = re.compile('^(.*)#([^#]*)$')
1048 match = _tagprog.match(url)
1049 if match: return match.group(1, 2)
1050 return url, None
1052 def splitattr(url):
1053 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1054 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1055 words = url.split(';')
1056 return words[0], words[1:]
1058 _valueprog = None
1059 def splitvalue(attr):
1060 """splitvalue('attr=value') --> 'attr', 'value'."""
1061 global _valueprog
1062 if _valueprog is None:
1063 import re
1064 _valueprog = re.compile('^([^=]*)=(.*)$')
1066 match = _valueprog.match(attr)
1067 if match: return match.group(1, 2)
1068 return attr, None
1070 def splitgophertype(selector):
1071 """splitgophertype('/Xselector') --> 'X', 'selector'."""
1072 if selector[:1] == '/' and selector[1:2]:
1073 return selector[1], selector[2:]
1074 return None, selector
1076 def unquote(s):
1077 """unquote('abc%20def') -> 'abc def'."""
1078 mychr = chr
1079 myatoi = int
1080 list = s.split('%')
1081 res = [list[0]]
1082 myappend = res.append
1083 del list[0]
1084 for item in list:
1085 if item[1:2]:
1086 try:
1087 myappend(mychr(myatoi(item[:2], 16))
1088 + item[2:])
1089 except ValueError:
1090 myappend('%' + item)
1091 else:
1092 myappend('%' + item)
1093 return "".join(res)
1095 def unquote_plus(s):
1096 """unquote('%7e/abc+def') -> '~/abc def'"""
1097 if '+' in s:
1098 # replace '+' with ' '
1099 s = ' '.join(s.split('+'))
1100 return unquote(s)
1102 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1103 'abcdefghijklmnopqrstuvwxyz'
1104 '0123456789' '_.-')
1106 _fast_safe_test = always_safe + '/'
1107 _fast_safe = None
1109 def _fast_quote(s):
1110 global _fast_safe
1111 if _fast_safe is None:
1112 _fast_safe = {}
1113 for c in _fast_safe_test:
1114 _fast_safe[c] = c
1115 res = list(s)
1116 for i in range(len(res)):
1117 c = res[i]
1118 if not c in _fast_safe:
1119 res[i] = '%%%02X' % ord(c)
1120 return ''.join(res)
1122 def quote(s, safe = '/'):
1123 """quote('abc def') -> 'abc%20def'
1125 Each part of a URL, e.g. the path info, the query, etc., has a
1126 different set of reserved characters that must be quoted.
1128 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1129 the following reserved characters.
1131 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1132 "$" | ","
1134 Each of these characters is reserved in some component of a URL,
1135 but not necessarily in all of them.
1137 By default, the quote function is intended for quoting the path
1138 section of a URL. Thus, it will not encode '/'. This character
1139 is reserved, but in typical usage the quote function is being
1140 called on a path where the existing slash characters are used as
1141 reserved characters.
1143 safe = always_safe + safe
1144 if _fast_safe_test == safe:
1145 return _fast_quote(s)
1146 res = list(s)
1147 for i in range(len(res)):
1148 c = res[i]
1149 if c not in safe:
1150 res[i] = '%%%02X' % ord(c)
1151 return ''.join(res)
1153 def quote_plus(s, safe = ''):
1154 """Quote the query fragment of a URL; replacing ' ' with '+'"""
1155 if ' ' in s:
1156 l = s.split(' ')
1157 for i in range(len(l)):
1158 l[i] = quote(l[i], safe)
1159 return '+'.join(l)
1160 else:
1161 return quote(s, safe)
1163 def urlencode(query,doseq=0):
1164 """Encode a sequence of two-element tuples or dictionary into a URL query string.
1166 If any values in the query arg are sequences and doseq is true, each
1167 sequence element is converted to a separate parameter.
1169 If the query arg is a sequence of two-element tuples, the order of the
1170 parameters in the output will match the order of parameters in the
1171 input.
1174 if hasattr(query,"items"):
1175 # mapping objects
1176 query = query.items()
1177 else:
1178 # it's a bother at times that strings and string-like objects are
1179 # sequences...
1180 try:
1181 # non-sequence items should not work with len()
1182 # non-empty strings will fail this
1183 if len(query) and not isinstance(query[0], tuple):
1184 raise TypeError
1185 # zero-length sequences of all types will get here and succeed,
1186 # but that's a minor nit - since the original implementation
1187 # allowed empty dicts that type of behavior probably should be
1188 # preserved for consistency
1189 except TypeError:
1190 ty,va,tb = sys.exc_info()
1191 raise TypeError, "not a valid non-string sequence or mapping object", tb
1193 l = []
1194 if not doseq:
1195 # preserve old behavior
1196 for k, v in query:
1197 k = quote_plus(str(k))
1198 v = quote_plus(str(v))
1199 l.append(k + '=' + v)
1200 else:
1201 for k, v in query:
1202 k = quote_plus(str(k))
1203 if isinstance(v, str):
1204 v = quote_plus(v)
1205 l.append(k + '=' + v)
1206 elif _is_unicode(v):
1207 # is there a reasonable way to convert to ASCII?
1208 # encode generates a string, but "replace" or "ignore"
1209 # lose information and "strict" can raise UnicodeError
1210 v = quote_plus(v.encode("ASCII","replace"))
1211 l.append(k + '=' + v)
1212 else:
1213 try:
1214 # is this a sufficient test for sequence-ness?
1215 x = len(v)
1216 except TypeError:
1217 # not a sequence
1218 v = quote_plus(str(v))
1219 l.append(k + '=' + v)
1220 else:
1221 # loop over the sequence
1222 for elt in v:
1223 l.append(k + '=' + quote_plus(str(elt)))
1224 return '&'.join(l)
1226 # Proxy handling
1227 def getproxies_environment():
1228 """Return a dictionary of scheme -> proxy server URL mappings.
1230 Scan the environment for variables named <scheme>_proxy;
1231 this seems to be the standard convention. If you need a
1232 different way, you can pass a proxies dictionary to the
1233 [Fancy]URLopener constructor.
1236 proxies = {}
1237 for name, value in os.environ.items():
1238 name = name.lower()
1239 if value and name[-6:] == '_proxy':
1240 proxies[name[:-6]] = value
1241 return proxies
1243 if os.name == 'mac':
1244 def getproxies():
1245 """Return a dictionary of scheme -> proxy server URL mappings.
1247 By convention the mac uses Internet Config to store
1248 proxies. An HTTP proxy, for instance, is stored under
1249 the HttpProxy key.
1252 try:
1253 import ic
1254 except ImportError:
1255 return {}
1257 try:
1258 config = ic.IC()
1259 except ic.error:
1260 return {}
1261 proxies = {}
1262 # HTTP:
1263 if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
1264 try:
1265 value = config['HTTPProxyHost']
1266 except ic.error:
1267 pass
1268 else:
1269 proxies['http'] = 'http://%s' % value
1270 # FTP: XXXX To be done.
1271 # Gopher: XXXX To be done.
1272 return proxies
1274 def proxy_bypass(x):
1275 return 0
1277 elif os.name == 'nt':
1278 def getproxies_registry():
1279 """Return a dictionary of scheme -> proxy server URL mappings.
1281 Win32 uses the registry to store proxies.
1284 proxies = {}
1285 try:
1286 import _winreg
1287 except ImportError:
1288 # Std module, so should be around - but you never know!
1289 return proxies
1290 try:
1291 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1292 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1293 proxyEnable = _winreg.QueryValueEx(internetSettings,
1294 'ProxyEnable')[0]
1295 if proxyEnable:
1296 # Returned as Unicode but problems if not converted to ASCII
1297 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1298 'ProxyServer')[0])
1299 if '=' in proxyServer:
1300 # Per-protocol settings
1301 for p in proxyServer.split(';'):
1302 protocol, address = p.split('=', 1)
1303 # See if address has a type:// prefix
1304 import re
1305 if not re.match('^([^/:]+)://', address):
1306 address = '%s://%s' % (protocol, address)
1307 proxies[protocol] = address
1308 else:
1309 # Use one setting for all protocols
1310 if proxyServer[:5] == 'http:':
1311 proxies['http'] = proxyServer
1312 else:
1313 proxies['http'] = 'http://%s' % proxyServer
1314 proxies['ftp'] = 'ftp://%s' % proxyServer
1315 internetSettings.Close()
1316 except (WindowsError, ValueError, TypeError):
1317 # Either registry key not found etc, or the value in an
1318 # unexpected format.
1319 # proxies already set up to be empty so nothing to do
1320 pass
1321 return proxies
1323 def getproxies():
1324 """Return a dictionary of scheme -> proxy server URL mappings.
1326 Returns settings gathered from the environment, if specified,
1327 or the registry.
1330 return getproxies_environment() or getproxies_registry()
1332 def proxy_bypass(host):
1333 try:
1334 import _winreg
1335 import re
1336 except ImportError:
1337 # Std modules, so should be around - but you never know!
1338 return 0
1339 try:
1340 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1341 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1342 proxyEnable = _winreg.QueryValueEx(internetSettings,
1343 'ProxyEnable')[0]
1344 proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1345 'ProxyOverride')[0])
1346 # ^^^^ Returned as Unicode but problems if not converted to ASCII
1347 except WindowsError:
1348 return 0
1349 if not proxyEnable or not proxyOverride:
1350 return 0
1351 # try to make a host list from name and IP address.
1352 host = [host]
1353 try:
1354 addr = socket.gethostbyname(host[0])
1355 if addr != host:
1356 host.append(addr)
1357 except socket.error:
1358 pass
1359 # make a check value list from the registry entry: replace the
1360 # '<local>' string by the localhost entry and the corresponding
1361 # canonical entry.
1362 proxyOverride = proxyOverride.split(';')
1363 i = 0
1364 while i < len(proxyOverride):
1365 if proxyOverride[i] == '<local>':
1366 proxyOverride[i:i+1] = ['localhost',
1367 '127.0.0.1',
1368 socket.gethostname(),
1369 socket.gethostbyname(
1370 socket.gethostname())]
1371 i += 1
1372 # print proxyOverride
1373 # now check if we match one of the registry values.
1374 for test in proxyOverride:
1375 test = test.replace(".", r"\.") # mask dots
1376 test = test.replace("*", r".*") # change glob sequence
1377 test = test.replace("?", r".") # change glob char
1378 for val in host:
1379 # print "%s <--> %s" %( test, val )
1380 if re.match(test, val, re.I):
1381 return 1
1382 return 0
1384 else:
1385 # By default use environment variables
1386 getproxies = getproxies_environment
1388 def proxy_bypass(host):
1389 return 0
1391 # Test and time quote() and unquote()
1392 def test1():
1393 s = ''
1394 for i in range(256): s = s + chr(i)
1395 s = s*4
1396 t0 = time.time()
1397 qs = quote(s)
1398 uqs = unquote(qs)
1399 t1 = time.time()
1400 if uqs != s:
1401 print 'Wrong!'
1402 print `s`
1403 print `qs`
1404 print `uqs`
1405 print round(t1 - t0, 3), 'sec'
1408 def reporthook(blocknum, blocksize, totalsize):
1409 # Report during remote transfers
1410 print "Block number: %d, Block size: %d, Total size: %d" % (
1411 blocknum, blocksize, totalsize)
1413 # Test program
1414 def test(args=[]):
1415 if not args:
1416 args = [
1417 '/etc/passwd',
1418 'file:/etc/passwd',
1419 'file://localhost/etc/passwd',
1420 'ftp://ftp.python.org/pub/python/README',
1421 ## 'gopher://gopher.micro.umn.edu/1/',
1422 'http://www.python.org/index.html',
1424 if hasattr(URLopener, "open_https"):
1425 args.append('https://synergy.as.cmu.edu/~geek/')
1426 try:
1427 for url in args:
1428 print '-'*10, url, '-'*10
1429 fn, h = urlretrieve(url, None, reporthook)
1430 print fn
1431 if h:
1432 print '======'
1433 for k in h.keys(): print k + ':', h[k]
1434 print '======'
1435 fp = open(fn, 'rb')
1436 data = fp.read()
1437 del fp
1438 if '\r' in data:
1439 table = string.maketrans("", "")
1440 data = data.translate(table, "\r")
1441 print data
1442 fn, h = None, None
1443 print '-'*40
1444 finally:
1445 urlcleanup()
1447 def main():
1448 import getopt, sys
1449 try:
1450 opts, args = getopt.getopt(sys.argv[1:], "th")
1451 except getopt.error, msg:
1452 print msg
1453 print "Use -h for help"
1454 return
1455 t = 0
1456 for o, a in opts:
1457 if o == '-t':
1458 t = t + 1
1459 if o == '-h':
1460 print "Usage: python urllib.py [-t] [url ...]"
1461 print "-t runs self-test;",
1462 print "otherwise, contents of urls are printed"
1463 return
1464 if t:
1465 if t > 1:
1466 test1()
1467 test(args)
1468 else:
1469 if not args:
1470 print "Use -h for help"
1471 for url in args:
1472 print urlopen(url).read(),
1474 # Run test program when run as a script
1475 if __name__ == '__main__':
1476 main()