This commit was manufactured by cvs2svn to create tag 'r23b1-mac'.
[python/dscho.git] / Lib / urllib.py
blob42851eea1706dd842837af9f9e77fe14f845db1a
1 """Open an arbitrary URL.
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
16 The object returned by URLopener().open(file) will differ per
17 protocol. All you know is that is has methods read(), readline(),
18 readlines(), fileno(), close() and info(). The read*(), fileno()
19 and close() methods work like those of open files.
20 The info() method returns a mimetools.Message object which can be
21 used to query various info about the object, if available.
22 (mimetools.Message objects are queried with the getheader() method.)
23 """
25 import string
26 import socket
27 import os
28 import time
29 import sys
31 __all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
32 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
33 "urlencode", "url2pathname", "pathname2url", "splittag",
34 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
35 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
36 "splitnport", "splitquery", "splitattr", "splitvalue",
37 "splitgophertype", "getproxies"]
39 __version__ = '1.15' # XXX This version is not always updated :-(
41 MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
43 # Helper for non-unix systems
44 if os.name == 'mac':
45 from macurl2path import url2pathname, pathname2url
46 elif os.name == 'nt':
47 from nturl2path import url2pathname, pathname2url
48 elif os.name == 'riscos':
49 from rourl2path import url2pathname, pathname2url
50 else:
51 def url2pathname(pathname):
52 return unquote(pathname)
53 def pathname2url(pathname):
54 return quote(pathname)
56 # This really consists of two pieces:
57 # (1) a class which handles opening of all sorts of URLs
58 # (plus assorted utilities etc.)
59 # (2) a set of functions for parsing URLs
60 # XXX Should these be separated out into different modules?
63 # Shortcut for basic usage
64 _urlopener = None
65 def urlopen(url, data=None, proxies=None):
66 """urlopen(url [, data]) -> open file-like object"""
67 global _urlopener
68 if proxies is not None:
69 opener = FancyURLopener(proxies=proxies)
70 elif not _urlopener:
71 opener = FancyURLopener()
72 _urlopener = opener
73 else:
74 opener = _urlopener
75 if data is None:
76 return opener.open(url)
77 else:
78 return opener.open(url, data)
79 def urlretrieve(url, filename=None, reporthook=None, data=None):
80 global _urlopener
81 if not _urlopener:
82 _urlopener = FancyURLopener()
83 return _urlopener.retrieve(url, filename, reporthook, data)
84 def urlcleanup():
85 if _urlopener:
86 _urlopener.cleanup()
89 ftpcache = {}
90 class URLopener:
91 """Class to open URLs.
92 This is a class rather than just a subroutine because we may need
93 more than one set of global protocol-specific options.
94 Note -- this is a base class for those who don't want the
95 automatic handling of errors type 302 (relocated) and 401
96 (authorization needed)."""
98 __tempfiles = None
100 version = "Python-urllib/%s" % __version__
102 # Constructor
103 def __init__(self, proxies=None, **x509):
104 if proxies is None:
105 proxies = getproxies()
106 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
107 self.proxies = proxies
108 self.key_file = x509.get('key_file')
109 self.cert_file = x509.get('cert_file')
110 self.addheaders = [('User-agent', self.version)]
111 self.__tempfiles = []
112 self.__unlink = os.unlink # See cleanup()
113 self.tempcache = None
114 # Undocumented feature: if you assign {} to tempcache,
115 # it is used to cache files retrieved with
116 # self.retrieve(). This is not enabled by default
117 # since it does not work for changing documents (and I
118 # haven't got the logic to check expiration headers
119 # yet).
120 self.ftpcache = ftpcache
121 # Undocumented feature: you can use a different
122 # ftp cache by assigning to the .ftpcache member;
123 # in case you want logically independent URL openers
124 # XXX This is not threadsafe. Bah.
126 def __del__(self):
127 self.close()
129 def close(self):
130 self.cleanup()
132 def cleanup(self):
133 # This code sometimes runs when the rest of this module
134 # has already been deleted, so it can't use any globals
135 # or import anything.
136 if self.__tempfiles:
137 for file in self.__tempfiles:
138 try:
139 self.__unlink(file)
140 except OSError:
141 pass
142 del self.__tempfiles[:]
143 if self.tempcache:
144 self.tempcache.clear()
146 def addheader(self, *args):
147 """Add a header to be used by the HTTP interface only
148 e.g. u.addheader('Accept', 'sound/basic')"""
149 self.addheaders.append(args)
151 # External interface
152 def open(self, fullurl, data=None):
153 """Use URLopener().open(file) instead of open(file, 'r')."""
154 fullurl = unwrap(toBytes(fullurl))
155 if self.tempcache and fullurl in self.tempcache:
156 filename, headers = self.tempcache[fullurl]
157 fp = open(filename, 'rb')
158 return addinfourl(fp, headers, fullurl)
159 urltype, url = splittype(fullurl)
160 if not urltype:
161 urltype = 'file'
162 if urltype in self.proxies:
163 proxy = self.proxies[urltype]
164 urltype, proxyhost = splittype(proxy)
165 host, selector = splithost(proxyhost)
166 url = (host, fullurl) # Signal special case to open_*()
167 else:
168 proxy = None
169 name = 'open_' + urltype
170 self.type = urltype
171 if '-' in name:
172 # replace - with _
173 name = '_'.join(name.split('-'))
174 if not hasattr(self, name):
175 if proxy:
176 return self.open_unknown_proxy(proxy, fullurl, data)
177 else:
178 return self.open_unknown(fullurl, data)
179 try:
180 if data is None:
181 return getattr(self, name)(url)
182 else:
183 return getattr(self, name)(url, data)
184 except socket.error, msg:
185 raise IOError, ('socket error', msg), sys.exc_info()[2]
187 def open_unknown(self, fullurl, data=None):
188 """Overridable interface to open unknown URL type."""
189 type, url = splittype(fullurl)
190 raise IOError, ('url error', 'unknown url type', type)
192 def open_unknown_proxy(self, proxy, fullurl, data=None):
193 """Overridable interface to open unknown URL type."""
194 type, url = splittype(fullurl)
195 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
197 # External interface
198 def retrieve(self, url, filename=None, reporthook=None, data=None):
199 """retrieve(url) returns (filename, headers) for a local object
200 or (tempfilename, headers) for a remote object."""
201 url = unwrap(toBytes(url))
202 if self.tempcache and url in self.tempcache:
203 return self.tempcache[url]
204 type, url1 = splittype(url)
205 if filename is None and (not type or type == 'file'):
206 try:
207 fp = self.open_local_file(url1)
208 hdrs = fp.info()
209 del fp
210 return url2pathname(splithost(url1)[1]), hdrs
211 except IOError, msg:
212 pass
213 fp = self.open(url, data)
214 headers = fp.info()
215 if filename:
216 tfp = open(filename, 'wb')
217 else:
218 import tempfile
219 garbage, path = splittype(url)
220 garbage, path = splithost(path or "")
221 path, garbage = splitquery(path or "")
222 path, garbage = splitattr(path or "")
223 suffix = os.path.splitext(path)[1]
224 (fd, filename) = tempfile.mkstemp(suffix)
225 self.__tempfiles.append(filename)
226 tfp = os.fdopen(fd, 'wb')
227 result = filename, headers
228 if self.tempcache is not None:
229 self.tempcache[url] = result
230 bs = 1024*8
231 size = -1
232 blocknum = 1
233 if reporthook:
234 if "content-length" in headers:
235 size = int(headers["Content-Length"])
236 reporthook(0, bs, size)
237 block = fp.read(bs)
238 if reporthook:
239 reporthook(1, bs, size)
240 while block:
241 tfp.write(block)
242 block = fp.read(bs)
243 blocknum = blocknum + 1
244 if reporthook:
245 reporthook(blocknum, bs, size)
246 fp.close()
247 tfp.close()
248 del fp
249 del tfp
250 return result
252 # Each method named open_<type> knows how to open that type of URL
254 def open_http(self, url, data=None):
255 """Use HTTP protocol."""
256 import httplib
257 user_passwd = None
258 if isinstance(url, str):
259 host, selector = splithost(url)
260 if host:
261 user_passwd, host = splituser(host)
262 host = unquote(host)
263 realhost = host
264 else:
265 host, selector = url
266 urltype, rest = splittype(selector)
267 url = rest
268 user_passwd = None
269 if urltype.lower() != 'http':
270 realhost = None
271 else:
272 realhost, rest = splithost(rest)
273 if realhost:
274 user_passwd, realhost = splituser(realhost)
275 if user_passwd:
276 selector = "%s://%s%s" % (urltype, realhost, rest)
277 if proxy_bypass(realhost):
278 host = realhost
280 #print "proxy via http:", host, selector
281 if not host: raise IOError, ('http error', 'no host given')
282 if user_passwd:
283 import base64
284 auth = base64.encodestring(user_passwd).strip()
285 else:
286 auth = None
287 h = httplib.HTTP(host)
288 if data is not None:
289 h.putrequest('POST', selector)
290 h.putheader('Content-type', 'application/x-www-form-urlencoded')
291 h.putheader('Content-length', '%d' % len(data))
292 else:
293 h.putrequest('GET', selector)
294 if auth: h.putheader('Authorization', 'Basic %s' % auth)
295 if realhost: h.putheader('Host', realhost)
296 for args in self.addheaders: h.putheader(*args)
297 h.endheaders()
298 if data is not None:
299 h.send(data)
300 errcode, errmsg, headers = h.getreply()
301 fp = h.getfile()
302 if errcode == 200:
303 return addinfourl(fp, headers, "http:" + url)
304 else:
305 if data is None:
306 return self.http_error(url, fp, errcode, errmsg, headers)
307 else:
308 return self.http_error(url, fp, errcode, errmsg, headers, data)
310 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
311 """Handle http errors.
312 Derived class can override this, or provide specific handlers
313 named http_error_DDD where DDD is the 3-digit error code."""
314 # First check if there's a specific handler for this error
315 name = 'http_error_%d' % errcode
316 if hasattr(self, name):
317 method = getattr(self, name)
318 if data is None:
319 result = method(url, fp, errcode, errmsg, headers)
320 else:
321 result = method(url, fp, errcode, errmsg, headers, data)
322 if result: return result
323 return self.http_error_default(url, fp, errcode, errmsg, headers)
325 def http_error_default(self, url, fp, errcode, errmsg, headers):
326 """Default error handler: close the connection and raise IOError."""
327 void = fp.read()
328 fp.close()
329 raise IOError, ('http error', errcode, errmsg, headers)
331 if hasattr(socket, "ssl"):
332 def open_https(self, url, data=None):
333 """Use HTTPS protocol."""
334 import httplib
335 user_passwd = None
336 if isinstance(url, str):
337 host, selector = splithost(url)
338 if host:
339 user_passwd, host = splituser(host)
340 host = unquote(host)
341 realhost = host
342 else:
343 host, selector = url
344 urltype, rest = splittype(selector)
345 url = rest
346 user_passwd = None
347 if urltype.lower() != 'https':
348 realhost = None
349 else:
350 realhost, rest = splithost(rest)
351 if realhost:
352 user_passwd, realhost = splituser(realhost)
353 if user_passwd:
354 selector = "%s://%s%s" % (urltype, realhost, rest)
355 #print "proxy via https:", host, selector
356 if not host: raise IOError, ('https error', 'no host given')
357 if user_passwd:
358 import base64
359 auth = base64.encodestring(user_passwd).strip()
360 else:
361 auth = None
362 h = httplib.HTTPS(host, 0,
363 key_file=self.key_file,
364 cert_file=self.cert_file)
365 if data is not None:
366 h.putrequest('POST', selector)
367 h.putheader('Content-type',
368 'application/x-www-form-urlencoded')
369 h.putheader('Content-length', '%d' % len(data))
370 else:
371 h.putrequest('GET', selector)
372 if auth: h.putheader('Authorization: Basic %s' % auth)
373 if realhost: h.putheader('Host', realhost)
374 for args in self.addheaders: h.putheader(*args)
375 h.endheaders()
376 if data is not None:
377 h.send(data)
378 errcode, errmsg, headers = h.getreply()
379 fp = h.getfile()
380 if errcode == 200:
381 return addinfourl(fp, headers, "https:" + url)
382 else:
383 if data is None:
384 return self.http_error(url, fp, errcode, errmsg, headers)
385 else:
386 return self.http_error(url, fp, errcode, errmsg, headers,
387 data)
389 def open_gopher(self, url):
390 """Use Gopher protocol."""
391 import gopherlib
392 host, selector = splithost(url)
393 if not host: raise IOError, ('gopher error', 'no host given')
394 host = unquote(host)
395 type, selector = splitgophertype(selector)
396 selector, query = splitquery(selector)
397 selector = unquote(selector)
398 if query:
399 query = unquote(query)
400 fp = gopherlib.send_query(selector, query, host)
401 else:
402 fp = gopherlib.send_selector(selector, host)
403 return addinfourl(fp, noheaders(), "gopher:" + url)
405 def open_file(self, url):
406 """Use local file or FTP depending on form of URL."""
407 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
408 return self.open_ftp(url)
409 else:
410 return self.open_local_file(url)
412 def open_local_file(self, url):
413 """Use local file."""
414 import mimetypes, mimetools, rfc822, StringIO
415 host, file = splithost(url)
416 localname = url2pathname(file)
417 try:
418 stats = os.stat(localname)
419 except OSError, e:
420 raise IOError(e.errno, e.strerror, e.filename)
421 size = stats.st_size
422 modified = rfc822.formatdate(stats.st_mtime)
423 mtype = mimetypes.guess_type(url)[0]
424 headers = mimetools.Message(StringIO.StringIO(
425 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
426 (mtype or 'text/plain', size, modified)))
427 if not host:
428 urlfile = file
429 if file[:1] == '/':
430 urlfile = 'file://' + file
431 return addinfourl(open(localname, 'rb'),
432 headers, urlfile)
433 host, port = splitport(host)
434 if not port \
435 and socket.gethostbyname(host) in (localhost(), thishost()):
436 urlfile = file
437 if file[:1] == '/':
438 urlfile = 'file://' + file
439 return addinfourl(open(localname, 'rb'),
440 headers, urlfile)
441 raise IOError, ('local file error', 'not on local host')
443 def open_ftp(self, url):
444 """Use FTP protocol."""
445 import mimetypes, mimetools, StringIO
446 host, path = splithost(url)
447 if not host: raise IOError, ('ftp error', 'no host given')
448 host, port = splitport(host)
449 user, host = splituser(host)
450 if user: user, passwd = splitpasswd(user)
451 else: passwd = None
452 host = unquote(host)
453 user = unquote(user or '')
454 passwd = unquote(passwd or '')
455 host = socket.gethostbyname(host)
456 if not port:
457 import ftplib
458 port = ftplib.FTP_PORT
459 else:
460 port = int(port)
461 path, attrs = splitattr(path)
462 path = unquote(path)
463 dirs = path.split('/')
464 dirs, file = dirs[:-1], dirs[-1]
465 if dirs and not dirs[0]: dirs = dirs[1:]
466 if dirs and not dirs[0]: dirs[0] = '/'
467 key = user, host, port, '/'.join(dirs)
468 # XXX thread unsafe!
469 if len(self.ftpcache) > MAXFTPCACHE:
470 # Prune the cache, rather arbitrarily
471 for k in self.ftpcache.keys():
472 if k != key:
473 v = self.ftpcache[k]
474 del self.ftpcache[k]
475 v.close()
476 try:
477 if not key in self.ftpcache:
478 self.ftpcache[key] = \
479 ftpwrapper(user, passwd, host, port, dirs)
480 if not file: type = 'D'
481 else: type = 'I'
482 for attr in attrs:
483 attr, value = splitvalue(attr)
484 if attr.lower() == 'type' and \
485 value in ('a', 'A', 'i', 'I', 'd', 'D'):
486 type = value.upper()
487 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
488 mtype = mimetypes.guess_type("ftp:" + url)[0]
489 headers = ""
490 if mtype:
491 headers += "Content-Type: %s\n" % mtype
492 if retrlen is not None and retrlen >= 0:
493 headers += "Content-Length: %d\n" % retrlen
494 headers = mimetools.Message(StringIO.StringIO(headers))
495 return addinfourl(fp, headers, "ftp:" + url)
496 except ftperrors(), msg:
497 raise IOError, ('ftp error', msg), sys.exc_info()[2]
499 def open_data(self, url, data=None):
500 """Use "data" URL."""
501 # ignore POSTed data
503 # syntax of data URLs:
504 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
505 # mediatype := [ type "/" subtype ] *( ";" parameter )
506 # data := *urlchar
507 # parameter := attribute "=" value
508 import StringIO, mimetools
509 try:
510 [type, data] = url.split(',', 1)
511 except ValueError:
512 raise IOError, ('data error', 'bad data URL')
513 if not type:
514 type = 'text/plain;charset=US-ASCII'
515 semi = type.rfind(';')
516 if semi >= 0 and '=' not in type[semi:]:
517 encoding = type[semi+1:]
518 type = type[:semi]
519 else:
520 encoding = ''
521 msg = []
522 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
523 time.gmtime(time.time())))
524 msg.append('Content-type: %s' % type)
525 if encoding == 'base64':
526 import base64
527 data = base64.decodestring(data)
528 else:
529 data = unquote(data)
530 msg.append('Content-length: %d' % len(data))
531 msg.append('')
532 msg.append(data)
533 msg = '\n'.join(msg)
534 f = StringIO.StringIO(msg)
535 headers = mimetools.Message(f, 0)
536 f.fileno = None # needed for addinfourl
537 return addinfourl(f, headers, url)
540 class FancyURLopener(URLopener):
541 """Derived class with handlers for errors we can handle (perhaps)."""
543 def __init__(self, *args, **kwargs):
544 URLopener.__init__(self, *args, **kwargs)
545 self.auth_cache = {}
546 self.tries = 0
547 self.maxtries = 10
549 def http_error_default(self, url, fp, errcode, errmsg, headers):
550 """Default error handling -- don't raise an exception."""
551 return addinfourl(fp, headers, "http:" + url)
553 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
554 """Error 302 -- relocated (temporarily)."""
555 self.tries += 1
556 if self.maxtries and self.tries >= self.maxtries:
557 if hasattr(self, "http_error_500"):
558 meth = self.http_error_500
559 else:
560 meth = self.http_error_default
561 self.tries = 0
562 return meth(url, fp, 500,
563 "Internal Server Error: Redirect Recursion", headers)
564 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
565 data)
566 self.tries = 0
567 return result
569 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
570 if 'location' in headers:
571 newurl = headers['location']
572 elif 'uri' in headers:
573 newurl = headers['uri']
574 else:
575 return
576 void = fp.read()
577 fp.close()
578 # In case the server sent a relative URL, join with original:
579 newurl = basejoin(self.type + ":" + url, newurl)
580 if data is None:
581 return self.open(newurl)
582 else:
583 return self.open(newurl, data)
585 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
586 """Error 301 -- also relocated (permanently)."""
587 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
589 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
590 """Error 303 -- also relocated (essentially identical to 302)."""
591 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
593 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
594 """Error 401 -- authentication required.
595 See this URL for a description of the basic authentication scheme:
596 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
597 if not 'www-authenticate' in headers:
598 URLopener.http_error_default(self, url, fp,
599 errcode, errmsg, headers)
600 stuff = headers['www-authenticate']
601 import re
602 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
603 if not match:
604 URLopener.http_error_default(self, url, fp,
605 errcode, errmsg, headers)
606 scheme, realm = match.groups()
607 if scheme.lower() != 'basic':
608 URLopener.http_error_default(self, url, fp,
609 errcode, errmsg, headers)
610 name = 'retry_' + self.type + '_basic_auth'
611 if data is None:
612 return getattr(self,name)(url, realm)
613 else:
614 return getattr(self,name)(url, realm, data)
616 def retry_http_basic_auth(self, url, realm, data=None):
617 host, selector = splithost(url)
618 i = host.find('@') + 1
619 host = host[i:]
620 user, passwd = self.get_user_passwd(host, realm, i)
621 if not (user or passwd): return None
622 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
623 newurl = 'http://' + host + selector
624 if data is None:
625 return self.open(newurl)
626 else:
627 return self.open(newurl, data)
629 def retry_https_basic_auth(self, url, realm, data=None):
630 host, selector = splithost(url)
631 i = host.find('@') + 1
632 host = host[i:]
633 user, passwd = self.get_user_passwd(host, realm, i)
634 if not (user or passwd): return None
635 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
636 newurl = '//' + host + selector
637 return self.open_https(newurl, data)
639 def get_user_passwd(self, host, realm, clear_cache = 0):
640 key = realm + '@' + host.lower()
641 if key in self.auth_cache:
642 if clear_cache:
643 del self.auth_cache[key]
644 else:
645 return self.auth_cache[key]
646 user, passwd = self.prompt_user_passwd(host, realm)
647 if user or passwd: self.auth_cache[key] = (user, passwd)
648 return user, passwd
650 def prompt_user_passwd(self, host, realm):
651 """Override this in a GUI environment!"""
652 import getpass
653 try:
654 user = raw_input("Enter username for %s at %s: " % (realm,
655 host))
656 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
657 (user, realm, host))
658 return user, passwd
659 except KeyboardInterrupt:
660 print
661 return None, None
664 # Utility functions
666 _localhost = None
667 def localhost():
668 """Return the IP address of the magic hostname 'localhost'."""
669 global _localhost
670 if _localhost is None:
671 _localhost = socket.gethostbyname('localhost')
672 return _localhost
674 _thishost = None
675 def thishost():
676 """Return the IP address of the current host."""
677 global _thishost
678 if _thishost is None:
679 _thishost = socket.gethostbyname(socket.gethostname())
680 return _thishost
682 _ftperrors = None
683 def ftperrors():
684 """Return the set of errors raised by the FTP class."""
685 global _ftperrors
686 if _ftperrors is None:
687 import ftplib
688 _ftperrors = ftplib.all_errors
689 return _ftperrors
691 _noheaders = None
692 def noheaders():
693 """Return an empty mimetools.Message object."""
694 global _noheaders
695 if _noheaders is None:
696 import mimetools
697 import StringIO
698 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
699 _noheaders.fp.close() # Recycle file descriptor
700 return _noheaders
703 # Utility classes
705 class ftpwrapper:
706 """Class used by open_ftp() for cache of open FTP connections."""
708 def __init__(self, user, passwd, host, port, dirs):
709 self.user = user
710 self.passwd = passwd
711 self.host = host
712 self.port = port
713 self.dirs = dirs
714 self.init()
716 def init(self):
717 import ftplib
718 self.busy = 0
719 self.ftp = ftplib.FTP()
720 self.ftp.connect(self.host, self.port)
721 self.ftp.login(self.user, self.passwd)
722 for dir in self.dirs:
723 self.ftp.cwd(dir)
725 def retrfile(self, file, type):
726 import ftplib
727 self.endtransfer()
728 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
729 else: cmd = 'TYPE ' + type; isdir = 0
730 try:
731 self.ftp.voidcmd(cmd)
732 except ftplib.all_errors:
733 self.init()
734 self.ftp.voidcmd(cmd)
735 conn = None
736 if file and not isdir:
737 # Use nlst to see if the file exists at all
738 try:
739 self.ftp.nlst(file)
740 except ftplib.error_perm, reason:
741 raise IOError, ('ftp error', reason), sys.exc_info()[2]
742 # Restore the transfer mode!
743 self.ftp.voidcmd(cmd)
744 # Try to retrieve as a file
745 try:
746 cmd = 'RETR ' + file
747 conn = self.ftp.ntransfercmd(cmd)
748 except ftplib.error_perm, reason:
749 if str(reason)[:3] != '550':
750 raise IOError, ('ftp error', reason), sys.exc_info()[2]
751 if not conn:
752 # Set transfer mode to ASCII!
753 self.ftp.voidcmd('TYPE A')
754 # Try a directory listing
755 if file: cmd = 'LIST ' + file
756 else: cmd = 'LIST'
757 conn = self.ftp.ntransfercmd(cmd)
758 self.busy = 1
759 # Pass back both a suitably decorated object and a retrieval length
760 return (addclosehook(conn[0].makefile('rb'),
761 self.endtransfer), conn[1])
762 def endtransfer(self):
763 if not self.busy:
764 return
765 self.busy = 0
766 try:
767 self.ftp.voidresp()
768 except ftperrors():
769 pass
771 def close(self):
772 self.endtransfer()
773 try:
774 self.ftp.close()
775 except ftperrors():
776 pass
778 class addbase:
779 """Base class for addinfo and addclosehook."""
781 def __init__(self, fp):
782 self.fp = fp
783 self.read = self.fp.read
784 self.readline = self.fp.readline
785 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
786 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
787 if hasattr(self.fp, "__iter__"):
788 self.__iter__ = self.fp.__iter__
789 if hasattr(self.fp, "next"):
790 self.next = self.fp.next
792 def __repr__(self):
793 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
794 `id(self)`, `self.fp`)
796 def close(self):
797 self.read = None
798 self.readline = None
799 self.readlines = None
800 self.fileno = None
801 if self.fp: self.fp.close()
802 self.fp = None
804 class addclosehook(addbase):
805 """Class to add a close hook to an open file."""
807 def __init__(self, fp, closehook, *hookargs):
808 addbase.__init__(self, fp)
809 self.closehook = closehook
810 self.hookargs = hookargs
812 def close(self):
813 addbase.close(self)
814 if self.closehook:
815 self.closehook(*self.hookargs)
816 self.closehook = None
817 self.hookargs = None
819 class addinfo(addbase):
820 """class to add an info() method to an open file."""
822 def __init__(self, fp, headers):
823 addbase.__init__(self, fp)
824 self.headers = headers
826 def info(self):
827 return self.headers
829 class addinfourl(addbase):
830 """class to add info() and geturl() methods to an open file."""
832 def __init__(self, fp, headers, url):
833 addbase.__init__(self, fp)
834 self.headers = headers
835 self.url = url
837 def info(self):
838 return self.headers
840 def geturl(self):
841 return self.url
844 def basejoin(base, url):
845 """Utility to combine a URL with a base URL to form a new URL."""
846 type, path = splittype(url)
847 if type:
848 # if url is complete (i.e., it contains a type), return it
849 return url
850 host, path = splithost(path)
851 type, basepath = splittype(base) # inherit type from base
852 if host:
853 # if url contains host, just inherit type
854 if type: return type + '://' + host + path
855 else:
856 # no type inherited, so url must have started with //
857 # just return it
858 return url
859 host, basepath = splithost(basepath) # inherit host
860 basepath, basetag = splittag(basepath) # remove extraneous cruft
861 basepath, basequery = splitquery(basepath) # idem
862 if path[:1] != '/':
863 # non-absolute path name
864 if path[:1] in ('#', '?'):
865 # path is just a tag or query, attach to basepath
866 i = len(basepath)
867 else:
868 # else replace last component
869 i = basepath.rfind('/')
870 if i < 0:
871 # basepath not absolute
872 if host:
873 # host present, make absolute
874 basepath = '/'
875 else:
876 # else keep non-absolute
877 basepath = ''
878 else:
879 # remove last file component
880 basepath = basepath[:i+1]
881 # Interpret ../ (important because of symlinks)
882 while basepath and path[:3] == '../':
883 path = path[3:]
884 i = basepath[:-1].rfind('/')
885 if i > 0:
886 basepath = basepath[:i+1]
887 elif i == 0:
888 basepath = '/'
889 break
890 else:
891 basepath = ''
893 path = basepath + path
894 if host and path and path[0] != '/':
895 path = '/' + path
896 if type and host: return type + '://' + host + path
897 elif type: return type + ':' + path
898 elif host: return '//' + host + path # don't know what this means
899 else: return path
902 # Utilities to parse URLs (most of these return None for missing parts):
903 # unwrap('<URL:type://host/path>') --> 'type://host/path'
904 # splittype('type:opaquestring') --> 'type', 'opaquestring'
905 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
906 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
907 # splitpasswd('user:passwd') -> 'user', 'passwd'
908 # splitport('host:port') --> 'host', 'port'
909 # splitquery('/path?query') --> '/path', 'query'
910 # splittag('/path#tag') --> '/path', 'tag'
911 # splitattr('/path;attr1=value1;attr2=value2;...') ->
912 # '/path', ['attr1=value1', 'attr2=value2', ...]
913 # splitvalue('attr=value') --> 'attr', 'value'
914 # splitgophertype('/Xselector') --> 'X', 'selector'
915 # unquote('abc%20def') -> 'abc def'
916 # quote('abc def') -> 'abc%20def')
918 try:
919 unicode
920 except NameError:
921 def _is_unicode(x):
922 return 0
923 else:
924 def _is_unicode(x):
925 return isinstance(x, unicode)
927 def toBytes(url):
928 """toBytes(u"URL") --> 'URL'."""
929 # Most URL schemes require ASCII. If that changes, the conversion
930 # can be relaxed
931 if _is_unicode(url):
932 try:
933 url = url.encode("ASCII")
934 except UnicodeError:
935 raise UnicodeError("URL " + repr(url) +
936 " contains non-ASCII characters")
937 return url
939 def unwrap(url):
940 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
941 url = url.strip()
942 if url[:1] == '<' and url[-1:] == '>':
943 url = url[1:-1].strip()
944 if url[:4] == 'URL:': url = url[4:].strip()
945 return url
947 _typeprog = None
948 def splittype(url):
949 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
950 global _typeprog
951 if _typeprog is None:
952 import re
953 _typeprog = re.compile('^([^/:]+):')
955 match = _typeprog.match(url)
956 if match:
957 scheme = match.group(1)
958 return scheme.lower(), url[len(scheme) + 1:]
959 return None, url
961 _hostprog = None
962 def splithost(url):
963 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
964 global _hostprog
965 if _hostprog is None:
966 import re
967 _hostprog = re.compile('^//([^/]*)(.*)$')
969 match = _hostprog.match(url)
970 if match: return match.group(1, 2)
971 return None, url
973 _userprog = None
974 def splituser(host):
975 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
976 global _userprog
977 if _userprog is None:
978 import re
979 _userprog = re.compile('^(.*)@(.*)$')
981 match = _userprog.match(host)
982 if match: return map(unquote, match.group(1, 2))
983 return None, host
985 _passwdprog = None
986 def splitpasswd(user):
987 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
988 global _passwdprog
989 if _passwdprog is None:
990 import re
991 _passwdprog = re.compile('^([^:]*):(.*)$')
993 match = _passwdprog.match(user)
994 if match: return match.group(1, 2)
995 return user, None
997 # splittag('/path#tag') --> '/path', 'tag'
998 _portprog = None
999 def splitport(host):
1000 """splitport('host:port') --> 'host', 'port'."""
1001 global _portprog
1002 if _portprog is None:
1003 import re
1004 _portprog = re.compile('^(.*):([0-9]+)$')
1006 match = _portprog.match(host)
1007 if match: return match.group(1, 2)
1008 return host, None
1010 _nportprog = None
1011 def splitnport(host, defport=-1):
1012 """Split host and port, returning numeric port.
1013 Return given default port if no ':' found; defaults to -1.
1014 Return numerical port if a valid number are found after ':'.
1015 Return None if ':' but not a valid number."""
1016 global _nportprog
1017 if _nportprog is None:
1018 import re
1019 _nportprog = re.compile('^(.*):(.*)$')
1021 match = _nportprog.match(host)
1022 if match:
1023 host, port = match.group(1, 2)
1024 try:
1025 if not port: raise ValueError, "no digits"
1026 nport = int(port)
1027 except ValueError:
1028 nport = None
1029 return host, nport
1030 return host, defport
1032 _queryprog = None
1033 def splitquery(url):
1034 """splitquery('/path?query') --> '/path', 'query'."""
1035 global _queryprog
1036 if _queryprog is None:
1037 import re
1038 _queryprog = re.compile('^(.*)\?([^?]*)$')
1040 match = _queryprog.match(url)
1041 if match: return match.group(1, 2)
1042 return url, None
1044 _tagprog = None
1045 def splittag(url):
1046 """splittag('/path#tag') --> '/path', 'tag'."""
1047 global _tagprog
1048 if _tagprog is None:
1049 import re
1050 _tagprog = re.compile('^(.*)#([^#]*)$')
1052 match = _tagprog.match(url)
1053 if match: return match.group(1, 2)
1054 return url, None
1056 def splitattr(url):
1057 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1058 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1059 words = url.split(';')
1060 return words[0], words[1:]
1062 _valueprog = None
1063 def splitvalue(attr):
1064 """splitvalue('attr=value') --> 'attr', 'value'."""
1065 global _valueprog
1066 if _valueprog is None:
1067 import re
1068 _valueprog = re.compile('^([^=]*)=(.*)$')
1070 match = _valueprog.match(attr)
1071 if match: return match.group(1, 2)
1072 return attr, None
1074 def splitgophertype(selector):
1075 """splitgophertype('/Xselector') --> 'X', 'selector'."""
1076 if selector[:1] == '/' and selector[1:2]:
1077 return selector[1], selector[2:]
1078 return None, selector
1080 def unquote(s):
1081 """unquote('abc%20def') -> 'abc def'."""
1082 mychr = chr
1083 myatoi = int
1084 list = s.split('%')
1085 res = [list[0]]
1086 myappend = res.append
1087 del list[0]
1088 for item in list:
1089 if item[1:2]:
1090 try:
1091 myappend(mychr(myatoi(item[:2], 16))
1092 + item[2:])
1093 except ValueError:
1094 myappend('%' + item)
1095 else:
1096 myappend('%' + item)
1097 return "".join(res)
1099 def unquote_plus(s):
1100 """unquote('%7e/abc+def') -> '~/abc def'"""
1101 if '+' in s:
1102 # replace '+' with ' '
1103 s = ' '.join(s.split('+'))
1104 return unquote(s)
1106 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1107 'abcdefghijklmnopqrstuvwxyz'
1108 '0123456789' '_.-')
1110 _fast_safe_test = always_safe + '/'
1111 _fast_safe = None
1113 def _fast_quote(s):
1114 global _fast_safe
1115 if _fast_safe is None:
1116 _fast_safe = {}
1117 for c in _fast_safe_test:
1118 _fast_safe[c] = c
1119 res = list(s)
1120 for i in range(len(res)):
1121 c = res[i]
1122 if not c in _fast_safe:
1123 res[i] = '%%%02X' % ord(c)
1124 return ''.join(res)
1126 def quote(s, safe = '/'):
1127 """quote('abc def') -> 'abc%20def'
1129 Each part of a URL, e.g. the path info, the query, etc., has a
1130 different set of reserved characters that must be quoted.
1132 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1133 the following reserved characters.
1135 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1136 "$" | ","
1138 Each of these characters is reserved in some component of a URL,
1139 but not necessarily in all of them.
1141 By default, the quote function is intended for quoting the path
1142 section of a URL. Thus, it will not encode '/'. This character
1143 is reserved, but in typical usage the quote function is being
1144 called on a path where the existing slash characters are used as
1145 reserved characters.
1147 safe = always_safe + safe
1148 if _fast_safe_test == safe:
1149 return _fast_quote(s)
1150 res = list(s)
1151 for i in range(len(res)):
1152 c = res[i]
1153 if c not in safe:
1154 res[i] = '%%%02X' % ord(c)
1155 return ''.join(res)
1157 def quote_plus(s, safe = ''):
1158 """Quote the query fragment of a URL; replacing ' ' with '+'"""
1159 if ' ' in s:
1160 l = s.split(' ')
1161 for i in range(len(l)):
1162 l[i] = quote(l[i], safe)
1163 return '+'.join(l)
1164 else:
1165 return quote(s, safe)
1167 def urlencode(query,doseq=0):
1168 """Encode a sequence of two-element tuples or dictionary into a URL query string.
1170 If any values in the query arg are sequences and doseq is true, each
1171 sequence element is converted to a separate parameter.
1173 If the query arg is a sequence of two-element tuples, the order of the
1174 parameters in the output will match the order of parameters in the
1175 input.
1178 if hasattr(query,"items"):
1179 # mapping objects
1180 query = query.items()
1181 else:
1182 # it's a bother at times that strings and string-like objects are
1183 # sequences...
1184 try:
1185 # non-sequence items should not work with len()
1186 # non-empty strings will fail this
1187 if len(query) and not isinstance(query[0], tuple):
1188 raise TypeError
1189 # zero-length sequences of all types will get here and succeed,
1190 # but that's a minor nit - since the original implementation
1191 # allowed empty dicts that type of behavior probably should be
1192 # preserved for consistency
1193 except TypeError:
1194 ty,va,tb = sys.exc_info()
1195 raise TypeError, "not a valid non-string sequence or mapping object", tb
1197 l = []
1198 if not doseq:
1199 # preserve old behavior
1200 for k, v in query:
1201 k = quote_plus(str(k))
1202 v = quote_plus(str(v))
1203 l.append(k + '=' + v)
1204 else:
1205 for k, v in query:
1206 k = quote_plus(str(k))
1207 if isinstance(v, str):
1208 v = quote_plus(v)
1209 l.append(k + '=' + v)
1210 elif _is_unicode(v):
1211 # is there a reasonable way to convert to ASCII?
1212 # encode generates a string, but "replace" or "ignore"
1213 # lose information and "strict" can raise UnicodeError
1214 v = quote_plus(v.encode("ASCII","replace"))
1215 l.append(k + '=' + v)
1216 else:
1217 try:
1218 # is this a sufficient test for sequence-ness?
1219 x = len(v)
1220 except TypeError:
1221 # not a sequence
1222 v = quote_plus(str(v))
1223 l.append(k + '=' + v)
1224 else:
1225 # loop over the sequence
1226 for elt in v:
1227 l.append(k + '=' + quote_plus(str(elt)))
1228 return '&'.join(l)
1230 # Proxy handling
1231 def getproxies_environment():
1232 """Return a dictionary of scheme -> proxy server URL mappings.
1234 Scan the environment for variables named <scheme>_proxy;
1235 this seems to be the standard convention. If you need a
1236 different way, you can pass a proxies dictionary to the
1237 [Fancy]URLopener constructor.
1240 proxies = {}
1241 for name, value in os.environ.items():
1242 name = name.lower()
1243 if value and name[-6:] == '_proxy':
1244 proxies[name[:-6]] = value
1245 return proxies
1247 if os.name == 'mac':
1248 def getproxies():
1249 """Return a dictionary of scheme -> proxy server URL mappings.
1251 By convention the mac uses Internet Config to store
1252 proxies. An HTTP proxy, for instance, is stored under
1253 the HttpProxy key.
1256 try:
1257 import ic
1258 except ImportError:
1259 return {}
1261 try:
1262 config = ic.IC()
1263 except ic.error:
1264 return {}
1265 proxies = {}
1266 # HTTP:
1267 if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
1268 try:
1269 value = config['HTTPProxyHost']
1270 except ic.error:
1271 pass
1272 else:
1273 proxies['http'] = 'http://%s' % value
1274 # FTP: XXXX To be done.
1275 # Gopher: XXXX To be done.
1276 return proxies
1278 def proxy_bypass(x):
1279 return 0
1281 elif os.name == 'nt':
1282 def getproxies_registry():
1283 """Return a dictionary of scheme -> proxy server URL mappings.
1285 Win32 uses the registry to store proxies.
1288 proxies = {}
1289 try:
1290 import _winreg
1291 except ImportError:
1292 # Std module, so should be around - but you never know!
1293 return proxies
1294 try:
1295 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1296 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1297 proxyEnable = _winreg.QueryValueEx(internetSettings,
1298 'ProxyEnable')[0]
1299 if proxyEnable:
1300 # Returned as Unicode but problems if not converted to ASCII
1301 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1302 'ProxyServer')[0])
1303 if '=' in proxyServer:
1304 # Per-protocol settings
1305 for p in proxyServer.split(';'):
1306 protocol, address = p.split('=', 1)
1307 # See if address has a type:// prefix
1308 import re
1309 if not re.match('^([^/:]+)://', address):
1310 address = '%s://%s' % (protocol, address)
1311 proxies[protocol] = address
1312 else:
1313 # Use one setting for all protocols
1314 if proxyServer[:5] == 'http:':
1315 proxies['http'] = proxyServer
1316 else:
1317 proxies['http'] = 'http://%s' % proxyServer
1318 proxies['ftp'] = 'ftp://%s' % proxyServer
1319 internetSettings.Close()
1320 except (WindowsError, ValueError, TypeError):
1321 # Either registry key not found etc, or the value in an
1322 # unexpected format.
1323 # proxies already set up to be empty so nothing to do
1324 pass
1325 return proxies
1327 def getproxies():
1328 """Return a dictionary of scheme -> proxy server URL mappings.
1330 Returns settings gathered from the environment, if specified,
1331 or the registry.
1334 return getproxies_environment() or getproxies_registry()
1336 def proxy_bypass(host):
1337 try:
1338 import _winreg
1339 import re
1340 except ImportError:
1341 # Std modules, so should be around - but you never know!
1342 return 0
1343 try:
1344 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1345 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1346 proxyEnable = _winreg.QueryValueEx(internetSettings,
1347 'ProxyEnable')[0]
1348 proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1349 'ProxyOverride')[0])
1350 # ^^^^ Returned as Unicode but problems if not converted to ASCII
1351 except WindowsError:
1352 return 0
1353 if not proxyEnable or not proxyOverride:
1354 return 0
1355 # try to make a host list from name and IP address.
1356 host = [host]
1357 try:
1358 addr = socket.gethostbyname(host[0])
1359 if addr != host:
1360 host.append(addr)
1361 except socket.error:
1362 pass
1363 # make a check value list from the registry entry: replace the
1364 # '<local>' string by the localhost entry and the corresponding
1365 # canonical entry.
1366 proxyOverride = proxyOverride.split(';')
1367 i = 0
1368 while i < len(proxyOverride):
1369 if proxyOverride[i] == '<local>':
1370 proxyOverride[i:i+1] = ['localhost',
1371 '127.0.0.1',
1372 socket.gethostname(),
1373 socket.gethostbyname(
1374 socket.gethostname())]
1375 i += 1
1376 # print proxyOverride
1377 # now check if we match one of the registry values.
1378 for test in proxyOverride:
1379 test = test.replace(".", r"\.") # mask dots
1380 test = test.replace("*", r".*") # change glob sequence
1381 test = test.replace("?", r".") # change glob char
1382 for val in host:
1383 # print "%s <--> %s" %( test, val )
1384 if re.match(test, val, re.I):
1385 return 1
1386 return 0
1388 else:
1389 # By default use environment variables
1390 getproxies = getproxies_environment
1392 def proxy_bypass(host):
1393 return 0
1395 # Test and time quote() and unquote()
1396 def test1():
1397 s = ''
1398 for i in range(256): s = s + chr(i)
1399 s = s*4
1400 t0 = time.time()
1401 qs = quote(s)
1402 uqs = unquote(qs)
1403 t1 = time.time()
1404 if uqs != s:
1405 print 'Wrong!'
1406 print `s`
1407 print `qs`
1408 print `uqs`
1409 print round(t1 - t0, 3), 'sec'
1412 def reporthook(blocknum, blocksize, totalsize):
1413 # Report during remote transfers
1414 print "Block number: %d, Block size: %d, Total size: %d" % (
1415 blocknum, blocksize, totalsize)
1417 # Test program
1418 def test(args=[]):
1419 if not args:
1420 args = [
1421 '/etc/passwd',
1422 'file:/etc/passwd',
1423 'file://localhost/etc/passwd',
1424 'ftp://ftp.python.org/pub/python/README',
1425 ## 'gopher://gopher.micro.umn.edu/1/',
1426 'http://www.python.org/index.html',
1428 if hasattr(URLopener, "open_https"):
1429 args.append('https://synergy.as.cmu.edu/~geek/')
1430 try:
1431 for url in args:
1432 print '-'*10, url, '-'*10
1433 fn, h = urlretrieve(url, None, reporthook)
1434 print fn
1435 if h:
1436 print '======'
1437 for k in h.keys(): print k + ':', h[k]
1438 print '======'
1439 fp = open(fn, 'rb')
1440 data = fp.read()
1441 del fp
1442 if '\r' in data:
1443 table = string.maketrans("", "")
1444 data = data.translate(table, "\r")
1445 print data
1446 fn, h = None, None
1447 print '-'*40
1448 finally:
1449 urlcleanup()
1451 def main():
1452 import getopt, sys
1453 try:
1454 opts, args = getopt.getopt(sys.argv[1:], "th")
1455 except getopt.error, msg:
1456 print msg
1457 print "Use -h for help"
1458 return
1459 t = 0
1460 for o, a in opts:
1461 if o == '-t':
1462 t = t + 1
1463 if o == '-h':
1464 print "Usage: python urllib.py [-t] [url ...]"
1465 print "-t runs self-test;",
1466 print "otherwise, contents of urls are printed"
1467 return
1468 if t:
1469 if t > 1:
1470 test1()
1471 test(args)
1472 else:
1473 if not args:
1474 print "Use -h for help"
1475 for url in args:
1476 print urlopen(url).read(),
1478 # Run test program when run as a script
1479 if __name__ == '__main__':
1480 main()