- Got rid of newmodule.c
[python/dscho.git] / Lib / urllib.py
blob4d686b911ba15a917fb6fab5e41b1ff61bdb1571
1 """Open an arbitrary URL.
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
16 The object returned by URLopener().open(file) will differ per
17 protocol. All you know is that is has methods read(), readline(),
18 readlines(), fileno(), close() and info(). The read*(), fileno()
19 and close() methods work like those of open files.
20 The info() method returns a mimetools.Message object which can be
21 used to query various info about the object, if available.
22 (mimetools.Message objects are queried with the getheader() method.)
23 """
25 import string
26 import socket
27 import os
28 import time
29 import sys
31 __all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
32 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
33 "urlencode", "url2pathname", "pathname2url", "splittag",
34 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
35 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
36 "splitnport", "splitquery", "splitattr", "splitvalue",
37 "splitgophertype", "getproxies"]
39 __version__ = '1.15' # XXX This version is not always updated :-(
41 MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
43 # Helper for non-unix systems
44 if os.name == 'mac':
45 from macurl2path import url2pathname, pathname2url
46 elif os.name == 'nt':
47 from nturl2path import url2pathname, pathname2url
48 elif os.name == 'riscos':
49 from rourl2path import url2pathname, pathname2url
50 else:
51 def url2pathname(pathname):
52 return unquote(pathname)
53 def pathname2url(pathname):
54 return quote(pathname)
56 # This really consists of two pieces:
57 # (1) a class which handles opening of all sorts of URLs
58 # (plus assorted utilities etc.)
59 # (2) a set of functions for parsing URLs
60 # XXX Should these be separated out into different modules?
63 # Shortcut for basic usage
64 _urlopener = None
65 def urlopen(url, data=None, proxies=None):
66 """urlopen(url [, data]) -> open file-like object"""
67 global _urlopener
68 if proxies is not None:
69 opener = FancyURLopener(proxies=proxies)
70 elif not _urlopener:
71 opener = FancyURLopener()
72 _urlopener = opener
73 else:
74 opener = _urlopener
75 if data is None:
76 return opener.open(url)
77 else:
78 return opener.open(url, data)
79 def urlretrieve(url, filename=None, reporthook=None, data=None):
80 global _urlopener
81 if not _urlopener:
82 _urlopener = FancyURLopener()
83 return _urlopener.retrieve(url, filename, reporthook, data)
84 def urlcleanup():
85 if _urlopener:
86 _urlopener.cleanup()
89 ftpcache = {}
90 class URLopener:
91 """Class to open URLs.
92 This is a class rather than just a subroutine because we may need
93 more than one set of global protocol-specific options.
94 Note -- this is a base class for those who don't want the
95 automatic handling of errors type 302 (relocated) and 401
96 (authorization needed)."""
98 __tempfiles = None
100 version = "Python-urllib/%s" % __version__
102 # Constructor
103 def __init__(self, proxies=None, **x509):
104 if proxies is None:
105 proxies = getproxies()
106 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
107 self.proxies = proxies
108 self.key_file = x509.get('key_file')
109 self.cert_file = x509.get('cert_file')
110 self.addheaders = [('User-agent', self.version)]
111 self.__tempfiles = []
112 self.__unlink = os.unlink # See cleanup()
113 self.tempcache = None
114 # Undocumented feature: if you assign {} to tempcache,
115 # it is used to cache files retrieved with
116 # self.retrieve(). This is not enabled by default
117 # since it does not work for changing documents (and I
118 # haven't got the logic to check expiration headers
119 # yet).
120 self.ftpcache = ftpcache
121 # Undocumented feature: you can use a different
122 # ftp cache by assigning to the .ftpcache member;
123 # in case you want logically independent URL openers
124 # XXX This is not threadsafe. Bah.
126 def __del__(self):
127 self.close()
129 def close(self):
130 self.cleanup()
132 def cleanup(self):
133 # This code sometimes runs when the rest of this module
134 # has already been deleted, so it can't use any globals
135 # or import anything.
136 if self.__tempfiles:
137 for file in self.__tempfiles:
138 try:
139 self.__unlink(file)
140 except OSError:
141 pass
142 del self.__tempfiles[:]
143 if self.tempcache:
144 self.tempcache.clear()
146 def addheader(self, *args):
147 """Add a header to be used by the HTTP interface only
148 e.g. u.addheader('Accept', 'sound/basic')"""
149 self.addheaders.append(args)
151 # External interface
152 def open(self, fullurl, data=None):
153 """Use URLopener().open(file) instead of open(file, 'r')."""
154 fullurl = unwrap(toBytes(fullurl))
155 if self.tempcache and fullurl in self.tempcache:
156 filename, headers = self.tempcache[fullurl]
157 fp = open(filename, 'rb')
158 return addinfourl(fp, headers, fullurl)
159 urltype, url = splittype(fullurl)
160 if not urltype:
161 urltype = 'file'
162 if urltype in self.proxies:
163 proxy = self.proxies[urltype]
164 urltype, proxyhost = splittype(proxy)
165 host, selector = splithost(proxyhost)
166 url = (host, fullurl) # Signal special case to open_*()
167 else:
168 proxy = None
169 name = 'open_' + urltype
170 self.type = urltype
171 if '-' in name:
172 # replace - with _
173 name = '_'.join(name.split('-'))
174 if not hasattr(self, name):
175 if proxy:
176 return self.open_unknown_proxy(proxy, fullurl, data)
177 else:
178 return self.open_unknown(fullurl, data)
179 try:
180 if data is None:
181 return getattr(self, name)(url)
182 else:
183 return getattr(self, name)(url, data)
184 except socket.error, msg:
185 raise IOError, ('socket error', msg), sys.exc_info()[2]
187 def open_unknown(self, fullurl, data=None):
188 """Overridable interface to open unknown URL type."""
189 type, url = splittype(fullurl)
190 raise IOError, ('url error', 'unknown url type', type)
192 def open_unknown_proxy(self, proxy, fullurl, data=None):
193 """Overridable interface to open unknown URL type."""
194 type, url = splittype(fullurl)
195 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
197 # External interface
198 def retrieve(self, url, filename=None, reporthook=None, data=None):
199 """retrieve(url) returns (filename, None) for a local object
200 or (tempfilename, headers) for a remote object."""
201 url = unwrap(toBytes(url))
202 if self.tempcache and url in self.tempcache:
203 return self.tempcache[url]
204 type, url1 = splittype(url)
205 if filename is None and (not type or type == 'file'):
206 try:
207 fp = self.open_local_file(url1)
208 hdrs = fp.info()
209 del fp
210 return url2pathname(splithost(url1)[1]), hdrs
211 except IOError, msg:
212 pass
213 fp = self.open(url, data)
214 headers = fp.info()
215 if not filename:
216 import tempfile
217 garbage, path = splittype(url)
218 garbage, path = splithost(path or "")
219 path, garbage = splitquery(path or "")
220 path, garbage = splitattr(path or "")
221 suffix = os.path.splitext(path)[1]
222 filename = tempfile.mktemp(suffix)
223 self.__tempfiles.append(filename)
224 result = filename, headers
225 if self.tempcache is not None:
226 self.tempcache[url] = result
227 tfp = open(filename, 'wb')
228 bs = 1024*8
229 size = -1
230 blocknum = 1
231 if reporthook:
232 if "content-length" in headers:
233 size = int(headers["Content-Length"])
234 reporthook(0, bs, size)
235 block = fp.read(bs)
236 if reporthook:
237 reporthook(1, bs, size)
238 while block:
239 tfp.write(block)
240 block = fp.read(bs)
241 blocknum = blocknum + 1
242 if reporthook:
243 reporthook(blocknum, bs, size)
244 fp.close()
245 tfp.close()
246 del fp
247 del tfp
248 return result
250 # Each method named open_<type> knows how to open that type of URL
252 def open_http(self, url, data=None):
253 """Use HTTP protocol."""
254 import httplib
255 user_passwd = None
256 if isinstance(url, str):
257 host, selector = splithost(url)
258 if host:
259 user_passwd, host = splituser(host)
260 host = unquote(host)
261 realhost = host
262 else:
263 host, selector = url
264 urltype, rest = splittype(selector)
265 url = rest
266 user_passwd = None
267 if urltype.lower() != 'http':
268 realhost = None
269 else:
270 realhost, rest = splithost(rest)
271 if realhost:
272 user_passwd, realhost = splituser(realhost)
273 if user_passwd:
274 selector = "%s://%s%s" % (urltype, realhost, rest)
275 if proxy_bypass(realhost):
276 host = realhost
278 #print "proxy via http:", host, selector
279 if not host: raise IOError, ('http error', 'no host given')
280 if user_passwd:
281 import base64
282 auth = base64.encodestring(user_passwd).strip()
283 else:
284 auth = None
285 h = httplib.HTTP(host)
286 if data is not None:
287 h.putrequest('POST', selector)
288 h.putheader('Content-type', 'application/x-www-form-urlencoded')
289 h.putheader('Content-length', '%d' % len(data))
290 else:
291 h.putrequest('GET', selector)
292 if auth: h.putheader('Authorization', 'Basic %s' % auth)
293 if realhost: h.putheader('Host', realhost)
294 for args in self.addheaders: apply(h.putheader, args)
295 h.endheaders()
296 if data is not None:
297 h.send(data)
298 errcode, errmsg, headers = h.getreply()
299 fp = h.getfile()
300 if errcode == 200:
301 return addinfourl(fp, headers, "http:" + url)
302 else:
303 if data is None:
304 return self.http_error(url, fp, errcode, errmsg, headers)
305 else:
306 return self.http_error(url, fp, errcode, errmsg, headers, data)
308 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
309 """Handle http errors.
310 Derived class can override this, or provide specific handlers
311 named http_error_DDD where DDD is the 3-digit error code."""
312 # First check if there's a specific handler for this error
313 name = 'http_error_%d' % errcode
314 if hasattr(self, name):
315 method = getattr(self, name)
316 if data is None:
317 result = method(url, fp, errcode, errmsg, headers)
318 else:
319 result = method(url, fp, errcode, errmsg, headers, data)
320 if result: return result
321 return self.http_error_default(url, fp, errcode, errmsg, headers)
323 def http_error_default(self, url, fp, errcode, errmsg, headers):
324 """Default error handler: close the connection and raise IOError."""
325 void = fp.read()
326 fp.close()
327 raise IOError, ('http error', errcode, errmsg, headers)
329 if hasattr(socket, "ssl"):
330 def open_https(self, url, data=None):
331 """Use HTTPS protocol."""
332 import httplib
333 user_passwd = None
334 if isinstance(url, str):
335 host, selector = splithost(url)
336 if host:
337 user_passwd, host = splituser(host)
338 host = unquote(host)
339 realhost = host
340 else:
341 host, selector = url
342 urltype, rest = splittype(selector)
343 url = rest
344 user_passwd = None
345 if urltype.lower() != 'https':
346 realhost = None
347 else:
348 realhost, rest = splithost(rest)
349 if realhost:
350 user_passwd, realhost = splituser(realhost)
351 if user_passwd:
352 selector = "%s://%s%s" % (urltype, realhost, rest)
353 #print "proxy via https:", host, selector
354 if not host: raise IOError, ('https error', 'no host given')
355 if user_passwd:
356 import base64
357 auth = base64.encodestring(user_passwd).strip()
358 else:
359 auth = None
360 h = httplib.HTTPS(host, 0,
361 key_file=self.key_file,
362 cert_file=self.cert_file)
363 if data is not None:
364 h.putrequest('POST', selector)
365 h.putheader('Content-type',
366 'application/x-www-form-urlencoded')
367 h.putheader('Content-length', '%d' % len(data))
368 else:
369 h.putrequest('GET', selector)
370 if auth: h.putheader('Authorization: Basic %s' % auth)
371 if realhost: h.putheader('Host', realhost)
372 for args in self.addheaders: apply(h.putheader, args)
373 h.endheaders()
374 if data is not None:
375 h.send(data)
376 errcode, errmsg, headers = h.getreply()
377 fp = h.getfile()
378 if errcode == 200:
379 return addinfourl(fp, headers, "https:" + url)
380 else:
381 if data is None:
382 return self.http_error(url, fp, errcode, errmsg, headers)
383 else:
384 return self.http_error(url, fp, errcode, errmsg, headers,
385 data)
387 def open_gopher(self, url):
388 """Use Gopher protocol."""
389 import gopherlib
390 host, selector = splithost(url)
391 if not host: raise IOError, ('gopher error', 'no host given')
392 host = unquote(host)
393 type, selector = splitgophertype(selector)
394 selector, query = splitquery(selector)
395 selector = unquote(selector)
396 if query:
397 query = unquote(query)
398 fp = gopherlib.send_query(selector, query, host)
399 else:
400 fp = gopherlib.send_selector(selector, host)
401 return addinfourl(fp, noheaders(), "gopher:" + url)
403 def open_file(self, url):
404 """Use local file or FTP depending on form of URL."""
405 if url[:2] == '//' and url[2:3] != '/':
406 return self.open_ftp(url)
407 else:
408 return self.open_local_file(url)
410 def open_local_file(self, url):
411 """Use local file."""
412 import mimetypes, mimetools, rfc822, StringIO
413 host, file = splithost(url)
414 localname = url2pathname(file)
415 try:
416 stats = os.stat(localname)
417 except OSError, e:
418 raise IOError(e.errno, e.strerror, e.filename)
419 size = stats.st_size
420 modified = rfc822.formatdate(stats.st_mtime)
421 mtype = mimetypes.guess_type(url)[0]
422 headers = mimetools.Message(StringIO.StringIO(
423 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
424 (mtype or 'text/plain', size, modified)))
425 if not host:
426 urlfile = file
427 if file[:1] == '/':
428 urlfile = 'file://' + file
429 return addinfourl(open(localname, 'rb'),
430 headers, urlfile)
431 host, port = splitport(host)
432 if not port \
433 and socket.gethostbyname(host) in (localhost(), thishost()):
434 urlfile = file
435 if file[:1] == '/':
436 urlfile = 'file://' + file
437 return addinfourl(open(localname, 'rb'),
438 headers, urlfile)
439 raise IOError, ('local file error', 'not on local host')
441 def open_ftp(self, url):
442 """Use FTP protocol."""
443 import mimetypes, mimetools, StringIO
444 host, path = splithost(url)
445 if not host: raise IOError, ('ftp error', 'no host given')
446 host, port = splitport(host)
447 user, host = splituser(host)
448 if user: user, passwd = splitpasswd(user)
449 else: passwd = None
450 host = unquote(host)
451 user = unquote(user or '')
452 passwd = unquote(passwd or '')
453 host = socket.gethostbyname(host)
454 if not port:
455 import ftplib
456 port = ftplib.FTP_PORT
457 else:
458 port = int(port)
459 path, attrs = splitattr(path)
460 path = unquote(path)
461 dirs = path.split('/')
462 dirs, file = dirs[:-1], dirs[-1]
463 if dirs and not dirs[0]: dirs = dirs[1:]
464 if dirs and not dirs[0]: dirs[0] = '/'
465 key = user, host, port, '/'.join(dirs)
466 # XXX thread unsafe!
467 if len(self.ftpcache) > MAXFTPCACHE:
468 # Prune the cache, rather arbitrarily
469 for k in self.ftpcache.keys():
470 if k != key:
471 v = self.ftpcache[k]
472 del self.ftpcache[k]
473 v.close()
474 try:
475 if not key in self.ftpcache:
476 self.ftpcache[key] = \
477 ftpwrapper(user, passwd, host, port, dirs)
478 if not file: type = 'D'
479 else: type = 'I'
480 for attr in attrs:
481 attr, value = splitvalue(attr)
482 if attr.lower() == 'type' and \
483 value in ('a', 'A', 'i', 'I', 'd', 'D'):
484 type = value.upper()
485 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
486 mtype = mimetypes.guess_type("ftp:" + url)[0]
487 headers = ""
488 if mtype:
489 headers += "Content-Type: %s\n" % mtype
490 if retrlen is not None and retrlen >= 0:
491 headers += "Content-Length: %d\n" % retrlen
492 headers = mimetools.Message(StringIO.StringIO(headers))
493 return addinfourl(fp, headers, "ftp:" + url)
494 except ftperrors(), msg:
495 raise IOError, ('ftp error', msg), sys.exc_info()[2]
497 def open_data(self, url, data=None):
498 """Use "data" URL."""
499 # ignore POSTed data
501 # syntax of data URLs:
502 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
503 # mediatype := [ type "/" subtype ] *( ";" parameter )
504 # data := *urlchar
505 # parameter := attribute "=" value
506 import StringIO, mimetools
507 try:
508 [type, data] = url.split(',', 1)
509 except ValueError:
510 raise IOError, ('data error', 'bad data URL')
511 if not type:
512 type = 'text/plain;charset=US-ASCII'
513 semi = type.rfind(';')
514 if semi >= 0 and '=' not in type[semi:]:
515 encoding = type[semi+1:]
516 type = type[:semi]
517 else:
518 encoding = ''
519 msg = []
520 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
521 time.gmtime(time.time())))
522 msg.append('Content-type: %s' % type)
523 if encoding == 'base64':
524 import base64
525 data = base64.decodestring(data)
526 else:
527 data = unquote(data)
528 msg.append('Content-length: %d' % len(data))
529 msg.append('')
530 msg.append(data)
531 msg = '\n'.join(msg)
532 f = StringIO.StringIO(msg)
533 headers = mimetools.Message(f, 0)
534 f.fileno = None # needed for addinfourl
535 return addinfourl(f, headers, url)
538 class FancyURLopener(URLopener):
539 """Derived class with handlers for errors we can handle (perhaps)."""
541 def __init__(self, *args, **kwargs):
542 apply(URLopener.__init__, (self,) + args, kwargs)
543 self.auth_cache = {}
544 self.tries = 0
545 self.maxtries = 10
547 def http_error_default(self, url, fp, errcode, errmsg, headers):
548 """Default error handling -- don't raise an exception."""
549 return addinfourl(fp, headers, "http:" + url)
551 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
552 """Error 302 -- relocated (temporarily)."""
553 self.tries += 1
554 if self.maxtries and self.tries >= self.maxtries:
555 if hasattr(self, "http_error_500"):
556 meth = self.http_error_500
557 else:
558 meth = self.http_error_default
559 self.tries = 0
560 return meth(url, fp, 500,
561 "Internal Server Error: Redirect Recursion", headers)
562 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
563 data)
564 self.tries = 0
565 return result
567 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
568 if 'location' in headers:
569 newurl = headers['location']
570 elif 'uri' in headers:
571 newurl = headers['uri']
572 else:
573 return
574 void = fp.read()
575 fp.close()
576 # In case the server sent a relative URL, join with original:
577 newurl = basejoin(self.type + ":" + url, newurl)
578 if data is None:
579 return self.open(newurl)
580 else:
581 return self.open(newurl, data)
583 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
584 """Error 301 -- also relocated (permanently)."""
585 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
587 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
588 """Error 401 -- authentication required.
589 See this URL for a description of the basic authentication scheme:
590 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
591 if not 'www-authenticate' in headers:
592 URLopener.http_error_default(self, url, fp,
593 errcode, errmsg, headers)
594 stuff = headers['www-authenticate']
595 import re
596 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
597 if not match:
598 URLopener.http_error_default(self, url, fp,
599 errcode, errmsg, headers)
600 scheme, realm = match.groups()
601 if scheme.lower() != 'basic':
602 URLopener.http_error_default(self, url, fp,
603 errcode, errmsg, headers)
604 name = 'retry_' + self.type + '_basic_auth'
605 if data is None:
606 return getattr(self,name)(url, realm)
607 else:
608 return getattr(self,name)(url, realm, data)
610 def retry_http_basic_auth(self, url, realm, data=None):
611 host, selector = splithost(url)
612 i = host.find('@') + 1
613 host = host[i:]
614 user, passwd = self.get_user_passwd(host, realm, i)
615 if not (user or passwd): return None
616 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
617 newurl = 'http://' + host + selector
618 if data is None:
619 return self.open(newurl)
620 else:
621 return self.open(newurl, data)
623 def retry_https_basic_auth(self, url, realm, data=None):
624 host, selector = splithost(url)
625 i = host.find('@') + 1
626 host = host[i:]
627 user, passwd = self.get_user_passwd(host, realm, i)
628 if not (user or passwd): return None
629 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
630 newurl = '//' + host + selector
631 return self.open_https(newurl, data)
633 def get_user_passwd(self, host, realm, clear_cache = 0):
634 key = realm + '@' + host.lower()
635 if key in self.auth_cache:
636 if clear_cache:
637 del self.auth_cache[key]
638 else:
639 return self.auth_cache[key]
640 user, passwd = self.prompt_user_passwd(host, realm)
641 if user or passwd: self.auth_cache[key] = (user, passwd)
642 return user, passwd
644 def prompt_user_passwd(self, host, realm):
645 """Override this in a GUI environment!"""
646 import getpass
647 try:
648 user = raw_input("Enter username for %s at %s: " % (realm,
649 host))
650 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
651 (user, realm, host))
652 return user, passwd
653 except KeyboardInterrupt:
654 print
655 return None, None
658 # Utility functions
660 _localhost = None
661 def localhost():
662 """Return the IP address of the magic hostname 'localhost'."""
663 global _localhost
664 if _localhost is None:
665 _localhost = socket.gethostbyname('localhost')
666 return _localhost
668 _thishost = None
669 def thishost():
670 """Return the IP address of the current host."""
671 global _thishost
672 if _thishost is None:
673 _thishost = socket.gethostbyname(socket.gethostname())
674 return _thishost
676 _ftperrors = None
677 def ftperrors():
678 """Return the set of errors raised by the FTP class."""
679 global _ftperrors
680 if _ftperrors is None:
681 import ftplib
682 _ftperrors = ftplib.all_errors
683 return _ftperrors
685 _noheaders = None
686 def noheaders():
687 """Return an empty mimetools.Message object."""
688 global _noheaders
689 if _noheaders is None:
690 import mimetools
691 import StringIO
692 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
693 _noheaders.fp.close() # Recycle file descriptor
694 return _noheaders
697 # Utility classes
699 class ftpwrapper:
700 """Class used by open_ftp() for cache of open FTP connections."""
702 def __init__(self, user, passwd, host, port, dirs):
703 self.user = user
704 self.passwd = passwd
705 self.host = host
706 self.port = port
707 self.dirs = dirs
708 self.init()
710 def init(self):
711 import ftplib
712 self.busy = 0
713 self.ftp = ftplib.FTP()
714 self.ftp.connect(self.host, self.port)
715 self.ftp.login(self.user, self.passwd)
716 for dir in self.dirs:
717 self.ftp.cwd(dir)
719 def retrfile(self, file, type):
720 import ftplib
721 self.endtransfer()
722 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
723 else: cmd = 'TYPE ' + type; isdir = 0
724 try:
725 self.ftp.voidcmd(cmd)
726 except ftplib.all_errors:
727 self.init()
728 self.ftp.voidcmd(cmd)
729 conn = None
730 if file and not isdir:
731 # Use nlst to see if the file exists at all
732 try:
733 self.ftp.nlst(file)
734 except ftplib.error_perm, reason:
735 raise IOError, ('ftp error', reason), sys.exc_info()[2]
736 # Restore the transfer mode!
737 self.ftp.voidcmd(cmd)
738 # Try to retrieve as a file
739 try:
740 cmd = 'RETR ' + file
741 conn = self.ftp.ntransfercmd(cmd)
742 except ftplib.error_perm, reason:
743 if str(reason)[:3] != '550':
744 raise IOError, ('ftp error', reason), sys.exc_info()[2]
745 if not conn:
746 # Set transfer mode to ASCII!
747 self.ftp.voidcmd('TYPE A')
748 # Try a directory listing
749 if file: cmd = 'LIST ' + file
750 else: cmd = 'LIST'
751 conn = self.ftp.ntransfercmd(cmd)
752 self.busy = 1
753 # Pass back both a suitably decorated object and a retrieval length
754 return (addclosehook(conn[0].makefile('rb'),
755 self.endtransfer), conn[1])
756 def endtransfer(self):
757 if not self.busy:
758 return
759 self.busy = 0
760 try:
761 self.ftp.voidresp()
762 except ftperrors():
763 pass
765 def close(self):
766 self.endtransfer()
767 try:
768 self.ftp.close()
769 except ftperrors():
770 pass
772 class addbase:
773 """Base class for addinfo and addclosehook."""
775 def __init__(self, fp):
776 self.fp = fp
777 self.read = self.fp.read
778 self.readline = self.fp.readline
779 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
780 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
782 def __repr__(self):
783 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
784 `id(self)`, `self.fp`)
786 def close(self):
787 self.read = None
788 self.readline = None
789 self.readlines = None
790 self.fileno = None
791 if self.fp: self.fp.close()
792 self.fp = None
794 class addclosehook(addbase):
795 """Class to add a close hook to an open file."""
797 def __init__(self, fp, closehook, *hookargs):
798 addbase.__init__(self, fp)
799 self.closehook = closehook
800 self.hookargs = hookargs
802 def close(self):
803 addbase.close(self)
804 if self.closehook:
805 apply(self.closehook, self.hookargs)
806 self.closehook = None
807 self.hookargs = None
809 class addinfo(addbase):
810 """class to add an info() method to an open file."""
812 def __init__(self, fp, headers):
813 addbase.__init__(self, fp)
814 self.headers = headers
816 def info(self):
817 return self.headers
819 class addinfourl(addbase):
820 """class to add info() and geturl() methods to an open file."""
822 def __init__(self, fp, headers, url):
823 addbase.__init__(self, fp)
824 self.headers = headers
825 self.url = url
827 def info(self):
828 return self.headers
830 def geturl(self):
831 return self.url
834 def basejoin(base, url):
835 """Utility to combine a URL with a base URL to form a new URL."""
836 type, path = splittype(url)
837 if type:
838 # if url is complete (i.e., it contains a type), return it
839 return url
840 host, path = splithost(path)
841 type, basepath = splittype(base) # inherit type from base
842 if host:
843 # if url contains host, just inherit type
844 if type: return type + '://' + host + path
845 else:
846 # no type inherited, so url must have started with //
847 # just return it
848 return url
849 host, basepath = splithost(basepath) # inherit host
850 basepath, basetag = splittag(basepath) # remove extraneous cruft
851 basepath, basequery = splitquery(basepath) # idem
852 if path[:1] != '/':
853 # non-absolute path name
854 if path[:1] in ('#', '?'):
855 # path is just a tag or query, attach to basepath
856 i = len(basepath)
857 else:
858 # else replace last component
859 i = basepath.rfind('/')
860 if i < 0:
861 # basepath not absolute
862 if host:
863 # host present, make absolute
864 basepath = '/'
865 else:
866 # else keep non-absolute
867 basepath = ''
868 else:
869 # remove last file component
870 basepath = basepath[:i+1]
871 # Interpret ../ (important because of symlinks)
872 while basepath and path[:3] == '../':
873 path = path[3:]
874 i = basepath[:-1].rfind('/')
875 if i > 0:
876 basepath = basepath[:i+1]
877 elif i == 0:
878 basepath = '/'
879 break
880 else:
881 basepath = ''
883 path = basepath + path
884 if host and path and path[0] != '/':
885 path = '/' + path
886 if type and host: return type + '://' + host + path
887 elif type: return type + ':' + path
888 elif host: return '//' + host + path # don't know what this means
889 else: return path
892 # Utilities to parse URLs (most of these return None for missing parts):
893 # unwrap('<URL:type://host/path>') --> 'type://host/path'
894 # splittype('type:opaquestring') --> 'type', 'opaquestring'
895 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
896 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
897 # splitpasswd('user:passwd') -> 'user', 'passwd'
898 # splitport('host:port') --> 'host', 'port'
899 # splitquery('/path?query') --> '/path', 'query'
900 # splittag('/path#tag') --> '/path', 'tag'
901 # splitattr('/path;attr1=value1;attr2=value2;...') ->
902 # '/path', ['attr1=value1', 'attr2=value2', ...]
903 # splitvalue('attr=value') --> 'attr', 'value'
904 # splitgophertype('/Xselector') --> 'X', 'selector'
905 # unquote('abc%20def') -> 'abc def'
906 # quote('abc def') -> 'abc%20def')
908 try:
909 unicode
910 except NameError:
911 def _is_unicode(x):
912 return 0
913 else:
914 def _is_unicode(x):
915 return isinstance(x, unicode)
917 def toBytes(url):
918 """toBytes(u"URL") --> 'URL'."""
919 # Most URL schemes require ASCII. If that changes, the conversion
920 # can be relaxed
921 if _is_unicode(url):
922 try:
923 url = url.encode("ASCII")
924 except UnicodeError:
925 raise UnicodeError("URL " + repr(url) +
926 " contains non-ASCII characters")
927 return url
929 def unwrap(url):
930 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
931 url = url.strip()
932 if url[:1] == '<' and url[-1:] == '>':
933 url = url[1:-1].strip()
934 if url[:4] == 'URL:': url = url[4:].strip()
935 return url
937 _typeprog = None
938 def splittype(url):
939 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
940 global _typeprog
941 if _typeprog is None:
942 import re
943 _typeprog = re.compile('^([^/:]+):')
945 match = _typeprog.match(url)
946 if match:
947 scheme = match.group(1)
948 return scheme.lower(), url[len(scheme) + 1:]
949 return None, url
951 _hostprog = None
952 def splithost(url):
953 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
954 global _hostprog
955 if _hostprog is None:
956 import re
957 _hostprog = re.compile('^//([^/]*)(.*)$')
959 match = _hostprog.match(url)
960 if match: return match.group(1, 2)
961 return None, url
963 _userprog = None
964 def splituser(host):
965 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
966 global _userprog
967 if _userprog is None:
968 import re
969 _userprog = re.compile('^([^@]*)@(.*)$')
971 match = _userprog.match(host)
972 if match: return map(unquote, match.group(1, 2))
973 return None, host
975 _passwdprog = None
976 def splitpasswd(user):
977 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
978 global _passwdprog
979 if _passwdprog is None:
980 import re
981 _passwdprog = re.compile('^([^:]*):(.*)$')
983 match = _passwdprog.match(user)
984 if match: return match.group(1, 2)
985 return user, None
987 # splittag('/path#tag') --> '/path', 'tag'
988 _portprog = None
989 def splitport(host):
990 """splitport('host:port') --> 'host', 'port'."""
991 global _portprog
992 if _portprog is None:
993 import re
994 _portprog = re.compile('^(.*):([0-9]+)$')
996 match = _portprog.match(host)
997 if match: return match.group(1, 2)
998 return host, None
1000 _nportprog = None
1001 def splitnport(host, defport=-1):
1002 """Split host and port, returning numeric port.
1003 Return given default port if no ':' found; defaults to -1.
1004 Return numerical port if a valid number are found after ':'.
1005 Return None if ':' but not a valid number."""
1006 global _nportprog
1007 if _nportprog is None:
1008 import re
1009 _nportprog = re.compile('^(.*):(.*)$')
1011 match = _nportprog.match(host)
1012 if match:
1013 host, port = match.group(1, 2)
1014 try:
1015 if not port: raise ValueError, "no digits"
1016 nport = int(port)
1017 except ValueError:
1018 nport = None
1019 return host, nport
1020 return host, defport
1022 _queryprog = None
1023 def splitquery(url):
1024 """splitquery('/path?query') --> '/path', 'query'."""
1025 global _queryprog
1026 if _queryprog is None:
1027 import re
1028 _queryprog = re.compile('^(.*)\?([^?]*)$')
1030 match = _queryprog.match(url)
1031 if match: return match.group(1, 2)
1032 return url, None
1034 _tagprog = None
1035 def splittag(url):
1036 """splittag('/path#tag') --> '/path', 'tag'."""
1037 global _tagprog
1038 if _tagprog is None:
1039 import re
1040 _tagprog = re.compile('^(.*)#([^#]*)$')
1042 match = _tagprog.match(url)
1043 if match: return match.group(1, 2)
1044 return url, None
1046 def splitattr(url):
1047 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1048 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1049 words = url.split(';')
1050 return words[0], words[1:]
1052 _valueprog = None
1053 def splitvalue(attr):
1054 """splitvalue('attr=value') --> 'attr', 'value'."""
1055 global _valueprog
1056 if _valueprog is None:
1057 import re
1058 _valueprog = re.compile('^([^=]*)=(.*)$')
1060 match = _valueprog.match(attr)
1061 if match: return match.group(1, 2)
1062 return attr, None
1064 def splitgophertype(selector):
1065 """splitgophertype('/Xselector') --> 'X', 'selector'."""
1066 if selector[:1] == '/' and selector[1:2]:
1067 return selector[1], selector[2:]
1068 return None, selector
1070 def unquote(s):
1071 """unquote('abc%20def') -> 'abc def'."""
1072 mychr = chr
1073 myatoi = int
1074 list = s.split('%')
1075 res = [list[0]]
1076 myappend = res.append
1077 del list[0]
1078 for item in list:
1079 if item[1:2]:
1080 try:
1081 myappend(mychr(myatoi(item[:2], 16))
1082 + item[2:])
1083 except ValueError:
1084 myappend('%' + item)
1085 else:
1086 myappend('%' + item)
1087 return "".join(res)
1089 def unquote_plus(s):
1090 """unquote('%7e/abc+def') -> '~/abc def'"""
1091 if '+' in s:
1092 # replace '+' with ' '
1093 s = ' '.join(s.split('+'))
1094 return unquote(s)
1096 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1097 'abcdefghijklmnopqrstuvwxyz'
1098 '0123456789' '_.-')
1100 _fast_safe_test = always_safe + '/'
1101 _fast_safe = None
1103 def _fast_quote(s):
1104 global _fast_safe
1105 if _fast_safe is None:
1106 _fast_safe = {}
1107 for c in _fast_safe_test:
1108 _fast_safe[c] = c
1109 res = list(s)
1110 for i in range(len(res)):
1111 c = res[i]
1112 if not c in _fast_safe:
1113 res[i] = '%%%02X' % ord(c)
1114 return ''.join(res)
1116 def quote(s, safe = '/'):
1117 """quote('abc def') -> 'abc%20def'
1119 Each part of a URL, e.g. the path info, the query, etc., has a
1120 different set of reserved characters that must be quoted.
1122 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1123 the following reserved characters.
1125 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1126 "$" | ","
1128 Each of these characters is reserved in some component of a URL,
1129 but not necessarily in all of them.
1131 By default, the quote function is intended for quoting the path
1132 section of a URL. Thus, it will not encode '/'. This character
1133 is reserved, but in typical usage the quote function is being
1134 called on a path where the existing slash characters are used as
1135 reserved characters.
1137 safe = always_safe + safe
1138 if _fast_safe_test == safe:
1139 return _fast_quote(s)
1140 res = list(s)
1141 for i in range(len(res)):
1142 c = res[i]
1143 if c not in safe:
1144 res[i] = '%%%02X' % ord(c)
1145 return ''.join(res)
1147 def quote_plus(s, safe = ''):
1148 """Quote the query fragment of a URL; replacing ' ' with '+'"""
1149 if ' ' in s:
1150 l = s.split(' ')
1151 for i in range(len(l)):
1152 l[i] = quote(l[i], safe)
1153 return '+'.join(l)
1154 else:
1155 return quote(s, safe)
1157 def urlencode(query,doseq=0):
1158 """Encode a sequence of two-element tuples or dictionary into a URL query string.
1160 If any values in the query arg are sequences and doseq is true, each
1161 sequence element is converted to a separate parameter.
1163 If the query arg is a sequence of two-element tuples, the order of the
1164 parameters in the output will match the order of parameters in the
1165 input.
1168 if hasattr(query,"items"):
1169 # mapping objects
1170 query = query.items()
1171 else:
1172 # it's a bother at times that strings and string-like objects are
1173 # sequences...
1174 try:
1175 # non-sequence items should not work with len()
1176 # non-empty strings will fail this
1177 if len(query) and not isinstance(query[0], tuple):
1178 raise TypeError
1179 # zero-length sequences of all types will get here and succeed,
1180 # but that's a minor nit - since the original implementation
1181 # allowed empty dicts that type of behavior probably should be
1182 # preserved for consistency
1183 except TypeError:
1184 ty,va,tb = sys.exc_info()
1185 raise TypeError, "not a valid non-string sequence or mapping object", tb
1187 l = []
1188 if not doseq:
1189 # preserve old behavior
1190 for k, v in query:
1191 k = quote_plus(str(k))
1192 v = quote_plus(str(v))
1193 l.append(k + '=' + v)
1194 else:
1195 for k, v in query:
1196 k = quote_plus(str(k))
1197 if isinstance(v, str):
1198 v = quote_plus(v)
1199 l.append(k + '=' + v)
1200 elif _is_unicode(v):
1201 # is there a reasonable way to convert to ASCII?
1202 # encode generates a string, but "replace" or "ignore"
1203 # lose information and "strict" can raise UnicodeError
1204 v = quote_plus(v.encode("ASCII","replace"))
1205 l.append(k + '=' + v)
1206 else:
1207 try:
1208 # is this a sufficient test for sequence-ness?
1209 x = len(v)
1210 except TypeError:
1211 # not a sequence
1212 v = quote_plus(str(v))
1213 l.append(k + '=' + v)
1214 else:
1215 # loop over the sequence
1216 for elt in v:
1217 l.append(k + '=' + quote_plus(str(elt)))
1218 return '&'.join(l)
1220 # Proxy handling
1221 def getproxies_environment():
1222 """Return a dictionary of scheme -> proxy server URL mappings.
1224 Scan the environment for variables named <scheme>_proxy;
1225 this seems to be the standard convention. If you need a
1226 different way, you can pass a proxies dictionary to the
1227 [Fancy]URLopener constructor.
1230 proxies = {}
1231 for name, value in os.environ.items():
1232 name = name.lower()
1233 if value and name[-6:] == '_proxy':
1234 proxies[name[:-6]] = value
1235 return proxies
1237 if os.name == 'mac':
1238 def getproxies():
1239 """Return a dictionary of scheme -> proxy server URL mappings.
1241 By convention the mac uses Internet Config to store
1242 proxies. An HTTP proxy, for instance, is stored under
1243 the HttpProxy key.
1246 try:
1247 import ic
1248 except ImportError:
1249 return {}
1251 try:
1252 config = ic.IC()
1253 except ic.error:
1254 return {}
1255 proxies = {}
1256 # HTTP:
1257 if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
1258 try:
1259 value = config['HTTPProxyHost']
1260 except ic.error:
1261 pass
1262 else:
1263 proxies['http'] = 'http://%s' % value
1264 # FTP: XXXX To be done.
1265 # Gopher: XXXX To be done.
1266 return proxies
1268 def proxy_bypass(x):
1269 return 0
1271 elif os.name == 'nt':
1272 def getproxies_registry():
1273 """Return a dictionary of scheme -> proxy server URL mappings.
1275 Win32 uses the registry to store proxies.
1278 proxies = {}
1279 try:
1280 import _winreg
1281 except ImportError:
1282 # Std module, so should be around - but you never know!
1283 return proxies
1284 try:
1285 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1286 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1287 proxyEnable = _winreg.QueryValueEx(internetSettings,
1288 'ProxyEnable')[0]
1289 if proxyEnable:
1290 # Returned as Unicode but problems if not converted to ASCII
1291 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1292 'ProxyServer')[0])
1293 if '=' in proxyServer:
1294 # Per-protocol settings
1295 for p in proxyServer.split(';'):
1296 protocol, address = p.split('=', 1)
1297 # See if address has a type:// prefix
1298 import re
1299 if not re.match('^([^/:]+)://', address):
1300 address = '%s://%s' % (protocol, address)
1301 proxies[protocol] = address
1302 else:
1303 # Use one setting for all protocols
1304 if proxyServer[:5] == 'http:':
1305 proxies['http'] = proxyServer
1306 else:
1307 proxies['http'] = 'http://%s' % proxyServer
1308 proxies['ftp'] = 'ftp://%s' % proxyServer
1309 internetSettings.Close()
1310 except (WindowsError, ValueError, TypeError):
1311 # Either registry key not found etc, or the value in an
1312 # unexpected format.
1313 # proxies already set up to be empty so nothing to do
1314 pass
1315 return proxies
1317 def getproxies():
1318 """Return a dictionary of scheme -> proxy server URL mappings.
1320 Returns settings gathered from the environment, if specified,
1321 or the registry.
1324 return getproxies_environment() or getproxies_registry()
1326 def proxy_bypass(host):
1327 try:
1328 import _winreg
1329 import re
1330 except ImportError:
1331 # Std modules, so should be around - but you never know!
1332 return 0
1333 try:
1334 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1335 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1336 proxyEnable = _winreg.QueryValueEx(internetSettings,
1337 'ProxyEnable')[0]
1338 proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1339 'ProxyOverride')[0])
1340 # ^^^^ Returned as Unicode but problems if not converted to ASCII
1341 except WindowsError:
1342 return 0
1343 if not proxyEnable or not proxyOverride:
1344 return 0
1345 # try to make a host list from name and IP address.
1346 host = [host]
1347 try:
1348 addr = socket.gethostbyname(host[0])
1349 if addr != host:
1350 host.append(addr)
1351 except socket.error:
1352 pass
1353 # make a check value list from the registry entry: replace the
1354 # '<local>' string by the localhost entry and the corresponding
1355 # canonical entry.
1356 proxyOverride = proxyOverride.split(';')
1357 i = 0
1358 while i < len(proxyOverride):
1359 if proxyOverride[i] == '<local>':
1360 proxyOverride[i:i+1] = ['localhost',
1361 '127.0.0.1',
1362 socket.gethostname(),
1363 socket.gethostbyname(
1364 socket.gethostname())]
1365 i += 1
1366 # print proxyOverride
1367 # now check if we match one of the registry values.
1368 for test in proxyOverride:
1369 test = test.replace(".", r"\.") # mask dots
1370 test = test.replace("*", r".*") # change glob sequence
1371 test = test.replace("?", r".") # change glob char
1372 for val in host:
1373 # print "%s <--> %s" %( test, val )
1374 if re.match(test, val, re.I):
1375 return 1
1376 return 0
1378 else:
1379 # By default use environment variables
1380 getproxies = getproxies_environment
1382 def proxy_bypass(host):
1383 return 0
1385 # Test and time quote() and unquote()
1386 def test1():
1387 s = ''
1388 for i in range(256): s = s + chr(i)
1389 s = s*4
1390 t0 = time.time()
1391 qs = quote(s)
1392 uqs = unquote(qs)
1393 t1 = time.time()
1394 if uqs != s:
1395 print 'Wrong!'
1396 print `s`
1397 print `qs`
1398 print `uqs`
1399 print round(t1 - t0, 3), 'sec'
1402 def reporthook(blocknum, blocksize, totalsize):
1403 # Report during remote transfers
1404 print "Block number: %d, Block size: %d, Total size: %d" % (
1405 blocknum, blocksize, totalsize)
1407 # Test program
1408 def test(args=[]):
1409 if not args:
1410 args = [
1411 '/etc/passwd',
1412 'file:/etc/passwd',
1413 'file://localhost/etc/passwd',
1414 'ftp://ftp.python.org/pub/python/README',
1415 ## 'gopher://gopher.micro.umn.edu/1/',
1416 'http://www.python.org/index.html',
1418 if hasattr(URLopener, "open_https"):
1419 args.append('https://synergy.as.cmu.edu/~geek/')
1420 try:
1421 for url in args:
1422 print '-'*10, url, '-'*10
1423 fn, h = urlretrieve(url, None, reporthook)
1424 print fn
1425 if h:
1426 print '======'
1427 for k in h.keys(): print k + ':', h[k]
1428 print '======'
1429 fp = open(fn, 'rb')
1430 data = fp.read()
1431 del fp
1432 if '\r' in data:
1433 table = string.maketrans("", "")
1434 data = data.translate(table, "\r")
1435 print data
1436 fn, h = None, None
1437 print '-'*40
1438 finally:
1439 urlcleanup()
1441 def main():
1442 import getopt, sys
1443 try:
1444 opts, args = getopt.getopt(sys.argv[1:], "th")
1445 except getopt.error, msg:
1446 print msg
1447 print "Use -h for help"
1448 return
1449 t = 0
1450 for o, a in opts:
1451 if o == '-t':
1452 t = t + 1
1453 if o == '-h':
1454 print "Usage: python urllib.py [-t] [url ...]"
1455 print "-t runs self-test;",
1456 print "otherwise, contents of urls are printed"
1457 return
1458 if t:
1459 if t > 1:
1460 test1()
1461 test(args)
1462 else:
1463 if not args:
1464 print "Use -h for help"
1465 for url in args:
1466 print urlopen(url).read(),
1468 # Run test program when run as a script
1469 if __name__ == '__main__':
1470 main()