Bump version to 0.9.1.
[python/dscho.git] / Lib / urllib.py
blob7c9446b5c0f91199f6c1d9c587eeab8830783297
1 """Open an arbitrary URL.
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
16 The object returned by URLopener().open(file) will differ per
17 protocol. All you know is that is has methods read(), readline(),
18 readlines(), fileno(), close() and info(). The read*(), fileno()
19 and close() methods work like those of open files.
20 The info() method returns a mimetools.Message object which can be
21 used to query various info about the object, if available.
22 (mimetools.Message objects are queried with the getheader() method.)
23 """
25 import string
26 import socket
27 import os
28 import sys
31 __version__ = '1.12' # XXX This version is not always updated :-(
33 MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
35 # Helper for non-unix systems
36 if os.name == 'mac':
37 from macurl2path import url2pathname, pathname2url
38 elif os.name == 'nt':
39 from nturl2path import url2pathname, pathname2url
40 else:
41 def url2pathname(pathname):
42 return unquote(pathname)
43 def pathname2url(pathname):
44 return quote(pathname)
46 # This really consists of two pieces:
47 # (1) a class which handles opening of all sorts of URLs
48 # (plus assorted utilities etc.)
49 # (2) a set of functions for parsing URLs
50 # XXX Should these be separated out into different modules?
53 # Shortcut for basic usage
54 _urlopener = None
55 def urlopen(url, data=None):
56 global _urlopener
57 if not _urlopener:
58 _urlopener = FancyURLopener()
59 if data is None:
60 return _urlopener.open(url)
61 else:
62 return _urlopener.open(url, data)
63 def urlretrieve(url, filename=None, reporthook=None):
64 global _urlopener
65 if not _urlopener:
66 _urlopener = FancyURLopener()
67 return _urlopener.retrieve(url, filename, reporthook)
68 def urlcleanup():
69 if _urlopener:
70 _urlopener.cleanup()
73 ftpcache = {}
74 class URLopener:
75 """Class to open URLs.
76 This is a class rather than just a subroutine because we may need
77 more than one set of global protocol-specific options.
78 Note -- this is a base class for those who don't want the
79 automatic handling of errors type 302 (relocated) and 401
80 (authorization needed)."""
82 __tempfiles = None
84 # Constructor
85 def __init__(self, proxies=None, **x509):
86 if proxies is None:
87 proxies = getproxies()
88 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
89 self.proxies = proxies
90 self.key_file = x509.get('key_file')
91 self.cert_file = x509.get('cert_file')
92 server_version = "Python-urllib/%s" % __version__
93 self.addheaders = [('User-agent', server_version)]
94 self.__tempfiles = []
95 self.__unlink = os.unlink # See cleanup()
96 self.tempcache = None
97 # Undocumented feature: if you assign {} to tempcache,
98 # it is used to cache files retrieved with
99 # self.retrieve(). This is not enabled by default
100 # since it does not work for changing documents (and I
101 # haven't got the logic to check expiration headers
102 # yet).
103 self.ftpcache = ftpcache
104 # Undocumented feature: you can use a different
105 # ftp cache by assigning to the .ftpcache member;
106 # in case you want logically independent URL openers
107 # XXX This is not threadsafe. Bah.
109 def __del__(self):
110 self.close()
112 def close(self):
113 self.cleanup()
115 def cleanup(self):
116 # This code sometimes runs when the rest of this module
117 # has already been deleted, so it can't use any globals
118 # or import anything.
119 if self.__tempfiles:
120 for file in self.__tempfiles:
121 try:
122 self.__unlink(file)
123 except:
124 pass
125 del self.__tempfiles[:]
126 if self.tempcache:
127 self.tempcache.clear()
129 def addheader(self, *args):
130 """Add a header to be used by the HTTP interface only
131 e.g. u.addheader('Accept', 'sound/basic')"""
132 self.addheaders.append(args)
134 # External interface
135 def open(self, fullurl, data=None):
136 """Use URLopener().open(file) instead of open(file, 'r')."""
137 fullurl = unwrap(fullurl)
138 if self.tempcache and self.tempcache.has_key(fullurl):
139 filename, headers = self.tempcache[fullurl]
140 fp = open(filename, 'rb')
141 return addinfourl(fp, headers, fullurl)
142 type, url = splittype(fullurl)
143 if not type: type = 'file'
144 if self.proxies.has_key(type):
145 proxy = self.proxies[type]
146 type, proxy = splittype(proxy)
147 host, selector = splithost(proxy)
148 url = (host, fullurl) # Signal special case to open_*()
149 name = 'open_' + type
150 self.type = type
151 if '-' in name:
152 # replace - with _
153 name = string.join(string.split(name, '-'), '_')
154 if not hasattr(self, name):
155 if data is None:
156 return self.open_unknown(fullurl)
157 else:
158 return self.open_unknown(fullurl, data)
159 try:
160 if data is None:
161 return getattr(self, name)(url)
162 else:
163 return getattr(self, name)(url, data)
164 except socket.error, msg:
165 raise IOError, ('socket error', msg), sys.exc_info()[2]
167 def open_unknown(self, fullurl, data=None):
168 """Overridable interface to open unknown URL type."""
169 type, url = splittype(fullurl)
170 raise IOError, ('url error', 'unknown url type', type)
172 # External interface
173 def retrieve(self, url, filename=None, reporthook=None):
174 """retrieve(url) returns (filename, None) for a local object
175 or (tempfilename, headers) for a remote object."""
176 url = unwrap(url)
177 if self.tempcache and self.tempcache.has_key(url):
178 return self.tempcache[url]
179 type, url1 = splittype(url)
180 if not filename and (not type or type == 'file'):
181 try:
182 fp = self.open_local_file(url1)
183 hdrs = fp.info()
184 del fp
185 return url2pathname(splithost(url1)[1]), hdrs
186 except IOError, msg:
187 pass
188 fp = self.open(url)
189 headers = fp.info()
190 if not filename:
191 import tempfile
192 garbage, path = splittype(url)
193 garbage, path = splithost(path or "")
194 path, garbage = splitquery(path or "")
195 path, garbage = splitattr(path or "")
196 suffix = os.path.splitext(path)[1]
197 filename = tempfile.mktemp(suffix)
198 self.__tempfiles.append(filename)
199 result = filename, headers
200 if self.tempcache is not None:
201 self.tempcache[url] = result
202 tfp = open(filename, 'wb')
203 bs = 1024*8
204 size = -1
205 blocknum = 1
206 if reporthook:
207 if headers.has_key("content-length"):
208 size = int(headers["Content-Length"])
209 reporthook(0, bs, size)
210 block = fp.read(bs)
211 if reporthook:
212 reporthook(1, bs, size)
213 while block:
214 tfp.write(block)
215 block = fp.read(bs)
216 blocknum = blocknum + 1
217 if reporthook:
218 reporthook(blocknum, bs, size)
219 fp.close()
220 tfp.close()
221 del fp
222 del tfp
223 return result
225 # Each method named open_<type> knows how to open that type of URL
227 def open_http(self, url, data=None):
228 """Use HTTP protocol."""
229 import httplib
230 user_passwd = None
231 if type(url) is type(""):
232 host, selector = splithost(url)
233 if host:
234 user_passwd, host = splituser(host)
235 host = unquote(host)
236 realhost = host
237 else:
238 host, selector = url
239 urltype, rest = splittype(selector)
240 url = rest
241 user_passwd = None
242 if string.lower(urltype) != 'http':
243 realhost = None
244 else:
245 realhost, rest = splithost(rest)
246 if realhost:
247 user_passwd, realhost = splituser(realhost)
248 if user_passwd:
249 selector = "%s://%s%s" % (urltype, realhost, rest)
250 #print "proxy via http:", host, selector
251 if not host: raise IOError, ('http error', 'no host given')
252 if user_passwd:
253 import base64
254 auth = string.strip(base64.encodestring(user_passwd))
255 else:
256 auth = None
257 h = httplib.HTTP(host)
258 if data is not None:
259 h.putrequest('POST', selector)
260 h.putheader('Content-type', 'application/x-www-form-urlencoded')
261 h.putheader('Content-length', '%d' % len(data))
262 else:
263 h.putrequest('GET', selector)
264 if auth: h.putheader('Authorization', 'Basic %s' % auth)
265 if realhost: h.putheader('Host', realhost)
266 for args in self.addheaders: apply(h.putheader, args)
267 h.endheaders()
268 if data is not None:
269 h.send(data + '\r\n')
270 errcode, errmsg, headers = h.getreply()
271 fp = h.getfile()
272 if errcode == 200:
273 return addinfourl(fp, headers, "http:" + url)
274 else:
275 if data is None:
276 return self.http_error(url, fp, errcode, errmsg, headers)
277 else:
278 return self.http_error(url, fp, errcode, errmsg, headers, data)
280 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
281 """Handle http errors.
282 Derived class can override this, or provide specific handlers
283 named http_error_DDD where DDD is the 3-digit error code."""
284 # First check if there's a specific handler for this error
285 name = 'http_error_%d' % errcode
286 if hasattr(self, name):
287 method = getattr(self, name)
288 if data is None:
289 result = method(url, fp, errcode, errmsg, headers)
290 else:
291 result = method(url, fp, errcode, errmsg, headers, data)
292 if result: return result
293 return self.http_error_default(url, fp, errcode, errmsg, headers)
295 def http_error_default(self, url, fp, errcode, errmsg, headers):
296 """Default error handler: close the connection and raise IOError."""
297 void = fp.read()
298 fp.close()
299 raise IOError, ('http error', errcode, errmsg, headers)
301 if hasattr(socket, "ssl"):
302 def open_https(self, url, data=None):
303 """Use HTTPS protocol."""
304 import httplib
305 if type(url) is type(""):
306 host, selector = splithost(url)
307 user_passwd, host = splituser(host)
308 else:
309 host, selector = url
310 urltype, rest = splittype(selector)
311 if string.lower(urltype) == 'https':
312 realhost, rest = splithost(rest)
313 user_passwd, realhost = splituser(realhost)
314 if user_passwd:
315 selector = "%s://%s%s" % (urltype, realhost, rest)
316 #print "proxy via https:", host, selector
317 if not host: raise IOError, ('https error', 'no host given')
318 if user_passwd:
319 import base64
320 auth = string.strip(base64.encodestring(user_passwd))
321 else:
322 auth = None
323 h = httplib.HTTPS(host, 0,
324 key_file=self.key_file,
325 cert_file=self.cert_file)
326 if data is not None:
327 h.putrequest('POST', selector)
328 h.putheader('Content-type',
329 'application/x-www-form-urlencoded')
330 h.putheader('Content-length', '%d' % len(data))
331 else:
332 h.putrequest('GET', selector)
333 if auth: h.putheader('Authorization: Basic %s' % auth)
334 for args in self.addheaders: apply(h.putheader, args)
335 h.endheaders()
336 if data is not None:
337 h.send(data + '\r\n')
338 errcode, errmsg, headers = h.getreply()
339 fp = h.getfile()
340 if errcode == 200:
341 return addinfourl(fp, headers, url)
342 else:
343 return self.http_error(url, fp, errcode, errmsg, headers)
345 def open_gopher(self, url):
346 """Use Gopher protocol."""
347 import gopherlib
348 host, selector = splithost(url)
349 if not host: raise IOError, ('gopher error', 'no host given')
350 host = unquote(host)
351 type, selector = splitgophertype(selector)
352 selector, query = splitquery(selector)
353 selector = unquote(selector)
354 if query:
355 query = unquote(query)
356 fp = gopherlib.send_query(selector, query, host)
357 else:
358 fp = gopherlib.send_selector(selector, host)
359 return addinfourl(fp, noheaders(), "gopher:" + url)
361 def open_file(self, url):
362 """Use local file or FTP depending on form of URL."""
363 if url[:2] == '//' and url[2:3] != '/':
364 return self.open_ftp(url)
365 else:
366 return self.open_local_file(url)
368 def open_local_file(self, url):
369 """Use local file."""
370 import mimetypes, mimetools, StringIO
371 mtype = mimetypes.guess_type(url)[0]
372 headers = mimetools.Message(StringIO.StringIO(
373 'Content-Type: %s\n' % (mtype or 'text/plain')))
374 host, file = splithost(url)
375 if not host:
376 urlfile = file
377 if file[:1] == '/':
378 urlfile = 'file://' + file
379 return addinfourl(open(url2pathname(file), 'rb'),
380 headers, urlfile)
381 host, port = splitport(host)
382 if not port \
383 and socket.gethostbyname(host) in (localhost(), thishost()):
384 urlfile = file
385 if file[:1] == '/':
386 urlfile = 'file://' + file
387 return addinfourl(open(url2pathname(file), 'rb'),
388 headers, urlfile)
389 raise IOError, ('local file error', 'not on local host')
391 def open_ftp(self, url):
392 """Use FTP protocol."""
393 host, path = splithost(url)
394 if not host: raise IOError, ('ftp error', 'no host given')
395 host, port = splitport(host)
396 user, host = splituser(host)
397 if user: user, passwd = splitpasswd(user)
398 else: passwd = None
399 host = unquote(host)
400 user = unquote(user or '')
401 passwd = unquote(passwd or '')
402 host = socket.gethostbyname(host)
403 if not port:
404 import ftplib
405 port = ftplib.FTP_PORT
406 else:
407 port = int(port)
408 path, attrs = splitattr(path)
409 path = unquote(path)
410 dirs = string.splitfields(path, '/')
411 dirs, file = dirs[:-1], dirs[-1]
412 if dirs and not dirs[0]: dirs = dirs[1:]
413 if dirs and not dirs[0]: dirs[0] = '/'
414 key = (user, host, port, string.joinfields(dirs, '/'))
415 # XXX thread unsafe!
416 if len(self.ftpcache) > MAXFTPCACHE:
417 # Prune the cache, rather arbitrarily
418 for k in self.ftpcache.keys():
419 if k != key:
420 v = self.ftpcache[k]
421 del self.ftpcache[k]
422 v.close()
423 try:
424 if not self.ftpcache.has_key(key):
425 self.ftpcache[key] = \
426 ftpwrapper(user, passwd, host, port, dirs)
427 if not file: type = 'D'
428 else: type = 'I'
429 for attr in attrs:
430 attr, value = splitvalue(attr)
431 if string.lower(attr) == 'type' and \
432 value in ('a', 'A', 'i', 'I', 'd', 'D'):
433 type = string.upper(value)
434 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
435 if retrlen is not None and retrlen >= 0:
436 import mimetools, StringIO
437 headers = mimetools.Message(StringIO.StringIO(
438 'Content-Length: %d\n' % retrlen))
439 else:
440 headers = noheaders()
441 return addinfourl(fp, headers, "ftp:" + url)
442 except ftperrors(), msg:
443 raise IOError, ('ftp error', msg), sys.exc_info()[2]
445 def open_data(self, url, data=None):
446 """Use "data" URL."""
447 # ignore POSTed data
449 # syntax of data URLs:
450 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
451 # mediatype := [ type "/" subtype ] *( ";" parameter )
452 # data := *urlchar
453 # parameter := attribute "=" value
454 import StringIO, mimetools, time
455 try:
456 [type, data] = string.split(url, ',', 1)
457 except ValueError:
458 raise IOError, ('data error', 'bad data URL')
459 if not type:
460 type = 'text/plain;charset=US-ASCII'
461 semi = string.rfind(type, ';')
462 if semi >= 0 and '=' not in type[semi:]:
463 encoding = type[semi+1:]
464 type = type[:semi]
465 else:
466 encoding = ''
467 msg = []
468 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
469 time.gmtime(time.time())))
470 msg.append('Content-type: %s' % type)
471 if encoding == 'base64':
472 import base64
473 data = base64.decodestring(data)
474 else:
475 data = unquote(data)
476 msg.append('Content-length: %d' % len(data))
477 msg.append('')
478 msg.append(data)
479 msg = string.join(msg, '\n')
480 f = StringIO.StringIO(msg)
481 headers = mimetools.Message(f, 0)
482 f.fileno = None # needed for addinfourl
483 return addinfourl(f, headers, url)
486 class FancyURLopener(URLopener):
487 """Derived class with handlers for errors we can handle (perhaps)."""
489 def __init__(self, *args):
490 apply(URLopener.__init__, (self,) + args)
491 self.auth_cache = {}
493 def http_error_default(self, url, fp, errcode, errmsg, headers):
494 """Default error handling -- don't raise an exception."""
495 return addinfourl(fp, headers, "http:" + url)
497 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
498 """Error 302 -- relocated (temporarily)."""
499 # XXX The server can force infinite recursion here!
500 if headers.has_key('location'):
501 newurl = headers['location']
502 elif headers.has_key('uri'):
503 newurl = headers['uri']
504 else:
505 return
506 void = fp.read()
507 fp.close()
508 # In case the server sent a relative URL, join with original:
509 newurl = basejoin("http:" + url, newurl)
510 if data is None:
511 return self.open(newurl)
512 else:
513 return self.open(newurl, data)
515 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
516 """Error 301 -- also relocated (permanently)."""
517 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
519 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
520 """Error 401 -- authentication required.
521 See this URL for a description of the basic authentication scheme:
522 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
523 if headers.has_key('www-authenticate'):
524 stuff = headers['www-authenticate']
525 import re
526 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
527 if match:
528 scheme, realm = match.groups()
529 if string.lower(scheme) == 'basic':
530 name = 'retry_' + self.type + '_basic_auth'
531 if data is None:
532 return getattr(self,name)(url, realm)
533 else:
534 return getattr(self,name)(url, realm, data)
536 def retry_http_basic_auth(self, url, realm, data=None):
537 host, selector = splithost(url)
538 i = string.find(host, '@') + 1
539 host = host[i:]
540 user, passwd = self.get_user_passwd(host, realm, i)
541 if not (user or passwd): return None
542 host = user + ':' + passwd + '@' + host
543 newurl = 'http://' + host + selector
544 if data is None:
545 return self.open(newurl)
546 else:
547 return self.open(newurl, data)
549 def retry_https_basic_auth(self, url, realm, data=None):
550 host, selector = splithost(url)
551 i = string.find(host, '@') + 1
552 host = host[i:]
553 user, passwd = self.get_user_passwd(host, realm, i)
554 if not (user or passwd): return None
555 host = user + ':' + passwd + '@' + host
556 newurl = '//' + host + selector
557 return self.open_https(newurl)
559 def get_user_passwd(self, host, realm, clear_cache = 0):
560 key = realm + '@' + string.lower(host)
561 if self.auth_cache.has_key(key):
562 if clear_cache:
563 del self.auth_cache[key]
564 else:
565 return self.auth_cache[key]
566 user, passwd = self.prompt_user_passwd(host, realm)
567 if user or passwd: self.auth_cache[key] = (user, passwd)
568 return user, passwd
570 def prompt_user_passwd(self, host, realm):
571 """Override this in a GUI environment!"""
572 import getpass
573 try:
574 user = raw_input("Enter username for %s at %s: " % (realm,
575 host))
576 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
577 (user, realm, host))
578 return user, passwd
579 except KeyboardInterrupt:
580 print
581 return None, None
584 # Utility functions
586 _localhost = None
587 def localhost():
588 """Return the IP address of the magic hostname 'localhost'."""
589 global _localhost
590 if not _localhost:
591 _localhost = socket.gethostbyname('localhost')
592 return _localhost
594 _thishost = None
595 def thishost():
596 """Return the IP address of the current host."""
597 global _thishost
598 if not _thishost:
599 _thishost = socket.gethostbyname(socket.gethostname())
600 return _thishost
602 _ftperrors = None
603 def ftperrors():
604 """Return the set of errors raised by the FTP class."""
605 global _ftperrors
606 if not _ftperrors:
607 import ftplib
608 _ftperrors = ftplib.all_errors
609 return _ftperrors
611 _noheaders = None
612 def noheaders():
613 """Return an empty mimetools.Message object."""
614 global _noheaders
615 if not _noheaders:
616 import mimetools
617 import StringIO
618 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
619 _noheaders.fp.close() # Recycle file descriptor
620 return _noheaders
623 # Utility classes
625 class ftpwrapper:
626 """Class used by open_ftp() for cache of open FTP connections."""
628 def __init__(self, user, passwd, host, port, dirs):
629 self.user = user
630 self.passwd = passwd
631 self.host = host
632 self.port = port
633 self.dirs = dirs
634 self.init()
636 def init(self):
637 import ftplib
638 self.busy = 0
639 self.ftp = ftplib.FTP()
640 self.ftp.connect(self.host, self.port)
641 self.ftp.login(self.user, self.passwd)
642 for dir in self.dirs:
643 self.ftp.cwd(dir)
645 def retrfile(self, file, type):
646 import ftplib
647 self.endtransfer()
648 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
649 else: cmd = 'TYPE ' + type; isdir = 0
650 try:
651 self.ftp.voidcmd(cmd)
652 except ftplib.all_errors:
653 self.init()
654 self.ftp.voidcmd(cmd)
655 conn = None
656 if file and not isdir:
657 # Use nlst to see if the file exists at all
658 try:
659 self.ftp.nlst(file)
660 except ftplib.error_perm, reason:
661 raise IOError, ('ftp error', reason), sys.exc_info()[2]
662 # Restore the transfer mode!
663 self.ftp.voidcmd(cmd)
664 # Try to retrieve as a file
665 try:
666 cmd = 'RETR ' + file
667 conn = self.ftp.ntransfercmd(cmd)
668 except ftplib.error_perm, reason:
669 if reason[:3] != '550':
670 raise IOError, ('ftp error', reason), sys.exc_info()[2]
671 if not conn:
672 # Set transfer mode to ASCII!
673 self.ftp.voidcmd('TYPE A')
674 # Try a directory listing
675 if file: cmd = 'LIST ' + file
676 else: cmd = 'LIST'
677 conn = self.ftp.ntransfercmd(cmd)
678 self.busy = 1
679 # Pass back both a suitably decorated object and a retrieval length
680 return (addclosehook(conn[0].makefile('rb'),
681 self.endtransfer), conn[1])
682 def endtransfer(self):
683 if not self.busy:
684 return
685 self.busy = 0
686 try:
687 self.ftp.voidresp()
688 except ftperrors():
689 pass
691 def close(self):
692 self.endtransfer()
693 try:
694 self.ftp.close()
695 except ftperrors():
696 pass
698 class addbase:
699 """Base class for addinfo and addclosehook."""
701 def __init__(self, fp):
702 self.fp = fp
703 self.read = self.fp.read
704 self.readline = self.fp.readline
705 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
706 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
708 def __repr__(self):
709 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
710 `id(self)`, `self.fp`)
712 def close(self):
713 self.read = None
714 self.readline = None
715 self.readlines = None
716 self.fileno = None
717 if self.fp: self.fp.close()
718 self.fp = None
720 class addclosehook(addbase):
721 """Class to add a close hook to an open file."""
723 def __init__(self, fp, closehook, *hookargs):
724 addbase.__init__(self, fp)
725 self.closehook = closehook
726 self.hookargs = hookargs
728 def close(self):
729 addbase.close(self)
730 if self.closehook:
731 apply(self.closehook, self.hookargs)
732 self.closehook = None
733 self.hookargs = None
735 class addinfo(addbase):
736 """class to add an info() method to an open file."""
738 def __init__(self, fp, headers):
739 addbase.__init__(self, fp)
740 self.headers = headers
742 def info(self):
743 return self.headers
745 class addinfourl(addbase):
746 """class to add info() and geturl() methods to an open file."""
748 def __init__(self, fp, headers, url):
749 addbase.__init__(self, fp)
750 self.headers = headers
751 self.url = url
753 def info(self):
754 return self.headers
756 def geturl(self):
757 return self.url
760 def basejoin(base, url):
761 """Utility to combine a URL with a base URL to form a new URL."""
762 type, path = splittype(url)
763 if type:
764 # if url is complete (i.e., it contains a type), return it
765 return url
766 host, path = splithost(path)
767 type, basepath = splittype(base) # inherit type from base
768 if host:
769 # if url contains host, just inherit type
770 if type: return type + '://' + host + path
771 else:
772 # no type inherited, so url must have started with //
773 # just return it
774 return url
775 host, basepath = splithost(basepath) # inherit host
776 basepath, basetag = splittag(basepath) # remove extraneous cruft
777 basepath, basequery = splitquery(basepath) # idem
778 if path[:1] != '/':
779 # non-absolute path name
780 if path[:1] in ('#', '?'):
781 # path is just a tag or query, attach to basepath
782 i = len(basepath)
783 else:
784 # else replace last component
785 i = string.rfind(basepath, '/')
786 if i < 0:
787 # basepath not absolute
788 if host:
789 # host present, make absolute
790 basepath = '/'
791 else:
792 # else keep non-absolute
793 basepath = ''
794 else:
795 # remove last file component
796 basepath = basepath[:i+1]
797 # Interpret ../ (important because of symlinks)
798 while basepath and path[:3] == '../':
799 path = path[3:]
800 i = string.rfind(basepath[:-1], '/')
801 if i > 0:
802 basepath = basepath[:i+1]
803 elif i == 0:
804 basepath = '/'
805 break
806 else:
807 basepath = ''
809 path = basepath + path
810 if type and host: return type + '://' + host + path
811 elif type: return type + ':' + path
812 elif host: return '//' + host + path # don't know what this means
813 else: return path
816 # Utilities to parse URLs (most of these return None for missing parts):
817 # unwrap('<URL:type://host/path>') --> 'type://host/path'
818 # splittype('type:opaquestring') --> 'type', 'opaquestring'
819 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
820 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
821 # splitpasswd('user:passwd') -> 'user', 'passwd'
822 # splitport('host:port') --> 'host', 'port'
823 # splitquery('/path?query') --> '/path', 'query'
824 # splittag('/path#tag') --> '/path', 'tag'
825 # splitattr('/path;attr1=value1;attr2=value2;...') ->
826 # '/path', ['attr1=value1', 'attr2=value2', ...]
827 # splitvalue('attr=value') --> 'attr', 'value'
828 # splitgophertype('/Xselector') --> 'X', 'selector'
829 # unquote('abc%20def') -> 'abc def'
830 # quote('abc def') -> 'abc%20def')
832 def unwrap(url):
833 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
834 url = string.strip(url)
835 if url[:1] == '<' and url[-1:] == '>':
836 url = string.strip(url[1:-1])
837 if url[:4] == 'URL:': url = string.strip(url[4:])
838 return url
840 _typeprog = None
841 def splittype(url):
842 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
843 global _typeprog
844 if _typeprog is None:
845 import re
846 _typeprog = re.compile('^([^/:]+):')
848 match = _typeprog.match(url)
849 if match:
850 scheme = match.group(1)
851 return scheme.lower(), url[len(scheme) + 1:]
852 return None, url
854 _hostprog = None
855 def splithost(url):
856 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
857 global _hostprog
858 if _hostprog is None:
859 import re
860 _hostprog = re.compile('^//([^/]*)(.*)$')
862 match = _hostprog.match(url)
863 if match: return match.group(1, 2)
864 return None, url
866 _userprog = None
867 def splituser(host):
868 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
869 global _userprog
870 if _userprog is None:
871 import re
872 _userprog = re.compile('^([^@]*)@(.*)$')
874 match = _userprog.match(host)
875 if match: return match.group(1, 2)
876 return None, host
878 _passwdprog = None
879 def splitpasswd(user):
880 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
881 global _passwdprog
882 if _passwdprog is None:
883 import re
884 _passwdprog = re.compile('^([^:]*):(.*)$')
886 match = _passwdprog.match(user)
887 if match: return match.group(1, 2)
888 return user, None
890 # splittag('/path#tag') --> '/path', 'tag'
891 _portprog = None
892 def splitport(host):
893 """splitport('host:port') --> 'host', 'port'."""
894 global _portprog
895 if _portprog is None:
896 import re
897 _portprog = re.compile('^(.*):([0-9]+)$')
899 match = _portprog.match(host)
900 if match: return match.group(1, 2)
901 return host, None
903 _nportprog = None
904 def splitnport(host, defport=-1):
905 """Split host and port, returning numeric port.
906 Return given default port if no ':' found; defaults to -1.
907 Return numerical port if a valid number are found after ':'.
908 Return None if ':' but not a valid number."""
909 global _nportprog
910 if _nportprog is None:
911 import re
912 _nportprog = re.compile('^(.*):(.*)$')
914 match = _nportprog.match(host)
915 if match:
916 host, port = match.group(1, 2)
917 try:
918 if not port: raise string.atoi_error, "no digits"
919 nport = string.atoi(port)
920 except string.atoi_error:
921 nport = None
922 return host, nport
923 return host, defport
925 _queryprog = None
926 def splitquery(url):
927 """splitquery('/path?query') --> '/path', 'query'."""
928 global _queryprog
929 if _queryprog is None:
930 import re
931 _queryprog = re.compile('^(.*)\?([^?]*)$')
933 match = _queryprog.match(url)
934 if match: return match.group(1, 2)
935 return url, None
937 _tagprog = None
938 def splittag(url):
939 """splittag('/path#tag') --> '/path', 'tag'."""
940 global _tagprog
941 if _tagprog is None:
942 import re
943 _tagprog = re.compile('^(.*)#([^#]*)$')
945 match = _tagprog.match(url)
946 if match: return match.group(1, 2)
947 return url, None
949 def splitattr(url):
950 """splitattr('/path;attr1=value1;attr2=value2;...') ->
951 '/path', ['attr1=value1', 'attr2=value2', ...]."""
952 words = string.splitfields(url, ';')
953 return words[0], words[1:]
955 _valueprog = None
956 def splitvalue(attr):
957 """splitvalue('attr=value') --> 'attr', 'value'."""
958 global _valueprog
959 if _valueprog is None:
960 import re
961 _valueprog = re.compile('^([^=]*)=(.*)$')
963 match = _valueprog.match(attr)
964 if match: return match.group(1, 2)
965 return attr, None
967 def splitgophertype(selector):
968 """splitgophertype('/Xselector') --> 'X', 'selector'."""
969 if selector[:1] == '/' and selector[1:2]:
970 return selector[1], selector[2:]
971 return None, selector
973 def unquote(s):
974 """unquote('abc%20def') -> 'abc def'."""
975 mychr = chr
976 myatoi = string.atoi
977 list = string.split(s, '%')
978 res = [list[0]]
979 myappend = res.append
980 del list[0]
981 for item in list:
982 if item[1:2]:
983 try:
984 myappend(mychr(myatoi(item[:2], 16))
985 + item[2:])
986 except:
987 myappend('%' + item)
988 else:
989 myappend('%' + item)
990 return string.join(res, "")
992 def unquote_plus(s):
993 if '+' in s:
994 # replace '+' with ' '
995 s = string.join(string.split(s, '+'), ' ')
996 return unquote(s)
998 always_safe = string.letters + string.digits + '_,.-'
999 def quote(s, safe = '/'):
1000 """quote('abc def') -> 'abc%20def')."""
1001 # XXX Can speed this up an order of magnitude
1002 safe = always_safe + safe
1003 res = list(s)
1004 for i in range(len(res)):
1005 c = res[i]
1006 if c not in safe:
1007 res[i] = '%%%02x' % ord(c)
1008 return string.joinfields(res, '')
1010 def quote_plus(s, safe = '/'):
1011 # XXX Can speed this up an order of magnitude
1012 if ' ' in s:
1013 # replace ' ' with '+'
1014 l = string.split(s, ' ')
1015 for i in range(len(l)):
1016 l[i] = quote(l[i], safe)
1017 return string.join(l, '+')
1018 else:
1019 return quote(s, safe)
1021 def urlencode(dict):
1022 """Encode a dictionary of form entries into a URL query string."""
1023 l = []
1024 for k, v in dict.items():
1025 k = quote_plus(str(k))
1026 v = quote_plus(str(v))
1027 l.append(k + '=' + v)
1028 return string.join(l, '&')
1031 # Proxy handling
1032 def getproxies_environment():
1033 """Return a dictionary of scheme -> proxy server URL mappings.
1035 Scan the environment for variables named <scheme>_proxy;
1036 this seems to be the standard convention. If you need a
1037 different way, you can pass a proxies dictionary to the
1038 [Fancy]URLopener constructor.
1041 proxies = {}
1042 for name, value in os.environ.items():
1043 name = string.lower(name)
1044 if value and name[-6:] == '_proxy':
1045 proxies[name[:-6]] = value
1046 return proxies
1048 if os.name == 'mac':
1049 def getproxies():
1050 """Return a dictionary of scheme -> proxy server URL mappings.
1052 By convention the mac uses Internet Config to store
1053 proxies. An HTTP proxy, for instance, is stored under
1054 the HttpProxy key.
1057 try:
1058 import ic
1059 except ImportError:
1060 return {}
1062 try:
1063 config = ic.IC()
1064 except ic.error:
1065 return {}
1066 proxies = {}
1067 # HTTP:
1068 if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1069 try:
1070 value = config['HTTPProxyHost']
1071 except ic.error:
1072 pass
1073 else:
1074 proxies['http'] = 'http://%s' % value
1075 # FTP: XXXX To be done.
1076 # Gopher: XXXX To be done.
1077 return proxies
1079 elif os.name == 'nt':
1080 def getproxies_registry():
1081 """Return a dictionary of scheme -> proxy server URL mappings.
1083 Win32 uses the registry to store proxies.
1086 proxies = {}
1087 try:
1088 import _winreg
1089 except ImportError:
1090 # Std module, so should be around - but you never know!
1091 return proxies
1092 try:
1093 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1094 'Software\\Microsoft\\Windows\\CurrentVersion\\Internet Settings')
1095 proxyEnable = _winreg.QueryValueEx(internetSettings,
1096 'ProxyEnable')[0]
1097 if proxyEnable:
1098 # Returned as Unicode but problems if not converted to ASCII
1099 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1100 'ProxyServer')[0])
1101 if ';' in proxyServer: # Per-protocol settings
1102 for p in proxyServer.split(';'):
1103 protocol, address = p.split('=')
1104 proxies[protocol] = '%s://%s' % (protocol, address)
1105 else: # Use one setting for all protocols
1106 proxies['http'] = 'http://%s' % proxyServer
1107 proxies['ftp'] = 'ftp://%s' % proxyServer
1108 internetSettings.Close()
1109 except (WindowsError, ValueError, TypeError):
1110 # Either registry key not found etc, or the value in an
1111 # unexpected format.
1112 # proxies already set up to be empty so nothing to do
1113 pass
1114 return proxies
1116 def getproxies():
1117 """Return a dictionary of scheme -> proxy server URL mappings.
1119 Returns settings gathered from the environment, if specified,
1120 or the registry.
1123 return getproxies_environment() or getproxies_registry()
1124 else:
1125 # By default use environment variables
1126 getproxies = getproxies_environment
1129 # Test and time quote() and unquote()
1130 def test1():
1131 import time
1132 s = ''
1133 for i in range(256): s = s + chr(i)
1134 s = s*4
1135 t0 = time.time()
1136 qs = quote(s)
1137 uqs = unquote(qs)
1138 t1 = time.time()
1139 if uqs != s:
1140 print 'Wrong!'
1141 print `s`
1142 print `qs`
1143 print `uqs`
1144 print round(t1 - t0, 3), 'sec'
1147 def reporthook(blocknum, blocksize, totalsize):
1148 # Report during remote transfers
1149 print "Block number: %d, Block size: %d, Total size: %d" % (blocknum, blocksize, totalsize)
1151 # Test program
1152 def test(args=[]):
1153 if not args:
1154 args = [
1155 '/etc/passwd',
1156 'file:/etc/passwd',
1157 'file://localhost/etc/passwd',
1158 'ftp://ftp.python.org/etc/passwd',
1159 ## 'gopher://gopher.micro.umn.edu/1/',
1160 'http://www.python.org/index.html',
1162 if hasattr(URLopener, "open_https"):
1163 args.append('https://synergy.as.cmu.edu/~geek/')
1164 try:
1165 for url in args:
1166 print '-'*10, url, '-'*10
1167 fn, h = urlretrieve(url, None, reporthook)
1168 print fn, h
1169 if h:
1170 print '======'
1171 for k in h.keys(): print k + ':', h[k]
1172 print '======'
1173 fp = open(fn, 'rb')
1174 data = fp.read()
1175 del fp
1176 if '\r' in data:
1177 table = string.maketrans("", "")
1178 data = string.translate(data, table, "\r")
1179 print data
1180 fn, h = None, None
1181 print '-'*40
1182 finally:
1183 urlcleanup()
1185 def main():
1186 import getopt, sys
1187 try:
1188 opts, args = getopt.getopt(sys.argv[1:], "th")
1189 except getopt.error, msg:
1190 print msg
1191 print "Use -h for help"
1192 return
1193 t = 0
1194 for o, a in opts:
1195 if o == '-t':
1196 t = t + 1
1197 if o == '-h':
1198 print "Usage: python urllib.py [-t] [url ...]"
1199 print "-t runs self-test;",
1200 print "otherwise, contents of urls are printed"
1201 return
1202 if t:
1203 if t > 1:
1204 test1()
1205 test(args)
1206 else:
1207 if not args:
1208 print "Use -h for help"
1209 for url in args:
1210 print urlopen(url).read(),
1212 # Run test program when run as a script
1213 if __name__ == '__main__':
1214 main()