Ditched '_find_SET()', since it was a no-value-added wrapper around
[python/dscho.git] / Lib / urllib.py
blob7bc9f1789fb008c0191d33a0e2df05cb22c87192
1 """Open an arbitrary URL.
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
16 The object returned by URLopener().open(file) will differ per
17 protocol. All you know is that is has methods read(), readline(),
18 readlines(), fileno(), close() and info(). The read*(), fileno()
19 and close() methods work like those of open files.
20 The info() method returns a mimetools.Message object which can be
21 used to query various info about the object, if available.
22 (mimetools.Message objects are queried with the getheader() method.)
23 """
25 import string
26 import socket
27 import os
28 import sys
31 __version__ = '1.12' # XXX This version is not always updated :-(
33 MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
35 # Helper for non-unix systems
36 if os.name == 'mac':
37 from macurl2path import url2pathname, pathname2url
38 elif os.name == 'nt':
39 from nturl2path import url2pathname, pathname2url
40 else:
41 def url2pathname(pathname):
42 return unquote(pathname)
43 def pathname2url(pathname):
44 return quote(pathname)
46 # This really consists of two pieces:
47 # (1) a class which handles opening of all sorts of URLs
48 # (plus assorted utilities etc.)
49 # (2) a set of functions for parsing URLs
50 # XXX Should these be separated out into different modules?
53 # Shortcut for basic usage
54 _urlopener = None
55 def urlopen(url, data=None):
56 global _urlopener
57 if not _urlopener:
58 _urlopener = FancyURLopener()
59 if data is None:
60 return _urlopener.open(url)
61 else:
62 return _urlopener.open(url, data)
63 def urlretrieve(url, filename=None, reporthook=None):
64 global _urlopener
65 if not _urlopener:
66 _urlopener = FancyURLopener()
67 return _urlopener.retrieve(url, filename, reporthook)
68 def urlcleanup():
69 if _urlopener:
70 _urlopener.cleanup()
73 ftpcache = {}
74 class URLopener:
75 """Class to open URLs.
76 This is a class rather than just a subroutine because we may need
77 more than one set of global protocol-specific options.
78 Note -- this is a base class for those who don't want the
79 automatic handling of errors type 302 (relocated) and 401
80 (authorization needed)."""
82 __tempfiles = None
84 # Constructor
85 def __init__(self, proxies=None, **x509):
86 if proxies is None:
87 proxies = getproxies()
88 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
89 self.proxies = proxies
90 self.key_file = x509.get('key_file')
91 self.cert_file = x509.get('cert_file')
92 server_version = "Python-urllib/%s" % __version__
93 self.addheaders = [('User-agent', server_version)]
94 self.__tempfiles = []
95 self.__unlink = os.unlink # See cleanup()
96 self.tempcache = None
97 # Undocumented feature: if you assign {} to tempcache,
98 # it is used to cache files retrieved with
99 # self.retrieve(). This is not enabled by default
100 # since it does not work for changing documents (and I
101 # haven't got the logic to check expiration headers
102 # yet).
103 self.ftpcache = ftpcache
104 # Undocumented feature: you can use a different
105 # ftp cache by assigning to the .ftpcache member;
106 # in case you want logically independent URL openers
107 # XXX This is not threadsafe. Bah.
109 def __del__(self):
110 self.close()
112 def close(self):
113 self.cleanup()
115 def cleanup(self):
116 # This code sometimes runs when the rest of this module
117 # has already been deleted, so it can't use any globals
118 # or import anything.
119 if self.__tempfiles:
120 for file in self.__tempfiles:
121 try:
122 self.__unlink(file)
123 except:
124 pass
125 del self.__tempfiles[:]
126 if self.tempcache:
127 self.tempcache.clear()
129 def addheader(self, *args):
130 """Add a header to be used by the HTTP interface only
131 e.g. u.addheader('Accept', 'sound/basic')"""
132 self.addheaders.append(args)
134 # External interface
135 def open(self, fullurl, data=None):
136 """Use URLopener().open(file) instead of open(file, 'r')."""
137 fullurl = unwrap(fullurl)
138 if self.tempcache and self.tempcache.has_key(fullurl):
139 filename, headers = self.tempcache[fullurl]
140 fp = open(filename, 'rb')
141 return addinfourl(fp, headers, fullurl)
142 type, url = splittype(fullurl)
143 if not type: type = 'file'
144 if self.proxies.has_key(type):
145 proxy = self.proxies[type]
146 type, proxy = splittype(proxy)
147 host, selector = splithost(proxy)
148 url = (host, fullurl) # Signal special case to open_*()
149 name = 'open_' + type
150 self.type = type
151 if '-' in name:
152 # replace - with _
153 name = string.join(string.split(name, '-'), '_')
154 if not hasattr(self, name):
155 if data is None:
156 return self.open_unknown(fullurl)
157 else:
158 return self.open_unknown(fullurl, data)
159 try:
160 if data is None:
161 return getattr(self, name)(url)
162 else:
163 return getattr(self, name)(url, data)
164 except socket.error, msg:
165 raise IOError, ('socket error', msg), sys.exc_info()[2]
167 def open_unknown(self, fullurl, data=None):
168 """Overridable interface to open unknown URL type."""
169 type, url = splittype(fullurl)
170 raise IOError, ('url error', 'unknown url type', type)
172 # External interface
173 def retrieve(self, url, filename=None, reporthook=None):
174 """retrieve(url) returns (filename, None) for a local object
175 or (tempfilename, headers) for a remote object."""
176 url = unwrap(url)
177 if self.tempcache and self.tempcache.has_key(url):
178 return self.tempcache[url]
179 type, url1 = splittype(url)
180 if not filename and (not type or type == 'file'):
181 try:
182 fp = self.open_local_file(url1)
183 hdrs = fp.info()
184 del fp
185 return url2pathname(splithost(url1)[1]), hdrs
186 except IOError, msg:
187 pass
188 fp = self.open(url)
189 headers = fp.info()
190 if not filename:
191 import tempfile
192 garbage, path = splittype(url)
193 garbage, path = splithost(path or "")
194 path, garbage = splitquery(path or "")
195 path, garbage = splitattr(path or "")
196 suffix = os.path.splitext(path)[1]
197 filename = tempfile.mktemp(suffix)
198 self.__tempfiles.append(filename)
199 result = filename, headers
200 if self.tempcache is not None:
201 self.tempcache[url] = result
202 tfp = open(filename, 'wb')
203 bs = 1024*8
204 size = -1
205 blocknum = 1
206 if reporthook:
207 if headers.has_key("content-length"):
208 size = int(headers["Content-Length"])
209 reporthook(0, bs, size)
210 block = fp.read(bs)
211 if reporthook:
212 reporthook(1, bs, size)
213 while block:
214 tfp.write(block)
215 block = fp.read(bs)
216 blocknum = blocknum + 1
217 if reporthook:
218 reporthook(blocknum, bs, size)
219 fp.close()
220 tfp.close()
221 del fp
222 del tfp
223 return result
225 # Each method named open_<type> knows how to open that type of URL
227 def open_http(self, url, data=None):
228 """Use HTTP protocol."""
229 import httplib
230 user_passwd = None
231 if type(url) is type(""):
232 host, selector = splithost(url)
233 if host:
234 user_passwd, host = splituser(host)
235 host = unquote(host)
236 realhost = host
237 else:
238 host, selector = url
239 urltype, rest = splittype(selector)
240 url = rest
241 user_passwd = None
242 if string.lower(urltype) != 'http':
243 realhost = None
244 else:
245 realhost, rest = splithost(rest)
246 if realhost:
247 user_passwd, realhost = splituser(realhost)
248 if user_passwd:
249 selector = "%s://%s%s" % (urltype, realhost, rest)
250 #print "proxy via http:", host, selector
251 if not host: raise IOError, ('http error', 'no host given')
252 if user_passwd:
253 import base64
254 auth = string.strip(base64.encodestring(user_passwd))
255 else:
256 auth = None
257 h = httplib.HTTP(host)
258 if data is not None:
259 h.putrequest('POST', selector)
260 h.putheader('Content-type', 'application/x-www-form-urlencoded')
261 h.putheader('Content-length', '%d' % len(data))
262 else:
263 h.putrequest('GET', selector)
264 if auth: h.putheader('Authorization', 'Basic %s' % auth)
265 if realhost: h.putheader('Host', realhost)
266 for args in self.addheaders: apply(h.putheader, args)
267 h.endheaders()
268 if data is not None:
269 h.send(data + '\r\n')
270 errcode, errmsg, headers = h.getreply()
271 fp = h.getfile()
272 if errcode == 200:
273 return addinfourl(fp, headers, "http:" + url)
274 else:
275 if data is None:
276 return self.http_error(url, fp, errcode, errmsg, headers)
277 else:
278 return self.http_error(url, fp, errcode, errmsg, headers, data)
280 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
281 """Handle http errors.
282 Derived class can override this, or provide specific handlers
283 named http_error_DDD where DDD is the 3-digit error code."""
284 # First check if there's a specific handler for this error
285 name = 'http_error_%d' % errcode
286 if hasattr(self, name):
287 method = getattr(self, name)
288 if data is None:
289 result = method(url, fp, errcode, errmsg, headers)
290 else:
291 result = method(url, fp, errcode, errmsg, headers, data)
292 if result: return result
293 return self.http_error_default(url, fp, errcode, errmsg, headers)
295 def http_error_default(self, url, fp, errcode, errmsg, headers):
296 """Default error handler: close the connection and raise IOError."""
297 void = fp.read()
298 fp.close()
299 raise IOError, ('http error', errcode, errmsg, headers)
301 if hasattr(socket, "ssl"):
302 def open_https(self, url):
303 """Use HTTPS protocol."""
304 import httplib
305 if type(url) is type(""):
306 host, selector = splithost(url)
307 user_passwd, host = splituser(host)
308 else:
309 host, selector = url
310 urltype, rest = splittype(selector)
311 if string.lower(urltype) == 'https':
312 realhost, rest = splithost(rest)
313 user_passwd, realhost = splituser(realhost)
314 if user_passwd:
315 selector = "%s://%s%s" % (urltype, realhost, rest)
316 print "proxy via https:", host, selector
317 if not host: raise IOError, ('https error', 'no host given')
318 if user_passwd:
319 import base64
320 auth = string.strip(base64.encodestring(user_passwd))
321 else:
322 auth = None
323 h = httplib.HTTPS(host, 0,
324 key_file=self.key_file,
325 cert_file=self.cert_file)
326 h.putrequest('GET', selector)
327 if auth: h.putheader('Authorization: Basic %s' % auth)
328 for args in self.addheaders: apply(h.putheader, args)
329 h.endheaders()
330 errcode, errmsg, headers = h.getreply()
331 fp = h.getfile()
332 if errcode == 200:
333 return addinfourl(fp, headers, url)
334 else:
335 return self.http_error(url, fp, errcode, errmsg, headers)
337 def open_gopher(self, url):
338 """Use Gopher protocol."""
339 import gopherlib
340 host, selector = splithost(url)
341 if not host: raise IOError, ('gopher error', 'no host given')
342 host = unquote(host)
343 type, selector = splitgophertype(selector)
344 selector, query = splitquery(selector)
345 selector = unquote(selector)
346 if query:
347 query = unquote(query)
348 fp = gopherlib.send_query(selector, query, host)
349 else:
350 fp = gopherlib.send_selector(selector, host)
351 return addinfourl(fp, noheaders(), "gopher:" + url)
353 def open_file(self, url):
354 """Use local file or FTP depending on form of URL."""
355 if url[:2] == '//' and url[2:3] != '/':
356 return self.open_ftp(url)
357 else:
358 return self.open_local_file(url)
360 def open_local_file(self, url):
361 """Use local file."""
362 import mimetypes, mimetools, StringIO
363 mtype = mimetypes.guess_type(url)[0]
364 headers = mimetools.Message(StringIO.StringIO(
365 'Content-Type: %s\n' % (mtype or 'text/plain')))
366 host, file = splithost(url)
367 if not host:
368 urlfile = file
369 if file[:1] == '/':
370 urlfile = 'file://' + file
371 return addinfourl(open(url2pathname(file), 'rb'),
372 headers, urlfile)
373 host, port = splitport(host)
374 if not port \
375 and socket.gethostbyname(host) in (localhost(), thishost()):
376 urlfile = file
377 if file[:1] == '/':
378 urlfile = 'file://' + file
379 return addinfourl(open(url2pathname(file), 'rb'),
380 headers, urlfile)
381 raise IOError, ('local file error', 'not on local host')
383 def open_ftp(self, url):
384 """Use FTP protocol."""
385 host, path = splithost(url)
386 if not host: raise IOError, ('ftp error', 'no host given')
387 host, port = splitport(host)
388 user, host = splituser(host)
389 if user: user, passwd = splitpasswd(user)
390 else: passwd = None
391 host = unquote(host)
392 user = unquote(user or '')
393 passwd = unquote(passwd or '')
394 host = socket.gethostbyname(host)
395 if not port:
396 import ftplib
397 port = ftplib.FTP_PORT
398 else:
399 port = int(port)
400 path, attrs = splitattr(path)
401 path = unquote(path)
402 dirs = string.splitfields(path, '/')
403 dirs, file = dirs[:-1], dirs[-1]
404 if dirs and not dirs[0]: dirs = dirs[1:]
405 if dirs and not dirs[0]: dirs[0] = '/'
406 key = (user, host, port, string.joinfields(dirs, '/'))
407 # XXX thread unsafe!
408 if len(self.ftpcache) > MAXFTPCACHE:
409 # Prune the cache, rather arbitrarily
410 for k in self.ftpcache.keys():
411 if k != key:
412 v = self.ftpcache[k]
413 del self.ftpcache[k]
414 v.close()
415 try:
416 if not self.ftpcache.has_key(key):
417 self.ftpcache[key] = \
418 ftpwrapper(user, passwd, host, port, dirs)
419 if not file: type = 'D'
420 else: type = 'I'
421 for attr in attrs:
422 attr, value = splitvalue(attr)
423 if string.lower(attr) == 'type' and \
424 value in ('a', 'A', 'i', 'I', 'd', 'D'):
425 type = string.upper(value)
426 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
427 if retrlen is not None and retrlen >= 0:
428 import mimetools, StringIO
429 headers = mimetools.Message(StringIO.StringIO(
430 'Content-Length: %d\n' % retrlen))
431 else:
432 headers = noheaders()
433 return addinfourl(fp, headers, "ftp:" + url)
434 except ftperrors(), msg:
435 raise IOError, ('ftp error', msg), sys.exc_info()[2]
437 def open_data(self, url, data=None):
438 """Use "data" URL."""
439 # ignore POSTed data
441 # syntax of data URLs:
442 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
443 # mediatype := [ type "/" subtype ] *( ";" parameter )
444 # data := *urlchar
445 # parameter := attribute "=" value
446 import StringIO, mimetools, time
447 try:
448 [type, data] = string.split(url, ',', 1)
449 except ValueError:
450 raise IOError, ('data error', 'bad data URL')
451 if not type:
452 type = 'text/plain;charset=US-ASCII'
453 semi = string.rfind(type, ';')
454 if semi >= 0 and '=' not in type[semi:]:
455 encoding = type[semi+1:]
456 type = type[:semi]
457 else:
458 encoding = ''
459 msg = []
460 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
461 time.gmtime(time.time())))
462 msg.append('Content-type: %s' % type)
463 if encoding == 'base64':
464 import base64
465 data = base64.decodestring(data)
466 else:
467 data = unquote(data)
468 msg.append('Content-length: %d' % len(data))
469 msg.append('')
470 msg.append(data)
471 msg = string.join(msg, '\n')
472 f = StringIO.StringIO(msg)
473 headers = mimetools.Message(f, 0)
474 f.fileno = None # needed for addinfourl
475 return addinfourl(f, headers, url)
478 class FancyURLopener(URLopener):
479 """Derived class with handlers for errors we can handle (perhaps)."""
481 def __init__(self, *args):
482 apply(URLopener.__init__, (self,) + args)
483 self.auth_cache = {}
485 def http_error_default(self, url, fp, errcode, errmsg, headers):
486 """Default error handling -- don't raise an exception."""
487 return addinfourl(fp, headers, "http:" + url)
489 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
490 """Error 302 -- relocated (temporarily)."""
491 # XXX The server can force infinite recursion here!
492 if headers.has_key('location'):
493 newurl = headers['location']
494 elif headers.has_key('uri'):
495 newurl = headers['uri']
496 else:
497 return
498 void = fp.read()
499 fp.close()
500 # In case the server sent a relative URL, join with original:
501 newurl = basejoin("http:" + url, newurl)
502 if data is None:
503 return self.open(newurl)
504 else:
505 return self.open(newurl, data)
507 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
508 """Error 301 -- also relocated (permanently)."""
509 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
511 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
512 """Error 401 -- authentication required.
513 See this URL for a description of the basic authentication scheme:
514 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
515 if headers.has_key('www-authenticate'):
516 stuff = headers['www-authenticate']
517 import re
518 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
519 if match:
520 scheme, realm = match.groups()
521 if string.lower(scheme) == 'basic':
522 name = 'retry_' + self.type + '_basic_auth'
523 if data is None:
524 return getattr(self,name)(url, realm)
525 else:
526 return getattr(self,name)(url, realm, data)
528 def retry_http_basic_auth(self, url, realm, data=None):
529 host, selector = splithost(url)
530 i = string.find(host, '@') + 1
531 host = host[i:]
532 user, passwd = self.get_user_passwd(host, realm, i)
533 if not (user or passwd): return None
534 host = user + ':' + passwd + '@' + host
535 newurl = 'http://' + host + selector
536 if data is None:
537 return self.open(newurl)
538 else:
539 return self.open(newurl, data)
541 def retry_https_basic_auth(self, url, realm, data=None):
542 host, selector = splithost(url)
543 i = string.find(host, '@') + 1
544 host = host[i:]
545 user, passwd = self.get_user_passwd(host, realm, i)
546 if not (user or passwd): return None
547 host = user + ':' + passwd + '@' + host
548 newurl = '//' + host + selector
549 return self.open_https(newurl)
551 def get_user_passwd(self, host, realm, clear_cache = 0):
552 key = realm + '@' + string.lower(host)
553 if self.auth_cache.has_key(key):
554 if clear_cache:
555 del self.auth_cache[key]
556 else:
557 return self.auth_cache[key]
558 user, passwd = self.prompt_user_passwd(host, realm)
559 if user or passwd: self.auth_cache[key] = (user, passwd)
560 return user, passwd
562 def prompt_user_passwd(self, host, realm):
563 """Override this in a GUI environment!"""
564 import getpass
565 try:
566 user = raw_input("Enter username for %s at %s: " % (realm,
567 host))
568 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
569 (user, realm, host))
570 return user, passwd
571 except KeyboardInterrupt:
572 print
573 return None, None
576 # Utility functions
578 _localhost = None
579 def localhost():
580 """Return the IP address of the magic hostname 'localhost'."""
581 global _localhost
582 if not _localhost:
583 _localhost = socket.gethostbyname('localhost')
584 return _localhost
586 _thishost = None
587 def thishost():
588 """Return the IP address of the current host."""
589 global _thishost
590 if not _thishost:
591 _thishost = socket.gethostbyname(socket.gethostname())
592 return _thishost
594 _ftperrors = None
595 def ftperrors():
596 """Return the set of errors raised by the FTP class."""
597 global _ftperrors
598 if not _ftperrors:
599 import ftplib
600 _ftperrors = ftplib.all_errors
601 return _ftperrors
603 _noheaders = None
604 def noheaders():
605 """Return an empty mimetools.Message object."""
606 global _noheaders
607 if not _noheaders:
608 import mimetools
609 import StringIO
610 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
611 _noheaders.fp.close() # Recycle file descriptor
612 return _noheaders
615 # Utility classes
617 class ftpwrapper:
618 """Class used by open_ftp() for cache of open FTP connections."""
620 def __init__(self, user, passwd, host, port, dirs):
621 self.user = user
622 self.passwd = passwd
623 self.host = host
624 self.port = port
625 self.dirs = dirs
626 self.init()
628 def init(self):
629 import ftplib
630 self.busy = 0
631 self.ftp = ftplib.FTP()
632 self.ftp.connect(self.host, self.port)
633 self.ftp.login(self.user, self.passwd)
634 for dir in self.dirs:
635 self.ftp.cwd(dir)
637 def retrfile(self, file, type):
638 import ftplib
639 self.endtransfer()
640 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
641 else: cmd = 'TYPE ' + type; isdir = 0
642 try:
643 self.ftp.voidcmd(cmd)
644 except ftplib.all_errors:
645 self.init()
646 self.ftp.voidcmd(cmd)
647 conn = None
648 if file and not isdir:
649 # Use nlst to see if the file exists at all
650 try:
651 self.ftp.nlst(file)
652 except ftplib.error_perm, reason:
653 raise IOError, ('ftp error', reason), sys.exc_info()[2]
654 # Restore the transfer mode!
655 self.ftp.voidcmd(cmd)
656 # Try to retrieve as a file
657 try:
658 cmd = 'RETR ' + file
659 conn = self.ftp.ntransfercmd(cmd)
660 except ftplib.error_perm, reason:
661 if reason[:3] != '550':
662 raise IOError, ('ftp error', reason), sys.exc_info()[2]
663 if not conn:
664 # Set transfer mode to ASCII!
665 self.ftp.voidcmd('TYPE A')
666 # Try a directory listing
667 if file: cmd = 'LIST ' + file
668 else: cmd = 'LIST'
669 conn = self.ftp.ntransfercmd(cmd)
670 self.busy = 1
671 # Pass back both a suitably decorated object and a retrieval length
672 return (addclosehook(conn[0].makefile('rb'),
673 self.endtransfer), conn[1])
674 def endtransfer(self):
675 if not self.busy:
676 return
677 self.busy = 0
678 try:
679 self.ftp.voidresp()
680 except ftperrors():
681 pass
683 def close(self):
684 self.endtransfer()
685 try:
686 self.ftp.close()
687 except ftperrors():
688 pass
690 class addbase:
691 """Base class for addinfo and addclosehook."""
693 def __init__(self, fp):
694 self.fp = fp
695 self.read = self.fp.read
696 self.readline = self.fp.readline
697 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
698 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
700 def __repr__(self):
701 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
702 `id(self)`, `self.fp`)
704 def close(self):
705 self.read = None
706 self.readline = None
707 self.readlines = None
708 self.fileno = None
709 if self.fp: self.fp.close()
710 self.fp = None
712 class addclosehook(addbase):
713 """Class to add a close hook to an open file."""
715 def __init__(self, fp, closehook, *hookargs):
716 addbase.__init__(self, fp)
717 self.closehook = closehook
718 self.hookargs = hookargs
720 def close(self):
721 if self.closehook:
722 apply(self.closehook, self.hookargs)
723 self.closehook = None
724 self.hookargs = None
725 addbase.close(self)
727 class addinfo(addbase):
728 """class to add an info() method to an open file."""
730 def __init__(self, fp, headers):
731 addbase.__init__(self, fp)
732 self.headers = headers
734 def info(self):
735 return self.headers
737 class addinfourl(addbase):
738 """class to add info() and geturl() methods to an open file."""
740 def __init__(self, fp, headers, url):
741 addbase.__init__(self, fp)
742 self.headers = headers
743 self.url = url
745 def info(self):
746 return self.headers
748 def geturl(self):
749 return self.url
752 def basejoin(base, url):
753 """Utility to combine a URL with a base URL to form a new URL."""
754 type, path = splittype(url)
755 if type:
756 # if url is complete (i.e., it contains a type), return it
757 return url
758 host, path = splithost(path)
759 type, basepath = splittype(base) # inherit type from base
760 if host:
761 # if url contains host, just inherit type
762 if type: return type + '://' + host + path
763 else:
764 # no type inherited, so url must have started with //
765 # just return it
766 return url
767 host, basepath = splithost(basepath) # inherit host
768 basepath, basetag = splittag(basepath) # remove extraneuous cruft
769 basepath, basequery = splitquery(basepath) # idem
770 if path[:1] != '/':
771 # non-absolute path name
772 if path[:1] in ('#', '?'):
773 # path is just a tag or query, attach to basepath
774 i = len(basepath)
775 else:
776 # else replace last component
777 i = string.rfind(basepath, '/')
778 if i < 0:
779 # basepath not absolute
780 if host:
781 # host present, make absolute
782 basepath = '/'
783 else:
784 # else keep non-absolute
785 basepath = ''
786 else:
787 # remove last file component
788 basepath = basepath[:i+1]
789 # Interpret ../ (important because of symlinks)
790 while basepath and path[:3] == '../':
791 path = path[3:]
792 i = string.rfind(basepath[:-1], '/')
793 if i > 0:
794 basepath = basepath[:i+1]
795 elif i == 0:
796 basepath = '/'
797 break
798 else:
799 basepath = ''
801 path = basepath + path
802 if type and host: return type + '://' + host + path
803 elif type: return type + ':' + path
804 elif host: return '//' + host + path # don't know what this means
805 else: return path
808 # Utilities to parse URLs (most of these return None for missing parts):
809 # unwrap('<URL:type://host/path>') --> 'type://host/path'
810 # splittype('type:opaquestring') --> 'type', 'opaquestring'
811 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
812 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
813 # splitpasswd('user:passwd') -> 'user', 'passwd'
814 # splitport('host:port') --> 'host', 'port'
815 # splitquery('/path?query') --> '/path', 'query'
816 # splittag('/path#tag') --> '/path', 'tag'
817 # splitattr('/path;attr1=value1;attr2=value2;...') ->
818 # '/path', ['attr1=value1', 'attr2=value2', ...]
819 # splitvalue('attr=value') --> 'attr', 'value'
820 # splitgophertype('/Xselector') --> 'X', 'selector'
821 # unquote('abc%20def') -> 'abc def'
822 # quote('abc def') -> 'abc%20def')
824 def unwrap(url):
825 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
826 url = string.strip(url)
827 if url[:1] == '<' and url[-1:] == '>':
828 url = string.strip(url[1:-1])
829 if url[:4] == 'URL:': url = string.strip(url[4:])
830 return url
832 _typeprog = None
833 def splittype(url):
834 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
835 global _typeprog
836 if _typeprog is None:
837 import re
838 _typeprog = re.compile('^([^/:]+):')
840 match = _typeprog.match(url)
841 if match:
842 scheme = match.group(1)
843 return scheme, url[len(scheme) + 1:]
844 return None, url
846 _hostprog = None
847 def splithost(url):
848 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
849 global _hostprog
850 if _hostprog is None:
851 import re
852 _hostprog = re.compile('^//([^/]*)(.*)$')
854 match = _hostprog.match(url)
855 if match: return match.group(1, 2)
856 return None, url
858 _userprog = None
859 def splituser(host):
860 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
861 global _userprog
862 if _userprog is None:
863 import re
864 _userprog = re.compile('^([^@]*)@(.*)$')
866 match = _userprog.match(host)
867 if match: return match.group(1, 2)
868 return None, host
870 _passwdprog = None
871 def splitpasswd(user):
872 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
873 global _passwdprog
874 if _passwdprog is None:
875 import re
876 _passwdprog = re.compile('^([^:]*):(.*)$')
878 match = _passwdprog.match(user)
879 if match: return match.group(1, 2)
880 return user, None
882 # splittag('/path#tag') --> '/path', 'tag'
883 _portprog = None
884 def splitport(host):
885 """splitport('host:port') --> 'host', 'port'."""
886 global _portprog
887 if _portprog is None:
888 import re
889 _portprog = re.compile('^(.*):([0-9]+)$')
891 match = _portprog.match(host)
892 if match: return match.group(1, 2)
893 return host, None
895 _nportprog = None
896 def splitnport(host, defport=-1):
897 """Split host and port, returning numeric port.
898 Return given default port if no ':' found; defaults to -1.
899 Return numerical port if a valid number are found after ':'.
900 Return None if ':' but not a valid number."""
901 global _nportprog
902 if _nportprog is None:
903 import re
904 _nportprog = re.compile('^(.*):(.*)$')
906 match = _nportprog.match(host)
907 if match:
908 host, port = match.group(1, 2)
909 try:
910 if not port: raise string.atoi_error, "no digits"
911 nport = string.atoi(port)
912 except string.atoi_error:
913 nport = None
914 return host, nport
915 return host, defport
917 _queryprog = None
918 def splitquery(url):
919 """splitquery('/path?query') --> '/path', 'query'."""
920 global _queryprog
921 if _queryprog is None:
922 import re
923 _queryprog = re.compile('^(.*)\?([^?]*)$')
925 match = _queryprog.match(url)
926 if match: return match.group(1, 2)
927 return url, None
929 _tagprog = None
930 def splittag(url):
931 """splittag('/path#tag') --> '/path', 'tag'."""
932 global _tagprog
933 if _tagprog is None:
934 import re
935 _tagprog = re.compile('^(.*)#([^#]*)$')
937 match = _tagprog.match(url)
938 if match: return match.group(1, 2)
939 return url, None
941 def splitattr(url):
942 """splitattr('/path;attr1=value1;attr2=value2;...') ->
943 '/path', ['attr1=value1', 'attr2=value2', ...]."""
944 words = string.splitfields(url, ';')
945 return words[0], words[1:]
947 _valueprog = None
948 def splitvalue(attr):
949 """splitvalue('attr=value') --> 'attr', 'value'."""
950 global _valueprog
951 if _valueprog is None:
952 import re
953 _valueprog = re.compile('^([^=]*)=(.*)$')
955 match = _valueprog.match(attr)
956 if match: return match.group(1, 2)
957 return attr, None
959 def splitgophertype(selector):
960 """splitgophertype('/Xselector') --> 'X', 'selector'."""
961 if selector[:1] == '/' and selector[1:2]:
962 return selector[1], selector[2:]
963 return None, selector
965 def unquote(s):
966 """unquote('abc%20def') -> 'abc def'."""
967 mychr = chr
968 myatoi = string.atoi
969 list = string.split(s, '%')
970 res = [list[0]]
971 myappend = res.append
972 del list[0]
973 for item in list:
974 if item[1:2]:
975 try:
976 myappend(mychr(myatoi(item[:2], 16))
977 + item[2:])
978 except:
979 myappend('%' + item)
980 else:
981 myappend('%' + item)
982 return string.join(res, "")
984 def unquote_plus(s):
985 if '+' in s:
986 # replace '+' with ' '
987 s = string.join(string.split(s, '+'), ' ')
988 return unquote(s)
990 always_safe = string.letters + string.digits + '_,.-'
991 def quote(s, safe = '/'):
992 """quote('abc def') -> 'abc%20def')."""
993 # XXX Can speed this up an order of magnitude
994 safe = always_safe + safe
995 res = list(s)
996 for i in range(len(res)):
997 c = res[i]
998 if c not in safe:
999 res[i] = '%%%02x' % ord(c)
1000 return string.joinfields(res, '')
1002 def quote_plus(s, safe = '/'):
1003 # XXX Can speed this up an order of magnitude
1004 if ' ' in s:
1005 # replace ' ' with '+'
1006 l = string.split(s, ' ')
1007 for i in range(len(l)):
1008 l[i] = quote(l[i], safe)
1009 return string.join(l, '+')
1010 else:
1011 return quote(s, safe)
1013 def urlencode(dict):
1014 """Encode a dictionary of form entries into a URL query string."""
1015 l = []
1016 for k, v in dict.items():
1017 k = quote_plus(str(k))
1018 v = quote_plus(str(v))
1019 l.append(k + '=' + v)
1020 return string.join(l, '&')
1023 # Proxy handling
1024 if os.name == 'mac':
1025 def getproxies():
1026 """Return a dictionary of scheme -> proxy server URL mappings.
1028 By convention the mac uses Internet Config to store
1029 proxies. An HTTP proxy, for instance, is stored under
1030 the HttpProxy key.
1033 try:
1034 import ic
1035 except ImportError:
1036 return {}
1038 try:
1039 config = ic.IC()
1040 except ic.error:
1041 return {}
1042 proxies = {}
1043 # HTTP:
1044 if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1045 try:
1046 value = config['HTTPProxyHost']
1047 except ic.error:
1048 pass
1049 else:
1050 proxies['http'] = 'http://%s' % value
1051 # FTP: XXXX To be done.
1052 # Gopher: XXXX To be done.
1053 return proxies
1055 else:
1056 def getproxies():
1057 """Return a dictionary of scheme -> proxy server URL mappings.
1059 Scan the environment for variables named <scheme>_proxy;
1060 this seems to be the standard convention. If you need a
1061 different way, you can pass a proxies dictionary to the
1062 [Fancy]URLopener constructor.
1065 proxies = {}
1066 for name, value in os.environ.items():
1067 name = string.lower(name)
1068 if value and name[-6:] == '_proxy':
1069 proxies[name[:-6]] = value
1070 return proxies
1073 # Test and time quote() and unquote()
1074 def test1():
1075 import time
1076 s = ''
1077 for i in range(256): s = s + chr(i)
1078 s = s*4
1079 t0 = time.time()
1080 qs = quote(s)
1081 uqs = unquote(qs)
1082 t1 = time.time()
1083 if uqs != s:
1084 print 'Wrong!'
1085 print `s`
1086 print `qs`
1087 print `uqs`
1088 print round(t1 - t0, 3), 'sec'
1091 def reporthook(blocknum, blocksize, totalsize):
1092 # Report during remote transfers
1093 print "Block number: %d, Block size: %d, Total size: %d" % (blocknum, blocksize, totalsize)
1095 # Test program
1096 def test(args=[]):
1097 if not args:
1098 args = [
1099 '/etc/passwd',
1100 'file:/etc/passwd',
1101 'file://localhost/etc/passwd',
1102 'ftp://ftp.python.org/etc/passwd',
1103 ## 'gopher://gopher.micro.umn.edu/1/',
1104 'http://www.python.org/index.html',
1106 if hasattr(URLopener, "open_https"):
1107 args.append('https://synergy.as.cmu.edu/~geek/')
1108 try:
1109 for url in args:
1110 print '-'*10, url, '-'*10
1111 fn, h = urlretrieve(url, None, reporthook)
1112 print fn, h
1113 if h:
1114 print '======'
1115 for k in h.keys(): print k + ':', h[k]
1116 print '======'
1117 fp = open(fn, 'rb')
1118 data = fp.read()
1119 del fp
1120 if '\r' in data:
1121 table = string.maketrans("", "")
1122 data = string.translate(data, table, "\r")
1123 print data
1124 fn, h = None, None
1125 print '-'*40
1126 finally:
1127 urlcleanup()
1129 def main():
1130 import getopt, sys
1131 try:
1132 opts, args = getopt.getopt(sys.argv[1:], "th")
1133 except getopt.error, msg:
1134 print msg
1135 print "Use -h for help"
1136 return
1137 t = 0
1138 for o, a in opts:
1139 if o == '-t':
1140 t = t + 1
1141 if o == '-h':
1142 print "Usage: python urllib.py [-t] [url ...]"
1143 print "-t runs self-test;",
1144 print "otherwise, contents of urls are printed"
1145 return
1146 if t:
1147 if t > 1:
1148 test1()
1149 test(args)
1150 else:
1151 if not args:
1152 print "Use -h for help"
1153 for url in args:
1154 print urlopen(url).read(),
1156 # Run test program when run as a script
1157 if __name__ == '__main__':
1158 main()