1 """Open an arbitrary URL.
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
16 The object returned by URLopener().open(file) will differ per
17 protocol. All you know is that is has methods read(), readline(),
18 readlines(), fileno(), close() and info(). The read*(), fileno()
19 and close() methods work like those of open files.
20 The info() method returns a mimetools.Message object which can be
21 used to query various info about the object, if available.
22 (mimetools.Message objects are queried with the getheader() method.)
31 __all__
= ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
32 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
33 "urlencode", "url2pathname", "pathname2url", "splittag",
34 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
35 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
36 "splitnport", "splitquery", "splitattr", "splitvalue",
37 "splitgophertype", "getproxies"]
39 __version__
= '1.15' # XXX This version is not always updated :-(
41 MAXFTPCACHE
= 10 # Trim the ftp cache beyond this size
43 # Helper for non-unix systems
45 from macurl2path
import url2pathname
, pathname2url
47 from nturl2path
import url2pathname
, pathname2url
48 elif os
.name
== 'riscos':
49 from rourl2path
import url2pathname
, pathname2url
51 def url2pathname(pathname
):
52 return unquote(pathname
)
53 def pathname2url(pathname
):
54 return quote(pathname
)
56 # This really consists of two pieces:
57 # (1) a class which handles opening of all sorts of URLs
58 # (plus assorted utilities etc.)
59 # (2) a set of functions for parsing URLs
60 # XXX Should these be separated out into different modules?
63 # Shortcut for basic usage
65 def urlopen(url
, data
=None):
66 """urlopen(url [, data]) -> open file-like object"""
69 _urlopener
= FancyURLopener()
71 return _urlopener
.open(url
)
73 return _urlopener
.open(url
, data
)
74 def urlretrieve(url
, filename
=None, reporthook
=None, data
=None):
77 _urlopener
= FancyURLopener()
78 return _urlopener
.retrieve(url
, filename
, reporthook
, data
)
86 """Class to open URLs.
87 This is a class rather than just a subroutine because we may need
88 more than one set of global protocol-specific options.
89 Note -- this is a base class for those who don't want the
90 automatic handling of errors type 302 (relocated) and 401
91 (authorization needed)."""
95 version
= "Python-urllib/%s" % __version__
98 def __init__(self
, proxies
=None, **x509
):
100 proxies
= getproxies()
101 assert hasattr(proxies
, 'has_key'), "proxies must be a mapping"
102 self
.proxies
= proxies
103 self
.key_file
= x509
.get('key_file')
104 self
.cert_file
= x509
.get('cert_file')
105 self
.addheaders
= [('User-agent', self
.version
)]
106 self
.__tempfiles
= []
107 self
.__unlink
= os
.unlink
# See cleanup()
108 self
.tempcache
= None
109 # Undocumented feature: if you assign {} to tempcache,
110 # it is used to cache files retrieved with
111 # self.retrieve(). This is not enabled by default
112 # since it does not work for changing documents (and I
113 # haven't got the logic to check expiration headers
115 self
.ftpcache
= ftpcache
116 # Undocumented feature: you can use a different
117 # ftp cache by assigning to the .ftpcache member;
118 # in case you want logically independent URL openers
119 # XXX This is not threadsafe. Bah.
128 # This code sometimes runs when the rest of this module
129 # has already been deleted, so it can't use any globals
130 # or import anything.
132 for file in self
.__tempfiles
:
137 del self
.__tempfiles
[:]
139 self
.tempcache
.clear()
141 def addheader(self
, *args
):
142 """Add a header to be used by the HTTP interface only
143 e.g. u.addheader('Accept', 'sound/basic')"""
144 self
.addheaders
.append(args
)
147 def open(self
, fullurl
, data
=None):
148 """Use URLopener().open(file) instead of open(file, 'r')."""
149 fullurl
= unwrap(toBytes(fullurl
))
150 if self
.tempcache
and self
.tempcache
.has_key(fullurl
):
151 filename
, headers
= self
.tempcache
[fullurl
]
152 fp
= open(filename
, 'rb')
153 return addinfourl(fp
, headers
, fullurl
)
154 urltype
, url
= splittype(fullurl
)
157 if self
.proxies
.has_key(urltype
):
158 proxy
= self
.proxies
[urltype
]
159 urltype
, proxyhost
= splittype(proxy
)
160 host
, selector
= splithost(proxyhost
)
161 url
= (host
, fullurl
) # Signal special case to open_*()
164 name
= 'open_' + urltype
168 name
= '_'.join(name
.split('-'))
169 if not hasattr(self
, name
):
171 return self
.open_unknown_proxy(proxy
, fullurl
, data
)
173 return self
.open_unknown(fullurl
, data
)
176 return getattr(self
, name
)(url
)
178 return getattr(self
, name
)(url
, data
)
179 except socket
.error
, msg
:
180 raise IOError, ('socket error', msg
), sys
.exc_info()[2]
182 def open_unknown(self
, fullurl
, data
=None):
183 """Overridable interface to open unknown URL type."""
184 type, url
= splittype(fullurl
)
185 raise IOError, ('url error', 'unknown url type', type)
187 def open_unknown_proxy(self
, proxy
, fullurl
, data
=None):
188 """Overridable interface to open unknown URL type."""
189 type, url
= splittype(fullurl
)
190 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy
)
193 def retrieve(self
, url
, filename
=None, reporthook
=None, data
=None):
194 """retrieve(url) returns (filename, None) for a local object
195 or (tempfilename, headers) for a remote object."""
196 url
= unwrap(toBytes(url
))
197 if self
.tempcache
and self
.tempcache
.has_key(url
):
198 return self
.tempcache
[url
]
199 type, url1
= splittype(url
)
200 if not filename
and (not type or type == 'file'):
202 fp
= self
.open_local_file(url1
)
205 return url2pathname(splithost(url1
)[1]), hdrs
208 fp
= self
.open(url
, data
)
212 garbage
, path
= splittype(url
)
213 garbage
, path
= splithost(path
or "")
214 path
, garbage
= splitquery(path
or "")
215 path
, garbage
= splitattr(path
or "")
216 suffix
= os
.path
.splitext(path
)[1]
217 filename
= tempfile
.mktemp(suffix
)
218 self
.__tempfiles
.append(filename
)
219 result
= filename
, headers
220 if self
.tempcache
is not None:
221 self
.tempcache
[url
] = result
222 tfp
= open(filename
, 'wb')
227 if headers
.has_key("content-length"):
228 size
= int(headers
["Content-Length"])
229 reporthook(0, bs
, size
)
232 reporthook(1, bs
, size
)
236 blocknum
= blocknum
+ 1
238 reporthook(blocknum
, bs
, size
)
245 # Each method named open_<type> knows how to open that type of URL
247 def open_http(self
, url
, data
=None):
248 """Use HTTP protocol."""
251 if type(url
) is types
.StringType
:
252 host
, selector
= splithost(url
)
254 user_passwd
, host
= splituser(host
)
259 urltype
, rest
= splittype(selector
)
262 if urltype
.lower() != 'http':
265 realhost
, rest
= splithost(rest
)
267 user_passwd
, realhost
= splituser(realhost
)
269 selector
= "%s://%s%s" % (urltype
, realhost
, rest
)
270 #print "proxy via http:", host, selector
271 if not host
: raise IOError, ('http error', 'no host given')
274 auth
= base64
.encodestring(user_passwd
).strip()
277 h
= httplib
.HTTP(host
)
279 h
.putrequest('POST', selector
)
280 h
.putheader('Content-type', 'application/x-www-form-urlencoded')
281 h
.putheader('Content-length', '%d' % len(data
))
283 h
.putrequest('GET', selector
)
284 if auth
: h
.putheader('Authorization', 'Basic %s' % auth
)
285 if realhost
: h
.putheader('Host', realhost
)
286 for args
in self
.addheaders
: apply(h
.putheader
, args
)
289 h
.send(data
+ '\r\n')
290 errcode
, errmsg
, headers
= h
.getreply()
293 return addinfourl(fp
, headers
, "http:" + url
)
296 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
)
298 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
, data
)
300 def http_error(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
301 """Handle http errors.
302 Derived class can override this, or provide specific handlers
303 named http_error_DDD where DDD is the 3-digit error code."""
304 # First check if there's a specific handler for this error
305 name
= 'http_error_%d' % errcode
306 if hasattr(self
, name
):
307 method
= getattr(self
, name
)
309 result
= method(url
, fp
, errcode
, errmsg
, headers
)
311 result
= method(url
, fp
, errcode
, errmsg
, headers
, data
)
312 if result
: return result
313 return self
.http_error_default(url
, fp
, errcode
, errmsg
, headers
)
315 def http_error_default(self
, url
, fp
, errcode
, errmsg
, headers
):
316 """Default error handler: close the connection and raise IOError."""
319 raise IOError, ('http error', errcode
, errmsg
, headers
)
321 if hasattr(socket
, "ssl"):
322 def open_https(self
, url
, data
=None):
323 """Use HTTPS protocol."""
326 if type(url
) is types
.StringType
:
327 host
, selector
= splithost(url
)
329 user_passwd
, host
= splituser(host
)
334 urltype
, rest
= splittype(selector
)
337 if urltype
.lower() != 'https':
340 realhost
, rest
= splithost(rest
)
342 user_passwd
, realhost
= splituser(realhost
)
344 selector
= "%s://%s%s" % (urltype
, realhost
, rest
)
345 #print "proxy via https:", host, selector
346 if not host
: raise IOError, ('https error', 'no host given')
349 auth
= base64
.encodestring(user_passwd
).strip()
352 h
= httplib
.HTTPS(host
, 0,
353 key_file
=self
.key_file
,
354 cert_file
=self
.cert_file
)
356 h
.putrequest('POST', selector
)
357 h
.putheader('Content-type',
358 'application/x-www-form-urlencoded')
359 h
.putheader('Content-length', '%d' % len(data
))
361 h
.putrequest('GET', selector
)
362 if auth
: h
.putheader('Authorization: Basic %s' % auth
)
363 if realhost
: h
.putheader('Host', realhost
)
364 for args
in self
.addheaders
: apply(h
.putheader
, args
)
367 h
.send(data
+ '\r\n')
368 errcode
, errmsg
, headers
= h
.getreply()
371 return addinfourl(fp
, headers
, url
)
374 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
)
376 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
,
379 def open_gopher(self
, url
):
380 """Use Gopher protocol."""
382 host
, selector
= splithost(url
)
383 if not host
: raise IOError, ('gopher error', 'no host given')
385 type, selector
= splitgophertype(selector
)
386 selector
, query
= splitquery(selector
)
387 selector
= unquote(selector
)
389 query
= unquote(query
)
390 fp
= gopherlib
.send_query(selector
, query
, host
)
392 fp
= gopherlib
.send_selector(selector
, host
)
393 return addinfourl(fp
, noheaders(), "gopher:" + url
)
395 def open_file(self
, url
):
396 """Use local file or FTP depending on form of URL."""
397 if url
[:2] == '//' and url
[2:3] != '/':
398 return self
.open_ftp(url
)
400 return self
.open_local_file(url
)
402 def open_local_file(self
, url
):
403 """Use local file."""
404 import mimetypes
, mimetools
, StringIO
405 mtype
= mimetypes
.guess_type(url
)[0]
406 headers
= mimetools
.Message(StringIO
.StringIO(
407 'Content-Type: %s\n' % (mtype
or 'text/plain')))
408 host
, file = splithost(url
)
412 urlfile
= 'file://' + file
413 return addinfourl(open(url2pathname(file), 'rb'),
415 host
, port
= splitport(host
)
417 and socket
.gethostbyname(host
) in (localhost(), thishost()):
420 urlfile
= 'file://' + file
421 return addinfourl(open(url2pathname(file), 'rb'),
423 raise IOError, ('local file error', 'not on local host')
425 def open_ftp(self
, url
):
426 """Use FTP protocol."""
427 host
, path
= splithost(url
)
428 if not host
: raise IOError, ('ftp error', 'no host given')
429 host
, port
= splitport(host
)
430 user
, host
= splituser(host
)
431 if user
: user
, passwd
= splitpasswd(user
)
434 user
= unquote(user
or '')
435 passwd
= unquote(passwd
or '')
436 host
= socket
.gethostbyname(host
)
439 port
= ftplib
.FTP_PORT
442 path
, attrs
= splitattr(path
)
444 dirs
= path
.split('/')
445 dirs
, file = dirs
[:-1], dirs
[-1]
446 if dirs
and not dirs
[0]: dirs
= dirs
[1:]
447 if dirs
and not dirs
[0]: dirs
[0] = '/'
448 key
= user
, host
, port
, '/'.join(dirs
)
450 if len(self
.ftpcache
) > MAXFTPCACHE
:
451 # Prune the cache, rather arbitrarily
452 for k
in self
.ftpcache
.keys():
458 if not self
.ftpcache
.has_key(key
):
459 self
.ftpcache
[key
] = \
460 ftpwrapper(user
, passwd
, host
, port
, dirs
)
461 if not file: type = 'D'
464 attr
, value
= splitvalue(attr
)
465 if attr
.lower() == 'type' and \
466 value
in ('a', 'A', 'i', 'I', 'd', 'D'):
468 (fp
, retrlen
) = self
.ftpcache
[key
].retrfile(file, type)
469 if retrlen
is not None and retrlen
>= 0:
470 import mimetools
, StringIO
471 headers
= mimetools
.Message(StringIO
.StringIO(
472 'Content-Length: %d\n' % retrlen
))
474 headers
= noheaders()
475 return addinfourl(fp
, headers
, "ftp:" + url
)
476 except ftperrors(), msg
:
477 raise IOError, ('ftp error', msg
), sys
.exc_info()[2]
479 def open_data(self
, url
, data
=None):
480 """Use "data" URL."""
483 # syntax of data URLs:
484 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
485 # mediatype := [ type "/" subtype ] *( ";" parameter )
487 # parameter := attribute "=" value
488 import StringIO
, mimetools
, time
490 [type, data
] = url
.split(',', 1)
492 raise IOError, ('data error', 'bad data URL')
494 type = 'text/plain;charset=US-ASCII'
495 semi
= type.rfind(';')
496 if semi
>= 0 and '=' not in type[semi
:]:
497 encoding
= type[semi
+1:]
502 msg
.append('Date: %s'%time
.strftime('%a, %d %b %Y %T GMT',
503 time
.gmtime(time
.time())))
504 msg
.append('Content-type: %s' % type)
505 if encoding
== 'base64':
507 data
= base64
.decodestring(data
)
510 msg
.append('Content-length: %d' % len(data
))
514 f
= StringIO
.StringIO(msg
)
515 headers
= mimetools
.Message(f
, 0)
516 f
.fileno
= None # needed for addinfourl
517 return addinfourl(f
, headers
, url
)
520 class FancyURLopener(URLopener
):
521 """Derived class with handlers for errors we can handle (perhaps)."""
523 def __init__(self
, *args
):
524 apply(URLopener
.__init
__, (self
,) + args
)
529 def http_error_default(self
, url
, fp
, errcode
, errmsg
, headers
):
530 """Default error handling -- don't raise an exception."""
531 return addinfourl(fp
, headers
, "http:" + url
)
533 def http_error_302(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
534 """Error 302 -- relocated (temporarily)."""
536 if self
.maxtries
and self
.tries
>= self
.maxtries
:
537 if hasattr(self
, "http_error_500"):
538 meth
= self
.http_error_500
540 meth
= self
.http_error_default
542 return meth(url
, fp
, 500,
543 "Internal Server Error: Redirect Recursion", headers
)
544 result
= self
.redirect_internal(url
, fp
, errcode
, errmsg
, headers
,
549 def redirect_internal(self
, url
, fp
, errcode
, errmsg
, headers
, data
):
550 if headers
.has_key('location'):
551 newurl
= headers
['location']
552 elif headers
.has_key('uri'):
553 newurl
= headers
['uri']
558 # In case the server sent a relative URL, join with original:
559 newurl
= basejoin("http:" + url
, newurl
)
561 return self
.open(newurl
)
563 return self
.open(newurl
, data
)
565 def http_error_301(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
566 """Error 301 -- also relocated (permanently)."""
567 return self
.http_error_302(url
, fp
, errcode
, errmsg
, headers
, data
)
569 def http_error_401(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
570 """Error 401 -- authentication required.
571 See this URL for a description of the basic authentication scheme:
572 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
573 if not headers
.has_key('www-authenticate'):
574 URLopener
.http_error_default(self
, url
, fp
,
576 stuff
= headers
['www-authenticate']
578 match
= re
.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff
)
580 URLopener
.http_error_default(self
, url
, fp
,
581 errcode
, errmsg
, headers
)
582 scheme
, realm
= match
.groups()
583 if scheme
.lower() != 'basic':
584 URLopener
.http_error_default(self
, url
, fp
,
585 errcode
, errmsg
, headers
)
586 name
= 'retry_' + self
.type + '_basic_auth'
588 return getattr(self
,name
)(url
, realm
)
590 return getattr(self
,name
)(url
, realm
, data
)
592 def retry_http_basic_auth(self
, url
, realm
, data
=None):
593 host
, selector
= splithost(url
)
594 i
= host
.find('@') + 1
596 user
, passwd
= self
.get_user_passwd(host
, realm
, i
)
597 if not (user
or passwd
): return None
598 host
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + host
599 newurl
= 'http://' + host
+ selector
601 return self
.open(newurl
)
603 return self
.open(newurl
, data
)
605 def retry_https_basic_auth(self
, url
, realm
, data
=None):
606 host
, selector
= splithost(url
)
607 i
= host
.find('@') + 1
609 user
, passwd
= self
.get_user_passwd(host
, realm
, i
)
610 if not (user
or passwd
): return None
611 host
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + host
612 newurl
= '//' + host
+ selector
613 return self
.open_https(newurl
, data
)
615 def get_user_passwd(self
, host
, realm
, clear_cache
= 0):
616 key
= realm
+ '@' + host
.lower()
617 if self
.auth_cache
.has_key(key
):
619 del self
.auth_cache
[key
]
621 return self
.auth_cache
[key
]
622 user
, passwd
= self
.prompt_user_passwd(host
, realm
)
623 if user
or passwd
: self
.auth_cache
[key
] = (user
, passwd
)
626 def prompt_user_passwd(self
, host
, realm
):
627 """Override this in a GUI environment!"""
630 user
= raw_input("Enter username for %s at %s: " % (realm
,
632 passwd
= getpass
.getpass("Enter password for %s in %s at %s: " %
635 except KeyboardInterrupt:
644 """Return the IP address of the magic hostname 'localhost'."""
647 _localhost
= socket
.gethostbyname('localhost')
652 """Return the IP address of the current host."""
655 _thishost
= socket
.gethostbyname(socket
.gethostname())
660 """Return the set of errors raised by the FTP class."""
664 _ftperrors
= ftplib
.all_errors
669 """Return an empty mimetools.Message object."""
674 _noheaders
= mimetools
.Message(StringIO
.StringIO(), 0)
675 _noheaders
.fp
.close() # Recycle file descriptor
682 """Class used by open_ftp() for cache of open FTP connections."""
684 def __init__(self
, user
, passwd
, host
, port
, dirs
):
695 self
.ftp
= ftplib
.FTP()
696 self
.ftp
.connect(self
.host
, self
.port
)
697 self
.ftp
.login(self
.user
, self
.passwd
)
698 for dir in self
.dirs
:
701 def retrfile(self
, file, type):
704 if type in ('d', 'D'): cmd
= 'TYPE A'; isdir
= 1
705 else: cmd
= 'TYPE ' + type; isdir
= 0
707 self
.ftp
.voidcmd(cmd
)
708 except ftplib
.all_errors
:
710 self
.ftp
.voidcmd(cmd
)
712 if file and not isdir
:
713 # Use nlst to see if the file exists at all
716 except ftplib
.error_perm
, reason
:
717 raise IOError, ('ftp error', reason
), sys
.exc_info()[2]
718 # Restore the transfer mode!
719 self
.ftp
.voidcmd(cmd
)
720 # Try to retrieve as a file
723 conn
= self
.ftp
.ntransfercmd(cmd
)
724 except ftplib
.error_perm
, reason
:
725 if str(reason
)[:3] != '550':
726 raise IOError, ('ftp error', reason
), sys
.exc_info()[2]
728 # Set transfer mode to ASCII!
729 self
.ftp
.voidcmd('TYPE A')
730 # Try a directory listing
731 if file: cmd
= 'LIST ' + file
733 conn
= self
.ftp
.ntransfercmd(cmd
)
735 # Pass back both a suitably decorated object and a retrieval length
736 return (addclosehook(conn
[0].makefile('rb'),
737 self
.endtransfer
), conn
[1])
738 def endtransfer(self
):
755 """Base class for addinfo and addclosehook."""
757 def __init__(self
, fp
):
759 self
.read
= self
.fp
.read
760 self
.readline
= self
.fp
.readline
761 if hasattr(self
.fp
, "readlines"): self
.readlines
= self
.fp
.readlines
762 if hasattr(self
.fp
, "fileno"): self
.fileno
= self
.fp
.fileno
765 return '<%s at %s whose fp = %s>' % (self
.__class
__.__name
__,
766 `
id(self
)`
, `self
.fp`
)
771 self
.readlines
= None
773 if self
.fp
: self
.fp
.close()
776 class addclosehook(addbase
):
777 """Class to add a close hook to an open file."""
779 def __init__(self
, fp
, closehook
, *hookargs
):
780 addbase
.__init
__(self
, fp
)
781 self
.closehook
= closehook
782 self
.hookargs
= hookargs
787 apply(self
.closehook
, self
.hookargs
)
788 self
.closehook
= None
791 class addinfo(addbase
):
792 """class to add an info() method to an open file."""
794 def __init__(self
, fp
, headers
):
795 addbase
.__init
__(self
, fp
)
796 self
.headers
= headers
801 class addinfourl(addbase
):
802 """class to add info() and geturl() methods to an open file."""
804 def __init__(self
, fp
, headers
, url
):
805 addbase
.__init
__(self
, fp
)
806 self
.headers
= headers
816 def basejoin(base
, url
):
817 """Utility to combine a URL with a base URL to form a new URL."""
818 type, path
= splittype(url
)
820 # if url is complete (i.e., it contains a type), return it
822 host
, path
= splithost(path
)
823 type, basepath
= splittype(base
) # inherit type from base
825 # if url contains host, just inherit type
826 if type: return type + '://' + host
+ path
828 # no type inherited, so url must have started with //
831 host
, basepath
= splithost(basepath
) # inherit host
832 basepath
, basetag
= splittag(basepath
) # remove extraneous cruft
833 basepath
, basequery
= splitquery(basepath
) # idem
835 # non-absolute path name
836 if path
[:1] in ('#', '?'):
837 # path is just a tag or query, attach to basepath
840 # else replace last component
841 i
= basepath
.rfind('/')
843 # basepath not absolute
845 # host present, make absolute
848 # else keep non-absolute
851 # remove last file component
852 basepath
= basepath
[:i
+1]
853 # Interpret ../ (important because of symlinks)
854 while basepath
and path
[:3] == '../':
856 i
= basepath
[:-1].rfind('/')
858 basepath
= basepath
[:i
+1]
865 path
= basepath
+ path
866 if type and host
: return type + '://' + host
+ path
867 elif type: return type + ':' + path
868 elif host
: return '//' + host
+ path
# don't know what this means
872 # Utilities to parse URLs (most of these return None for missing parts):
873 # unwrap('<URL:type://host/path>') --> 'type://host/path'
874 # splittype('type:opaquestring') --> 'type', 'opaquestring'
875 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
876 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
877 # splitpasswd('user:passwd') -> 'user', 'passwd'
878 # splitport('host:port') --> 'host', 'port'
879 # splitquery('/path?query') --> '/path', 'query'
880 # splittag('/path#tag') --> '/path', 'tag'
881 # splitattr('/path;attr1=value1;attr2=value2;...') ->
882 # '/path', ['attr1=value1', 'attr2=value2', ...]
883 # splitvalue('attr=value') --> 'attr', 'value'
884 # splitgophertype('/Xselector') --> 'X', 'selector'
885 # unquote('abc%20def') -> 'abc def'
886 # quote('abc def') -> 'abc%20def')
889 """toBytes(u"URL") --> 'URL'."""
890 # Most URL schemes require ASCII. If that changes, the conversion
892 if type(url
) is types
.UnicodeType
:
894 url
= url
.encode("ASCII")
896 raise UnicodeError("URL " + repr(url
) +
897 " contains non-ASCII characters")
901 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
903 if url
[:1] == '<' and url
[-1:] == '>':
904 url
= url
[1:-1].strip()
905 if url
[:4] == 'URL:': url
= url
[4:].strip()
910 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
912 if _typeprog
is None:
914 _typeprog
= re
.compile('^([^/:]+):')
916 match
= _typeprog
.match(url
)
918 scheme
= match
.group(1)
919 return scheme
.lower(), url
[len(scheme
) + 1:]
924 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
926 if _hostprog
is None:
928 _hostprog
= re
.compile('^//([^/]*)(.*)$')
930 match
= _hostprog
.match(url
)
931 if match
: return match
.group(1, 2)
936 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
938 if _userprog
is None:
940 _userprog
= re
.compile('^([^@]*)@(.*)$')
942 match
= _userprog
.match(host
)
943 if match
: return map(unquote
, match
.group(1, 2))
947 def splitpasswd(user
):
948 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
950 if _passwdprog
is None:
952 _passwdprog
= re
.compile('^([^:]*):(.*)$')
954 match
= _passwdprog
.match(user
)
955 if match
: return match
.group(1, 2)
958 # splittag('/path#tag') --> '/path', 'tag'
961 """splitport('host:port') --> 'host', 'port'."""
963 if _portprog
is None:
965 _portprog
= re
.compile('^(.*):([0-9]+)$')
967 match
= _portprog
.match(host
)
968 if match
: return match
.group(1, 2)
972 def splitnport(host
, defport
=-1):
973 """Split host and port, returning numeric port.
974 Return given default port if no ':' found; defaults to -1.
975 Return numerical port if a valid number are found after ':'.
976 Return None if ':' but not a valid number."""
978 if _nportprog
is None:
980 _nportprog
= re
.compile('^(.*):(.*)$')
982 match
= _nportprog
.match(host
)
984 host
, port
= match
.group(1, 2)
986 if not port
: raise ValueError, "no digits"
995 """splitquery('/path?query') --> '/path', 'query'."""
997 if _queryprog
is None:
999 _queryprog
= re
.compile('^(.*)\?([^?]*)$')
1001 match
= _queryprog
.match(url
)
1002 if match
: return match
.group(1, 2)
1007 """splittag('/path#tag') --> '/path', 'tag'."""
1009 if _tagprog
is None:
1011 _tagprog
= re
.compile('^(.*)#([^#]*)$')
1013 match
= _tagprog
.match(url
)
1014 if match
: return match
.group(1, 2)
1018 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1019 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1020 words
= url
.split(';')
1021 return words
[0], words
[1:]
1024 def splitvalue(attr
):
1025 """splitvalue('attr=value') --> 'attr', 'value'."""
1027 if _valueprog
is None:
1029 _valueprog
= re
.compile('^([^=]*)=(.*)$')
1031 match
= _valueprog
.match(attr
)
1032 if match
: return match
.group(1, 2)
1035 def splitgophertype(selector
):
1036 """splitgophertype('/Xselector') --> 'X', 'selector'."""
1037 if selector
[:1] == '/' and selector
[1:2]:
1038 return selector
[1], selector
[2:]
1039 return None, selector
1042 """unquote('abc%20def') -> 'abc def'."""
1047 myappend
= res
.append
1052 myappend(mychr(myatoi(item
[:2], 16))
1055 myappend('%' + item
)
1057 myappend('%' + item
)
1060 def unquote_plus(s
):
1061 """unquote('%7e/abc+def') -> '~/abc def'"""
1063 # replace '+' with ' '
1064 s
= ' '.join(s
.split('+'))
1067 always_safe
= ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1068 'abcdefghijklmnopqrstuvwxyz'
1071 _fast_safe_test
= always_safe
+ '/'
1076 if _fast_safe
is None:
1078 for c
in _fast_safe_test
:
1081 for i
in range(len(res
)):
1083 if not _fast_safe
.has_key(c
):
1084 res
[i
] = '%%%02X' % ord(c
)
1087 def quote(s
, safe
= '/'):
1088 """quote('abc def') -> 'abc%20def'
1090 Each part of a URL, e.g. the path info, the query, etc., has a
1091 different set of reserved characters that must be quoted.
1093 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1094 the following reserved characters.
1096 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1099 Each of these characters is reserved in some component of a URL,
1100 but not necessarily in all of them.
1102 By default, the quote function is intended for quoting the path
1103 section of a URL. Thus, it will not encode '/'. This character
1104 is reserved, but in typical usage the quote function is being
1105 called on a path where the existing slash characters are used as
1106 reserved characters.
1108 safe
= always_safe
+ safe
1109 if _fast_safe_test
== safe
:
1110 return _fast_quote(s
)
1112 for i
in range(len(res
)):
1115 res
[i
] = '%%%02X' % ord(c
)
1118 def quote_plus(s
, safe
= ''):
1119 """Quote the query fragment of a URL; replacing ' ' with '+'"""
1122 for i
in range(len(l
)):
1123 l
[i
] = quote(l
[i
], safe
)
1126 return quote(s
, safe
)
1128 def urlencode(query
,doseq
=0):
1129 """Encode a sequence of two-element tuples or dictionary into a URL query string.
1131 If any values in the query arg are sequences and doseq is true, each
1132 sequence element is converted to a separate parameter.
1134 If the query arg is a sequence of two-element tuples, the order of the
1135 parameters in the output will match the order of parameters in the
1139 if hasattr(query
,"items"):
1141 query
= query
.items()
1143 # it's a bother at times that strings and string-like objects are
1146 # non-sequence items should not work with len()
1148 # non-empty strings will fail this
1149 if len(query
) and type(query
[0]) != types
.TupleType
:
1151 # zero-length sequences of all types will get here and succeed,
1152 # but that's a minor nit - since the original implementation
1153 # allowed empty dicts that type of behavior probably should be
1154 # preserved for consistency
1156 ty
,va
,tb
= sys
.exc_info()
1157 raise TypeError, "not a valid non-string sequence or mapping object", tb
1161 # preserve old behavior
1163 k
= quote_plus(str(k
))
1164 v
= quote_plus(str(v
))
1165 l
.append(k
+ '=' + v
)
1168 k
= quote_plus(str(k
))
1169 if type(v
) == types
.StringType
:
1171 l
.append(k
+ '=' + v
)
1172 elif type(v
) == types
.UnicodeType
:
1173 # is there a reasonable way to convert to ASCII?
1174 # encode generates a string, but "replace" or "ignore"
1175 # lose information and "strict" can raise UnicodeError
1176 v
= quote_plus(v
.encode("ASCII","replace"))
1177 l
.append(k
+ '=' + v
)
1180 # is this a sufficient test for sequence-ness?
1184 v
= quote_plus(str(v
))
1185 l
.append(k
+ '=' + v
)
1187 # loop over the sequence
1189 l
.append(k
+ '=' + quote_plus(str(elt
)))
1193 def getproxies_environment():
1194 """Return a dictionary of scheme -> proxy server URL mappings.
1196 Scan the environment for variables named <scheme>_proxy;
1197 this seems to be the standard convention. If you need a
1198 different way, you can pass a proxies dictionary to the
1199 [Fancy]URLopener constructor.
1203 for name
, value
in os
.environ
.items():
1205 if value
and name
[-6:] == '_proxy':
1206 proxies
[name
[:-6]] = value
1209 if os
.name
== 'mac':
1211 """Return a dictionary of scheme -> proxy server URL mappings.
1213 By convention the mac uses Internet Config to store
1214 proxies. An HTTP proxy, for instance, is stored under
1229 if config
.has_key('UseHTTPProxy') and config
['UseHTTPProxy']:
1231 value
= config
['HTTPProxyHost']
1235 proxies
['http'] = 'http://%s' % value
1236 # FTP: XXXX To be done.
1237 # Gopher: XXXX To be done.
1240 elif os
.name
== 'nt':
1241 def getproxies_registry():
1242 """Return a dictionary of scheme -> proxy server URL mappings.
1244 Win32 uses the registry to store proxies.
1251 # Std module, so should be around - but you never know!
1254 internetSettings
= _winreg
.OpenKey(_winreg
.HKEY_CURRENT_USER
,
1255 r
'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1256 proxyEnable
= _winreg
.QueryValueEx(internetSettings
,
1259 # Returned as Unicode but problems if not converted to ASCII
1260 proxyServer
= str(_winreg
.QueryValueEx(internetSettings
,
1262 if '=' in proxyServer
:
1263 # Per-protocol settings
1264 for p
in proxyServer
.split(';'):
1265 protocol
, address
= p
.split('=', 1)
1266 proxies
[protocol
] = '%s://%s' % (protocol
, address
)
1268 # Use one setting for all protocols
1269 if proxyServer
[:5] == 'http:':
1270 proxies
['http'] = proxyServer
1272 proxies
['http'] = 'http://%s' % proxyServer
1273 proxies
['ftp'] = 'ftp://%s' % proxyServer
1274 internetSettings
.Close()
1275 except (WindowsError, ValueError, TypeError):
1276 # Either registry key not found etc, or the value in an
1277 # unexpected format.
1278 # proxies already set up to be empty so nothing to do
1283 """Return a dictionary of scheme -> proxy server URL mappings.
1285 Returns settings gathered from the environment, if specified,
1289 return getproxies_environment() or getproxies_registry()
1291 # By default use environment variables
1292 getproxies
= getproxies_environment
1295 # Test and time quote() and unquote()
1299 for i
in range(256): s
= s
+ chr(i
)
1310 print round(t1
- t0
, 3), 'sec'
1313 def reporthook(blocknum
, blocksize
, totalsize
):
1314 # Report during remote transfers
1315 print "Block number: %d, Block size: %d, Total size: %d" % (
1316 blocknum
, blocksize
, totalsize
)
1324 'file://localhost/etc/passwd',
1325 'ftp://ftp.python.org/etc/passwd',
1326 ## 'gopher://gopher.micro.umn.edu/1/',
1327 'http://www.python.org/index.html',
1329 if hasattr(URLopener
, "open_https"):
1330 args
.append('https://synergy.as.cmu.edu/~geek/')
1333 print '-'*10, url
, '-'*10
1334 fn
, h
= urlretrieve(url
, None, reporthook
)
1338 for k
in h
.keys(): print k
+ ':', h
[k
]
1344 table
= string
.maketrans("", "")
1345 data
= data
.translate(table
, "\r")
1355 opts
, args
= getopt
.getopt(sys
.argv
[1:], "th")
1356 except getopt
.error
, msg
:
1358 print "Use -h for help"
1365 print "Usage: python urllib.py [-t] [url ...]"
1366 print "-t runs self-test;",
1367 print "otherwise, contents of urls are printed"
1375 print "Use -h for help"
1377 print urlopen(url
).read(),
1379 # Run test program when run as a script
1380 if __name__
== '__main__':