1 """Open an arbitrary URL.
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
16 The object returned by URLopener().open(file) will differ per
17 protocol. All you know is that is has methods read(), readline(),
18 readlines(), fileno(), close() and info(). The read*(), fileno()
19 and close() methods work like those of open files.
20 The info() method returns a mimetools.Message object which can be
21 used to query various info about the object, if available.
22 (mimetools.Message objects are queried with the getheader() method.)
31 __all__
= ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
32 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
33 "urlencode", "url2pathname", "pathname2url", "splittag",
34 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
35 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
36 "splitnport", "splitquery", "splitattr", "splitvalue",
37 "splitgophertype", "getproxies"]
39 __version__
= '1.15' # XXX This version is not always updated :-(
41 MAXFTPCACHE
= 10 # Trim the ftp cache beyond this size
43 # Helper for non-unix systems
45 from macurl2path
import url2pathname
, pathname2url
47 from nturl2path
import url2pathname
, pathname2url
48 elif os
.name
== 'riscos':
49 from rourl2path
import url2pathname
, pathname2url
51 def url2pathname(pathname
):
52 return unquote(pathname
)
53 def pathname2url(pathname
):
54 return quote(pathname
)
56 # This really consists of two pieces:
57 # (1) a class which handles opening of all sorts of URLs
58 # (plus assorted utilities etc.)
59 # (2) a set of functions for parsing URLs
60 # XXX Should these be separated out into different modules?
63 # Shortcut for basic usage
65 def urlopen(url
, data
=None, proxies
=None):
66 """urlopen(url [, data]) -> open file-like object"""
68 if proxies
is not None:
69 opener
= FancyURLopener(proxies
=proxies
)
71 opener
= FancyURLopener()
76 return opener
.open(url
)
78 return opener
.open(url
, data
)
79 def urlretrieve(url
, filename
=None, reporthook
=None, data
=None):
82 _urlopener
= FancyURLopener()
83 return _urlopener
.retrieve(url
, filename
, reporthook
, data
)
91 """Class to open URLs.
92 This is a class rather than just a subroutine because we may need
93 more than one set of global protocol-specific options.
94 Note -- this is a base class for those who don't want the
95 automatic handling of errors type 302 (relocated) and 401
96 (authorization needed)."""
100 version
= "Python-urllib/%s" % __version__
103 def __init__(self
, proxies
=None, **x509
):
105 proxies
= getproxies()
106 assert hasattr(proxies
, 'has_key'), "proxies must be a mapping"
107 self
.proxies
= proxies
108 self
.key_file
= x509
.get('key_file')
109 self
.cert_file
= x509
.get('cert_file')
110 self
.addheaders
= [('User-agent', self
.version
)]
111 self
.__tempfiles
= []
112 self
.__unlink
= os
.unlink
# See cleanup()
113 self
.tempcache
= None
114 # Undocumented feature: if you assign {} to tempcache,
115 # it is used to cache files retrieved with
116 # self.retrieve(). This is not enabled by default
117 # since it does not work for changing documents (and I
118 # haven't got the logic to check expiration headers
120 self
.ftpcache
= ftpcache
121 # Undocumented feature: you can use a different
122 # ftp cache by assigning to the .ftpcache member;
123 # in case you want logically independent URL openers
124 # XXX This is not threadsafe. Bah.
133 # This code sometimes runs when the rest of this module
134 # has already been deleted, so it can't use any globals
135 # or import anything.
137 for file in self
.__tempfiles
:
142 del self
.__tempfiles
[:]
144 self
.tempcache
.clear()
146 def addheader(self
, *args
):
147 """Add a header to be used by the HTTP interface only
148 e.g. u.addheader('Accept', 'sound/basic')"""
149 self
.addheaders
.append(args
)
152 def open(self
, fullurl
, data
=None):
153 """Use URLopener().open(file) instead of open(file, 'r')."""
154 fullurl
= unwrap(toBytes(fullurl
))
155 if self
.tempcache
and fullurl
in self
.tempcache
:
156 filename
, headers
= self
.tempcache
[fullurl
]
157 fp
= open(filename
, 'rb')
158 return addinfourl(fp
, headers
, fullurl
)
159 urltype
, url
= splittype(fullurl
)
162 if urltype
in self
.proxies
:
163 proxy
= self
.proxies
[urltype
]
164 urltype
, proxyhost
= splittype(proxy
)
165 host
, selector
= splithost(proxyhost
)
166 url
= (host
, fullurl
) # Signal special case to open_*()
169 name
= 'open_' + urltype
173 name
= '_'.join(name
.split('-'))
174 if not hasattr(self
, name
):
176 return self
.open_unknown_proxy(proxy
, fullurl
, data
)
178 return self
.open_unknown(fullurl
, data
)
181 return getattr(self
, name
)(url
)
183 return getattr(self
, name
)(url
, data
)
184 except socket
.error
, msg
:
185 raise IOError, ('socket error', msg
), sys
.exc_info()[2]
187 def open_unknown(self
, fullurl
, data
=None):
188 """Overridable interface to open unknown URL type."""
189 type, url
= splittype(fullurl
)
190 raise IOError, ('url error', 'unknown url type', type)
192 def open_unknown_proxy(self
, proxy
, fullurl
, data
=None):
193 """Overridable interface to open unknown URL type."""
194 type, url
= splittype(fullurl
)
195 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy
)
198 def retrieve(self
, url
, filename
=None, reporthook
=None, data
=None):
199 """retrieve(url) returns (filename, None) for a local object
200 or (tempfilename, headers) for a remote object."""
201 url
= unwrap(toBytes(url
))
202 if self
.tempcache
and url
in self
.tempcache
:
203 return self
.tempcache
[url
]
204 type, url1
= splittype(url
)
205 if filename
is None and (not type or type == 'file'):
207 fp
= self
.open_local_file(url1
)
210 return url2pathname(splithost(url1
)[1]), hdrs
213 fp
= self
.open(url
, data
)
216 tfp
= open(filename
, 'wb')
219 garbage
, path
= splittype(url
)
220 garbage
, path
= splithost(path
or "")
221 path
, garbage
= splitquery(path
or "")
222 path
, garbage
= splitattr(path
or "")
223 suffix
= os
.path
.splitext(path
)[1]
224 (fd
, filename
) = tempfile
.mkstemp(suffix
)
225 self
.__tempfiles
.append(filename
)
226 tfp
= os
.fdopen(fd
, 'wb')
227 result
= filename
, headers
228 if self
.tempcache
is not None:
229 self
.tempcache
[url
] = result
234 if "content-length" in headers
:
235 size
= int(headers
["Content-Length"])
236 reporthook(0, bs
, size
)
239 reporthook(1, bs
, size
)
243 blocknum
= blocknum
+ 1
245 reporthook(blocknum
, bs
, size
)
252 # Each method named open_<type> knows how to open that type of URL
254 def open_http(self
, url
, data
=None):
255 """Use HTTP protocol."""
258 if isinstance(url
, str):
259 host
, selector
= splithost(url
)
261 user_passwd
, host
= splituser(host
)
266 urltype
, rest
= splittype(selector
)
269 if urltype
.lower() != 'http':
272 realhost
, rest
= splithost(rest
)
274 user_passwd
, realhost
= splituser(realhost
)
276 selector
= "%s://%s%s" % (urltype
, realhost
, rest
)
277 if proxy_bypass(realhost
):
280 #print "proxy via http:", host, selector
281 if not host
: raise IOError, ('http error', 'no host given')
284 auth
= base64
.encodestring(user_passwd
).strip()
287 h
= httplib
.HTTP(host
)
289 h
.putrequest('POST', selector
)
290 h
.putheader('Content-type', 'application/x-www-form-urlencoded')
291 h
.putheader('Content-length', '%d' % len(data
))
293 h
.putrequest('GET', selector
)
294 if auth
: h
.putheader('Authorization', 'Basic %s' % auth
)
295 if realhost
: h
.putheader('Host', realhost
)
296 for args
in self
.addheaders
: apply(h
.putheader
, args
)
300 errcode
, errmsg
, headers
= h
.getreply()
303 return addinfourl(fp
, headers
, "http:" + url
)
306 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
)
308 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
, data
)
310 def http_error(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
311 """Handle http errors.
312 Derived class can override this, or provide specific handlers
313 named http_error_DDD where DDD is the 3-digit error code."""
314 # First check if there's a specific handler for this error
315 name
= 'http_error_%d' % errcode
316 if hasattr(self
, name
):
317 method
= getattr(self
, name
)
319 result
= method(url
, fp
, errcode
, errmsg
, headers
)
321 result
= method(url
, fp
, errcode
, errmsg
, headers
, data
)
322 if result
: return result
323 return self
.http_error_default(url
, fp
, errcode
, errmsg
, headers
)
325 def http_error_default(self
, url
, fp
, errcode
, errmsg
, headers
):
326 """Default error handler: close the connection and raise IOError."""
329 raise IOError, ('http error', errcode
, errmsg
, headers
)
331 if hasattr(socket
, "ssl"):
332 def open_https(self
, url
, data
=None):
333 """Use HTTPS protocol."""
336 if isinstance(url
, str):
337 host
, selector
= splithost(url
)
339 user_passwd
, host
= splituser(host
)
344 urltype
, rest
= splittype(selector
)
347 if urltype
.lower() != 'https':
350 realhost
, rest
= splithost(rest
)
352 user_passwd
, realhost
= splituser(realhost
)
354 selector
= "%s://%s%s" % (urltype
, realhost
, rest
)
355 #print "proxy via https:", host, selector
356 if not host
: raise IOError, ('https error', 'no host given')
359 auth
= base64
.encodestring(user_passwd
).strip()
362 h
= httplib
.HTTPS(host
, 0,
363 key_file
=self
.key_file
,
364 cert_file
=self
.cert_file
)
366 h
.putrequest('POST', selector
)
367 h
.putheader('Content-type',
368 'application/x-www-form-urlencoded')
369 h
.putheader('Content-length', '%d' % len(data
))
371 h
.putrequest('GET', selector
)
372 if auth
: h
.putheader('Authorization: Basic %s' % auth
)
373 if realhost
: h
.putheader('Host', realhost
)
374 for args
in self
.addheaders
: apply(h
.putheader
, args
)
378 errcode
, errmsg
, headers
= h
.getreply()
381 return addinfourl(fp
, headers
, "https:" + url
)
384 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
)
386 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
,
389 def open_gopher(self
, url
):
390 """Use Gopher protocol."""
392 host
, selector
= splithost(url
)
393 if not host
: raise IOError, ('gopher error', 'no host given')
395 type, selector
= splitgophertype(selector
)
396 selector
, query
= splitquery(selector
)
397 selector
= unquote(selector
)
399 query
= unquote(query
)
400 fp
= gopherlib
.send_query(selector
, query
, host
)
402 fp
= gopherlib
.send_selector(selector
, host
)
403 return addinfourl(fp
, noheaders(), "gopher:" + url
)
405 def open_file(self
, url
):
406 """Use local file or FTP depending on form of URL."""
407 if url
[:2] == '//' and url
[2:3] != '/' and url
[2:12].lower() != 'localhost/':
408 return self
.open_ftp(url
)
410 return self
.open_local_file(url
)
412 def open_local_file(self
, url
):
413 """Use local file."""
414 import mimetypes
, mimetools
, rfc822
, StringIO
415 host
, file = splithost(url
)
416 localname
= url2pathname(file)
418 stats
= os
.stat(localname
)
420 raise IOError(e
.errno
, e
.strerror
, e
.filename
)
422 modified
= rfc822
.formatdate(stats
.st_mtime
)
423 mtype
= mimetypes
.guess_type(url
)[0]
424 headers
= mimetools
.Message(StringIO
.StringIO(
425 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
426 (mtype
or 'text/plain', size
, modified
)))
430 urlfile
= 'file://' + file
431 return addinfourl(open(localname
, 'rb'),
433 host
, port
= splitport(host
)
435 and socket
.gethostbyname(host
) in (localhost(), thishost()):
438 urlfile
= 'file://' + file
439 return addinfourl(open(localname
, 'rb'),
441 raise IOError, ('local file error', 'not on local host')
443 def open_ftp(self
, url
):
444 """Use FTP protocol."""
445 import mimetypes
, mimetools
, StringIO
446 host
, path
= splithost(url
)
447 if not host
: raise IOError, ('ftp error', 'no host given')
448 host
, port
= splitport(host
)
449 user
, host
= splituser(host
)
450 if user
: user
, passwd
= splitpasswd(user
)
453 user
= unquote(user
or '')
454 passwd
= unquote(passwd
or '')
455 host
= socket
.gethostbyname(host
)
458 port
= ftplib
.FTP_PORT
461 path
, attrs
= splitattr(path
)
463 dirs
= path
.split('/')
464 dirs
, file = dirs
[:-1], dirs
[-1]
465 if dirs
and not dirs
[0]: dirs
= dirs
[1:]
466 if dirs
and not dirs
[0]: dirs
[0] = '/'
467 key
= user
, host
, port
, '/'.join(dirs
)
469 if len(self
.ftpcache
) > MAXFTPCACHE
:
470 # Prune the cache, rather arbitrarily
471 for k
in self
.ftpcache
.keys():
477 if not key
in self
.ftpcache
:
478 self
.ftpcache
[key
] = \
479 ftpwrapper(user
, passwd
, host
, port
, dirs
)
480 if not file: type = 'D'
483 attr
, value
= splitvalue(attr
)
484 if attr
.lower() == 'type' and \
485 value
in ('a', 'A', 'i', 'I', 'd', 'D'):
487 (fp
, retrlen
) = self
.ftpcache
[key
].retrfile(file, type)
488 mtype
= mimetypes
.guess_type("ftp:" + url
)[0]
491 headers
+= "Content-Type: %s\n" % mtype
492 if retrlen
is not None and retrlen
>= 0:
493 headers
+= "Content-Length: %d\n" % retrlen
494 headers
= mimetools
.Message(StringIO
.StringIO(headers
))
495 return addinfourl(fp
, headers
, "ftp:" + url
)
496 except ftperrors(), msg
:
497 raise IOError, ('ftp error', msg
), sys
.exc_info()[2]
499 def open_data(self
, url
, data
=None):
500 """Use "data" URL."""
503 # syntax of data URLs:
504 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
505 # mediatype := [ type "/" subtype ] *( ";" parameter )
507 # parameter := attribute "=" value
508 import StringIO
, mimetools
510 [type, data
] = url
.split(',', 1)
512 raise IOError, ('data error', 'bad data URL')
514 type = 'text/plain;charset=US-ASCII'
515 semi
= type.rfind(';')
516 if semi
>= 0 and '=' not in type[semi
:]:
517 encoding
= type[semi
+1:]
522 msg
.append('Date: %s'%time
.strftime('%a, %d %b %Y %T GMT',
523 time
.gmtime(time
.time())))
524 msg
.append('Content-type: %s' % type)
525 if encoding
== 'base64':
527 data
= base64
.decodestring(data
)
530 msg
.append('Content-length: %d' % len(data
))
534 f
= StringIO
.StringIO(msg
)
535 headers
= mimetools
.Message(f
, 0)
536 f
.fileno
= None # needed for addinfourl
537 return addinfourl(f
, headers
, url
)
540 class FancyURLopener(URLopener
):
541 """Derived class with handlers for errors we can handle (perhaps)."""
543 def __init__(self
, *args
, **kwargs
):
544 apply(URLopener
.__init
__, (self
,) + args
, kwargs
)
549 def http_error_default(self
, url
, fp
, errcode
, errmsg
, headers
):
550 """Default error handling -- don't raise an exception."""
551 return addinfourl(fp
, headers
, "http:" + url
)
553 def http_error_302(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
554 """Error 302 -- relocated (temporarily)."""
556 if self
.maxtries
and self
.tries
>= self
.maxtries
:
557 if hasattr(self
, "http_error_500"):
558 meth
= self
.http_error_500
560 meth
= self
.http_error_default
562 return meth(url
, fp
, 500,
563 "Internal Server Error: Redirect Recursion", headers
)
564 result
= self
.redirect_internal(url
, fp
, errcode
, errmsg
, headers
,
569 def redirect_internal(self
, url
, fp
, errcode
, errmsg
, headers
, data
):
570 if 'location' in headers
:
571 newurl
= headers
['location']
572 elif 'uri' in headers
:
573 newurl
= headers
['uri']
578 # In case the server sent a relative URL, join with original:
579 newurl
= basejoin(self
.type + ":" + url
, newurl
)
581 return self
.open(newurl
)
583 return self
.open(newurl
, data
)
585 def http_error_301(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
586 """Error 301 -- also relocated (permanently)."""
587 return self
.http_error_302(url
, fp
, errcode
, errmsg
, headers
, data
)
589 def http_error_401(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
590 """Error 401 -- authentication required.
591 See this URL for a description of the basic authentication scheme:
592 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
593 if not 'www-authenticate' in headers
:
594 URLopener
.http_error_default(self
, url
, fp
,
595 errcode
, errmsg
, headers
)
596 stuff
= headers
['www-authenticate']
598 match
= re
.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff
)
600 URLopener
.http_error_default(self
, url
, fp
,
601 errcode
, errmsg
, headers
)
602 scheme
, realm
= match
.groups()
603 if scheme
.lower() != 'basic':
604 URLopener
.http_error_default(self
, url
, fp
,
605 errcode
, errmsg
, headers
)
606 name
= 'retry_' + self
.type + '_basic_auth'
608 return getattr(self
,name
)(url
, realm
)
610 return getattr(self
,name
)(url
, realm
, data
)
612 def retry_http_basic_auth(self
, url
, realm
, data
=None):
613 host
, selector
= splithost(url
)
614 i
= host
.find('@') + 1
616 user
, passwd
= self
.get_user_passwd(host
, realm
, i
)
617 if not (user
or passwd
): return None
618 host
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + host
619 newurl
= 'http://' + host
+ selector
621 return self
.open(newurl
)
623 return self
.open(newurl
, data
)
625 def retry_https_basic_auth(self
, url
, realm
, data
=None):
626 host
, selector
= splithost(url
)
627 i
= host
.find('@') + 1
629 user
, passwd
= self
.get_user_passwd(host
, realm
, i
)
630 if not (user
or passwd
): return None
631 host
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + host
632 newurl
= '//' + host
+ selector
633 return self
.open_https(newurl
, data
)
635 def get_user_passwd(self
, host
, realm
, clear_cache
= 0):
636 key
= realm
+ '@' + host
.lower()
637 if key
in self
.auth_cache
:
639 del self
.auth_cache
[key
]
641 return self
.auth_cache
[key
]
642 user
, passwd
= self
.prompt_user_passwd(host
, realm
)
643 if user
or passwd
: self
.auth_cache
[key
] = (user
, passwd
)
646 def prompt_user_passwd(self
, host
, realm
):
647 """Override this in a GUI environment!"""
650 user
= raw_input("Enter username for %s at %s: " % (realm
,
652 passwd
= getpass
.getpass("Enter password for %s in %s at %s: " %
655 except KeyboardInterrupt:
664 """Return the IP address of the magic hostname 'localhost'."""
666 if _localhost
is None:
667 _localhost
= socket
.gethostbyname('localhost')
672 """Return the IP address of the current host."""
674 if _thishost
is None:
675 _thishost
= socket
.gethostbyname(socket
.gethostname())
680 """Return the set of errors raised by the FTP class."""
682 if _ftperrors
is None:
684 _ftperrors
= ftplib
.all_errors
689 """Return an empty mimetools.Message object."""
691 if _noheaders
is None:
694 _noheaders
= mimetools
.Message(StringIO
.StringIO(), 0)
695 _noheaders
.fp
.close() # Recycle file descriptor
702 """Class used by open_ftp() for cache of open FTP connections."""
704 def __init__(self
, user
, passwd
, host
, port
, dirs
):
715 self
.ftp
= ftplib
.FTP()
716 self
.ftp
.connect(self
.host
, self
.port
)
717 self
.ftp
.login(self
.user
, self
.passwd
)
718 for dir in self
.dirs
:
721 def retrfile(self
, file, type):
724 if type in ('d', 'D'): cmd
= 'TYPE A'; isdir
= 1
725 else: cmd
= 'TYPE ' + type; isdir
= 0
727 self
.ftp
.voidcmd(cmd
)
728 except ftplib
.all_errors
:
730 self
.ftp
.voidcmd(cmd
)
732 if file and not isdir
:
733 # Use nlst to see if the file exists at all
736 except ftplib
.error_perm
, reason
:
737 raise IOError, ('ftp error', reason
), sys
.exc_info()[2]
738 # Restore the transfer mode!
739 self
.ftp
.voidcmd(cmd
)
740 # Try to retrieve as a file
743 conn
= self
.ftp
.ntransfercmd(cmd
)
744 except ftplib
.error_perm
, reason
:
745 if str(reason
)[:3] != '550':
746 raise IOError, ('ftp error', reason
), sys
.exc_info()[2]
748 # Set transfer mode to ASCII!
749 self
.ftp
.voidcmd('TYPE A')
750 # Try a directory listing
751 if file: cmd
= 'LIST ' + file
753 conn
= self
.ftp
.ntransfercmd(cmd
)
755 # Pass back both a suitably decorated object and a retrieval length
756 return (addclosehook(conn
[0].makefile('rb'),
757 self
.endtransfer
), conn
[1])
758 def endtransfer(self
):
775 """Base class for addinfo and addclosehook."""
777 def __init__(self
, fp
):
779 self
.read
= self
.fp
.read
780 self
.readline
= self
.fp
.readline
781 if hasattr(self
.fp
, "readlines"): self
.readlines
= self
.fp
.readlines
782 if hasattr(self
.fp
, "fileno"): self
.fileno
= self
.fp
.fileno
785 return '<%s at %s whose fp = %s>' % (self
.__class
__.__name
__,
786 `
id(self
)`
, `self
.fp`
)
791 self
.readlines
= None
793 if self
.fp
: self
.fp
.close()
796 class addclosehook(addbase
):
797 """Class to add a close hook to an open file."""
799 def __init__(self
, fp
, closehook
, *hookargs
):
800 addbase
.__init
__(self
, fp
)
801 self
.closehook
= closehook
802 self
.hookargs
= hookargs
807 apply(self
.closehook
, self
.hookargs
)
808 self
.closehook
= None
811 class addinfo(addbase
):
812 """class to add an info() method to an open file."""
814 def __init__(self
, fp
, headers
):
815 addbase
.__init
__(self
, fp
)
816 self
.headers
= headers
821 class addinfourl(addbase
):
822 """class to add info() and geturl() methods to an open file."""
824 def __init__(self
, fp
, headers
, url
):
825 addbase
.__init
__(self
, fp
)
826 self
.headers
= headers
836 def basejoin(base
, url
):
837 """Utility to combine a URL with a base URL to form a new URL."""
838 type, path
= splittype(url
)
840 # if url is complete (i.e., it contains a type), return it
842 host
, path
= splithost(path
)
843 type, basepath
= splittype(base
) # inherit type from base
845 # if url contains host, just inherit type
846 if type: return type + '://' + host
+ path
848 # no type inherited, so url must have started with //
851 host
, basepath
= splithost(basepath
) # inherit host
852 basepath
, basetag
= splittag(basepath
) # remove extraneous cruft
853 basepath
, basequery
= splitquery(basepath
) # idem
855 # non-absolute path name
856 if path
[:1] in ('#', '?'):
857 # path is just a tag or query, attach to basepath
860 # else replace last component
861 i
= basepath
.rfind('/')
863 # basepath not absolute
865 # host present, make absolute
868 # else keep non-absolute
871 # remove last file component
872 basepath
= basepath
[:i
+1]
873 # Interpret ../ (important because of symlinks)
874 while basepath
and path
[:3] == '../':
876 i
= basepath
[:-1].rfind('/')
878 basepath
= basepath
[:i
+1]
885 path
= basepath
+ path
886 if host
and path
and path
[0] != '/':
888 if type and host
: return type + '://' + host
+ path
889 elif type: return type + ':' + path
890 elif host
: return '//' + host
+ path
# don't know what this means
894 # Utilities to parse URLs (most of these return None for missing parts):
895 # unwrap('<URL:type://host/path>') --> 'type://host/path'
896 # splittype('type:opaquestring') --> 'type', 'opaquestring'
897 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
898 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
899 # splitpasswd('user:passwd') -> 'user', 'passwd'
900 # splitport('host:port') --> 'host', 'port'
901 # splitquery('/path?query') --> '/path', 'query'
902 # splittag('/path#tag') --> '/path', 'tag'
903 # splitattr('/path;attr1=value1;attr2=value2;...') ->
904 # '/path', ['attr1=value1', 'attr2=value2', ...]
905 # splitvalue('attr=value') --> 'attr', 'value'
906 # splitgophertype('/Xselector') --> 'X', 'selector'
907 # unquote('abc%20def') -> 'abc def'
908 # quote('abc def') -> 'abc%20def')
917 return isinstance(x
, unicode)
920 """toBytes(u"URL") --> 'URL'."""
921 # Most URL schemes require ASCII. If that changes, the conversion
925 url
= url
.encode("ASCII")
927 raise UnicodeError("URL " + repr(url
) +
928 " contains non-ASCII characters")
932 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
934 if url
[:1] == '<' and url
[-1:] == '>':
935 url
= url
[1:-1].strip()
936 if url
[:4] == 'URL:': url
= url
[4:].strip()
941 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
943 if _typeprog
is None:
945 _typeprog
= re
.compile('^([^/:]+):')
947 match
= _typeprog
.match(url
)
949 scheme
= match
.group(1)
950 return scheme
.lower(), url
[len(scheme
) + 1:]
955 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
957 if _hostprog
is None:
959 _hostprog
= re
.compile('^//([^/]*)(.*)$')
961 match
= _hostprog
.match(url
)
962 if match
: return match
.group(1, 2)
967 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
969 if _userprog
is None:
971 _userprog
= re
.compile('^(.*)@(.*)$')
973 match
= _userprog
.match(host
)
974 if match
: return map(unquote
, match
.group(1, 2))
978 def splitpasswd(user
):
979 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
981 if _passwdprog
is None:
983 _passwdprog
= re
.compile('^([^:]*):(.*)$')
985 match
= _passwdprog
.match(user
)
986 if match
: return match
.group(1, 2)
989 # splittag('/path#tag') --> '/path', 'tag'
992 """splitport('host:port') --> 'host', 'port'."""
994 if _portprog
is None:
996 _portprog
= re
.compile('^(.*):([0-9]+)$')
998 match
= _portprog
.match(host
)
999 if match
: return match
.group(1, 2)
1003 def splitnport(host
, defport
=-1):
1004 """Split host and port, returning numeric port.
1005 Return given default port if no ':' found; defaults to -1.
1006 Return numerical port if a valid number are found after ':'.
1007 Return None if ':' but not a valid number."""
1009 if _nportprog
is None:
1011 _nportprog
= re
.compile('^(.*):(.*)$')
1013 match
= _nportprog
.match(host
)
1015 host
, port
= match
.group(1, 2)
1017 if not port
: raise ValueError, "no digits"
1022 return host
, defport
1025 def splitquery(url
):
1026 """splitquery('/path?query') --> '/path', 'query'."""
1028 if _queryprog
is None:
1030 _queryprog
= re
.compile('^(.*)\?([^?]*)$')
1032 match
= _queryprog
.match(url
)
1033 if match
: return match
.group(1, 2)
1038 """splittag('/path#tag') --> '/path', 'tag'."""
1040 if _tagprog
is None:
1042 _tagprog
= re
.compile('^(.*)#([^#]*)$')
1044 match
= _tagprog
.match(url
)
1045 if match
: return match
.group(1, 2)
1049 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1050 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1051 words
= url
.split(';')
1052 return words
[0], words
[1:]
1055 def splitvalue(attr
):
1056 """splitvalue('attr=value') --> 'attr', 'value'."""
1058 if _valueprog
is None:
1060 _valueprog
= re
.compile('^([^=]*)=(.*)$')
1062 match
= _valueprog
.match(attr
)
1063 if match
: return match
.group(1, 2)
1066 def splitgophertype(selector
):
1067 """splitgophertype('/Xselector') --> 'X', 'selector'."""
1068 if selector
[:1] == '/' and selector
[1:2]:
1069 return selector
[1], selector
[2:]
1070 return None, selector
1073 """unquote('abc%20def') -> 'abc def'."""
1078 myappend
= res
.append
1083 myappend(mychr(myatoi(item
[:2], 16))
1086 myappend('%' + item
)
1088 myappend('%' + item
)
1091 def unquote_plus(s
):
1092 """unquote('%7e/abc+def') -> '~/abc def'"""
1094 # replace '+' with ' '
1095 s
= ' '.join(s
.split('+'))
1098 always_safe
= ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1099 'abcdefghijklmnopqrstuvwxyz'
1102 _fast_safe_test
= always_safe
+ '/'
1107 if _fast_safe
is None:
1109 for c
in _fast_safe_test
:
1112 for i
in range(len(res
)):
1114 if not c
in _fast_safe
:
1115 res
[i
] = '%%%02X' % ord(c
)
1118 def quote(s
, safe
= '/'):
1119 """quote('abc def') -> 'abc%20def'
1121 Each part of a URL, e.g. the path info, the query, etc., has a
1122 different set of reserved characters that must be quoted.
1124 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1125 the following reserved characters.
1127 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1130 Each of these characters is reserved in some component of a URL,
1131 but not necessarily in all of them.
1133 By default, the quote function is intended for quoting the path
1134 section of a URL. Thus, it will not encode '/'. This character
1135 is reserved, but in typical usage the quote function is being
1136 called on a path where the existing slash characters are used as
1137 reserved characters.
1139 safe
= always_safe
+ safe
1140 if _fast_safe_test
== safe
:
1141 return _fast_quote(s
)
1143 for i
in range(len(res
)):
1146 res
[i
] = '%%%02X' % ord(c
)
1149 def quote_plus(s
, safe
= ''):
1150 """Quote the query fragment of a URL; replacing ' ' with '+'"""
1153 for i
in range(len(l
)):
1154 l
[i
] = quote(l
[i
], safe
)
1157 return quote(s
, safe
)
1159 def urlencode(query
,doseq
=0):
1160 """Encode a sequence of two-element tuples or dictionary into a URL query string.
1162 If any values in the query arg are sequences and doseq is true, each
1163 sequence element is converted to a separate parameter.
1165 If the query arg is a sequence of two-element tuples, the order of the
1166 parameters in the output will match the order of parameters in the
1170 if hasattr(query
,"items"):
1172 query
= query
.items()
1174 # it's a bother at times that strings and string-like objects are
1177 # non-sequence items should not work with len()
1178 # non-empty strings will fail this
1179 if len(query
) and not isinstance(query
[0], tuple):
1181 # zero-length sequences of all types will get here and succeed,
1182 # but that's a minor nit - since the original implementation
1183 # allowed empty dicts that type of behavior probably should be
1184 # preserved for consistency
1186 ty
,va
,tb
= sys
.exc_info()
1187 raise TypeError, "not a valid non-string sequence or mapping object", tb
1191 # preserve old behavior
1193 k
= quote_plus(str(k
))
1194 v
= quote_plus(str(v
))
1195 l
.append(k
+ '=' + v
)
1198 k
= quote_plus(str(k
))
1199 if isinstance(v
, str):
1201 l
.append(k
+ '=' + v
)
1202 elif _is_unicode(v
):
1203 # is there a reasonable way to convert to ASCII?
1204 # encode generates a string, but "replace" or "ignore"
1205 # lose information and "strict" can raise UnicodeError
1206 v
= quote_plus(v
.encode("ASCII","replace"))
1207 l
.append(k
+ '=' + v
)
1210 # is this a sufficient test for sequence-ness?
1214 v
= quote_plus(str(v
))
1215 l
.append(k
+ '=' + v
)
1217 # loop over the sequence
1219 l
.append(k
+ '=' + quote_plus(str(elt
)))
1223 def getproxies_environment():
1224 """Return a dictionary of scheme -> proxy server URL mappings.
1226 Scan the environment for variables named <scheme>_proxy;
1227 this seems to be the standard convention. If you need a
1228 different way, you can pass a proxies dictionary to the
1229 [Fancy]URLopener constructor.
1233 for name
, value
in os
.environ
.items():
1235 if value
and name
[-6:] == '_proxy':
1236 proxies
[name
[:-6]] = value
1239 if os
.name
== 'mac':
1241 """Return a dictionary of scheme -> proxy server URL mappings.
1243 By convention the mac uses Internet Config to store
1244 proxies. An HTTP proxy, for instance, is stored under
1259 if 'UseHTTPProxy' in config
and config
['UseHTTPProxy']:
1261 value
= config
['HTTPProxyHost']
1265 proxies
['http'] = 'http://%s' % value
1266 # FTP: XXXX To be done.
1267 # Gopher: XXXX To be done.
1270 def proxy_bypass(x
):
1273 elif os
.name
== 'nt':
1274 def getproxies_registry():
1275 """Return a dictionary of scheme -> proxy server URL mappings.
1277 Win32 uses the registry to store proxies.
1284 # Std module, so should be around - but you never know!
1287 internetSettings
= _winreg
.OpenKey(_winreg
.HKEY_CURRENT_USER
,
1288 r
'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1289 proxyEnable
= _winreg
.QueryValueEx(internetSettings
,
1292 # Returned as Unicode but problems if not converted to ASCII
1293 proxyServer
= str(_winreg
.QueryValueEx(internetSettings
,
1295 if '=' in proxyServer
:
1296 # Per-protocol settings
1297 for p
in proxyServer
.split(';'):
1298 protocol
, address
= p
.split('=', 1)
1299 # See if address has a type:// prefix
1301 if not re
.match('^([^/:]+)://', address
):
1302 address
= '%s://%s' % (protocol
, address
)
1303 proxies
[protocol
] = address
1305 # Use one setting for all protocols
1306 if proxyServer
[:5] == 'http:':
1307 proxies
['http'] = proxyServer
1309 proxies
['http'] = 'http://%s' % proxyServer
1310 proxies
['ftp'] = 'ftp://%s' % proxyServer
1311 internetSettings
.Close()
1312 except (WindowsError, ValueError, TypeError):
1313 # Either registry key not found etc, or the value in an
1314 # unexpected format.
1315 # proxies already set up to be empty so nothing to do
1320 """Return a dictionary of scheme -> proxy server URL mappings.
1322 Returns settings gathered from the environment, if specified,
1326 return getproxies_environment() or getproxies_registry()
1328 def proxy_bypass(host
):
1333 # Std modules, so should be around - but you never know!
1336 internetSettings
= _winreg
.OpenKey(_winreg
.HKEY_CURRENT_USER
,
1337 r
'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1338 proxyEnable
= _winreg
.QueryValueEx(internetSettings
,
1340 proxyOverride
= str(_winreg
.QueryValueEx(internetSettings
,
1341 'ProxyOverride')[0])
1342 # ^^^^ Returned as Unicode but problems if not converted to ASCII
1343 except WindowsError:
1345 if not proxyEnable
or not proxyOverride
:
1347 # try to make a host list from name and IP address.
1350 addr
= socket
.gethostbyname(host
[0])
1353 except socket
.error
:
1355 # make a check value list from the registry entry: replace the
1356 # '<local>' string by the localhost entry and the corresponding
1358 proxyOverride
= proxyOverride
.split(';')
1360 while i
< len(proxyOverride
):
1361 if proxyOverride
[i
] == '<local>':
1362 proxyOverride
[i
:i
+1] = ['localhost',
1364 socket
.gethostname(),
1365 socket
.gethostbyname(
1366 socket
.gethostname())]
1368 # print proxyOverride
1369 # now check if we match one of the registry values.
1370 for test
in proxyOverride
:
1371 test
= test
.replace(".", r
"\.") # mask dots
1372 test
= test
.replace("*", r
".*") # change glob sequence
1373 test
= test
.replace("?", r
".") # change glob char
1375 # print "%s <--> %s" %( test, val )
1376 if re
.match(test
, val
, re
.I
):
1381 # By default use environment variables
1382 getproxies
= getproxies_environment
1384 def proxy_bypass(host
):
1387 # Test and time quote() and unquote()
1390 for i
in range(256): s
= s
+ chr(i
)
1401 print round(t1
- t0
, 3), 'sec'
1404 def reporthook(blocknum
, blocksize
, totalsize
):
1405 # Report during remote transfers
1406 print "Block number: %d, Block size: %d, Total size: %d" % (
1407 blocknum
, blocksize
, totalsize
)
1415 'file://localhost/etc/passwd',
1416 'ftp://ftp.python.org/pub/python/README',
1417 ## 'gopher://gopher.micro.umn.edu/1/',
1418 'http://www.python.org/index.html',
1420 if hasattr(URLopener
, "open_https"):
1421 args
.append('https://synergy.as.cmu.edu/~geek/')
1424 print '-'*10, url
, '-'*10
1425 fn
, h
= urlretrieve(url
, None, reporthook
)
1429 for k
in h
.keys(): print k
+ ':', h
[k
]
1435 table
= string
.maketrans("", "")
1436 data
= data
.translate(table
, "\r")
1446 opts
, args
= getopt
.getopt(sys
.argv
[1:], "th")
1447 except getopt
.error
, msg
:
1449 print "Use -h for help"
1456 print "Usage: python urllib.py [-t] [url ...]"
1457 print "-t runs self-test;",
1458 print "otherwise, contents of urls are printed"
1466 print "Use -h for help"
1468 print urlopen(url
).read(),
1470 # Run test program when run as a script
1471 if __name__
== '__main__':