1 """Open an arbitrary URL.
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
16 The object returned by URLopener().open(file) will differ per
17 protocol. All you know is that is has methods read(), readline(),
18 readlines(), fileno(), close() and info(). The read*(), fileno()
19 and close() methods work like those of open files.
20 The info() method returns a mimetools.Message object which can be
21 used to query various info about the object, if available.
22 (mimetools.Message objects are queried with the getheader() method.)
33 __all__
= ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
34 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
35 "urlencode", "url2pathname", "pathname2url", "splittag",
36 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
37 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
38 "splitnport", "splitquery", "splitattr", "splitvalue",
39 "splitgophertype", "getproxies"]
41 __version__
= '1.15' # XXX This version is not always updated :-(
43 MAXFTPCACHE
= 10 # Trim the ftp cache beyond this size
45 # Helper for non-unix systems
47 from macurl2path
import url2pathname
, pathname2url
49 from nturl2path
import url2pathname
, pathname2url
50 elif os
.name
== 'riscos':
51 from rourl2path
import url2pathname
, pathname2url
53 def url2pathname(pathname
):
54 return unquote(pathname
)
55 def pathname2url(pathname
):
56 return quote(pathname
)
58 # This really consists of two pieces:
59 # (1) a class which handles opening of all sorts of URLs
60 # (plus assorted utilities etc.)
61 # (2) a set of functions for parsing URLs
62 # XXX Should these be separated out into different modules?
65 # Shortcut for basic usage
67 def urlopen(url
, data
=None):
68 """urlopen(url [, data]) -> open file-like object"""
71 _urlopener
= FancyURLopener()
73 return _urlopener
.open(url
)
75 return _urlopener
.open(url
, data
)
76 def urlretrieve(url
, filename
=None, reporthook
=None, data
=None):
79 _urlopener
= FancyURLopener()
80 return _urlopener
.retrieve(url
, filename
, reporthook
, data
)
88 """Class to open URLs.
89 This is a class rather than just a subroutine because we may need
90 more than one set of global protocol-specific options.
91 Note -- this is a base class for those who don't want the
92 automatic handling of errors type 302 (relocated) and 401
93 (authorization needed)."""
97 version
= "Python-urllib/%s" % __version__
100 def __init__(self
, proxies
=None, **x509
):
102 proxies
= getproxies()
103 assert hasattr(proxies
, 'has_key'), "proxies must be a mapping"
104 self
.proxies
= proxies
105 self
.key_file
= x509
.get('key_file')
106 self
.cert_file
= x509
.get('cert_file')
107 self
.addheaders
= [('User-agent', self
.version
)]
108 self
.__tempfiles
= []
109 self
.__unlink
= os
.unlink
# See cleanup()
110 self
.tempcache
= None
111 # Undocumented feature: if you assign {} to tempcache,
112 # it is used to cache files retrieved with
113 # self.retrieve(). This is not enabled by default
114 # since it does not work for changing documents (and I
115 # haven't got the logic to check expiration headers
117 self
.ftpcache
= ftpcache
118 # Undocumented feature: you can use a different
119 # ftp cache by assigning to the .ftpcache member;
120 # in case you want logically independent URL openers
121 # XXX This is not threadsafe. Bah.
130 # This code sometimes runs when the rest of this module
131 # has already been deleted, so it can't use any globals
132 # or import anything.
134 for file in self
.__tempfiles
:
139 del self
.__tempfiles
[:]
141 self
.tempcache
.clear()
143 def addheader(self
, *args
):
144 """Add a header to be used by the HTTP interface only
145 e.g. u.addheader('Accept', 'sound/basic')"""
146 self
.addheaders
.append(args
)
149 def open(self
, fullurl
, data
=None):
150 """Use URLopener().open(file) instead of open(file, 'r')."""
151 fullurl
= unwrap(toBytes(fullurl
))
152 if self
.tempcache
and self
.tempcache
.has_key(fullurl
):
153 filename
, headers
= self
.tempcache
[fullurl
]
154 fp
= open(filename
, 'rb')
155 return addinfourl(fp
, headers
, fullurl
)
156 urltype
, url
= splittype(fullurl
)
159 if self
.proxies
.has_key(urltype
):
160 proxy
= self
.proxies
[urltype
]
161 urltype
, proxyhost
= splittype(proxy
)
162 host
, selector
= splithost(proxyhost
)
163 url
= (host
, fullurl
) # Signal special case to open_*()
166 name
= 'open_' + urltype
170 name
= '_'.join(name
.split('-'))
171 if not hasattr(self
, name
):
173 return self
.open_unknown_proxy(proxy
, fullurl
, data
)
175 return self
.open_unknown(fullurl
, data
)
178 return getattr(self
, name
)(url
)
180 return getattr(self
, name
)(url
, data
)
181 except socket
.error
, msg
:
182 raise IOError, ('socket error', msg
), sys
.exc_info()[2]
184 def open_unknown(self
, fullurl
, data
=None):
185 """Overridable interface to open unknown URL type."""
186 type, url
= splittype(fullurl
)
187 raise IOError, ('url error', 'unknown url type', type)
189 def open_unknown_proxy(self
, proxy
, fullurl
, data
=None):
190 """Overridable interface to open unknown URL type."""
191 type, url
= splittype(fullurl
)
192 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy
)
195 def retrieve(self
, url
, filename
=None, reporthook
=None, data
=None):
196 """retrieve(url) returns (filename, None) for a local object
197 or (tempfilename, headers) for a remote object."""
198 url
= unwrap(toBytes(url
))
199 if self
.tempcache
and self
.tempcache
.has_key(url
):
200 return self
.tempcache
[url
]
201 type, url1
= splittype(url
)
202 if not filename
and (not type or type == 'file'):
204 fp
= self
.open_local_file(url1
)
207 return url2pathname(splithost(url1
)[1]), hdrs
210 fp
= self
.open(url
, data
)
214 garbage
, path
= splittype(url
)
215 garbage
, path
= splithost(path
or "")
216 path
, garbage
= splitquery(path
or "")
217 path
, garbage
= splitattr(path
or "")
218 suffix
= os
.path
.splitext(path
)[1]
219 filename
= tempfile
.mktemp(suffix
)
220 self
.__tempfiles
.append(filename
)
221 result
= filename
, headers
222 if self
.tempcache
is not None:
223 self
.tempcache
[url
] = result
224 tfp
= open(filename
, 'wb')
229 if headers
.has_key("content-length"):
230 size
= int(headers
["Content-Length"])
231 reporthook(0, bs
, size
)
234 reporthook(1, bs
, size
)
238 blocknum
= blocknum
+ 1
240 reporthook(blocknum
, bs
, size
)
247 # Each method named open_<type> knows how to open that type of URL
249 def open_http(self
, url
, data
=None):
250 """Use HTTP protocol."""
253 if type(url
) is types
.StringType
:
254 host
, selector
= splithost(url
)
256 user_passwd
, host
= splituser(host
)
261 urltype
, rest
= splittype(selector
)
264 if urltype
.lower() != 'http':
267 realhost
, rest
= splithost(rest
)
269 user_passwd
, realhost
= splituser(realhost
)
271 selector
= "%s://%s%s" % (urltype
, realhost
, rest
)
272 if proxy_bypass(realhost
):
275 #print "proxy via http:", host, selector
276 if not host
: raise IOError, ('http error', 'no host given')
279 auth
= base64
.encodestring(user_passwd
).strip()
282 h
= httplib
.HTTP(host
)
284 h
.putrequest('POST', selector
)
285 h
.putheader('Content-type', 'application/x-www-form-urlencoded')
286 h
.putheader('Content-length', '%d' % len(data
))
288 h
.putrequest('GET', selector
)
289 if auth
: h
.putheader('Authorization', 'Basic %s' % auth
)
290 if realhost
: h
.putheader('Host', realhost
)
291 for args
in self
.addheaders
: apply(h
.putheader
, args
)
295 errcode
, errmsg
, headers
= h
.getreply()
298 return addinfourl(fp
, headers
, "http:" + url
)
301 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
)
303 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
, data
)
305 def http_error(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
306 """Handle http errors.
307 Derived class can override this, or provide specific handlers
308 named http_error_DDD where DDD is the 3-digit error code."""
309 # First check if there's a specific handler for this error
310 name
= 'http_error_%d' % errcode
311 if hasattr(self
, name
):
312 method
= getattr(self
, name
)
314 result
= method(url
, fp
, errcode
, errmsg
, headers
)
316 result
= method(url
, fp
, errcode
, errmsg
, headers
, data
)
317 if result
: return result
318 return self
.http_error_default(url
, fp
, errcode
, errmsg
, headers
)
320 def http_error_default(self
, url
, fp
, errcode
, errmsg
, headers
):
321 """Default error handler: close the connection and raise IOError."""
324 raise IOError, ('http error', errcode
, errmsg
, headers
)
326 if hasattr(socket
, "ssl"):
327 def open_https(self
, url
, data
=None):
328 """Use HTTPS protocol."""
331 if type(url
) is types
.StringType
:
332 host
, selector
= splithost(url
)
334 user_passwd
, host
= splituser(host
)
339 urltype
, rest
= splittype(selector
)
342 if urltype
.lower() != 'https':
345 realhost
, rest
= splithost(rest
)
347 user_passwd
, realhost
= splituser(realhost
)
349 selector
= "%s://%s%s" % (urltype
, realhost
, rest
)
350 #print "proxy via https:", host, selector
351 if not host
: raise IOError, ('https error', 'no host given')
354 auth
= base64
.encodestring(user_passwd
).strip()
357 h
= httplib
.HTTPS(host
, 0,
358 key_file
=self
.key_file
,
359 cert_file
=self
.cert_file
)
361 h
.putrequest('POST', selector
)
362 h
.putheader('Content-type',
363 'application/x-www-form-urlencoded')
364 h
.putheader('Content-length', '%d' % len(data
))
366 h
.putrequest('GET', selector
)
367 if auth
: h
.putheader('Authorization: Basic %s' % auth
)
368 if realhost
: h
.putheader('Host', realhost
)
369 for args
in self
.addheaders
: apply(h
.putheader
, args
)
373 errcode
, errmsg
, headers
= h
.getreply()
376 return addinfourl(fp
, headers
, "https:" + url
)
379 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
)
381 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
,
384 def open_gopher(self
, url
):
385 """Use Gopher protocol."""
387 host
, selector
= splithost(url
)
388 if not host
: raise IOError, ('gopher error', 'no host given')
390 type, selector
= splitgophertype(selector
)
391 selector
, query
= splitquery(selector
)
392 selector
= unquote(selector
)
394 query
= unquote(query
)
395 fp
= gopherlib
.send_query(selector
, query
, host
)
397 fp
= gopherlib
.send_selector(selector
, host
)
398 return addinfourl(fp
, noheaders(), "gopher:" + url
)
400 def open_file(self
, url
):
401 """Use local file or FTP depending on form of URL."""
402 if url
[:2] == '//' and url
[2:3] != '/':
403 return self
.open_ftp(url
)
405 return self
.open_local_file(url
)
407 def open_local_file(self
, url
):
408 """Use local file."""
409 import mimetypes
, mimetools
, rfc822
, StringIO
410 host
, file = splithost(url
)
411 localname
= url2pathname(file)
412 stats
= os
.stat(localname
)
413 size
= stats
[stat
.ST_SIZE
]
414 modified
= rfc822
.formatdate(stats
[stat
.ST_MTIME
])
415 mtype
= mimetypes
.guess_type(url
)[0]
416 headers
= mimetools
.Message(StringIO
.StringIO(
417 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
418 (mtype
or 'text/plain', size
, modified
)))
422 urlfile
= 'file://' + file
423 return addinfourl(open(localname
, 'rb'),
425 host
, port
= splitport(host
)
427 and socket
.gethostbyname(host
) in (localhost(), thishost()):
430 urlfile
= 'file://' + file
431 return addinfourl(open(localname
, 'rb'),
433 raise IOError, ('local file error', 'not on local host')
435 def open_ftp(self
, url
):
436 """Use FTP protocol."""
437 import mimetypes
, mimetools
, StringIO
438 host
, path
= splithost(url
)
439 if not host
: raise IOError, ('ftp error', 'no host given')
440 host
, port
= splitport(host
)
441 user
, host
= splituser(host
)
442 if user
: user
, passwd
= splitpasswd(user
)
445 user
= unquote(user
or '')
446 passwd
= unquote(passwd
or '')
447 host
= socket
.gethostbyname(host
)
450 port
= ftplib
.FTP_PORT
453 path
, attrs
= splitattr(path
)
455 dirs
= path
.split('/')
456 dirs
, file = dirs
[:-1], dirs
[-1]
457 if dirs
and not dirs
[0]: dirs
= dirs
[1:]
458 if dirs
and not dirs
[0]: dirs
[0] = '/'
459 key
= user
, host
, port
, '/'.join(dirs
)
461 if len(self
.ftpcache
) > MAXFTPCACHE
:
462 # Prune the cache, rather arbitrarily
463 for k
in self
.ftpcache
.keys():
469 if not self
.ftpcache
.has_key(key
):
470 self
.ftpcache
[key
] = \
471 ftpwrapper(user
, passwd
, host
, port
, dirs
)
472 if not file: type = 'D'
475 attr
, value
= splitvalue(attr
)
476 if attr
.lower() == 'type' and \
477 value
in ('a', 'A', 'i', 'I', 'd', 'D'):
479 (fp
, retrlen
) = self
.ftpcache
[key
].retrfile(file, type)
480 mtype
= mimetypes
.guess_type("ftp:" + url
)[0]
483 headers
+= "Content-Type: %s\n" % mtype
484 if retrlen
is not None and retrlen
>= 0:
485 headers
+= "Content-Length: %d\n" % retrlen
486 headers
= mimetools
.Message(StringIO
.StringIO(headers
))
487 return addinfourl(fp
, headers
, "ftp:" + url
)
488 except ftperrors(), msg
:
489 raise IOError, ('ftp error', msg
), sys
.exc_info()[2]
491 def open_data(self
, url
, data
=None):
492 """Use "data" URL."""
495 # syntax of data URLs:
496 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
497 # mediatype := [ type "/" subtype ] *( ";" parameter )
499 # parameter := attribute "=" value
500 import StringIO
, mimetools
, time
502 [type, data
] = url
.split(',', 1)
504 raise IOError, ('data error', 'bad data URL')
506 type = 'text/plain;charset=US-ASCII'
507 semi
= type.rfind(';')
508 if semi
>= 0 and '=' not in type[semi
:]:
509 encoding
= type[semi
+1:]
514 msg
.append('Date: %s'%time
.strftime('%a, %d %b %Y %T GMT',
515 time
.gmtime(time
.time())))
516 msg
.append('Content-type: %s' % type)
517 if encoding
== 'base64':
519 data
= base64
.decodestring(data
)
522 msg
.append('Content-length: %d' % len(data
))
526 f
= StringIO
.StringIO(msg
)
527 headers
= mimetools
.Message(f
, 0)
528 f
.fileno
= None # needed for addinfourl
529 return addinfourl(f
, headers
, url
)
532 class FancyURLopener(URLopener
):
533 """Derived class with handlers for errors we can handle (perhaps)."""
535 def __init__(self
, *args
):
536 apply(URLopener
.__init
__, (self
,) + args
)
541 def http_error_default(self
, url
, fp
, errcode
, errmsg
, headers
):
542 """Default error handling -- don't raise an exception."""
543 return addinfourl(fp
, headers
, "http:" + url
)
545 def http_error_302(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
546 """Error 302 -- relocated (temporarily)."""
548 if self
.maxtries
and self
.tries
>= self
.maxtries
:
549 if hasattr(self
, "http_error_500"):
550 meth
= self
.http_error_500
552 meth
= self
.http_error_default
554 return meth(url
, fp
, 500,
555 "Internal Server Error: Redirect Recursion", headers
)
556 result
= self
.redirect_internal(url
, fp
, errcode
, errmsg
, headers
,
561 def redirect_internal(self
, url
, fp
, errcode
, errmsg
, headers
, data
):
562 if headers
.has_key('location'):
563 newurl
= headers
['location']
564 elif headers
.has_key('uri'):
565 newurl
= headers
['uri']
570 # In case the server sent a relative URL, join with original:
571 newurl
= basejoin(self
.type + ":" + url
, newurl
)
573 return self
.open(newurl
)
575 return self
.open(newurl
, data
)
577 def http_error_301(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
578 """Error 301 -- also relocated (permanently)."""
579 return self
.http_error_302(url
, fp
, errcode
, errmsg
, headers
, data
)
581 def http_error_401(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
582 """Error 401 -- authentication required.
583 See this URL for a description of the basic authentication scheme:
584 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
585 if not headers
.has_key('www-authenticate'):
586 URLopener
.http_error_default(self
, url
, fp
,
587 errcode
, errmsg
, headers
)
588 stuff
= headers
['www-authenticate']
590 match
= re
.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff
)
592 URLopener
.http_error_default(self
, url
, fp
,
593 errcode
, errmsg
, headers
)
594 scheme
, realm
= match
.groups()
595 if scheme
.lower() != 'basic':
596 URLopener
.http_error_default(self
, url
, fp
,
597 errcode
, errmsg
, headers
)
598 name
= 'retry_' + self
.type + '_basic_auth'
600 return getattr(self
,name
)(url
, realm
)
602 return getattr(self
,name
)(url
, realm
, data
)
604 def retry_http_basic_auth(self
, url
, realm
, data
=None):
605 host
, selector
= splithost(url
)
606 i
= host
.find('@') + 1
608 user
, passwd
= self
.get_user_passwd(host
, realm
, i
)
609 if not (user
or passwd
): return None
610 host
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + host
611 newurl
= 'http://' + host
+ selector
613 return self
.open(newurl
)
615 return self
.open(newurl
, data
)
617 def retry_https_basic_auth(self
, url
, realm
, data
=None):
618 host
, selector
= splithost(url
)
619 i
= host
.find('@') + 1
621 user
, passwd
= self
.get_user_passwd(host
, realm
, i
)
622 if not (user
or passwd
): return None
623 host
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + host
624 newurl
= '//' + host
+ selector
625 return self
.open_https(newurl
, data
)
627 def get_user_passwd(self
, host
, realm
, clear_cache
= 0):
628 key
= realm
+ '@' + host
.lower()
629 if self
.auth_cache
.has_key(key
):
631 del self
.auth_cache
[key
]
633 return self
.auth_cache
[key
]
634 user
, passwd
= self
.prompt_user_passwd(host
, realm
)
635 if user
or passwd
: self
.auth_cache
[key
] = (user
, passwd
)
638 def prompt_user_passwd(self
, host
, realm
):
639 """Override this in a GUI environment!"""
642 user
= raw_input("Enter username for %s at %s: " % (realm
,
644 passwd
= getpass
.getpass("Enter password for %s in %s at %s: " %
647 except KeyboardInterrupt:
656 """Return the IP address of the magic hostname 'localhost'."""
659 _localhost
= socket
.gethostbyname('localhost')
664 """Return the IP address of the current host."""
667 _thishost
= socket
.gethostbyname(socket
.gethostname())
672 """Return the set of errors raised by the FTP class."""
676 _ftperrors
= ftplib
.all_errors
681 """Return an empty mimetools.Message object."""
686 _noheaders
= mimetools
.Message(StringIO
.StringIO(), 0)
687 _noheaders
.fp
.close() # Recycle file descriptor
694 """Class used by open_ftp() for cache of open FTP connections."""
696 def __init__(self
, user
, passwd
, host
, port
, dirs
):
707 self
.ftp
= ftplib
.FTP()
708 self
.ftp
.connect(self
.host
, self
.port
)
709 self
.ftp
.login(self
.user
, self
.passwd
)
710 for dir in self
.dirs
:
713 def retrfile(self
, file, type):
716 if type in ('d', 'D'): cmd
= 'TYPE A'; isdir
= 1
717 else: cmd
= 'TYPE ' + type; isdir
= 0
719 self
.ftp
.voidcmd(cmd
)
720 except ftplib
.all_errors
:
722 self
.ftp
.voidcmd(cmd
)
724 if file and not isdir
:
725 # Use nlst to see if the file exists at all
728 except ftplib
.error_perm
, reason
:
729 raise IOError, ('ftp error', reason
), sys
.exc_info()[2]
730 # Restore the transfer mode!
731 self
.ftp
.voidcmd(cmd
)
732 # Try to retrieve as a file
735 conn
= self
.ftp
.ntransfercmd(cmd
)
736 except ftplib
.error_perm
, reason
:
737 if str(reason
)[:3] != '550':
738 raise IOError, ('ftp error', reason
), sys
.exc_info()[2]
740 # Set transfer mode to ASCII!
741 self
.ftp
.voidcmd('TYPE A')
742 # Try a directory listing
743 if file: cmd
= 'LIST ' + file
745 conn
= self
.ftp
.ntransfercmd(cmd
)
747 # Pass back both a suitably decorated object and a retrieval length
748 return (addclosehook(conn
[0].makefile('rb'),
749 self
.endtransfer
), conn
[1])
750 def endtransfer(self
):
767 """Base class for addinfo and addclosehook."""
769 def __init__(self
, fp
):
771 self
.read
= self
.fp
.read
772 self
.readline
= self
.fp
.readline
773 if hasattr(self
.fp
, "readlines"): self
.readlines
= self
.fp
.readlines
774 if hasattr(self
.fp
, "fileno"): self
.fileno
= self
.fp
.fileno
777 return '<%s at %s whose fp = %s>' % (self
.__class
__.__name
__,
778 `
id(self
)`
, `self
.fp`
)
783 self
.readlines
= None
785 if self
.fp
: self
.fp
.close()
788 class addclosehook(addbase
):
789 """Class to add a close hook to an open file."""
791 def __init__(self
, fp
, closehook
, *hookargs
):
792 addbase
.__init
__(self
, fp
)
793 self
.closehook
= closehook
794 self
.hookargs
= hookargs
799 apply(self
.closehook
, self
.hookargs
)
800 self
.closehook
= None
803 class addinfo(addbase
):
804 """class to add an info() method to an open file."""
806 def __init__(self
, fp
, headers
):
807 addbase
.__init
__(self
, fp
)
808 self
.headers
= headers
813 class addinfourl(addbase
):
814 """class to add info() and geturl() methods to an open file."""
816 def __init__(self
, fp
, headers
, url
):
817 addbase
.__init
__(self
, fp
)
818 self
.headers
= headers
828 def basejoin(base
, url
):
829 """Utility to combine a URL with a base URL to form a new URL."""
830 type, path
= splittype(url
)
832 # if url is complete (i.e., it contains a type), return it
834 host
, path
= splithost(path
)
835 type, basepath
= splittype(base
) # inherit type from base
837 # if url contains host, just inherit type
838 if type: return type + '://' + host
+ path
840 # no type inherited, so url must have started with //
843 host
, basepath
= splithost(basepath
) # inherit host
844 basepath
, basetag
= splittag(basepath
) # remove extraneous cruft
845 basepath
, basequery
= splitquery(basepath
) # idem
847 # non-absolute path name
848 if path
[:1] in ('#', '?'):
849 # path is just a tag or query, attach to basepath
852 # else replace last component
853 i
= basepath
.rfind('/')
855 # basepath not absolute
857 # host present, make absolute
860 # else keep non-absolute
863 # remove last file component
864 basepath
= basepath
[:i
+1]
865 # Interpret ../ (important because of symlinks)
866 while basepath
and path
[:3] == '../':
868 i
= basepath
[:-1].rfind('/')
870 basepath
= basepath
[:i
+1]
877 path
= basepath
+ path
878 if host
and path
and path
[0] != '/':
880 if type and host
: return type + '://' + host
+ path
881 elif type: return type + ':' + path
882 elif host
: return '//' + host
+ path
# don't know what this means
886 # Utilities to parse URLs (most of these return None for missing parts):
887 # unwrap('<URL:type://host/path>') --> 'type://host/path'
888 # splittype('type:opaquestring') --> 'type', 'opaquestring'
889 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
890 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
891 # splitpasswd('user:passwd') -> 'user', 'passwd'
892 # splitport('host:port') --> 'host', 'port'
893 # splitquery('/path?query') --> '/path', 'query'
894 # splittag('/path#tag') --> '/path', 'tag'
895 # splitattr('/path;attr1=value1;attr2=value2;...') ->
896 # '/path', ['attr1=value1', 'attr2=value2', ...]
897 # splitvalue('attr=value') --> 'attr', 'value'
898 # splitgophertype('/Xselector') --> 'X', 'selector'
899 # unquote('abc%20def') -> 'abc def'
900 # quote('abc def') -> 'abc%20def')
903 """toBytes(u"URL") --> 'URL'."""
904 # Most URL schemes require ASCII. If that changes, the conversion
906 if type(url
) is types
.UnicodeType
:
908 url
= url
.encode("ASCII")
910 raise UnicodeError("URL " + repr(url
) +
911 " contains non-ASCII characters")
915 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
917 if url
[:1] == '<' and url
[-1:] == '>':
918 url
= url
[1:-1].strip()
919 if url
[:4] == 'URL:': url
= url
[4:].strip()
924 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
926 if _typeprog
is None:
928 _typeprog
= re
.compile('^([^/:]+):')
930 match
= _typeprog
.match(url
)
932 scheme
= match
.group(1)
933 return scheme
.lower(), url
[len(scheme
) + 1:]
938 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
940 if _hostprog
is None:
942 _hostprog
= re
.compile('^//([^/]*)(.*)$')
944 match
= _hostprog
.match(url
)
945 if match
: return match
.group(1, 2)
950 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
952 if _userprog
is None:
954 _userprog
= re
.compile('^([^@]*)@(.*)$')
956 match
= _userprog
.match(host
)
957 if match
: return map(unquote
, match
.group(1, 2))
961 def splitpasswd(user
):
962 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
964 if _passwdprog
is None:
966 _passwdprog
= re
.compile('^([^:]*):(.*)$')
968 match
= _passwdprog
.match(user
)
969 if match
: return match
.group(1, 2)
972 # splittag('/path#tag') --> '/path', 'tag'
975 """splitport('host:port') --> 'host', 'port'."""
977 if _portprog
is None:
979 _portprog
= re
.compile('^(.*):([0-9]+)$')
981 match
= _portprog
.match(host
)
982 if match
: return match
.group(1, 2)
986 def splitnport(host
, defport
=-1):
987 """Split host and port, returning numeric port.
988 Return given default port if no ':' found; defaults to -1.
989 Return numerical port if a valid number are found after ':'.
990 Return None if ':' but not a valid number."""
992 if _nportprog
is None:
994 _nportprog
= re
.compile('^(.*):(.*)$')
996 match
= _nportprog
.match(host
)
998 host
, port
= match
.group(1, 2)
1000 if not port
: raise ValueError, "no digits"
1005 return host
, defport
1008 def splitquery(url
):
1009 """splitquery('/path?query') --> '/path', 'query'."""
1011 if _queryprog
is None:
1013 _queryprog
= re
.compile('^(.*)\?([^?]*)$')
1015 match
= _queryprog
.match(url
)
1016 if match
: return match
.group(1, 2)
1021 """splittag('/path#tag') --> '/path', 'tag'."""
1023 if _tagprog
is None:
1025 _tagprog
= re
.compile('^(.*)#([^#]*)$')
1027 match
= _tagprog
.match(url
)
1028 if match
: return match
.group(1, 2)
1032 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1033 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1034 words
= url
.split(';')
1035 return words
[0], words
[1:]
1038 def splitvalue(attr
):
1039 """splitvalue('attr=value') --> 'attr', 'value'."""
1041 if _valueprog
is None:
1043 _valueprog
= re
.compile('^([^=]*)=(.*)$')
1045 match
= _valueprog
.match(attr
)
1046 if match
: return match
.group(1, 2)
1049 def splitgophertype(selector
):
1050 """splitgophertype('/Xselector') --> 'X', 'selector'."""
1051 if selector
[:1] == '/' and selector
[1:2]:
1052 return selector
[1], selector
[2:]
1053 return None, selector
1056 """unquote('abc%20def') -> 'abc def'."""
1061 myappend
= res
.append
1066 myappend(mychr(myatoi(item
[:2], 16))
1069 myappend('%' + item
)
1071 myappend('%' + item
)
1074 def unquote_plus(s
):
1075 """unquote('%7e/abc+def') -> '~/abc def'"""
1077 # replace '+' with ' '
1078 s
= ' '.join(s
.split('+'))
1081 always_safe
= ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1082 'abcdefghijklmnopqrstuvwxyz'
1085 _fast_safe_test
= always_safe
+ '/'
1090 if _fast_safe
is None:
1092 for c
in _fast_safe_test
:
1095 for i
in range(len(res
)):
1097 if not _fast_safe
.has_key(c
):
1098 res
[i
] = '%%%02X' % ord(c
)
1101 def quote(s
, safe
= '/'):
1102 """quote('abc def') -> 'abc%20def'
1104 Each part of a URL, e.g. the path info, the query, etc., has a
1105 different set of reserved characters that must be quoted.
1107 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1108 the following reserved characters.
1110 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1113 Each of these characters is reserved in some component of a URL,
1114 but not necessarily in all of them.
1116 By default, the quote function is intended for quoting the path
1117 section of a URL. Thus, it will not encode '/'. This character
1118 is reserved, but in typical usage the quote function is being
1119 called on a path where the existing slash characters are used as
1120 reserved characters.
1122 safe
= always_safe
+ safe
1123 if _fast_safe_test
== safe
:
1124 return _fast_quote(s
)
1126 for i
in range(len(res
)):
1129 res
[i
] = '%%%02X' % ord(c
)
1132 def quote_plus(s
, safe
= ''):
1133 """Quote the query fragment of a URL; replacing ' ' with '+'"""
1136 for i
in range(len(l
)):
1137 l
[i
] = quote(l
[i
], safe
)
1140 return quote(s
, safe
)
1142 def urlencode(query
,doseq
=0):
1143 """Encode a sequence of two-element tuples or dictionary into a URL query string.
1145 If any values in the query arg are sequences and doseq is true, each
1146 sequence element is converted to a separate parameter.
1148 If the query arg is a sequence of two-element tuples, the order of the
1149 parameters in the output will match the order of parameters in the
1153 if hasattr(query
,"items"):
1155 query
= query
.items()
1157 # it's a bother at times that strings and string-like objects are
1160 # non-sequence items should not work with len()
1162 # non-empty strings will fail this
1163 if len(query
) and type(query
[0]) != types
.TupleType
:
1165 # zero-length sequences of all types will get here and succeed,
1166 # but that's a minor nit - since the original implementation
1167 # allowed empty dicts that type of behavior probably should be
1168 # preserved for consistency
1170 ty
,va
,tb
= sys
.exc_info()
1171 raise TypeError, "not a valid non-string sequence or mapping object", tb
1175 # preserve old behavior
1177 k
= quote_plus(str(k
))
1178 v
= quote_plus(str(v
))
1179 l
.append(k
+ '=' + v
)
1182 k
= quote_plus(str(k
))
1183 if type(v
) == types
.StringType
:
1185 l
.append(k
+ '=' + v
)
1186 elif type(v
) == types
.UnicodeType
:
1187 # is there a reasonable way to convert to ASCII?
1188 # encode generates a string, but "replace" or "ignore"
1189 # lose information and "strict" can raise UnicodeError
1190 v
= quote_plus(v
.encode("ASCII","replace"))
1191 l
.append(k
+ '=' + v
)
1194 # is this a sufficient test for sequence-ness?
1198 v
= quote_plus(str(v
))
1199 l
.append(k
+ '=' + v
)
1201 # loop over the sequence
1203 l
.append(k
+ '=' + quote_plus(str(elt
)))
1207 def getproxies_environment():
1208 """Return a dictionary of scheme -> proxy server URL mappings.
1210 Scan the environment for variables named <scheme>_proxy;
1211 this seems to be the standard convention. If you need a
1212 different way, you can pass a proxies dictionary to the
1213 [Fancy]URLopener constructor.
1217 for name
, value
in os
.environ
.items():
1219 if value
and name
[-6:] == '_proxy':
1220 proxies
[name
[:-6]] = value
1223 if os
.name
== 'mac':
1225 """Return a dictionary of scheme -> proxy server URL mappings.
1227 By convention the mac uses Internet Config to store
1228 proxies. An HTTP proxy, for instance, is stored under
1243 if config
.has_key('UseHTTPProxy') and config
['UseHTTPProxy']:
1245 value
= config
['HTTPProxyHost']
1249 proxies
['http'] = 'http://%s' % value
1250 # FTP: XXXX To be done.
1251 # Gopher: XXXX To be done.
1254 def proxy_bypass(x
):
1257 elif os
.name
== 'nt':
1258 def getproxies_registry():
1259 """Return a dictionary of scheme -> proxy server URL mappings.
1261 Win32 uses the registry to store proxies.
1268 # Std module, so should be around - but you never know!
1271 internetSettings
= _winreg
.OpenKey(_winreg
.HKEY_CURRENT_USER
,
1272 r
'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1273 proxyEnable
= _winreg
.QueryValueEx(internetSettings
,
1276 # Returned as Unicode but problems if not converted to ASCII
1277 proxyServer
= str(_winreg
.QueryValueEx(internetSettings
,
1279 if '=' in proxyServer
:
1280 # Per-protocol settings
1281 for p
in proxyServer
.split(';'):
1282 protocol
, address
= p
.split('=', 1)
1283 proxies
[protocol
] = '%s://%s' % (protocol
, address
)
1285 # Use one setting for all protocols
1286 if proxyServer
[:5] == 'http:':
1287 proxies
['http'] = proxyServer
1289 proxies
['http'] = 'http://%s' % proxyServer
1290 proxies
['ftp'] = 'ftp://%s' % proxyServer
1291 internetSettings
.Close()
1292 except (WindowsError, ValueError, TypeError):
1293 # Either registry key not found etc, or the value in an
1294 # unexpected format.
1295 # proxies already set up to be empty so nothing to do
1300 """Return a dictionary of scheme -> proxy server URL mappings.
1302 Returns settings gathered from the environment, if specified,
1306 return getproxies_environment() or getproxies_registry()
1308 def proxy_bypass(host
):
1314 # Std modules, so should be around - but you never know!
1317 internetSettings
= _winreg
.OpenKey(_winreg
.HKEY_CURRENT_USER
,
1318 r
'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1319 proxyEnable
= _winreg
.QueryValueEx(internetSettings
,
1321 proxyOverride
= str(_winreg
.QueryValueEx(internetSettings
,
1322 'ProxyOverride')[0])
1323 # ^^^^ Returned as Unicode but problems if not converted to ASCII
1324 except WindowsError:
1326 if not proxyEnable
or not proxyOverride
:
1328 # try to make a host list from name and IP address.
1331 addr
= socket
.gethostbyname(host
[0])
1334 except socket
.error
:
1336 # make a check value list from the registry entry: replace the
1337 # '<local>' string by the localhost entry and the corresponding
1339 proxyOverride
= proxyOverride
.split(';')
1341 while i
< len(proxyOverride
):
1342 if proxyOverride
[i
] == '<local>':
1343 proxyOverride
[i
:i
+1] = ['localhost',
1345 socket
.gethostname(),
1346 socket
.gethostbyname(
1347 socket
.gethostname())]
1349 # print proxyOverride
1350 # now check if we match one of the registry values.
1351 for test
in proxyOverride
:
1352 test
= test
.replace(".", r
"\.") # mask dots
1353 test
= test
.replace("*", r
".*") # change glob sequence
1354 test
= test
.replace("?", r
".") # change glob char
1356 # print "%s <--> %s" %( test, val )
1357 if re
.match(test
, val
, re
.I
):
1362 # By default use environment variables
1363 getproxies
= getproxies_environment
1365 def proxy_bypass(host
):
1368 # Test and time quote() and unquote()
1372 for i
in range(256): s
= s
+ chr(i
)
1383 print round(t1
- t0
, 3), 'sec'
1386 def reporthook(blocknum
, blocksize
, totalsize
):
1387 # Report during remote transfers
1388 print "Block number: %d, Block size: %d, Total size: %d" % (
1389 blocknum
, blocksize
, totalsize
)
1397 'file://localhost/etc/passwd',
1398 'ftp://ftp.python.org/etc/passwd',
1399 ## 'gopher://gopher.micro.umn.edu/1/',
1400 'http://www.python.org/index.html',
1402 if hasattr(URLopener
, "open_https"):
1403 args
.append('https://synergy.as.cmu.edu/~geek/')
1406 print '-'*10, url
, '-'*10
1407 fn
, h
= urlretrieve(url
, None, reporthook
)
1411 for k
in h
.keys(): print k
+ ':', h
[k
]
1417 table
= string
.maketrans("", "")
1418 data
= data
.translate(table
, "\r")
1428 opts
, args
= getopt
.getopt(sys
.argv
[1:], "th")
1429 except getopt
.error
, msg
:
1431 print "Use -h for help"
1438 print "Usage: python urllib.py [-t] [url ...]"
1439 print "-t runs self-test;",
1440 print "otherwise, contents of urls are printed"
1448 print "Use -h for help"
1450 print urlopen(url
).read(),
1452 # Run test program when run as a script
1453 if __name__
== '__main__':