1 """Open an arbitrary URL.
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
16 The object returned by URLopener().open(file) will differ per
17 protocol. All you know is that is has methods read(), readline(),
18 readlines(), fileno(), close() and info(). The read*(), fileno()
19 and close() methods work like those of open files.
20 The info() method returns a mimetools.Message object which can be
21 used to query various info about the object, if available.
22 (mimetools.Message objects are queried with the getheader() method.)
31 __all__
= ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
32 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
33 "urlencode", "url2pathname", "pathname2url", "splittag",
34 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
35 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
36 "splitnport", "splitquery", "splitattr", "splitvalue",
37 "splitgophertype", "getproxies"]
39 __version__
= '1.15' # XXX This version is not always updated :-(
41 MAXFTPCACHE
= 10 # Trim the ftp cache beyond this size
43 # Helper for non-unix systems
45 from macurl2path
import url2pathname
, pathname2url
47 from nturl2path
import url2pathname
, pathname2url
48 elif os
.name
== 'riscos':
49 from rourl2path
import url2pathname
, pathname2url
51 def url2pathname(pathname
):
52 return unquote(pathname
)
53 def pathname2url(pathname
):
54 return quote(pathname
)
56 # This really consists of two pieces:
57 # (1) a class which handles opening of all sorts of URLs
58 # (plus assorted utilities etc.)
59 # (2) a set of functions for parsing URLs
60 # XXX Should these be separated out into different modules?
63 # Shortcut for basic usage
65 def urlopen(url
, data
=None):
66 """urlopen(url [, data]) -> open file-like object"""
69 _urlopener
= FancyURLopener()
71 return _urlopener
.open(url
)
73 return _urlopener
.open(url
, data
)
74 def urlretrieve(url
, filename
=None, reporthook
=None, data
=None):
77 _urlopener
= FancyURLopener()
78 return _urlopener
.retrieve(url
, filename
, reporthook
, data
)
86 """Class to open URLs.
87 This is a class rather than just a subroutine because we may need
88 more than one set of global protocol-specific options.
89 Note -- this is a base class for those who don't want the
90 automatic handling of errors type 302 (relocated) and 401
91 (authorization needed)."""
95 version
= "Python-urllib/%s" % __version__
98 def __init__(self
, proxies
=None, **x509
):
100 proxies
= getproxies()
101 assert hasattr(proxies
, 'has_key'), "proxies must be a mapping"
102 self
.proxies
= proxies
103 self
.key_file
= x509
.get('key_file')
104 self
.cert_file
= x509
.get('cert_file')
105 self
.addheaders
= [('User-agent', self
.version
)]
106 self
.__tempfiles
= []
107 self
.__unlink
= os
.unlink
# See cleanup()
108 self
.tempcache
= None
109 # Undocumented feature: if you assign {} to tempcache,
110 # it is used to cache files retrieved with
111 # self.retrieve(). This is not enabled by default
112 # since it does not work for changing documents (and I
113 # haven't got the logic to check expiration headers
115 self
.ftpcache
= ftpcache
116 # Undocumented feature: you can use a different
117 # ftp cache by assigning to the .ftpcache member;
118 # in case you want logically independent URL openers
119 # XXX This is not threadsafe. Bah.
128 # This code sometimes runs when the rest of this module
129 # has already been deleted, so it can't use any globals
130 # or import anything.
132 for file in self
.__tempfiles
:
137 del self
.__tempfiles
[:]
139 self
.tempcache
.clear()
141 def addheader(self
, *args
):
142 """Add a header to be used by the HTTP interface only
143 e.g. u.addheader('Accept', 'sound/basic')"""
144 self
.addheaders
.append(args
)
147 def open(self
, fullurl
, data
=None):
148 """Use URLopener().open(file) instead of open(file, 'r')."""
149 fullurl
= unwrap(toBytes(fullurl
))
150 if self
.tempcache
and self
.tempcache
.has_key(fullurl
):
151 filename
, headers
= self
.tempcache
[fullurl
]
152 fp
= open(filename
, 'rb')
153 return addinfourl(fp
, headers
, fullurl
)
154 urltype
, url
= splittype(fullurl
)
157 if self
.proxies
.has_key(urltype
):
158 proxy
= self
.proxies
[urltype
]
159 urltype
, proxyhost
= splittype(proxy
)
160 host
, selector
= splithost(proxyhost
)
161 url
= (host
, fullurl
) # Signal special case to open_*()
164 name
= 'open_' + urltype
168 name
= '_'.join(name
.split('-'))
169 if not hasattr(self
, name
):
171 return self
.open_unknown_proxy(proxy
, fullurl
, data
)
173 return self
.open_unknown(fullurl
, data
)
176 return getattr(self
, name
)(url
)
178 return getattr(self
, name
)(url
, data
)
179 except socket
.error
, msg
:
180 raise IOError, ('socket error', msg
), sys
.exc_info()[2]
182 def open_unknown(self
, fullurl
, data
=None):
183 """Overridable interface to open unknown URL type."""
184 type, url
= splittype(fullurl
)
185 raise IOError, ('url error', 'unknown url type', type)
187 def open_unknown_proxy(self
, proxy
, fullurl
, data
=None):
188 """Overridable interface to open unknown URL type."""
189 type, url
= splittype(fullurl
)
190 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy
)
193 def retrieve(self
, url
, filename
=None, reporthook
=None, data
=None):
194 """retrieve(url) returns (filename, None) for a local object
195 or (tempfilename, headers) for a remote object."""
196 url
= unwrap(toBytes(url
))
197 if self
.tempcache
and self
.tempcache
.has_key(url
):
198 return self
.tempcache
[url
]
199 type, url1
= splittype(url
)
200 if not filename
and (not type or type == 'file'):
202 fp
= self
.open_local_file(url1
)
205 return url2pathname(splithost(url1
)[1]), hdrs
208 fp
= self
.open(url
, data
)
212 garbage
, path
= splittype(url
)
213 garbage
, path
= splithost(path
or "")
214 path
, garbage
= splitquery(path
or "")
215 path
, garbage
= splitattr(path
or "")
216 suffix
= os
.path
.splitext(path
)[1]
217 filename
= tempfile
.mktemp(suffix
)
218 self
.__tempfiles
.append(filename
)
219 result
= filename
, headers
220 if self
.tempcache
is not None:
221 self
.tempcache
[url
] = result
222 tfp
= open(filename
, 'wb')
227 if headers
.has_key("content-length"):
228 size
= int(headers
["Content-Length"])
229 reporthook(0, bs
, size
)
232 reporthook(1, bs
, size
)
236 blocknum
= blocknum
+ 1
238 reporthook(blocknum
, bs
, size
)
245 # Each method named open_<type> knows how to open that type of URL
247 def open_http(self
, url
, data
=None):
248 """Use HTTP protocol."""
251 if type(url
) is types
.StringType
:
252 host
, selector
= splithost(url
)
254 user_passwd
, host
= splituser(host
)
259 urltype
, rest
= splittype(selector
)
262 if urltype
.lower() != 'http':
265 realhost
, rest
= splithost(rest
)
267 user_passwd
, realhost
= splituser(realhost
)
269 selector
= "%s://%s%s" % (urltype
, realhost
, rest
)
270 #print "proxy via http:", host, selector
271 if not host
: raise IOError, ('http error', 'no host given')
274 auth
= base64
.encodestring(user_passwd
).strip()
277 h
= httplib
.HTTP(host
)
279 h
.putrequest('POST', selector
)
280 h
.putheader('Content-type', 'application/x-www-form-urlencoded')
281 h
.putheader('Content-length', '%d' % len(data
))
283 h
.putrequest('GET', selector
)
284 if auth
: h
.putheader('Authorization', 'Basic %s' % auth
)
285 if realhost
: h
.putheader('Host', realhost
)
286 for args
in self
.addheaders
: apply(h
.putheader
, args
)
290 errcode
, errmsg
, headers
= h
.getreply()
293 return addinfourl(fp
, headers
, "http:" + url
)
296 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
)
298 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
, data
)
300 def http_error(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
301 """Handle http errors.
302 Derived class can override this, or provide specific handlers
303 named http_error_DDD where DDD is the 3-digit error code."""
304 # First check if there's a specific handler for this error
305 name
= 'http_error_%d' % errcode
306 if hasattr(self
, name
):
307 method
= getattr(self
, name
)
309 result
= method(url
, fp
, errcode
, errmsg
, headers
)
311 result
= method(url
, fp
, errcode
, errmsg
, headers
, data
)
312 if result
: return result
313 return self
.http_error_default(url
, fp
, errcode
, errmsg
, headers
)
315 def http_error_default(self
, url
, fp
, errcode
, errmsg
, headers
):
316 """Default error handler: close the connection and raise IOError."""
319 raise IOError, ('http error', errcode
, errmsg
, headers
)
321 if hasattr(socket
, "ssl"):
322 def open_https(self
, url
, data
=None):
323 """Use HTTPS protocol."""
326 if type(url
) is types
.StringType
:
327 host
, selector
= splithost(url
)
329 user_passwd
, host
= splituser(host
)
334 urltype
, rest
= splittype(selector
)
337 if urltype
.lower() != 'https':
340 realhost
, rest
= splithost(rest
)
342 user_passwd
, realhost
= splituser(realhost
)
344 selector
= "%s://%s%s" % (urltype
, realhost
, rest
)
345 #print "proxy via https:", host, selector
346 if not host
: raise IOError, ('https error', 'no host given')
349 auth
= base64
.encodestring(user_passwd
).strip()
352 h
= httplib
.HTTPS(host
, 0,
353 key_file
=self
.key_file
,
354 cert_file
=self
.cert_file
)
356 h
.putrequest('POST', selector
)
357 h
.putheader('Content-type',
358 'application/x-www-form-urlencoded')
359 h
.putheader('Content-length', '%d' % len(data
))
361 h
.putrequest('GET', selector
)
362 if auth
: h
.putheader('Authorization: Basic %s' % auth
)
363 if realhost
: h
.putheader('Host', realhost
)
364 for args
in self
.addheaders
: apply(h
.putheader
, args
)
368 errcode
, errmsg
, headers
= h
.getreply()
371 return addinfourl(fp
, headers
, url
)
374 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
)
376 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
,
379 def open_gopher(self
, url
):
380 """Use Gopher protocol."""
382 host
, selector
= splithost(url
)
383 if not host
: raise IOError, ('gopher error', 'no host given')
385 type, selector
= splitgophertype(selector
)
386 selector
, query
= splitquery(selector
)
387 selector
= unquote(selector
)
389 query
= unquote(query
)
390 fp
= gopherlib
.send_query(selector
, query
, host
)
392 fp
= gopherlib
.send_selector(selector
, host
)
393 return addinfourl(fp
, noheaders(), "gopher:" + url
)
395 def open_file(self
, url
):
396 """Use local file or FTP depending on form of URL."""
397 if url
[:2] == '//' and url
[2:3] != '/':
398 return self
.open_ftp(url
)
400 return self
.open_local_file(url
)
402 def open_local_file(self
, url
):
403 """Use local file."""
404 import mimetypes
, mimetools
, StringIO
405 mtype
= mimetypes
.guess_type(url
)[0]
406 headers
= mimetools
.Message(StringIO
.StringIO(
407 'Content-Type: %s\n' % (mtype
or 'text/plain')))
408 host
, file = splithost(url
)
412 urlfile
= 'file://' + file
413 return addinfourl(open(url2pathname(file), 'rb'),
415 host
, port
= splitport(host
)
417 and socket
.gethostbyname(host
) in (localhost(), thishost()):
420 urlfile
= 'file://' + file
421 return addinfourl(open(url2pathname(file), 'rb'),
423 raise IOError, ('local file error', 'not on local host')
425 def open_ftp(self
, url
):
426 """Use FTP protocol."""
427 host
, path
= splithost(url
)
428 if not host
: raise IOError, ('ftp error', 'no host given')
429 host
, port
= splitport(host
)
430 user
, host
= splituser(host
)
431 if user
: user
, passwd
= splitpasswd(user
)
434 user
= unquote(user
or '')
435 passwd
= unquote(passwd
or '')
436 host
= socket
.gethostbyname(host
)
439 port
= ftplib
.FTP_PORT
442 path
, attrs
= splitattr(path
)
444 dirs
= path
.split('/')
445 dirs
, file = dirs
[:-1], dirs
[-1]
446 if dirs
and not dirs
[0]: dirs
= dirs
[1:]
447 if dirs
and not dirs
[0]: dirs
[0] = '/'
448 key
= user
, host
, port
, '/'.join(dirs
)
450 if len(self
.ftpcache
) > MAXFTPCACHE
:
451 # Prune the cache, rather arbitrarily
452 for k
in self
.ftpcache
.keys():
458 if not self
.ftpcache
.has_key(key
):
459 self
.ftpcache
[key
] = \
460 ftpwrapper(user
, passwd
, host
, port
, dirs
)
461 if not file: type = 'D'
464 attr
, value
= splitvalue(attr
)
465 if attr
.lower() == 'type' and \
466 value
in ('a', 'A', 'i', 'I', 'd', 'D'):
468 (fp
, retrlen
) = self
.ftpcache
[key
].retrfile(file, type)
469 if retrlen
is not None and retrlen
>= 0:
470 import mimetools
, StringIO
471 headers
= mimetools
.Message(StringIO
.StringIO(
472 'Content-Length: %d\n' % retrlen
))
474 headers
= noheaders()
475 return addinfourl(fp
, headers
, "ftp:" + url
)
476 except ftperrors(), msg
:
477 raise IOError, ('ftp error', msg
), sys
.exc_info()[2]
479 def open_data(self
, url
, data
=None):
480 """Use "data" URL."""
483 # syntax of data URLs:
484 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
485 # mediatype := [ type "/" subtype ] *( ";" parameter )
487 # parameter := attribute "=" value
488 import StringIO
, mimetools
, time
490 [type, data
] = url
.split(',', 1)
492 raise IOError, ('data error', 'bad data URL')
494 type = 'text/plain;charset=US-ASCII'
495 semi
= type.rfind(';')
496 if semi
>= 0 and '=' not in type[semi
:]:
497 encoding
= type[semi
+1:]
502 msg
.append('Date: %s'%time
.strftime('%a, %d %b %Y %T GMT',
503 time
.gmtime(time
.time())))
504 msg
.append('Content-type: %s' % type)
505 if encoding
== 'base64':
507 data
= base64
.decodestring(data
)
510 msg
.append('Content-length: %d' % len(data
))
514 f
= StringIO
.StringIO(msg
)
515 headers
= mimetools
.Message(f
, 0)
516 f
.fileno
= None # needed for addinfourl
517 return addinfourl(f
, headers
, url
)
520 class FancyURLopener(URLopener
):
521 """Derived class with handlers for errors we can handle (perhaps)."""
523 def __init__(self
, *args
):
524 apply(URLopener
.__init
__, (self
,) + args
)
529 def http_error_default(self
, url
, fp
, errcode
, errmsg
, headers
):
530 """Default error handling -- don't raise an exception."""
531 return addinfourl(fp
, headers
, "http:" + url
)
533 def http_error_302(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
534 """Error 302 -- relocated (temporarily)."""
536 if self
.maxtries
and self
.tries
>= self
.maxtries
:
537 if hasattr(self
, "http_error_500"):
538 meth
= self
.http_error_500
540 meth
= self
.http_error_default
542 return meth(url
, fp
, 500,
543 "Internal Server Error: Redirect Recursion", headers
)
544 result
= self
.redirect_internal(url
, fp
, errcode
, errmsg
, headers
,
549 def redirect_internal(self
, url
, fp
, errcode
, errmsg
, headers
, data
):
550 if headers
.has_key('location'):
551 newurl
= headers
['location']
552 elif headers
.has_key('uri'):
553 newurl
= headers
['uri']
558 # In case the server sent a relative URL, join with original:
559 newurl
= basejoin(self
.type + ":" + url
, newurl
)
561 return self
.open(newurl
)
563 return self
.open(newurl
, data
)
565 def http_error_301(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
566 """Error 301 -- also relocated (permanently)."""
567 return self
.http_error_302(url
, fp
, errcode
, errmsg
, headers
, data
)
569 def http_error_401(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
570 """Error 401 -- authentication required.
571 See this URL for a description of the basic authentication scheme:
572 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
573 if not headers
.has_key('www-authenticate'):
574 URLopener
.http_error_default(self
, url
, fp
,
576 stuff
= headers
['www-authenticate']
578 match
= re
.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff
)
580 URLopener
.http_error_default(self
, url
, fp
,
581 errcode
, errmsg
, headers
)
582 scheme
, realm
= match
.groups()
583 if scheme
.lower() != 'basic':
584 URLopener
.http_error_default(self
, url
, fp
,
585 errcode
, errmsg
, headers
)
586 name
= 'retry_' + self
.type + '_basic_auth'
588 return getattr(self
,name
)(url
, realm
)
590 return getattr(self
,name
)(url
, realm
, data
)
592 def retry_http_basic_auth(self
, url
, realm
, data
=None):
593 host
, selector
= splithost(url
)
594 i
= host
.find('@') + 1
596 user
, passwd
= self
.get_user_passwd(host
, realm
, i
)
597 if not (user
or passwd
): return None
598 host
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + host
599 newurl
= 'http://' + host
+ selector
601 return self
.open(newurl
)
603 return self
.open(newurl
, data
)
605 def retry_https_basic_auth(self
, url
, realm
, data
=None):
606 host
, selector
= splithost(url
)
607 i
= host
.find('@') + 1
609 user
, passwd
= self
.get_user_passwd(host
, realm
, i
)
610 if not (user
or passwd
): return None
611 host
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + host
612 newurl
= '//' + host
+ selector
613 return self
.open_https(newurl
, data
)
615 def get_user_passwd(self
, host
, realm
, clear_cache
= 0):
616 key
= realm
+ '@' + host
.lower()
617 if self
.auth_cache
.has_key(key
):
619 del self
.auth_cache
[key
]
621 return self
.auth_cache
[key
]
622 user
, passwd
= self
.prompt_user_passwd(host
, realm
)
623 if user
or passwd
: self
.auth_cache
[key
] = (user
, passwd
)
626 def prompt_user_passwd(self
, host
, realm
):
627 """Override this in a GUI environment!"""
630 user
= raw_input("Enter username for %s at %s: " % (realm
,
632 passwd
= getpass
.getpass("Enter password for %s in %s at %s: " %
635 except KeyboardInterrupt:
644 """Return the IP address of the magic hostname 'localhost'."""
647 _localhost
= socket
.gethostbyname('localhost')
652 """Return the IP address of the current host."""
655 _thishost
= socket
.gethostbyname(socket
.gethostname())
660 """Return the set of errors raised by the FTP class."""
664 _ftperrors
= ftplib
.all_errors
669 """Return an empty mimetools.Message object."""
674 _noheaders
= mimetools
.Message(StringIO
.StringIO(), 0)
675 _noheaders
.fp
.close() # Recycle file descriptor
682 """Class used by open_ftp() for cache of open FTP connections."""
684 def __init__(self
, user
, passwd
, host
, port
, dirs
):
695 self
.ftp
= ftplib
.FTP()
696 self
.ftp
.connect(self
.host
, self
.port
)
697 self
.ftp
.login(self
.user
, self
.passwd
)
698 for dir in self
.dirs
:
701 def retrfile(self
, file, type):
704 if type in ('d', 'D'): cmd
= 'TYPE A'; isdir
= 1
705 else: cmd
= 'TYPE ' + type; isdir
= 0
707 self
.ftp
.voidcmd(cmd
)
708 except ftplib
.all_errors
:
710 self
.ftp
.voidcmd(cmd
)
712 if file and not isdir
:
713 # Use nlst to see if the file exists at all
716 except ftplib
.error_perm
, reason
:
717 raise IOError, ('ftp error', reason
), sys
.exc_info()[2]
718 # Restore the transfer mode!
719 self
.ftp
.voidcmd(cmd
)
720 # Try to retrieve as a file
723 conn
= self
.ftp
.ntransfercmd(cmd
)
724 except ftplib
.error_perm
, reason
:
725 if str(reason
)[:3] != '550':
726 raise IOError, ('ftp error', reason
), sys
.exc_info()[2]
728 # Set transfer mode to ASCII!
729 self
.ftp
.voidcmd('TYPE A')
730 # Try a directory listing
731 if file: cmd
= 'LIST ' + file
733 conn
= self
.ftp
.ntransfercmd(cmd
)
735 # Pass back both a suitably decorated object and a retrieval length
736 return (addclosehook(conn
[0].makefile('rb'),
737 self
.endtransfer
), conn
[1])
738 def endtransfer(self
):
755 """Base class for addinfo and addclosehook."""
757 def __init__(self
, fp
):
759 self
.read
= self
.fp
.read
760 self
.readline
= self
.fp
.readline
761 if hasattr(self
.fp
, "readlines"): self
.readlines
= self
.fp
.readlines
762 if hasattr(self
.fp
, "fileno"): self
.fileno
= self
.fp
.fileno
765 return '<%s at %s whose fp = %s>' % (self
.__class
__.__name
__,
766 `
id(self
)`
, `self
.fp`
)
771 self
.readlines
= None
773 if self
.fp
: self
.fp
.close()
776 class addclosehook(addbase
):
777 """Class to add a close hook to an open file."""
779 def __init__(self
, fp
, closehook
, *hookargs
):
780 addbase
.__init
__(self
, fp
)
781 self
.closehook
= closehook
782 self
.hookargs
= hookargs
787 apply(self
.closehook
, self
.hookargs
)
788 self
.closehook
= None
791 class addinfo(addbase
):
792 """class to add an info() method to an open file."""
794 def __init__(self
, fp
, headers
):
795 addbase
.__init
__(self
, fp
)
796 self
.headers
= headers
801 class addinfourl(addbase
):
802 """class to add info() and geturl() methods to an open file."""
804 def __init__(self
, fp
, headers
, url
):
805 addbase
.__init
__(self
, fp
)
806 self
.headers
= headers
816 def basejoin(base
, url
):
817 """Utility to combine a URL with a base URL to form a new URL."""
818 type, path
= splittype(url
)
820 # if url is complete (i.e., it contains a type), return it
822 host
, path
= splithost(path
)
823 type, basepath
= splittype(base
) # inherit type from base
825 # if url contains host, just inherit type
826 if type: return type + '://' + host
+ path
828 # no type inherited, so url must have started with //
831 host
, basepath
= splithost(basepath
) # inherit host
832 basepath
, basetag
= splittag(basepath
) # remove extraneous cruft
833 basepath
, basequery
= splitquery(basepath
) # idem
835 # non-absolute path name
836 if path
[:1] in ('#', '?'):
837 # path is just a tag or query, attach to basepath
840 # else replace last component
841 i
= basepath
.rfind('/')
843 # basepath not absolute
845 # host present, make absolute
848 # else keep non-absolute
851 # remove last file component
852 basepath
= basepath
[:i
+1]
853 # Interpret ../ (important because of symlinks)
854 while basepath
and path
[:3] == '../':
856 i
= basepath
[:-1].rfind('/')
858 basepath
= basepath
[:i
+1]
865 path
= basepath
+ path
866 if host
and path
and path
[0] != '/':
868 if type and host
: return type + '://' + host
+ path
869 elif type: return type + ':' + path
870 elif host
: return '//' + host
+ path
# don't know what this means
874 # Utilities to parse URLs (most of these return None for missing parts):
875 # unwrap('<URL:type://host/path>') --> 'type://host/path'
876 # splittype('type:opaquestring') --> 'type', 'opaquestring'
877 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
878 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
879 # splitpasswd('user:passwd') -> 'user', 'passwd'
880 # splitport('host:port') --> 'host', 'port'
881 # splitquery('/path?query') --> '/path', 'query'
882 # splittag('/path#tag') --> '/path', 'tag'
883 # splitattr('/path;attr1=value1;attr2=value2;...') ->
884 # '/path', ['attr1=value1', 'attr2=value2', ...]
885 # splitvalue('attr=value') --> 'attr', 'value'
886 # splitgophertype('/Xselector') --> 'X', 'selector'
887 # unquote('abc%20def') -> 'abc def'
888 # quote('abc def') -> 'abc%20def')
891 """toBytes(u"URL") --> 'URL'."""
892 # Most URL schemes require ASCII. If that changes, the conversion
894 if type(url
) is types
.UnicodeType
:
896 url
= url
.encode("ASCII")
898 raise UnicodeError("URL " + repr(url
) +
899 " contains non-ASCII characters")
903 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
905 if url
[:1] == '<' and url
[-1:] == '>':
906 url
= url
[1:-1].strip()
907 if url
[:4] == 'URL:': url
= url
[4:].strip()
912 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
914 if _typeprog
is None:
916 _typeprog
= re
.compile('^([^/:]+):')
918 match
= _typeprog
.match(url
)
920 scheme
= match
.group(1)
921 return scheme
.lower(), url
[len(scheme
) + 1:]
926 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
928 if _hostprog
is None:
930 _hostprog
= re
.compile('^//([^/]*)(.*)$')
932 match
= _hostprog
.match(url
)
933 if match
: return match
.group(1, 2)
938 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
940 if _userprog
is None:
942 _userprog
= re
.compile('^([^@]*)@(.*)$')
944 match
= _userprog
.match(host
)
945 if match
: return map(unquote
, match
.group(1, 2))
949 def splitpasswd(user
):
950 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
952 if _passwdprog
is None:
954 _passwdprog
= re
.compile('^([^:]*):(.*)$')
956 match
= _passwdprog
.match(user
)
957 if match
: return match
.group(1, 2)
960 # splittag('/path#tag') --> '/path', 'tag'
963 """splitport('host:port') --> 'host', 'port'."""
965 if _portprog
is None:
967 _portprog
= re
.compile('^(.*):([0-9]+)$')
969 match
= _portprog
.match(host
)
970 if match
: return match
.group(1, 2)
974 def splitnport(host
, defport
=-1):
975 """Split host and port, returning numeric port.
976 Return given default port if no ':' found; defaults to -1.
977 Return numerical port if a valid number are found after ':'.
978 Return None if ':' but not a valid number."""
980 if _nportprog
is None:
982 _nportprog
= re
.compile('^(.*):(.*)$')
984 match
= _nportprog
.match(host
)
986 host
, port
= match
.group(1, 2)
988 if not port
: raise ValueError, "no digits"
997 """splitquery('/path?query') --> '/path', 'query'."""
999 if _queryprog
is None:
1001 _queryprog
= re
.compile('^(.*)\?([^?]*)$')
1003 match
= _queryprog
.match(url
)
1004 if match
: return match
.group(1, 2)
1009 """splittag('/path#tag') --> '/path', 'tag'."""
1011 if _tagprog
is None:
1013 _tagprog
= re
.compile('^(.*)#([^#]*)$')
1015 match
= _tagprog
.match(url
)
1016 if match
: return match
.group(1, 2)
1020 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1021 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1022 words
= url
.split(';')
1023 return words
[0], words
[1:]
1026 def splitvalue(attr
):
1027 """splitvalue('attr=value') --> 'attr', 'value'."""
1029 if _valueprog
is None:
1031 _valueprog
= re
.compile('^([^=]*)=(.*)$')
1033 match
= _valueprog
.match(attr
)
1034 if match
: return match
.group(1, 2)
1037 def splitgophertype(selector
):
1038 """splitgophertype('/Xselector') --> 'X', 'selector'."""
1039 if selector
[:1] == '/' and selector
[1:2]:
1040 return selector
[1], selector
[2:]
1041 return None, selector
1044 """unquote('abc%20def') -> 'abc def'."""
1049 myappend
= res
.append
1054 myappend(mychr(myatoi(item
[:2], 16))
1057 myappend('%' + item
)
1059 myappend('%' + item
)
1062 def unquote_plus(s
):
1063 """unquote('%7e/abc+def') -> '~/abc def'"""
1065 # replace '+' with ' '
1066 s
= ' '.join(s
.split('+'))
1069 always_safe
= ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1070 'abcdefghijklmnopqrstuvwxyz'
1073 _fast_safe_test
= always_safe
+ '/'
1078 if _fast_safe
is None:
1080 for c
in _fast_safe_test
:
1083 for i
in range(len(res
)):
1085 if not _fast_safe
.has_key(c
):
1086 res
[i
] = '%%%02X' % ord(c
)
1089 def quote(s
, safe
= '/'):
1090 """quote('abc def') -> 'abc%20def'
1092 Each part of a URL, e.g. the path info, the query, etc., has a
1093 different set of reserved characters that must be quoted.
1095 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1096 the following reserved characters.
1098 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1101 Each of these characters is reserved in some component of a URL,
1102 but not necessarily in all of them.
1104 By default, the quote function is intended for quoting the path
1105 section of a URL. Thus, it will not encode '/'. This character
1106 is reserved, but in typical usage the quote function is being
1107 called on a path where the existing slash characters are used as
1108 reserved characters.
1110 safe
= always_safe
+ safe
1111 if _fast_safe_test
== safe
:
1112 return _fast_quote(s
)
1114 for i
in range(len(res
)):
1117 res
[i
] = '%%%02X' % ord(c
)
1120 def quote_plus(s
, safe
= ''):
1121 """Quote the query fragment of a URL; replacing ' ' with '+'"""
1124 for i
in range(len(l
)):
1125 l
[i
] = quote(l
[i
], safe
)
1128 return quote(s
, safe
)
1130 def urlencode(query
,doseq
=0):
1131 """Encode a sequence of two-element tuples or dictionary into a URL query string.
1133 If any values in the query arg are sequences and doseq is true, each
1134 sequence element is converted to a separate parameter.
1136 If the query arg is a sequence of two-element tuples, the order of the
1137 parameters in the output will match the order of parameters in the
1141 if hasattr(query
,"items"):
1143 query
= query
.items()
1145 # it's a bother at times that strings and string-like objects are
1148 # non-sequence items should not work with len()
1150 # non-empty strings will fail this
1151 if len(query
) and type(query
[0]) != types
.TupleType
:
1153 # zero-length sequences of all types will get here and succeed,
1154 # but that's a minor nit - since the original implementation
1155 # allowed empty dicts that type of behavior probably should be
1156 # preserved for consistency
1158 ty
,va
,tb
= sys
.exc_info()
1159 raise TypeError, "not a valid non-string sequence or mapping object", tb
1163 # preserve old behavior
1165 k
= quote_plus(str(k
))
1166 v
= quote_plus(str(v
))
1167 l
.append(k
+ '=' + v
)
1170 k
= quote_plus(str(k
))
1171 if type(v
) == types
.StringType
:
1173 l
.append(k
+ '=' + v
)
1174 elif type(v
) == types
.UnicodeType
:
1175 # is there a reasonable way to convert to ASCII?
1176 # encode generates a string, but "replace" or "ignore"
1177 # lose information and "strict" can raise UnicodeError
1178 v
= quote_plus(v
.encode("ASCII","replace"))
1179 l
.append(k
+ '=' + v
)
1182 # is this a sufficient test for sequence-ness?
1186 v
= quote_plus(str(v
))
1187 l
.append(k
+ '=' + v
)
1189 # loop over the sequence
1191 l
.append(k
+ '=' + quote_plus(str(elt
)))
1195 def getproxies_environment():
1196 """Return a dictionary of scheme -> proxy server URL mappings.
1198 Scan the environment for variables named <scheme>_proxy;
1199 this seems to be the standard convention. If you need a
1200 different way, you can pass a proxies dictionary to the
1201 [Fancy]URLopener constructor.
1205 for name
, value
in os
.environ
.items():
1207 if value
and name
[-6:] == '_proxy':
1208 proxies
[name
[:-6]] = value
1211 if os
.name
== 'mac':
1213 """Return a dictionary of scheme -> proxy server URL mappings.
1215 By convention the mac uses Internet Config to store
1216 proxies. An HTTP proxy, for instance, is stored under
1231 if config
.has_key('UseHTTPProxy') and config
['UseHTTPProxy']:
1233 value
= config
['HTTPProxyHost']
1237 proxies
['http'] = 'http://%s' % value
1238 # FTP: XXXX To be done.
1239 # Gopher: XXXX To be done.
1242 elif os
.name
== 'nt':
1243 def getproxies_registry():
1244 """Return a dictionary of scheme -> proxy server URL mappings.
1246 Win32 uses the registry to store proxies.
1253 # Std module, so should be around - but you never know!
1256 internetSettings
= _winreg
.OpenKey(_winreg
.HKEY_CURRENT_USER
,
1257 r
'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1258 proxyEnable
= _winreg
.QueryValueEx(internetSettings
,
1261 # Returned as Unicode but problems if not converted to ASCII
1262 proxyServer
= str(_winreg
.QueryValueEx(internetSettings
,
1264 if '=' in proxyServer
:
1265 # Per-protocol settings
1266 for p
in proxyServer
.split(';'):
1267 protocol
, address
= p
.split('=', 1)
1268 proxies
[protocol
] = '%s://%s' % (protocol
, address
)
1270 # Use one setting for all protocols
1271 if proxyServer
[:5] == 'http:':
1272 proxies
['http'] = proxyServer
1274 proxies
['http'] = 'http://%s' % proxyServer
1275 proxies
['ftp'] = 'ftp://%s' % proxyServer
1276 internetSettings
.Close()
1277 except (WindowsError, ValueError, TypeError):
1278 # Either registry key not found etc, or the value in an
1279 # unexpected format.
1280 # proxies already set up to be empty so nothing to do
1285 """Return a dictionary of scheme -> proxy server URL mappings.
1287 Returns settings gathered from the environment, if specified,
1291 return getproxies_environment() or getproxies_registry()
1293 # By default use environment variables
1294 getproxies
= getproxies_environment
1297 # Test and time quote() and unquote()
1301 for i
in range(256): s
= s
+ chr(i
)
1312 print round(t1
- t0
, 3), 'sec'
1315 def reporthook(blocknum
, blocksize
, totalsize
):
1316 # Report during remote transfers
1317 print "Block number: %d, Block size: %d, Total size: %d" % (
1318 blocknum
, blocksize
, totalsize
)
1326 'file://localhost/etc/passwd',
1327 'ftp://ftp.python.org/etc/passwd',
1328 ## 'gopher://gopher.micro.umn.edu/1/',
1329 'http://www.python.org/index.html',
1331 if hasattr(URLopener
, "open_https"):
1332 args
.append('https://synergy.as.cmu.edu/~geek/')
1335 print '-'*10, url
, '-'*10
1336 fn
, h
= urlretrieve(url
, None, reporthook
)
1340 for k
in h
.keys(): print k
+ ':', h
[k
]
1346 table
= string
.maketrans("", "")
1347 data
= data
.translate(table
, "\r")
1357 opts
, args
= getopt
.getopt(sys
.argv
[1:], "th")
1358 except getopt
.error
, msg
:
1360 print "Use -h for help"
1367 print "Usage: python urllib.py [-t] [url ...]"
1368 print "-t runs self-test;",
1369 print "otherwise, contents of urls are printed"
1377 print "Use -h for help"
1379 print urlopen(url
).read(),
1381 # Run test program when run as a script
1382 if __name__
== '__main__':