1 """Open an arbitrary URL.
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
16 The object returned by URLopener().open(file) will differ per
17 protocol. All you know is that is has methods read(), readline(),
18 readlines(), fileno(), close() and info(). The read*(), fileno()
19 and close() methods work like those of open files.
20 The info() method returns a mimetools.Message object which can be
21 used to query various info about the object, if available.
22 (mimetools.Message objects are queried with the getheader() method.)
31 __version__
= '1.12' # XXX This version is not always updated :-(
33 MAXFTPCACHE
= 10 # Trim the ftp cache beyond this size
35 # Helper for non-unix systems
37 from macurl2path
import url2pathname
, pathname2url
39 from nturl2path
import url2pathname
, pathname2url
41 def url2pathname(pathname
):
42 return unquote(pathname
)
43 def pathname2url(pathname
):
44 return quote(pathname
)
46 # This really consists of two pieces:
47 # (1) a class which handles opening of all sorts of URLs
48 # (plus assorted utilities etc.)
49 # (2) a set of functions for parsing URLs
50 # XXX Should these be separated out into different modules?
53 # Shortcut for basic usage
55 def urlopen(url
, data
=None):
58 _urlopener
= FancyURLopener()
60 return _urlopener
.open(url
)
62 return _urlopener
.open(url
, data
)
63 def urlretrieve(url
, filename
=None, reporthook
=None):
66 _urlopener
= FancyURLopener()
67 return _urlopener
.retrieve(url
, filename
, reporthook
)
75 """Class to open URLs.
76 This is a class rather than just a subroutine because we may need
77 more than one set of global protocol-specific options.
78 Note -- this is a base class for those who don't want the
79 automatic handling of errors type 302 (relocated) and 401
80 (authorization needed)."""
85 def __init__(self
, proxies
=None, **x509
):
87 proxies
= getproxies()
88 assert hasattr(proxies
, 'has_key'), "proxies must be a mapping"
89 self
.proxies
= proxies
90 self
.key_file
= x509
.get('key_file')
91 self
.cert_file
= x509
.get('cert_file')
92 server_version
= "Python-urllib/%s" % __version__
93 self
.addheaders
= [('User-agent', server_version
)]
95 self
.__unlink
= os
.unlink
# See cleanup()
97 # Undocumented feature: if you assign {} to tempcache,
98 # it is used to cache files retrieved with
99 # self.retrieve(). This is not enabled by default
100 # since it does not work for changing documents (and I
101 # haven't got the logic to check expiration headers
103 self
.ftpcache
= ftpcache
104 # Undocumented feature: you can use a different
105 # ftp cache by assigning to the .ftpcache member;
106 # in case you want logically independent URL openers
107 # XXX This is not threadsafe. Bah.
116 # This code sometimes runs when the rest of this module
117 # has already been deleted, so it can't use any globals
118 # or import anything.
120 for file in self
.__tempfiles
:
125 del self
.__tempfiles
[:]
127 self
.tempcache
.clear()
129 def addheader(self
, *args
):
130 """Add a header to be used by the HTTP interface only
131 e.g. u.addheader('Accept', 'sound/basic')"""
132 self
.addheaders
.append(args
)
135 def open(self
, fullurl
, data
=None):
136 """Use URLopener().open(file) instead of open(file, 'r')."""
137 fullurl
= unwrap(fullurl
)
138 if self
.tempcache
and self
.tempcache
.has_key(fullurl
):
139 filename
, headers
= self
.tempcache
[fullurl
]
140 fp
= open(filename
, 'rb')
141 return addinfourl(fp
, headers
, fullurl
)
142 type, url
= splittype(fullurl
)
143 if not type: type = 'file'
144 if self
.proxies
.has_key(type):
145 proxy
= self
.proxies
[type]
146 type, proxy
= splittype(proxy
)
147 host
, selector
= splithost(proxy
)
148 url
= (host
, fullurl
) # Signal special case to open_*()
149 name
= 'open_' + type
153 name
= string
.join(string
.split(name
, '-'), '_')
154 if not hasattr(self
, name
):
156 return self
.open_unknown(fullurl
)
158 return self
.open_unknown(fullurl
, data
)
161 return getattr(self
, name
)(url
)
163 return getattr(self
, name
)(url
, data
)
164 except socket
.error
, msg
:
165 raise IOError, ('socket error', msg
), sys
.exc_info()[2]
167 def open_unknown(self
, fullurl
, data
=None):
168 """Overridable interface to open unknown URL type."""
169 type, url
= splittype(fullurl
)
170 raise IOError, ('url error', 'unknown url type', type)
173 def retrieve(self
, url
, filename
=None, reporthook
=None):
174 """retrieve(url) returns (filename, None) for a local object
175 or (tempfilename, headers) for a remote object."""
177 if self
.tempcache
and self
.tempcache
.has_key(url
):
178 return self
.tempcache
[url
]
179 type, url1
= splittype(url
)
180 if not filename
and (not type or type == 'file'):
182 fp
= self
.open_local_file(url1
)
185 return url2pathname(splithost(url1
)[1]), hdrs
192 garbage
, path
= splittype(url
)
193 garbage
, path
= splithost(path
or "")
194 path
, garbage
= splitquery(path
or "")
195 path
, garbage
= splitattr(path
or "")
196 suffix
= os
.path
.splitext(path
)[1]
197 filename
= tempfile
.mktemp(suffix
)
198 self
.__tempfiles
.append(filename
)
199 result
= filename
, headers
200 if self
.tempcache
is not None:
201 self
.tempcache
[url
] = result
202 tfp
= open(filename
, 'wb')
207 if headers
.has_key("content-length"):
208 size
= int(headers
["Content-Length"])
209 reporthook(0, bs
, size
)
212 reporthook(1, bs
, size
)
216 blocknum
= blocknum
+ 1
218 reporthook(blocknum
, bs
, size
)
225 # Each method named open_<type> knows how to open that type of URL
227 def open_http(self
, url
, data
=None):
228 """Use HTTP protocol."""
231 if type(url
) is type(""):
232 host
, selector
= splithost(url
)
234 user_passwd
, host
= splituser(host
)
239 urltype
, rest
= splittype(selector
)
242 if string
.lower(urltype
) != 'http':
245 realhost
, rest
= splithost(rest
)
247 user_passwd
, realhost
= splituser(realhost
)
249 selector
= "%s://%s%s" % (urltype
, realhost
, rest
)
250 #print "proxy via http:", host, selector
251 if not host
: raise IOError, ('http error', 'no host given')
254 auth
= string
.strip(base64
.encodestring(user_passwd
))
257 h
= httplib
.HTTP(host
)
259 h
.putrequest('POST', selector
)
260 h
.putheader('Content-type', 'application/x-www-form-urlencoded')
261 h
.putheader('Content-length', '%d' % len(data
))
263 h
.putrequest('GET', selector
)
264 if auth
: h
.putheader('Authorization', 'Basic %s' % auth
)
265 if realhost
: h
.putheader('Host', realhost
)
266 for args
in self
.addheaders
: apply(h
.putheader
, args
)
269 h
.send(data
+ '\r\n')
270 errcode
, errmsg
, headers
= h
.getreply()
273 return addinfourl(fp
, headers
, "http:" + url
)
276 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
)
278 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
, data
)
280 def http_error(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
281 """Handle http errors.
282 Derived class can override this, or provide specific handlers
283 named http_error_DDD where DDD is the 3-digit error code."""
284 # First check if there's a specific handler for this error
285 name
= 'http_error_%d' % errcode
286 if hasattr(self
, name
):
287 method
= getattr(self
, name
)
289 result
= method(url
, fp
, errcode
, errmsg
, headers
)
291 result
= method(url
, fp
, errcode
, errmsg
, headers
, data
)
292 if result
: return result
293 return self
.http_error_default(url
, fp
, errcode
, errmsg
, headers
)
295 def http_error_default(self
, url
, fp
, errcode
, errmsg
, headers
):
296 """Default error handler: close the connection and raise IOError."""
299 raise IOError, ('http error', errcode
, errmsg
, headers
)
301 if hasattr(socket
, "ssl"):
302 def open_https(self
, url
, data
=None):
303 """Use HTTPS protocol."""
305 if type(url
) is type(""):
306 host
, selector
= splithost(url
)
307 user_passwd
, host
= splituser(host
)
310 urltype
, rest
= splittype(selector
)
311 if string
.lower(urltype
) == 'https':
312 realhost
, rest
= splithost(rest
)
313 user_passwd
, realhost
= splituser(realhost
)
315 selector
= "%s://%s%s" % (urltype
, realhost
, rest
)
316 #print "proxy via https:", host, selector
317 if not host
: raise IOError, ('https error', 'no host given')
320 auth
= string
.strip(base64
.encodestring(user_passwd
))
323 h
= httplib
.HTTPS(host
, 0,
324 key_file
=self
.key_file
,
325 cert_file
=self
.cert_file
)
327 h
.putrequest('POST', selector
)
328 h
.putheader('Content-type',
329 'application/x-www-form-urlencoded')
330 h
.putheader('Content-length', '%d' % len(data
))
332 h
.putrequest('GET', selector
)
333 if auth
: h
.putheader('Authorization: Basic %s' % auth
)
334 for args
in self
.addheaders
: apply(h
.putheader
, args
)
337 h
.send(data
+ '\r\n')
338 errcode
, errmsg
, headers
= h
.getreply()
341 return addinfourl(fp
, headers
, url
)
343 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
)
345 def open_gopher(self
, url
):
346 """Use Gopher protocol."""
348 host
, selector
= splithost(url
)
349 if not host
: raise IOError, ('gopher error', 'no host given')
351 type, selector
= splitgophertype(selector
)
352 selector
, query
= splitquery(selector
)
353 selector
= unquote(selector
)
355 query
= unquote(query
)
356 fp
= gopherlib
.send_query(selector
, query
, host
)
358 fp
= gopherlib
.send_selector(selector
, host
)
359 return addinfourl(fp
, noheaders(), "gopher:" + url
)
361 def open_file(self
, url
):
362 """Use local file or FTP depending on form of URL."""
363 if url
[:2] == '//' and url
[2:3] != '/':
364 return self
.open_ftp(url
)
366 return self
.open_local_file(url
)
368 def open_local_file(self
, url
):
369 """Use local file."""
370 import mimetypes
, mimetools
, StringIO
371 mtype
= mimetypes
.guess_type(url
)[0]
372 headers
= mimetools
.Message(StringIO
.StringIO(
373 'Content-Type: %s\n' % (mtype
or 'text/plain')))
374 host
, file = splithost(url
)
378 urlfile
= 'file://' + file
379 return addinfourl(open(url2pathname(file), 'rb'),
381 host
, port
= splitport(host
)
383 and socket
.gethostbyname(host
) in (localhost(), thishost()):
386 urlfile
= 'file://' + file
387 return addinfourl(open(url2pathname(file), 'rb'),
389 raise IOError, ('local file error', 'not on local host')
391 def open_ftp(self
, url
):
392 """Use FTP protocol."""
393 host
, path
= splithost(url
)
394 if not host
: raise IOError, ('ftp error', 'no host given')
395 host
, port
= splitport(host
)
396 user
, host
= splituser(host
)
397 if user
: user
, passwd
= splitpasswd(user
)
400 user
= unquote(user
or '')
401 passwd
= unquote(passwd
or '')
402 host
= socket
.gethostbyname(host
)
405 port
= ftplib
.FTP_PORT
408 path
, attrs
= splitattr(path
)
410 dirs
= string
.splitfields(path
, '/')
411 dirs
, file = dirs
[:-1], dirs
[-1]
412 if dirs
and not dirs
[0]: dirs
= dirs
[1:]
413 if dirs
and not dirs
[0]: dirs
[0] = '/'
414 key
= (user
, host
, port
, string
.joinfields(dirs
, '/'))
416 if len(self
.ftpcache
) > MAXFTPCACHE
:
417 # Prune the cache, rather arbitrarily
418 for k
in self
.ftpcache
.keys():
424 if not self
.ftpcache
.has_key(key
):
425 self
.ftpcache
[key
] = \
426 ftpwrapper(user
, passwd
, host
, port
, dirs
)
427 if not file: type = 'D'
430 attr
, value
= splitvalue(attr
)
431 if string
.lower(attr
) == 'type' and \
432 value
in ('a', 'A', 'i', 'I', 'd', 'D'):
433 type = string
.upper(value
)
434 (fp
, retrlen
) = self
.ftpcache
[key
].retrfile(file, type)
435 if retrlen
is not None and retrlen
>= 0:
436 import mimetools
, StringIO
437 headers
= mimetools
.Message(StringIO
.StringIO(
438 'Content-Length: %d\n' % retrlen
))
440 headers
= noheaders()
441 return addinfourl(fp
, headers
, "ftp:" + url
)
442 except ftperrors(), msg
:
443 raise IOError, ('ftp error', msg
), sys
.exc_info()[2]
445 def open_data(self
, url
, data
=None):
446 """Use "data" URL."""
449 # syntax of data URLs:
450 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
451 # mediatype := [ type "/" subtype ] *( ";" parameter )
453 # parameter := attribute "=" value
454 import StringIO
, mimetools
, time
456 [type, data
] = string
.split(url
, ',', 1)
458 raise IOError, ('data error', 'bad data URL')
460 type = 'text/plain;charset=US-ASCII'
461 semi
= string
.rfind(type, ';')
462 if semi
>= 0 and '=' not in type[semi
:]:
463 encoding
= type[semi
+1:]
468 msg
.append('Date: %s'%time
.strftime('%a, %d %b %Y %T GMT',
469 time
.gmtime(time
.time())))
470 msg
.append('Content-type: %s' % type)
471 if encoding
== 'base64':
473 data
= base64
.decodestring(data
)
476 msg
.append('Content-length: %d' % len(data
))
479 msg
= string
.join(msg
, '\n')
480 f
= StringIO
.StringIO(msg
)
481 headers
= mimetools
.Message(f
, 0)
482 f
.fileno
= None # needed for addinfourl
483 return addinfourl(f
, headers
, url
)
486 class FancyURLopener(URLopener
):
487 """Derived class with handlers for errors we can handle (perhaps)."""
489 def __init__(self
, *args
):
490 apply(URLopener
.__init
__, (self
,) + args
)
493 def http_error_default(self
, url
, fp
, errcode
, errmsg
, headers
):
494 """Default error handling -- don't raise an exception."""
495 return addinfourl(fp
, headers
, "http:" + url
)
497 def http_error_302(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
498 """Error 302 -- relocated (temporarily)."""
499 # XXX The server can force infinite recursion here!
500 if headers
.has_key('location'):
501 newurl
= headers
['location']
502 elif headers
.has_key('uri'):
503 newurl
= headers
['uri']
508 # In case the server sent a relative URL, join with original:
509 newurl
= basejoin("http:" + url
, newurl
)
511 return self
.open(newurl
)
513 return self
.open(newurl
, data
)
515 def http_error_301(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
516 """Error 301 -- also relocated (permanently)."""
517 return self
.http_error_302(url
, fp
, errcode
, errmsg
, headers
, data
)
519 def http_error_401(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
520 """Error 401 -- authentication required.
521 See this URL for a description of the basic authentication scheme:
522 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
523 if headers
.has_key('www-authenticate'):
524 stuff
= headers
['www-authenticate']
526 match
= re
.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff
)
528 scheme
, realm
= match
.groups()
529 if string
.lower(scheme
) == 'basic':
530 name
= 'retry_' + self
.type + '_basic_auth'
532 return getattr(self
,name
)(url
, realm
)
534 return getattr(self
,name
)(url
, realm
, data
)
536 def retry_http_basic_auth(self
, url
, realm
, data
=None):
537 host
, selector
= splithost(url
)
538 i
= string
.find(host
, '@') + 1
540 user
, passwd
= self
.get_user_passwd(host
, realm
, i
)
541 if not (user
or passwd
): return None
542 host
= user
+ ':' + passwd
+ '@' + host
543 newurl
= 'http://' + host
+ selector
545 return self
.open(newurl
)
547 return self
.open(newurl
, data
)
549 def retry_https_basic_auth(self
, url
, realm
, data
=None):
550 host
, selector
= splithost(url
)
551 i
= string
.find(host
, '@') + 1
553 user
, passwd
= self
.get_user_passwd(host
, realm
, i
)
554 if not (user
or passwd
): return None
555 host
= user
+ ':' + passwd
+ '@' + host
556 newurl
= '//' + host
+ selector
557 return self
.open_https(newurl
)
559 def get_user_passwd(self
, host
, realm
, clear_cache
= 0):
560 key
= realm
+ '@' + string
.lower(host
)
561 if self
.auth_cache
.has_key(key
):
563 del self
.auth_cache
[key
]
565 return self
.auth_cache
[key
]
566 user
, passwd
= self
.prompt_user_passwd(host
, realm
)
567 if user
or passwd
: self
.auth_cache
[key
] = (user
, passwd
)
570 def prompt_user_passwd(self
, host
, realm
):
571 """Override this in a GUI environment!"""
574 user
= raw_input("Enter username for %s at %s: " % (realm
,
576 passwd
= getpass
.getpass("Enter password for %s in %s at %s: " %
579 except KeyboardInterrupt:
588 """Return the IP address of the magic hostname 'localhost'."""
591 _localhost
= socket
.gethostbyname('localhost')
596 """Return the IP address of the current host."""
599 _thishost
= socket
.gethostbyname(socket
.gethostname())
604 """Return the set of errors raised by the FTP class."""
608 _ftperrors
= ftplib
.all_errors
613 """Return an empty mimetools.Message object."""
618 _noheaders
= mimetools
.Message(StringIO
.StringIO(), 0)
619 _noheaders
.fp
.close() # Recycle file descriptor
626 """Class used by open_ftp() for cache of open FTP connections."""
628 def __init__(self
, user
, passwd
, host
, port
, dirs
):
639 self
.ftp
= ftplib
.FTP()
640 self
.ftp
.connect(self
.host
, self
.port
)
641 self
.ftp
.login(self
.user
, self
.passwd
)
642 for dir in self
.dirs
:
645 def retrfile(self
, file, type):
648 if type in ('d', 'D'): cmd
= 'TYPE A'; isdir
= 1
649 else: cmd
= 'TYPE ' + type; isdir
= 0
651 self
.ftp
.voidcmd(cmd
)
652 except ftplib
.all_errors
:
654 self
.ftp
.voidcmd(cmd
)
656 if file and not isdir
:
657 # Use nlst to see if the file exists at all
660 except ftplib
.error_perm
, reason
:
661 raise IOError, ('ftp error', reason
), sys
.exc_info()[2]
662 # Restore the transfer mode!
663 self
.ftp
.voidcmd(cmd
)
664 # Try to retrieve as a file
667 conn
= self
.ftp
.ntransfercmd(cmd
)
668 except ftplib
.error_perm
, reason
:
669 if reason
[:3] != '550':
670 raise IOError, ('ftp error', reason
), sys
.exc_info()[2]
672 # Set transfer mode to ASCII!
673 self
.ftp
.voidcmd('TYPE A')
674 # Try a directory listing
675 if file: cmd
= 'LIST ' + file
677 conn
= self
.ftp
.ntransfercmd(cmd
)
679 # Pass back both a suitably decorated object and a retrieval length
680 return (addclosehook(conn
[0].makefile('rb'),
681 self
.endtransfer
), conn
[1])
682 def endtransfer(self
):
699 """Base class for addinfo and addclosehook."""
701 def __init__(self
, fp
):
703 self
.read
= self
.fp
.read
704 self
.readline
= self
.fp
.readline
705 if hasattr(self
.fp
, "readlines"): self
.readlines
= self
.fp
.readlines
706 if hasattr(self
.fp
, "fileno"): self
.fileno
= self
.fp
.fileno
709 return '<%s at %s whose fp = %s>' % (self
.__class
__.__name
__,
710 `
id(self
)`
, `self
.fp`
)
715 self
.readlines
= None
717 if self
.fp
: self
.fp
.close()
720 class addclosehook(addbase
):
721 """Class to add a close hook to an open file."""
723 def __init__(self
, fp
, closehook
, *hookargs
):
724 addbase
.__init
__(self
, fp
)
725 self
.closehook
= closehook
726 self
.hookargs
= hookargs
731 apply(self
.closehook
, self
.hookargs
)
732 self
.closehook
= None
735 class addinfo(addbase
):
736 """class to add an info() method to an open file."""
738 def __init__(self
, fp
, headers
):
739 addbase
.__init
__(self
, fp
)
740 self
.headers
= headers
745 class addinfourl(addbase
):
746 """class to add info() and geturl() methods to an open file."""
748 def __init__(self
, fp
, headers
, url
):
749 addbase
.__init
__(self
, fp
)
750 self
.headers
= headers
760 def basejoin(base
, url
):
761 """Utility to combine a URL with a base URL to form a new URL."""
762 type, path
= splittype(url
)
764 # if url is complete (i.e., it contains a type), return it
766 host
, path
= splithost(path
)
767 type, basepath
= splittype(base
) # inherit type from base
769 # if url contains host, just inherit type
770 if type: return type + '://' + host
+ path
772 # no type inherited, so url must have started with //
775 host
, basepath
= splithost(basepath
) # inherit host
776 basepath
, basetag
= splittag(basepath
) # remove extraneous cruft
777 basepath
, basequery
= splitquery(basepath
) # idem
779 # non-absolute path name
780 if path
[:1] in ('#', '?'):
781 # path is just a tag or query, attach to basepath
784 # else replace last component
785 i
= string
.rfind(basepath
, '/')
787 # basepath not absolute
789 # host present, make absolute
792 # else keep non-absolute
795 # remove last file component
796 basepath
= basepath
[:i
+1]
797 # Interpret ../ (important because of symlinks)
798 while basepath
and path
[:3] == '../':
800 i
= string
.rfind(basepath
[:-1], '/')
802 basepath
= basepath
[:i
+1]
809 path
= basepath
+ path
810 if type and host
: return type + '://' + host
+ path
811 elif type: return type + ':' + path
812 elif host
: return '//' + host
+ path
# don't know what this means
816 # Utilities to parse URLs (most of these return None for missing parts):
817 # unwrap('<URL:type://host/path>') --> 'type://host/path'
818 # splittype('type:opaquestring') --> 'type', 'opaquestring'
819 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
820 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
821 # splitpasswd('user:passwd') -> 'user', 'passwd'
822 # splitport('host:port') --> 'host', 'port'
823 # splitquery('/path?query') --> '/path', 'query'
824 # splittag('/path#tag') --> '/path', 'tag'
825 # splitattr('/path;attr1=value1;attr2=value2;...') ->
826 # '/path', ['attr1=value1', 'attr2=value2', ...]
827 # splitvalue('attr=value') --> 'attr', 'value'
828 # splitgophertype('/Xselector') --> 'X', 'selector'
829 # unquote('abc%20def') -> 'abc def'
830 # quote('abc def') -> 'abc%20def')
833 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
834 url
= string
.strip(url
)
835 if url
[:1] == '<' and url
[-1:] == '>':
836 url
= string
.strip(url
[1:-1])
837 if url
[:4] == 'URL:': url
= string
.strip(url
[4:])
842 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
844 if _typeprog
is None:
846 _typeprog
= re
.compile('^([^/:]+):')
848 match
= _typeprog
.match(url
)
850 scheme
= match
.group(1)
851 return scheme
.lower(), url
[len(scheme
) + 1:]
856 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
858 if _hostprog
is None:
860 _hostprog
= re
.compile('^//([^/]*)(.*)$')
862 match
= _hostprog
.match(url
)
863 if match
: return match
.group(1, 2)
868 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
870 if _userprog
is None:
872 _userprog
= re
.compile('^([^@]*)@(.*)$')
874 match
= _userprog
.match(host
)
875 if match
: return match
.group(1, 2)
879 def splitpasswd(user
):
880 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
882 if _passwdprog
is None:
884 _passwdprog
= re
.compile('^([^:]*):(.*)$')
886 match
= _passwdprog
.match(user
)
887 if match
: return match
.group(1, 2)
890 # splittag('/path#tag') --> '/path', 'tag'
893 """splitport('host:port') --> 'host', 'port'."""
895 if _portprog
is None:
897 _portprog
= re
.compile('^(.*):([0-9]+)$')
899 match
= _portprog
.match(host
)
900 if match
: return match
.group(1, 2)
904 def splitnport(host
, defport
=-1):
905 """Split host and port, returning numeric port.
906 Return given default port if no ':' found; defaults to -1.
907 Return numerical port if a valid number are found after ':'.
908 Return None if ':' but not a valid number."""
910 if _nportprog
is None:
912 _nportprog
= re
.compile('^(.*):(.*)$')
914 match
= _nportprog
.match(host
)
916 host
, port
= match
.group(1, 2)
918 if not port
: raise string
.atoi_error
, "no digits"
919 nport
= string
.atoi(port
)
920 except string
.atoi_error
:
927 """splitquery('/path?query') --> '/path', 'query'."""
929 if _queryprog
is None:
931 _queryprog
= re
.compile('^(.*)\?([^?]*)$')
933 match
= _queryprog
.match(url
)
934 if match
: return match
.group(1, 2)
939 """splittag('/path#tag') --> '/path', 'tag'."""
943 _tagprog
= re
.compile('^(.*)#([^#]*)$')
945 match
= _tagprog
.match(url
)
946 if match
: return match
.group(1, 2)
950 """splitattr('/path;attr1=value1;attr2=value2;...') ->
951 '/path', ['attr1=value1', 'attr2=value2', ...]."""
952 words
= string
.splitfields(url
, ';')
953 return words
[0], words
[1:]
956 def splitvalue(attr
):
957 """splitvalue('attr=value') --> 'attr', 'value'."""
959 if _valueprog
is None:
961 _valueprog
= re
.compile('^([^=]*)=(.*)$')
963 match
= _valueprog
.match(attr
)
964 if match
: return match
.group(1, 2)
967 def splitgophertype(selector
):
968 """splitgophertype('/Xselector') --> 'X', 'selector'."""
969 if selector
[:1] == '/' and selector
[1:2]:
970 return selector
[1], selector
[2:]
971 return None, selector
974 """unquote('abc%20def') -> 'abc def'."""
977 list = string
.split(s
, '%')
979 myappend
= res
.append
984 myappend(mychr(myatoi(item
[:2], 16))
990 return string
.join(res
, "")
994 # replace '+' with ' '
995 s
= string
.join(string
.split(s
, '+'), ' ')
998 always_safe
= string
.letters
+ string
.digits
+ '_,.-'
999 def quote(s
, safe
= '/'):
1000 """quote('abc def') -> 'abc%20def')."""
1001 # XXX Can speed this up an order of magnitude
1002 safe
= always_safe
+ safe
1004 for i
in range(len(res
)):
1007 res
[i
] = '%%%02x' % ord(c
)
1008 return string
.joinfields(res
, '')
1010 def quote_plus(s
, safe
= '/'):
1011 # XXX Can speed this up an order of magnitude
1013 # replace ' ' with '+'
1014 l
= string
.split(s
, ' ')
1015 for i
in range(len(l
)):
1016 l
[i
] = quote(l
[i
], safe
)
1017 return string
.join(l
, '+')
1019 return quote(s
, safe
)
1021 def urlencode(dict):
1022 """Encode a dictionary of form entries into a URL query string."""
1024 for k
, v
in dict.items():
1025 k
= quote_plus(str(k
))
1026 v
= quote_plus(str(v
))
1027 l
.append(k
+ '=' + v
)
1028 return string
.join(l
, '&')
1032 def getproxies_environment():
1033 """Return a dictionary of scheme -> proxy server URL mappings.
1035 Scan the environment for variables named <scheme>_proxy;
1036 this seems to be the standard convention. If you need a
1037 different way, you can pass a proxies dictionary to the
1038 [Fancy]URLopener constructor.
1042 for name
, value
in os
.environ
.items():
1043 name
= string
.lower(name
)
1044 if value
and name
[-6:] == '_proxy':
1045 proxies
[name
[:-6]] = value
1048 if os
.name
== 'mac':
1050 """Return a dictionary of scheme -> proxy server URL mappings.
1052 By convention the mac uses Internet Config to store
1053 proxies. An HTTP proxy, for instance, is stored under
1068 if config
.has_key('UseHTTPProxy') and config
['UseHTTPProxy']:
1070 value
= config
['HTTPProxyHost']
1074 proxies
['http'] = 'http://%s' % value
1075 # FTP: XXXX To be done.
1076 # Gopher: XXXX To be done.
1079 elif os
.name
== 'nt':
1080 def getproxies_registry():
1081 """Return a dictionary of scheme -> proxy server URL mappings.
1083 Win32 uses the registry to store proxies.
1090 # Std module, so should be around - but you never know!
1093 internetSettings
= _winreg
.OpenKey(_winreg
.HKEY_CURRENT_USER
,
1094 'Software\\Microsoft\\Windows\\CurrentVersion\\Internet Settings')
1095 proxyEnable
= _winreg
.QueryValueEx(internetSettings
,
1098 # Returned as Unicode but problems if not converted to ASCII
1099 proxyServer
= str(_winreg
.QueryValueEx(internetSettings
,
1101 if ';' in proxyServer
: # Per-protocol settings
1102 for p
in proxyServer
.split(';'):
1103 protocol
, address
= p
.split('=')
1104 proxies
[protocol
] = '%s://%s' % (protocol
, address
)
1105 else: # Use one setting for all protocols
1106 proxies
['http'] = 'http://%s' % proxyServer
1107 proxies
['ftp'] = 'ftp://%s' % proxyServer
1108 internetSettings
.Close()
1109 except (WindowsError, ValueError, TypeError):
1110 # Either registry key not found etc, or the value in an
1111 # unexpected format.
1112 # proxies already set up to be empty so nothing to do
1117 """Return a dictionary of scheme -> proxy server URL mappings.
1119 Returns settings gathered from the environment, if specified,
1123 return getproxies_environment() or getproxies_registry()
1125 # By default use environment variables
1126 getproxies
= getproxies_environment
1129 # Test and time quote() and unquote()
1133 for i
in range(256): s
= s
+ chr(i
)
1144 print round(t1
- t0
, 3), 'sec'
1147 def reporthook(blocknum
, blocksize
, totalsize
):
1148 # Report during remote transfers
1149 print "Block number: %d, Block size: %d, Total size: %d" % (blocknum
, blocksize
, totalsize
)
1157 'file://localhost/etc/passwd',
1158 'ftp://ftp.python.org/etc/passwd',
1159 ## 'gopher://gopher.micro.umn.edu/1/',
1160 'http://www.python.org/index.html',
1162 if hasattr(URLopener
, "open_https"):
1163 args
.append('https://synergy.as.cmu.edu/~geek/')
1166 print '-'*10, url
, '-'*10
1167 fn
, h
= urlretrieve(url
, None, reporthook
)
1171 for k
in h
.keys(): print k
+ ':', h
[k
]
1177 table
= string
.maketrans("", "")
1178 data
= string
.translate(data
, table
, "\r")
1188 opts
, args
= getopt
.getopt(sys
.argv
[1:], "th")
1189 except getopt
.error
, msg
:
1191 print "Use -h for help"
1198 print "Usage: python urllib.py [-t] [url ...]"
1199 print "-t runs self-test;",
1200 print "otherwise, contents of urls are printed"
1208 print "Use -h for help"
1210 print urlopen(url
).read(),
1212 # Run test program when run as a script
1213 if __name__
== '__main__':