1 """An extensible library for opening URLs using a variety of protocols
3 The simplest way to use this module is to call the urlopen function,
4 which accepts a string containing a URL or a Request object (described
5 below). It opens the URL and returns the results as file-like
6 object; the returned object has some extra methods described below.
8 The OpenerDirector manages a collection of Handler objects that do
9 all the actual work. Each Handler implements a particular protocol or
10 option. The OpenerDirector is a composite object that invokes the
11 Handlers needed to open the requested URL. For example, the
12 HTTPHandler performs HTTP GET and POST requests and deals with
13 non-error returns. The HTTPRedirectHandler automatically deals with
14 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15 deals with digest authentication.
17 urlopen(url, data=None) -- basic usage is that same as original
18 urllib. pass the url and optionally data to post to an HTTP URL, and
19 get a file-like object back. One difference is that you can also pass
20 a Request instance instead of URL. Raises a URLError (subclass of
21 IOError); for HTTP errors, raises an HTTPError, which can also be
22 treated as a valid response.
24 build_opener -- function that creates a new OpenerDirector instance.
25 will install the default handlers. accepts one or more Handlers as
26 arguments, either instances or Handler classes that it will
27 instantiate. if one of the argument is a subclass of the default
28 handler, the argument will be installed instead of the default.
30 install_opener -- installs a new opener as the default opener.
35 Request -- an object that encapsulates the state of a request. the
36 state can be a simple as the URL. it can also include extra HTTP
37 headers, e.g. a User-Agent.
42 URLError-- a subclass of IOError, individual protocols have their own
45 HTTPError-- also a valid HTTP response, so you can treat an HTTP error
46 as an exceptional event or valid response
49 BaseHandler and parent
50 _call_chain conventions
56 # set up authentication info
57 authinfo = urllib2.HTTPBasicAuthHandler()
58 authinfo.add_password('realm', 'host', 'username', 'password')
60 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
62 # build a new opener that adds authentication and caching FTP handlers
63 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
66 urllib2.install_opener(opener)
68 f = urllib2.urlopen('http://www.python.org/')
74 # If an authentication error handler that tries to perform
75 # authentication for some reason but fails, how should the error be
76 # signalled? The client needs to know the HTTP error code. But if
77 # the handler knows that the problem was, e.g., that it didn't know
78 # that hash algo that requested in the challenge, it would be good to
79 # pass that information along to the client, too.
83 # documentation (getting there)
85 # abstract factory for opener
86 # ftp errors aren't handled cleanly
87 # gopher can return a socket.error
88 # check digest against correct (i.e. non-apache) implementation
110 from cStringIO
import StringIO
112 from StringIO
import StringIO
114 # not sure how many of these need to be gotten rid of
115 from urllib
import unwrap
, unquote
, splittype
, splithost
, \
116 addinfourl
, splitport
, splitgophertype
, splitquery
, \
117 splitattr
, ftpwrapper
, noheaders
, splituser
, splitpasswd
119 # support for FileHandler, proxies via environment variables
120 from urllib
import localhost
, url2pathname
, getproxies
125 def urlopen(url
, data
=None):
128 _opener
= build_opener()
129 return _opener
.open(url
, data
)
131 def install_opener(opener
):
135 # do these error classes make sense?
136 # make sure all of the IOError stuff is overridden. we just want to be
139 class URLError(IOError):
140 # URLError is a sub-type of IOError, but it doesn't share any of
141 # the implementation. need to override __init__ and __str__
142 def __init__(self
, reason
):
146 return '<urlopen error %s>' % self
.reason
148 class HTTPError(URLError
, addinfourl
):
149 """Raised when HTTP error occurs, but also acts like non-error return"""
150 __super_init
= addinfourl
.__init
__
152 def __init__(self
, url
, code
, msg
, hdrs
, fp
):
158 # The addinfourl classes depend on fp being a valid file
159 # object. In some cases, the HTTPError may not have a valid
160 # file object. If this happens, the simplest workaround is to
161 # not initialize the base classes.
163 self
.__super
_init
(fp
, hdrs
, url
)
166 return 'HTTP Error %s: %s' % (self
.code
, self
.msg
)
169 # XXX is this safe? what if user catches exception, then
170 # extracts fp and discards exception?
174 class GopherError(URLError
):
180 def __init__(self
, url
, data
=None, headers
={}):
181 # unwrap('<URL:type://host/path>') --> 'type://host/path'
182 self
.__original
= unwrap(url
)
184 # self.__r_type is what's left after doing the splittype
189 for key
, value
in headers
.items():
190 self
.add_header(key
, value
)
192 def __getattr__(self
, attr
):
193 # XXX this is a fallback mechanism to guard against these
194 # methods getting called in a non-standard order. this may be
195 # too complicated and/or unnecessary.
196 # XXX should the __r_XXX attributes be public?
197 if attr
[:12] == '_Request__r_':
199 if hasattr(Request
, 'get_' + name
):
200 getattr(self
, 'get_' + name
)()
201 return getattr(self
, attr
)
202 raise AttributeError, attr
204 def get_method(self
):
210 def add_data(self
, data
):
214 return self
.data
is not None
219 def get_full_url(self
):
220 return self
.__original
223 if self
.type is None:
224 self
.type, self
.__r
_type
= splittype(self
.__original
)
225 if self
.type is None:
226 raise ValueError, "unknown url type: %s" % self
.__original
230 if self
.host
is None:
231 self
.host
, self
.__r
_host
= splithost(self
.__r
_type
)
233 self
.host
= unquote(self
.host
)
236 def get_selector(self
):
239 def set_proxy(self
, host
, type):
240 self
.host
, self
.type = host
, type
241 self
.__r
_host
= self
.__original
243 def add_header(self
, key
, val
):
244 # useful for something like authentication
245 self
.headers
[key
.capitalize()] = val
247 class OpenerDirector
:
249 server_version
= "Python-urllib/%s" % __version__
250 self
.addheaders
= [('User-agent', server_version
)]
251 # manage the individual handlers
253 self
.handle_open
= {}
254 self
.handle_error
= {}
256 def add_handler(self
, handler
):
258 for meth
in dir(handler
):
259 if meth
[-5:] == '_open':
261 if protocol
in self
.handle_open
:
262 self
.handle_open
[protocol
].append(handler
)
263 self
.handle_open
[protocol
].sort()
265 self
.handle_open
[protocol
] = [handler
]
269 j
= meth
[i
+1:].find('_') + i
+ 1
270 if j
!= -1 and meth
[i
+1:j
] == 'error':
277 dict = self
.handle_error
.get(proto
, {})
279 dict[kind
].append(handler
)
282 dict[kind
] = [handler
]
283 self
.handle_error
[proto
] = dict
287 self
.handlers
.append(handler
)
289 handler
.add_parent(self
)
295 for handler
in self
.handlers
:
299 def _call_chain(self
, chain
, kind
, meth_name
, *args
):
300 # XXX raise an exception if no one else should try to handle
301 # this url. return None if you can't but someone else could.
302 handlers
= chain
.get(kind
, ())
303 for handler
in handlers
:
304 func
= getattr(handler
, meth_name
)
307 if result
is not None:
310 def open(self
, fullurl
, data
=None):
311 # accept a URL or a Request object
312 if isinstance(fullurl
, basestring
):
313 req
= Request(fullurl
, data
)
319 result
= self
._call
_chain
(self
.handle_open
, 'default',
324 type_
= req
.get_type()
325 result
= self
._call
_chain
(self
.handle_open
, type_
, type_
+ \
330 return self
._call
_chain
(self
.handle_open
, 'unknown',
333 def error(self
, proto
, *args
):
334 if proto
in ['http', 'https']:
335 # XXX http[s] protocols are special-cased
336 dict = self
.handle_error
['http'] # https is not different than http
337 proto
= args
[2] # YUCK!
338 meth_name
= 'http_error_%d' % proto
342 dict = self
.handle_error
343 meth_name
= proto
+ '_error'
345 args
= (dict, proto
, meth_name
) + args
346 result
= self
._call
_chain
(*args
)
351 args
= (dict, 'default', 'http_error_default') + orig_args
352 return self
._call
_chain
(*args
)
354 # XXX probably also want an abstract factory that knows when it makes
355 # sense to skip a superclass in favor of a subclass and when it might
356 # make sense to include both
358 def build_opener(*handlers
):
359 """Create an opener object from a list of handlers.
361 The opener will use several default handlers, including support
364 If any of the handlers passed as arguments are subclasses of the
365 default handlers, the default handlers will not be used.
368 opener
= OpenerDirector()
369 default_classes
= [ProxyHandler
, UnknownHandler
, HTTPHandler
,
370 HTTPDefaultErrorHandler
, HTTPRedirectHandler
,
371 FTPHandler
, FileHandler
]
372 if hasattr(httplib
, 'HTTPS'):
373 default_classes
.append(HTTPSHandler
)
375 for klass
in default_classes
:
376 for check
in handlers
:
377 if inspect
.isclass(check
):
378 if issubclass(check
, klass
):
380 elif isinstance(check
, klass
):
383 default_classes
.remove(klass
)
385 for klass
in default_classes
:
386 opener
.add_handler(klass())
389 if inspect
.isclass(h
):
391 opener
.add_handler(h
)
397 def add_parent(self
, parent
):
401 def __lt__(self
, other
):
402 if not hasattr(other
, "handler_order"):
403 # Try to preserve the old behavior of having custom classes
404 # inserted after default ones (works only for custom user
405 # classes which are not aware of handler_order).
407 return self
.handler_order
< other
.handler_order
410 class HTTPDefaultErrorHandler(BaseHandler
):
411 def http_error_default(self
, req
, fp
, code
, msg
, hdrs
):
412 raise HTTPError(req
.get_full_url(), code
, msg
, hdrs
, fp
)
414 class HTTPRedirectHandler(BaseHandler
):
415 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
416 """Return a Request or None in response to a redirect.
418 This is called by the http_error_30x methods when a
419 redirection response is received. If a redirection should
420 take place, return a new Request to allow http_error_30x to
421 perform the redirect. Otherwise, raise HTTPError if no-one
422 else should try to handle this url. Return None if you can't
423 but another Handler might.
426 if (code
in (301, 302, 303, 307) and m
in ("GET", "HEAD")
427 or code
in (301, 302, 303) and m
== "POST"):
428 # Strictly (according to RFC 2616), 301 or 302 in response
429 # to a POST MUST NOT cause a redirection without confirmation
430 # from the user (of urllib2, in this case). In practice,
431 # essentially all clients do redirect in this case, so we
433 return Request(newurl
, headers
=req
.headers
)
435 raise HTTPError(req
.get_full_url(), code
, msg
, headers
, fp
)
437 # Implementation note: To avoid the server sending us into an
438 # infinite loop, the request object needs to track what URLs we
439 # have already seen. Do this by adding a handler-specific
440 # attribute to the Request object.
441 def http_error_302(self
, req
, fp
, code
, msg
, headers
):
442 if 'location' in headers
:
443 newurl
= headers
['location']
444 elif 'uri' in headers
:
445 newurl
= headers
['uri']
448 newurl
= urlparse
.urljoin(req
.get_full_url(), newurl
)
450 # XXX Probably want to forget about the state of the current
451 # request, although that might interact poorly with other
452 # handlers that also use handler-specific request attributes
453 new
= self
.redirect_request(req
, fp
, code
, msg
, headers
, newurl
)
458 new
.error_302_dict
= {}
459 if hasattr(req
, 'error_302_dict'):
460 if len(req
.error_302_dict
)>10 or \
461 newurl
in req
.error_302_dict
:
462 raise HTTPError(req
.get_full_url(), code
,
463 self
.inf_msg
+ msg
, headers
, fp
)
464 new
.error_302_dict
.update(req
.error_302_dict
)
465 new
.error_302_dict
[newurl
] = newurl
467 # Don't close the fp until we are sure that we won't use it
472 return self
.parent
.open(new
)
474 http_error_301
= http_error_303
= http_error_307
= http_error_302
476 inf_msg
= "The HTTP server returned a redirect error that would " \
477 "lead to an infinite loop.\n" \
478 "The last 30x error message was:\n"
480 class ProxyHandler(BaseHandler
):
481 # Proxies must be in front
484 def __init__(self
, proxies
=None):
486 proxies
= getproxies()
487 assert hasattr(proxies
, 'has_key'), "proxies must be a mapping"
488 self
.proxies
= proxies
489 for type, url
in proxies
.items():
490 setattr(self
, '%s_open' % type,
491 lambda r
, proxy
=url
, type=type, meth
=self
.proxy_open
: \
492 meth(r
, proxy
, type))
494 def proxy_open(self
, req
, proxy
, type):
495 orig_type
= req
.get_type()
496 type, r_type
= splittype(proxy
)
497 host
, XXX
= splithost(r_type
)
499 user_pass
, host
= host
.split('@', 1)
501 user
, password
= user_pass
.split(':', 1)
502 user_pass
= base64
.encodestring('%s:%s' % (unquote(user
),
504 req
.add_header('Proxy-authorization', 'Basic ' + user_pass
)
506 req
.set_proxy(host
, type)
507 if orig_type
== type:
508 # let other handlers take care of it
509 # XXX this only makes sense if the proxy is before the
513 # need to start over, because the other handlers don't
514 # grok the proxy's URL type
515 return self
.parent
.open(req
)
517 # feature suggested by Duncan Booth
518 # XXX custom is not a good name
520 # either pass a function to the constructor or override handle
521 def __init__(self
, proto
, func
=None, proxy_addr
=None):
524 self
.addr
= proxy_addr
526 def handle(self
, req
):
527 if self
.func
and self
.func(req
):
533 class CustomProxyHandler(BaseHandler
):
534 # Proxies must be in front
537 def __init__(self
, *proxies
):
540 def proxy_open(self
, req
):
541 proto
= req
.get_type()
543 proxies
= self
.proxies
[proto
]
548 req
.set_proxy(p
.get_proxy())
549 return self
.parent
.open(req
)
552 def do_proxy(self
, p
, req
):
553 return self
.parent
.open(req
)
555 def add_proxy(self
, cpo
):
556 if cpo
.proto
in self
.proxies
:
557 self
.proxies
[cpo
.proto
].append(cpo
)
559 self
.proxies
[cpo
.proto
] = [cpo
]
561 class HTTPPasswordMgr
:
565 def add_password(self
, realm
, uri
, user
, passwd
):
566 # uri could be a single URI or a sequence
567 if isinstance(uri
, basestring
):
569 uri
= tuple(map(self
.reduce_uri
, uri
))
570 if not realm
in self
.passwd
:
571 self
.passwd
[realm
] = {}
572 self
.passwd
[realm
][uri
] = (user
, passwd
)
574 def find_user_password(self
, realm
, authuri
):
575 domains
= self
.passwd
.get(realm
, {})
576 authuri
= self
.reduce_uri(authuri
)
577 for uris
, authinfo
in domains
.iteritems():
579 if self
.is_suburi(uri
, authuri
):
583 def reduce_uri(self
, uri
):
584 """Accept netloc or URI and extract only the netloc and path"""
585 parts
= urlparse
.urlparse(uri
)
587 return parts
[1], parts
[2] or '/'
591 def is_suburi(self
, base
, test
):
592 """Check if test is below base in a URI tree
594 Both args must be URIs in reduced form.
598 if base
[0] != test
[0]:
600 common
= posixpath
.commonprefix((base
[1], test
[1]))
601 if len(common
) == len(base
[1]):
606 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr
):
608 def find_user_password(self
, realm
, authuri
):
609 user
, password
= HTTPPasswordMgr
.find_user_password(self
, realm
,
612 return user
, password
613 return HTTPPasswordMgr
.find_user_password(self
, None, authuri
)
616 class AbstractBasicAuthHandler
:
618 rx
= re
.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re
.I
)
620 # XXX there can actually be multiple auth-schemes in a
621 # www-authenticate header. should probably be a lot more careful
622 # in parsing them to extract multiple alternatives
624 def __init__(self
, password_mgr
=None):
625 if password_mgr
is None:
626 password_mgr
= HTTPPasswordMgr()
627 self
.passwd
= password_mgr
628 self
.add_password
= self
.passwd
.add_password
630 def http_error_auth_reqed(self
, authreq
, host
, req
, headers
):
631 # XXX could be multiple headers
632 authreq
= headers
.get(authreq
, None)
634 mo
= AbstractBasicAuthHandler
.rx
.match(authreq
)
636 scheme
, realm
= mo
.groups()
637 if scheme
.lower() == 'basic':
638 return self
.retry_http_basic_auth(host
, req
, realm
)
640 def retry_http_basic_auth(self
, host
, req
, realm
):
641 user
,pw
= self
.passwd
.find_user_password(realm
, host
)
643 raw
= "%s:%s" % (user
, pw
)
644 auth
= 'Basic %s' % base64
.encodestring(raw
).strip()
645 if req
.headers
.get(self
.auth_header
, None) == auth
:
647 req
.add_header(self
.auth_header
, auth
)
648 return self
.parent
.open(req
)
652 class HTTPBasicAuthHandler(AbstractBasicAuthHandler
, BaseHandler
):
654 auth_header
= 'Authorization'
656 def http_error_401(self
, req
, fp
, code
, msg
, headers
):
657 host
= urlparse
.urlparse(req
.get_full_url())[1]
658 return self
.http_error_auth_reqed('www-authenticate',
662 class ProxyBasicAuthHandler(AbstractBasicAuthHandler
, BaseHandler
):
664 auth_header
= 'Proxy-authorization'
666 def http_error_407(self
, req
, fp
, code
, msg
, headers
):
667 host
= req
.get_host()
668 return self
.http_error_auth_reqed('proxy-authenticate',
673 """Return n random bytes."""
674 # Use /dev/urandom if it is available. Fall back to random module
675 # if not. It might be worthwhile to extend this function to use
676 # other platform-specific mechanisms for getting random bytes.
677 if os
.path
.exists("/dev/urandom"):
678 f
= open("/dev/urandom")
683 L
= [chr(random
.randrange(0, 256)) for i
in range(n
)]
686 class AbstractDigestAuthHandler
:
687 # Digest authentication is specified in RFC 2617.
689 # XXX The client does not inspect the Authentication-Info header
690 # in a successful response.
692 # XXX It should be possible to test this implementation against
693 # a mock server that just generates a static set of challenges.
695 # XXX qop="auth-int" supports is shaky
697 def __init__(self
, passwd
=None):
699 passwd
= HTTPPasswordMgr()
701 self
.add_password
= self
.passwd
.add_password
705 def reset_retry_count(self
):
708 def http_error_auth_reqed(self
, auth_header
, host
, req
, headers
):
709 authreq
= headers
.get(auth_header
, None)
711 # Don't fail endlessly - if we failed once, we'll probably
712 # fail a second time. Hm. Unless the Password Manager is
713 # prompting for the information. Crap. This isn't great
714 # but it's better than the current 'repeat until recursion
715 # depth exceeded' approach <wink>
716 raise HTTPError(req
.get_full_url(), 401, "digest auth failed",
721 scheme
= authreq
.split()[0]
722 if scheme
.lower() == 'digest':
723 return self
.retry_http_digest_auth(req
, authreq
)
725 raise ValueError("AbstractDigestAuthHandler doesn't know "
728 def retry_http_digest_auth(self
, req
, auth
):
729 token
, challenge
= auth
.split(' ', 1)
730 chal
= parse_keqv_list(parse_http_list(challenge
))
731 auth
= self
.get_authorization(req
, chal
)
733 auth_val
= 'Digest %s' % auth
734 if req
.headers
.get(self
.auth_header
, None) == auth_val
:
736 req
.add_header(self
.auth_header
, auth_val
)
737 resp
= self
.parent
.open(req
)
740 def get_cnonce(self
, nonce
):
741 # The cnonce-value is an opaque
742 # quoted string value provided by the client and used by both client
743 # and server to avoid chosen plaintext attacks, to provide mutual
744 # authentication, and to provide some message integrity protection.
745 # This isn't a fabulous effort, but it's probably Good Enough.
746 dig
= sha
.new("%s:%s:%s:%s" % (self
.nonce_count
, nonce
, time
.ctime(),
747 randombytes(8))).hexdigest()
750 def get_authorization(self
, req
, chal
):
752 realm
= chal
['realm']
753 nonce
= chal
['nonce']
754 qop
= chal
.get('qop')
755 algorithm
= chal
.get('algorithm', 'MD5')
756 # mod_digest doesn't send an opaque, even though it isn't
757 # supposed to be optional
758 opaque
= chal
.get('opaque', None)
762 H
, KD
= self
.get_algorithm_impls(algorithm
)
766 user
, pw
= self
.passwd
.find_user_password(realm
, req
.get_full_url())
770 # XXX not implemented yet
772 entdig
= self
.get_entity_digest(req
.get_data(), chal
)
776 A1
= "%s:%s:%s" % (user
, realm
, pw
)
777 A2
= "%s:%s" % (req
.has_data() and 'POST' or 'GET',
778 # XXX selector: what about proxies and full urls
781 self
.nonce_count
+= 1
782 ncvalue
= '%08x' % self
.nonce_count
783 cnonce
= self
.get_cnonce(nonce
)
784 noncebit
= "%s:%s:%s:%s:%s" % (nonce
, ncvalue
, cnonce
, qop
, H(A2
))
785 respdig
= KD(H(A1
), noncebit
)
787 respdig
= KD(H(A1
), "%s:%s" % (nonce
, H(A2
)))
789 # XXX handle auth-int.
792 # XXX should the partial digests be encoded too?
794 base
= 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
795 'response="%s"' % (user
, realm
, nonce
, req
.get_selector(),
798 base
= base
+ ', opaque="%s"' % opaque
800 base
= base
+ ', digest="%s"' % entdig
801 if algorithm
!= 'MD5':
802 base
= base
+ ', algorithm="%s"' % algorithm
804 base
= base
+ ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue
, cnonce
)
807 def get_algorithm_impls(self
, algorithm
):
808 # lambdas assume digest modules are imported at the top level
809 if algorithm
== 'MD5':
810 H
= lambda x
: md5
.new(x
).hexdigest()
811 elif algorithm
== 'SHA':
812 H
= lambda x
: sha
.new(x
).hexdigest()
814 KD
= lambda s
, d
: H("%s:%s" % (s
, d
))
817 def get_entity_digest(self
, data
, chal
):
818 # XXX not implemented yet
822 class HTTPDigestAuthHandler(BaseHandler
, AbstractDigestAuthHandler
):
823 """An authentication protocol defined by RFC 2069
825 Digest authentication improves on basic authentication because it
826 does not transmit passwords in the clear.
829 auth_header
= 'Authorization'
831 def http_error_401(self
, req
, fp
, code
, msg
, headers
):
832 host
= urlparse
.urlparse(req
.get_full_url())[1]
833 retry
= self
.http_error_auth_reqed('www-authenticate',
835 self
.reset_retry_count()
839 class ProxyDigestAuthHandler(BaseHandler
, AbstractDigestAuthHandler
):
841 auth_header
= 'Proxy-Authorization'
843 def http_error_407(self
, req
, fp
, code
, msg
, headers
):
844 host
= req
.get_host()
845 retry
= self
.http_error_auth_reqed('proxy-authenticate',
847 self
.reset_retry_count()
850 class AbstractHTTPHandler(BaseHandler
):
852 # XXX Should rewrite do_open() to use the new httplib interface,
853 # would be a little simpler.
855 def do_open(self
, http_class
, req
):
856 host
= req
.get_host()
858 raise URLError('no host given')
860 h
= http_class(host
) # will parse host:port
862 data
= req
.get_data()
863 h
.putrequest('POST', req
.get_selector())
864 if not 'Content-type' in req
.headers
:
865 h
.putheader('Content-type',
866 'application/x-www-form-urlencoded')
867 if not 'Content-length' in req
.headers
:
868 h
.putheader('Content-length', '%d' % len(data
))
870 h
.putrequest('GET', req
.get_selector())
872 scheme
, sel
= splittype(req
.get_selector())
873 sel_host
, sel_path
= splithost(sel
)
874 h
.putheader('Host', sel_host
or host
)
875 for name
, value
in self
.parent
.addheaders
:
876 name
= name
.capitalize()
877 if name
not in req
.headers
:
878 h
.putheader(name
, value
)
879 for k
, v
in req
.headers
.items():
881 # httplib will attempt to connect() here. be prepared
882 # to convert a socket error to a URLError.
885 except socket
.error
, err
:
890 code
, msg
, hdrs
= h
.getreply()
893 return addinfourl(fp
, hdrs
, req
.get_full_url())
895 return self
.parent
.error('http', req
, fp
, code
, msg
, hdrs
)
898 class HTTPHandler(AbstractHTTPHandler
):
900 def http_open(self
, req
):
901 return self
.do_open(httplib
.HTTP
, req
)
904 if hasattr(httplib
, 'HTTPS'):
905 class HTTPSHandler(AbstractHTTPHandler
):
907 def https_open(self
, req
):
908 return self
.do_open(httplib
.HTTPS
, req
)
911 class UnknownHandler(BaseHandler
):
912 def unknown_open(self
, req
):
913 type = req
.get_type()
914 raise URLError('unknown url type: %s' % type)
916 def parse_keqv_list(l
):
917 """Parse list of key=value strings where keys are not duplicated."""
920 k
, v
= elt
.split('=', 1)
921 if v
[0] == '"' and v
[-1] == '"':
926 def parse_http_list(s
):
927 """Parse lists as described by RFC 2068 Section 2.
929 In particular, parse comman-separated lists where the elements of
930 the list may include quoted-strings. A quoted-string could
933 # XXX this function could probably use more testing
945 list.append(s
[start
:])
949 raise ValueError, "unbalanced quotes"
951 list.append(s
[start
:i
+c
])
956 list.append(s
[start
:i
+c
])
964 list.append(s
[start
:i
+c
])
970 return map(lambda x
: x
.strip(), list)
972 class FileHandler(BaseHandler
):
973 # Use local file or FTP depending on form of URL
974 def file_open(self
, req
):
975 url
= req
.get_selector()
976 if url
[:2] == '//' and url
[2:3] != '/':
978 return self
.parent
.open(req
)
980 return self
.open_local_file(req
)
982 # names for the localhost
985 if FileHandler
.names
is None:
986 FileHandler
.names
= (socket
.gethostbyname('localhost'),
987 socket
.gethostbyname(socket
.gethostname()))
988 return FileHandler
.names
990 # not entirely sure what the rules are here
991 def open_local_file(self
, req
):
992 host
= req
.get_host()
993 file = req
.get_selector()
994 localfile
= url2pathname(file)
995 stats
= os
.stat(localfile
)
997 modified
= rfc822
.formatdate(stats
.st_mtime
)
998 mtype
= mimetypes
.guess_type(file)[0]
999 headers
= mimetools
.Message(StringIO(
1000 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1001 (mtype
or 'text/plain', size
, modified
)))
1003 host
, port
= splitport(host
)
1005 (not port
and socket
.gethostbyname(host
) in self
.get_names()):
1006 return addinfourl(open(localfile
, 'rb'),
1007 headers
, 'file:'+file)
1008 raise URLError('file not on local host')
1010 class FTPHandler(BaseHandler
):
1011 def ftp_open(self
, req
):
1012 host
= req
.get_host()
1014 raise IOError, ('ftp error', 'no host given')
1015 host
, port
= splitport(host
)
1017 port
= ftplib
.FTP_PORT
1019 # username/password handling
1020 user
, host
= splituser(host
)
1022 user
, passwd
= splitpasswd(user
)
1025 host
= unquote(host
)
1026 user
= unquote(user
or '')
1027 passwd
= unquote(passwd
or '')
1030 host
= socket
.gethostbyname(host
)
1031 except socket
.error
, msg
:
1033 path
, attrs
= splitattr(req
.get_selector())
1034 dirs
= path
.split('/')
1035 dirs
= map(unquote
, dirs
)
1036 dirs
, file = dirs
[:-1], dirs
[-1]
1037 if dirs
and not dirs
[0]:
1040 fw
= self
.connect_ftp(user
, passwd
, host
, port
, dirs
)
1041 type = file and 'I' or 'D'
1043 attr
, value
= splitattr(attr
)
1044 if attr
.lower() == 'type' and \
1045 value
in ('a', 'A', 'i', 'I', 'd', 'D'):
1046 type = value
.upper()
1047 fp
, retrlen
= fw
.retrfile(file, type)
1049 mtype
= mimetypes
.guess_type(req
.get_full_url())[0]
1051 headers
+= "Content-type: %s\n" % mtype
1052 if retrlen
is not None and retrlen
>= 0:
1053 headers
+= "Content-length: %d\n" % retrlen
1054 sf
= StringIO(headers
)
1055 headers
= mimetools
.Message(sf
)
1056 return addinfourl(fp
, headers
, req
.get_full_url())
1057 except ftplib
.all_errors
, msg
:
1058 raise IOError, ('ftp error', msg
), sys
.exc_info()[2]
1060 def connect_ftp(self
, user
, passwd
, host
, port
, dirs
):
1061 fw
= ftpwrapper(user
, passwd
, host
, port
, dirs
)
1062 ## fw.ftp.set_debuglevel(1)
1065 class CacheFTPHandler(FTPHandler
):
1066 # XXX would be nice to have pluggable cache strategies
1067 # XXX this stuff is definitely not thread safe
1075 def setTimeout(self
, t
):
1078 def setMaxConns(self
, m
):
1081 def connect_ftp(self
, user
, passwd
, host
, port
, dirs
):
1082 key
= user
, host
, port
, '/'.join(dirs
)
1083 if key
in self
.cache
:
1084 self
.timeout
[key
] = time
.time() + self
.delay
1086 self
.cache
[key
] = ftpwrapper(user
, passwd
, host
, port
, dirs
)
1087 self
.timeout
[key
] = time
.time() + self
.delay
1089 return self
.cache
[key
]
1091 def check_cache(self
):
1092 # first check for old ones
1094 if self
.soonest
<= t
:
1095 for k
, v
in self
.timeout
.items():
1097 self
.cache
[k
].close()
1100 self
.soonest
= min(self
.timeout
.values())
1102 # then check the size
1103 if len(self
.cache
) == self
.max_conns
:
1104 for k
, v
in self
.timeout
.items():
1105 if v
== self
.soonest
:
1109 self
.soonest
= min(self
.timeout
.values())
1111 class GopherHandler(BaseHandler
):
1112 def gopher_open(self
, req
):
1113 host
= req
.get_host()
1115 raise GopherError('no host given')
1116 host
= unquote(host
)
1117 selector
= req
.get_selector()
1118 type, selector
= splitgophertype(selector
)
1119 selector
, query
= splitquery(selector
)
1120 selector
= unquote(selector
)
1122 query
= unquote(query
)
1123 fp
= gopherlib
.send_query(selector
, query
, host
)
1125 fp
= gopherlib
.send_selector(selector
, host
)
1126 return addinfourl(fp
, noheaders(), req
.get_full_url())
1128 #bleck! don't use this yet
1129 class OpenerFactory
:
1131 default_handlers
= [UnknownHandler
, HTTPHandler
,
1132 HTTPDefaultErrorHandler
, HTTPRedirectHandler
,
1133 FTPHandler
, FileHandler
]
1135 replacement_handlers
= []
1137 def add_handler(self
, h
):
1138 self
.handlers
= self
.handlers
+ [h
]
1140 def replace_handler(self
, h
):
1143 def build_opener(self
):
1144 opener
= OpenerDirector()
1145 for ph
in self
.default_handlers
:
1146 if inspect
.isclass(ph
):
1148 opener
.add_handler(ph
)
1150 if __name__
== "__main__":
1151 # XXX some of the test code depends on machine configurations that
1152 # are internal to CNRI. Need to set up a public server with the
1153 # right authentication configuration for test purposes.
1154 if socket
.gethostname() == 'bitdiddle':
1155 localhost
= 'bitdiddle.cnri.reston.va.us'
1156 elif socket
.gethostname() == 'bitdiddle.concentric.net':
1157 localhost
= 'localhost'
1161 # Thanks to Fred for finding these!
1162 'gopher://gopher.lib.ncsu.edu/11/library/stacks/Alex',
1163 'gopher://gopher.vt.edu:10010/10/33',
1166 'file://nonsensename/etc/passwd',
1167 'ftp://www.python.org/pub/python/misc/sousa.au',
1168 'ftp://www.python.org/pub/tmp/blat',
1169 'http://www.espn.com/', # redirect
1170 'http://www.python.org/Spanish/Inquistion/',
1171 ('http://www.python.org/cgi-bin/faqw.py',
1172 'query=pythonistas&querytype=simple&casefold=yes&req=search'),
1173 'http://www.python.org/',
1174 'ftp://gatekeeper.research.compaq.com/pub/DEC/SRC/research-reports/00README-Legal-Rules-Regs',
1177 ## if localhost is not None:
1179 ## 'file://%s/etc/passwd' % localhost,
1180 ## 'http://%s/simple/' % localhost,
1181 ## 'http://%s/digest/' % localhost,
1182 ## 'http://%s/not/found.h' % localhost,
1185 ## bauth = HTTPBasicAuthHandler()
1186 ## bauth.add_password('basic_test_realm', localhost, 'jhylton',
1188 ## dauth = HTTPDigestAuthHandler()
1189 ## dauth.add_password('digest_test_realm', localhost, 'jhylton',
1193 cfh
= CacheFTPHandler()
1196 ## # XXX try out some custom proxy objects too!
1197 ## def at_cnri(req):
1198 ## host = req.get_host()
1200 ## if host[-18:] == '.cnri.reston.va.us':
1202 ## p = CustomProxy('http', at_cnri, 'proxy.cnri.reston.va.us')
1203 ## ph = CustomProxyHandler(p)
1205 ## install_opener(build_opener(dauth, bauth, cfh, GopherHandler, ph))
1206 install_opener(build_opener(cfh
, GopherHandler
))
1209 if isinstance(url
, tuple):
1215 f
= urlopen(url
, req
)
1216 except IOError, err
:
1217 print "IOError:", err
1218 except socket
.error
, err
:
1219 print "socket.error:", err
1223 print "read %d bytes" % len(buf
)