1 """Integration with Python standard library module urllib2: OpenerDirector
4 Copyright 2004-2006 John J Lee <jjl@pobox.com>
6 This code is free software; you can redistribute it and/or modify it
7 under the terms of the BSD or ZPL 2.1 licenses (see the file
8 COPYING.txt included with the distribution).
12 import os
, urllib2
, bisect
, urllib
, httplib
, types
, tempfile
14 import threading
as _threading
16 import dummy_threading
as _threading
27 from _util
import isstringlike
28 from _request
import Request
31 class ContentTooShortError(urllib2
.URLError
):
32 def __init__(self
, reason
, result
):
33 urllib2
.URLError
.__init
__(self
, reason
)
37 class OpenerDirector(urllib2
.OpenerDirector
):
39 urllib2
.OpenerDirector
.__init
__(self
)
40 # really none of these are (sanely) public -- the lack of initial
41 # underscore on some is just due to following urllib2
42 self
.process_response
= {}
43 self
.process_request
= {}
44 self
._any
_request
= {}
45 self
._any
_response
= {}
46 self
._handler
_index
_valid
= True
49 def add_handler(self
, handler
):
50 if handler
in self
.handlers
:
52 # XXX why does self.handlers need to be sorted?
53 bisect
.insort(self
.handlers
, handler
)
54 handler
.add_parent(self
)
55 self
._handler
_index
_valid
= False
57 def _maybe_reindex_handlers(self
):
58 if self
._handler
_index
_valid
:
69 for handler
in self
.handlers
:
71 for meth
in dir(handler
):
72 if meth
in ["redirect_request", "do_open", "proxy_open"]:
73 # oops, coincidental match
76 if meth
== "any_request":
77 any_request
.add(handler
)
80 elif meth
== "any_response":
81 any_response
.add(handler
)
87 condition
= meth
[ii
+1:]
89 if condition
.startswith("error"):
90 jj
= meth
[ii
+1:].find("_") + ii
+ 1
96 lookup
= handle_error
.setdefault(scheme
, {})
97 elif condition
== "open":
100 elif condition
== "request":
102 lookup
= process_request
103 elif condition
== "response":
105 lookup
= process_response
109 lookup
.setdefault(kind
, set()).add(handler
)
113 unwanted
.append(handler
)
115 for handler
in unwanted
:
116 self
.handlers
.remove(handler
)
118 # sort indexed methods
119 # XXX could be cleaned up
120 for lookup
in [process_request
, process_response
]:
121 for scheme
, handlers
in lookup
.iteritems():
122 lookup
[scheme
] = handlers
123 for scheme
, lookup
in handle_error
.iteritems():
124 for code
, handlers
in lookup
.iteritems():
125 handlers
= list(handlers
)
127 lookup
[code
] = handlers
128 for scheme
, handlers
in handle_open
.iteritems():
129 handlers
= list(handlers
)
131 handle_open
[scheme
] = handlers
134 self
.handle_error
= handle_error
135 self
.handle_open
= handle_open
136 self
.process_request
= process_request
137 self
.process_response
= process_response
138 self
._any
_request
= any_request
139 self
._any
_response
= any_response
141 def _request(self
, url_or_req
, data
, visit
):
142 if isstringlike(url_or_req
):
143 req
= Request(url_or_req
, data
, visit
=visit
)
145 # already a urllib2.Request or mechanize.Request instance
149 # XXX yuck, give request a .visit attribute if it doesn't have one
152 except AttributeError:
154 if visit
is not None:
158 def open(self
, fullurl
, data
=None):
159 req
= self
._request
(fullurl
, data
, None)
160 req_scheme
= req
.get_type()
162 self
._maybe
_reindex
_handlers
()
164 # pre-process request
165 # XXX should we allow a Processor to change the URL scheme
167 request_processors
= set(self
.process_request
.get(req_scheme
, []))
168 request_processors
.update(self
._any
_request
)
169 request_processors
= list(request_processors
)
170 request_processors
.sort()
171 for processor
in request_processors
:
172 for meth_name
in ["any_request", req_scheme
+"_request"]:
173 meth
= getattr(processor
, meth_name
, None)
177 # In Python >= 2.4, .open() supports processors already, so we must
178 # call ._open() instead.
179 urlopen
= getattr(urllib2
.OpenerDirector
, "_open",
180 urllib2
.OpenerDirector
.open)
181 response
= urlopen(self
, req
, data
)
183 # post-process response
184 response_processors
= set(self
.process_response
.get(req_scheme
, []))
185 response_processors
.update(self
._any
_response
)
186 response_processors
= list(response_processors
)
187 response_processors
.sort()
188 for processor
in response_processors
:
189 for meth_name
in ["any_response", req_scheme
+"_response"]:
190 meth
= getattr(processor
, meth_name
, None)
192 response
= meth(req
, response
)
196 def error(self
, proto
, *args
):
197 if proto
in ['http', 'https']:
198 # XXX http[s] protocols are special-cased
199 dict = self
.handle_error
['http'] # https is not different than http
200 proto
= args
[2] # YUCK!
201 meth_name
= 'http_error_%s' % proto
205 dict = self
.handle_error
206 meth_name
= proto
+ '_error'
208 args
= (dict, proto
, meth_name
) + args
209 result
= apply(self
._call
_chain
, args
)
214 args
= (dict, 'default', 'http_error_default') + orig_args
215 return apply(self
._call
_chain
, args
)
218 def retrieve(self
, fullurl
, filename
=None, reporthook
=None, data
=None):
219 """Returns (filename, headers).
221 For remote objects, the default filename will refer to a temporary
222 file. Temporary files are removed when the OpenerDirector.close()
225 For file: URLs, at present the returned filename is None. This may
228 If the actual number of bytes read is less than indicated by the
229 Content-Length header, raises ContentTooShortError (a URLError
230 subclass). The exception's .result attribute contains the (filename,
231 headers) that would have been returned.
234 req
= self
._request
(fullurl
, data
, False)
235 scheme
= req
.get_type()
238 if filename
is None and scheme
== 'file':
239 # XXX req.get_selector() seems broken here, return None,
242 #return urllib.url2pathname(req.get_selector()), headers
244 tfp
= open(filename
, 'wb')
246 path
= _rfc3986
.urlsplit(fullurl
)[2]
247 suffix
= os
.path
.splitext(path
)[1]
248 fd
, filename
= tempfile
.mkstemp(suffix
)
249 self
._tempfiles
.append(filename
)
250 tfp
= os
.fdopen(fd
, 'wb')
252 result
= filename
, headers
258 if "content-length" in headers
:
259 size
= int(headers
["Content-Length"])
260 reporthook(blocknum
, bs
, size
)
269 reporthook(blocknum
, bs
, size
)
275 # raise exception if actual size does not match content-length header
276 if size
>= 0 and read
< size
:
277 raise ContentTooShortError(
278 "retrieval incomplete: "
279 "got only %i out of %i bytes" % (read
, size
),
286 urllib2
.OpenerDirector
.close(self
)
288 # make it very obvious this object is no longer supposed to be used
289 self
.open = self
.error
= self
.retrieve
= self
.add_handler
= None
292 for filename
in self
._tempfiles
:
297 del self
._tempfiles
[:]
300 def wrapped_open(urlopen
, process_response_object
, fullurl
, data
=None):
303 response
= urlopen(fullurl
, data
)
304 except urllib2
.HTTPError
, error
:
306 if error
.fp
is None: # not a response
310 if response
is not None:
311 response
= process_response_object(response
)
317 class ResponseProcessingOpener(OpenerDirector
):
319 def open(self
, fullurl
, data
=None):
320 def bound_open(fullurl
, data
=None):
321 return OpenerDirector
.open(self
, fullurl
, data
)
323 bound_open
, self
.process_response_object
, fullurl
, data
)
325 def process_response_object(self
, response
):
329 class SeekableResponseOpener(ResponseProcessingOpener
):
330 def process_response_object(self
, response
):
331 return _response
.seek_wrapped_response(response
)
335 """This class's interface is quite likely to change."""
339 urllib2
.ProxyHandler
,
340 urllib2
.UnknownHandler
,
341 _http
.HTTPHandler
, # derived from new AbstractHTTPHandler
342 _http
.HTTPDefaultErrorHandler
,
343 _http
.HTTPRedirectHandler
, # bugfixed
347 _upgrade
.HTTPRequestUpgradeProcessor
,
348 _http
.HTTPCookieProcessor
,
349 _http
.HTTPErrorProcessor
,
351 if hasattr(httplib
, 'HTTPS'):
352 default_classes
.append(_http
.HTTPSHandler
)
354 replacement_handlers
= []
356 def __init__(self
, klass
=OpenerDirector
):
359 def build_opener(self
, *handlers
):
360 """Create an opener object from a list of handlers and processors.
362 The opener will use several default handlers and processors, including
363 support for HTTP and FTP.
365 If any of the handlers passed as arguments are subclasses of the
366 default handlers, the default handlers will not be used.
369 opener
= self
.klass()
370 default_classes
= list(self
.default_classes
)
372 for klass
in default_classes
:
373 for check
in handlers
:
374 if type(check
) == types
.ClassType
:
375 if issubclass(check
, klass
):
377 elif type(check
) == types
.InstanceType
:
378 if isinstance(check
, klass
):
381 default_classes
.remove(klass
)
383 for klass
in default_classes
:
384 opener
.add_handler(klass())
386 if type(h
) == types
.ClassType
:
388 opener
.add_handler(h
)
393 build_opener
= OpenerFactory().build_opener
396 urlopen_lock
= _threading
.Lock()
397 def urlopen(url
, data
=None):
400 urlopen_lock
.acquire()
403 _opener
= build_opener()
405 urlopen_lock
.release()
406 return _opener
.open(url
, data
)
408 def urlretrieve(url
, filename
=None, reporthook
=None, data
=None):
411 urlopen_lock
.acquire()
414 _opener
= build_opener()
416 urlopen_lock
.release()
417 return _opener
.retrieve(url
, filename
, reporthook
, data
)
419 def install_opener(opener
):