getting file size for all dict files to be downloaded. coming to be 400mb or so.
[worddb.git] / libs / openid / fetchers.py
blob96ddb2357bf9cbb9fb4cfba4bebf10c6832dc316
1 # -*- test-case-name: openid.test.test_fetchers -*-
2 """
3 This module contains the HTTP fetcher interface and several implementations.
4 """
6 __all__ = ['fetch', 'getDefaultFetcher', 'setDefaultFetcher', 'HTTPResponse',
7 'HTTPFetcher', 'createHTTPFetcher', 'HTTPFetchingError',
8 'HTTPError']
10 import urllib2
11 import time
12 import cStringIO
13 import sys
15 import openid
16 import openid.urinorm
18 # Try to import httplib2 for caching support
19 # http://bitworking.org/projects/httplib2/
20 try:
21 import httplib2
22 except ImportError:
23 # httplib2 not available
24 httplib2 = None
26 # try to import pycurl, which will let us use CurlHTTPFetcher
27 try:
28 import pycurl
29 except ImportError:
30 pycurl = None
32 USER_AGENT = "python-openid/%s (%s)" % (openid.__version__, sys.platform)
33 MAX_RESPONSE_KB = 1024
35 def fetch(url, body=None, headers=None):
36 """Invoke the fetch method on the default fetcher. Most users
37 should need only this method.
39 @raises Exception: any exceptions that may be raised by the default fetcher
40 """
41 fetcher = getDefaultFetcher()
42 return fetcher.fetch(url, body, headers)
44 def createHTTPFetcher():
45 """Create a default HTTP fetcher instance
47 prefers Curl to urllib2."""
48 if pycurl is None:
49 fetcher = Urllib2Fetcher()
50 else:
51 fetcher = CurlHTTPFetcher()
53 return fetcher
55 # Contains the currently set HTTP fetcher. If it is set to None, the
56 # library will call createHTTPFetcher() to set it. Do not access this
57 # variable outside of this module.
58 _default_fetcher = None
60 def getDefaultFetcher():
61 """Return the default fetcher instance
62 if no fetcher has been set, it will create a default fetcher.
64 @return: the default fetcher
65 @rtype: HTTPFetcher
66 """
67 global _default_fetcher
69 if _default_fetcher is None:
70 setDefaultFetcher(createHTTPFetcher())
72 return _default_fetcher
74 def setDefaultFetcher(fetcher, wrap_exceptions=True):
75 """Set the default fetcher
77 @param fetcher: The fetcher to use as the default HTTP fetcher
78 @type fetcher: HTTPFetcher
80 @param wrap_exceptions: Whether to wrap exceptions thrown by the
81 fetcher wil HTTPFetchingError so that they may be caught
82 easier. By default, exceptions will be wrapped. In general,
83 unwrapped fetchers are useful for debugging of fetching errors
84 or if your fetcher raises well-known exceptions that you would
85 like to catch.
86 @type wrap_exceptions: bool
87 """
88 global _default_fetcher
89 if fetcher is None or not wrap_exceptions:
90 _default_fetcher = fetcher
91 else:
92 _default_fetcher = ExceptionWrappingFetcher(fetcher)
94 def usingCurl():
95 """Whether the currently set HTTP fetcher is a Curl HTTP fetcher."""
96 return isinstance(getDefaultFetcher(), CurlHTTPFetcher)
98 class HTTPResponse(object):
99 """XXX document attributes"""
100 headers = None
101 status = None
102 body = None
103 final_url = None
105 def __init__(self, final_url=None, status=None, headers=None, body=None):
106 self.final_url = final_url
107 self.status = status
108 self.headers = headers
109 self.body = body
111 def __repr__(self):
112 return "<%s status %s for %s>" % (self.__class__.__name__,
113 self.status,
114 self.final_url)
116 class HTTPFetcher(object):
118 This class is the interface for openid HTTP fetchers. This
119 interface is only important if you need to write a new fetcher for
120 some reason.
123 def fetch(self, url, body=None, headers=None):
125 This performs an HTTP POST or GET, following redirects along
126 the way. If a body is specified, then the request will be a
127 POST. Otherwise, it will be a GET.
130 @param headers: HTTP headers to include with the request
131 @type headers: {str:str}
133 @return: An object representing the server's HTTP response. If
134 there are network or protocol errors, an exception will be
135 raised. HTTP error responses, like 404 or 500, do not
136 cause exceptions.
138 @rtype: L{HTTPResponse}
140 @raise Exception: Different implementations will raise
141 different errors based on the underlying HTTP library.
143 raise NotImplementedError
145 def _allowedURL(url):
146 return url.startswith('http://') or url.startswith('https://')
148 class HTTPFetchingError(Exception):
149 """Exception that is wrapped around all exceptions that are raised
150 by the underlying fetcher when using the ExceptionWrappingFetcher
152 @ivar why: The exception that caused this exception
154 def __init__(self, why=None):
155 Exception.__init__(self, why)
156 self.why = why
158 class ExceptionWrappingFetcher(HTTPFetcher):
159 """Fetcher that wraps another fetcher, causing all exceptions
161 @cvar uncaught_exceptions: Exceptions that should be exposed to the
162 user if they are raised by the fetch call
165 uncaught_exceptions = (SystemExit, KeyboardInterrupt, MemoryError)
167 def __init__(self, fetcher):
168 self.fetcher = fetcher
170 def fetch(self, *args, **kwargs):
171 try:
172 return self.fetcher.fetch(*args, **kwargs)
173 except self.uncaught_exceptions:
174 raise
175 except:
176 exc_cls, exc_inst = sys.exc_info()[:2]
177 if exc_inst is None:
178 # string exceptions
179 exc_inst = exc_cls
181 raise HTTPFetchingError(why=exc_inst)
183 class Urllib2Fetcher(HTTPFetcher):
184 """An C{L{HTTPFetcher}} that uses urllib2.
187 # Parameterized for the benefit of testing frameworks, see
188 # http://trac.openidenabled.com/trac/ticket/85
189 urlopen = staticmethod(urllib2.urlopen)
191 def fetch(self, url, body=None, headers=None):
192 if not _allowedURL(url):
193 raise ValueError('Bad URL scheme: %r' % (url,))
195 if headers is None:
196 headers = {}
198 headers.setdefault(
199 'User-Agent',
200 "%s Python-urllib/%s" % (USER_AGENT, urllib2.__version__,))
201 headers.setdefault(
202 'Range',
203 '0-%s' % (1024*MAX_RESPONSE_KB,))
205 req = urllib2.Request(url, data=body, headers=headers)
206 try:
207 f = self.urlopen(req)
208 try:
209 return self._makeResponse(f)
210 finally:
211 f.close()
212 except urllib2.HTTPError, why:
213 try:
214 return self._makeResponse(why)
215 finally:
216 why.close()
218 def _makeResponse(self, urllib2_response):
219 resp = HTTPResponse()
220 resp.body = urllib2_response.read(MAX_RESPONSE_KB * 1024)
221 resp.final_url = urllib2_response.geturl()
222 resp.headers = dict(urllib2_response.info().items())
224 if hasattr(urllib2_response, 'code'):
225 resp.status = urllib2_response.code
226 else:
227 resp.status = 200
229 return resp
231 class HTTPError(HTTPFetchingError):
233 This exception is raised by the C{L{CurlHTTPFetcher}} when it
234 encounters an exceptional situation fetching a URL.
236 pass
238 # XXX: define what we mean by paranoid, and make sure it is.
239 class CurlHTTPFetcher(HTTPFetcher):
241 An C{L{HTTPFetcher}} that uses pycurl for fetching.
242 See U{http://pycurl.sourceforge.net/}.
244 ALLOWED_TIME = 20 # seconds
246 def __init__(self):
247 HTTPFetcher.__init__(self)
248 if pycurl is None:
249 raise RuntimeError('Cannot find pycurl library')
251 def _parseHeaders(self, header_file):
252 header_file.seek(0)
254 # Remove the status line from the beginning of the input
255 unused_http_status_line = header_file.readline()
256 lines = [line.strip() for line in header_file]
258 # and the blank line from the end
259 empty_line = lines.pop()
260 if empty_line:
261 raise HTTPError("No blank line at end of headers: %r" % (line,))
263 headers = {}
264 for line in lines:
265 try:
266 name, value = line.split(':', 1)
267 except ValueError:
268 raise HTTPError(
269 "Malformed HTTP header line in response: %r" % (line,))
271 value = value.strip()
273 # HTTP headers are case-insensitive
274 name = name.lower()
275 headers[name] = value
277 return headers
279 def _checkURL(self, url):
280 # XXX: document that this can be overridden to match desired policy
281 # XXX: make sure url is well-formed and routeable
282 return _allowedURL(url)
284 def fetch(self, url, body=None, headers=None):
285 stop = int(time.time()) + self.ALLOWED_TIME
286 off = self.ALLOWED_TIME
288 if headers is None:
289 headers = {}
291 headers.setdefault('User-Agent',
292 "%s %s" % (USER_AGENT, pycurl.version,))
294 header_list = []
295 if headers is not None:
296 for header_name, header_value in headers.iteritems():
297 header_list.append('%s: %s' % (header_name, header_value))
299 c = pycurl.Curl()
300 try:
301 c.setopt(pycurl.NOSIGNAL, 1)
303 if header_list:
304 c.setopt(pycurl.HTTPHEADER, header_list)
306 # Presence of a body indicates that we should do a POST
307 if body is not None:
308 c.setopt(pycurl.POST, 1)
309 c.setopt(pycurl.POSTFIELDS, body)
311 while off > 0:
312 if not self._checkURL(url):
313 raise HTTPError("Fetching URL not allowed: %r" % (url,))
315 data = cStringIO.StringIO()
316 def write_data(chunk):
317 if data.tell() > 1024*MAX_RESPONSE_KB:
318 return 0
319 else:
320 return data.write(chunk)
322 response_header_data = cStringIO.StringIO()
323 c.setopt(pycurl.WRITEFUNCTION, write_data)
324 c.setopt(pycurl.HEADERFUNCTION, response_header_data.write)
325 c.setopt(pycurl.TIMEOUT, off)
326 c.setopt(pycurl.URL, openid.urinorm.urinorm(url))
327 c.setopt(pycurl.RANGE, '0-%s'%(MAX_RESPONSE_KB*1024))
329 c.perform()
331 response_headers = self._parseHeaders(response_header_data)
332 code = c.getinfo(pycurl.RESPONSE_CODE)
333 if code in [301, 302, 303, 307]:
334 url = response_headers.get('location')
335 if url is None:
336 raise HTTPError(
337 'Redirect (%s) returned without a location' % code)
339 # Redirects are always GETs
340 c.setopt(pycurl.POST, 0)
342 # There is no way to reset POSTFIELDS to empty and
343 # reuse the connection, but we only use it once.
344 else:
345 resp = HTTPResponse()
346 resp.headers = response_headers
347 resp.status = code
348 resp.final_url = url
349 resp.body = data.getvalue()
350 return resp
352 off = stop - int(time.time())
354 raise HTTPError("Timed out fetching: %r" % (url,))
355 finally:
356 c.close()
358 class HTTPLib2Fetcher(HTTPFetcher):
359 """A fetcher that uses C{httplib2} for performing HTTP
360 requests. This implementation supports HTTP caching.
362 @see: http://bitworking.org/projects/httplib2/
365 def __init__(self, cache=None):
366 """@param cache: An object suitable for use as an C{httplib2}
367 cache. If a string is passed, it is assumed to be a
368 directory name.
370 if httplib2 is None:
371 raise RuntimeError('Cannot find httplib2 library. '
372 'See http://bitworking.org/projects/httplib2/')
374 super(HTTPLib2Fetcher, self).__init__()
376 # An instance of the httplib2 object that performs HTTP requests
377 self.httplib2 = httplib2.Http(cache)
379 # We want httplib2 to raise exceptions for errors, just like
380 # the other fetchers.
381 self.httplib2.force_exception_to_status_code = False
383 def fetch(self, url, body=None, headers=None):
384 """Perform an HTTP request
386 @raises Exception: Any exception that can be raised by httplib2
388 @see: C{L{HTTPFetcher.fetch}}
390 if body:
391 method = 'POST'
392 else:
393 method = 'GET'
395 if headers is None:
396 headers = {}
397 headers.setdefault(
398 'Range',
399 '0-%s' % (1024*MAX_RESPONSE_KB,))
401 # httplib2 doesn't check to make sure that the URL's scheme is
402 # 'http' so we do it here.
403 if not (url.startswith('http://') or url.startswith('https://')):
404 raise ValueError('URL is not a HTTP URL: %r' % (url,))
406 httplib2_response, content = self.httplib2.request(
407 url, method, body=body, headers=headers)
409 # Translate the httplib2 response to our HTTP response abstraction
411 # When a 400 is returned, there is no "content-location"
412 # header set. This seems like a bug to me. I can't think of a
413 # case where we really care about the final URL when it is an
414 # error response, but being careful about it can't hurt.
415 try:
416 final_url = httplib2_response['content-location']
417 except KeyError:
418 # We're assuming that no redirects occurred
419 assert not httplib2_response.previous
421 # And this should never happen for a successful response
422 assert httplib2_response.status != 200
423 final_url = url
425 return HTTPResponse(
426 body=content,
427 final_url=final_url,
428 headers=dict(httplib2_response.items()),
429 status=httplib2_response.status,