libs/openid/fetchers.py

   1 # -*- test-case-name: openid.test.test_fetchers -*-
   2 """
   3 This module contains the HTTP fetcher interface and several implementations.
   4 """
   5
   6 __all__ = ['fetch', 'getDefaultFetcher', 'setDefaultFetcher', 'HTTPResponse',
   7            'HTTPFetcher', 'createHTTPFetcher', 'HTTPFetchingError',
   8            'HTTPError']
   9
  10 import urllib2
  11 import time
  12 import cStringIO
  13 import sys
  14
  15 import openid
  16 import openid.urinorm
  17
  18 # Try to import httplib2 for caching support
  19 # http://bitworking.org/projects/httplib2/
  20 try:
  21     import httplib2
  22 except ImportError:
  23     # httplib2 not available
  24     httplib2 = None
  25
  26 # try to import pycurl, which will let us use CurlHTTPFetcher
  27 try:
  28     import pycurl
  29 except ImportError:
  30     pycurl = None
  31
  32 USER_AGENT = "python-openid/%s (%s)" % (openid.__version__, sys.platform)
  33 MAX_RESPONSE_KB = 1024
  34
  35 def fetch(url, body=None, headers=None):
  36     """Invoke the fetch method on the default fetcher. Most users
  37     should need only this method.
  38
  39     @raises Exception: any exceptions that may be raised by the default fetcher
  40     """
  41     fetcher = getDefaultFetcher()
  42     return fetcher.fetch(url, body, headers)
  43
  44 def createHTTPFetcher():
  45     """Create a default HTTP fetcher instance
  46
  47     prefers Curl to urllib2."""
  48     if pycurl is None:
  49         fetcher = Urllib2Fetcher()
  50     else:
  51         fetcher = CurlHTTPFetcher()
  52
  53     return fetcher
  54
  55 # Contains the currently set HTTP fetcher. If it is set to None, the
  56 # library will call createHTTPFetcher() to set it. Do not access this
  57 # variable outside of this module.
  58 _default_fetcher = None
  59
  60 def getDefaultFetcher():
  61     """Return the default fetcher instance
  62     if no fetcher has been set, it will create a default fetcher.
  63
  64     @return: the default fetcher
  65     @rtype: HTTPFetcher
  66     """
  67     global _default_fetcher
  68
  69     if _default_fetcher is None:
  70         setDefaultFetcher(createHTTPFetcher())
  71
  72     return _default_fetcher
  73
  74 def setDefaultFetcher(fetcher, wrap_exceptions=True):
  75     """Set the default fetcher
  76
  77     @param fetcher: The fetcher to use as the default HTTP fetcher
  78     @type fetcher: HTTPFetcher
  79
  80     @param wrap_exceptions: Whether to wrap exceptions thrown by the
  81         fetcher wil HTTPFetchingError so that they may be caught
  82         easier. By default, exceptions will be wrapped. In general,
  83         unwrapped fetchers are useful for debugging of fetching errors
  84         or if your fetcher raises well-known exceptions that you would
  85         like to catch.
  86     @type wrap_exceptions: bool
  87     """
  88     global _default_fetcher
  89     if fetcher is None or not wrap_exceptions:
  90         _default_fetcher = fetcher
  91     else:
  92         _default_fetcher = ExceptionWrappingFetcher(fetcher)
  93
  94 def usingCurl():
  95     """Whether the currently set HTTP fetcher is a Curl HTTP fetcher."""
  96     return isinstance(getDefaultFetcher(), CurlHTTPFetcher)
  97
  98 class HTTPResponse(object):
  99     """XXX document attributes"""
 100     headers = None
 101     status = None
 102     body = None
 103     final_url = None
 104
 105     def __init__(self, final_url=None, status=None, headers=None, body=None):
 106         self.final_url = final_url
 107         self.status = status
 108         self.headers = headers
 109         self.body = body
 110
 111     def __repr__(self):
 112         return "<%s status %s for %s>" % (self.__class__.__name__,
 113                                           self.status,
 114                                           self.final_url)
 115
 116 class HTTPFetcher(object):
 117     """
 118     This class is the interface for openid HTTP fetchers.  This
 119     interface is only important if you need to write a new fetcher for
 120     some reason.
 121     """
 122
 123     def fetch(self, url, body=None, headers=None):
 124         """
 125         This performs an HTTP POST or GET, following redirects along
 126         the way. If a body is specified, then the request will be a
 127         POST. Otherwise, it will be a GET.
 128
 129
 130         @param headers: HTTP headers to include with the request
 131         @type headers: {str:str}
 132
 133         @return: An object representing the server's HTTP response. If
 134             there are network or protocol errors, an exception will be
 135             raised. HTTP error responses, like 404 or 500, do not
 136             cause exceptions.
 137
 138         @rtype: L{HTTPResponse}
 139
 140         @raise Exception: Different implementations will raise
 141             different errors based on the underlying HTTP library.
 142         """
 143         raise NotImplementedError
 144
 145 def _allowedURL(url):
 146     return url.startswith('http://') or url.startswith('https://')
 147
 148 class HTTPFetchingError(Exception):
 149     """Exception that is wrapped around all exceptions that are raised
 150     by the underlying fetcher when using the ExceptionWrappingFetcher
 151
 152     @ivar why: The exception that caused this exception
 153     """
 154     def __init__(self, why=None):
 155         Exception.__init__(self, why)
 156         self.why = why
 157
 158 class ExceptionWrappingFetcher(HTTPFetcher):
 159     """Fetcher that wraps another fetcher, causing all exceptions
 160
 161     @cvar uncaught_exceptions: Exceptions that should be exposed to the
 162         user if they are raised by the fetch call
 163     """
 164
 165     uncaught_exceptions = (SystemExit, KeyboardInterrupt, MemoryError)
 166
 167     def __init__(self, fetcher):
 168         self.fetcher = fetcher
 169
 170     def fetch(self, *args, **kwargs):
 171         try:
 172             return self.fetcher.fetch(*args, **kwargs)
 173         except self.uncaught_exceptions:
 174             raise
 175         except:
 176             exc_cls, exc_inst = sys.exc_info()[:2]
 177             if exc_inst is None:
 178                 # string exceptions
 179                 exc_inst = exc_cls
 180
 181             raise HTTPFetchingError(why=exc_inst)
 182
 183 class Urllib2Fetcher(HTTPFetcher):
 184     """An C{L{HTTPFetcher}} that uses urllib2.
 185     """
 186
 187     # Parameterized for the benefit of testing frameworks, see
 188     # http://trac.openidenabled.com/trac/ticket/85
 189     urlopen = staticmethod(urllib2.urlopen)
 190
 191     def fetch(self, url, body=None, headers=None):
 192         if not _allowedURL(url):
 193             raise ValueError('Bad URL scheme: %r' % (url,))
 194
 195         if headers is None:
 196             headers = {}
 197
 198         headers.setdefault(
 199             'User-Agent',
 200             "%s Python-urllib/%s" % (USER_AGENT, urllib2.__version__,))
 201         headers.setdefault(
 202             'Range',
 203             '0-%s' % (1024*MAX_RESPONSE_KB,))
 204
 205         req = urllib2.Request(url, data=body, headers=headers)
 206         try:
 207             f = self.urlopen(req)
 208             try:
 209                 return self._makeResponse(f)
 210             finally:
 211                 f.close()
 212         except urllib2.HTTPError, why:
 213             try:
 214                 return self._makeResponse(why)
 215             finally:
 216                 why.close()
 217
 218     def _makeResponse(self, urllib2_response):
 219         resp = HTTPResponse()
 220         resp.body = urllib2_response.read(MAX_RESPONSE_KB * 1024)
 221         resp.final_url = urllib2_response.geturl()
 222         resp.headers = dict(urllib2_response.info().items())
 223
 224         if hasattr(urllib2_response, 'code'):
 225             resp.status = urllib2_response.code
 226         else:
 227             resp.status = 200
 228
 229         return resp
 230
 231 class HTTPError(HTTPFetchingError):
 232     """
 233     This exception is raised by the C{L{CurlHTTPFetcher}} when it
 234     encounters an exceptional situation fetching a URL.
 235     """
 236     pass
 237
 238 # XXX: define what we mean by paranoid, and make sure it is.
 239 class CurlHTTPFetcher(HTTPFetcher):
 240     """
 241     An C{L{HTTPFetcher}} that uses pycurl for fetching.
 242     See U{http://pycurl.sourceforge.net/}.
 243     """
 244     ALLOWED_TIME = 20 # seconds
 245
 246     def __init__(self):
 247         HTTPFetcher.__init__(self)
 248         if pycurl is None:
 249             raise RuntimeError('Cannot find pycurl library')
 250
 251     def _parseHeaders(self, header_file):
 252         header_file.seek(0)
 253
 254         # Remove the status line from the beginning of the input
 255         unused_http_status_line = header_file.readline()
 256         lines = [line.strip() for line in header_file]
 257
 258         # and the blank line from the end
 259         empty_line = lines.pop()
 260         if empty_line:
 261             raise HTTPError("No blank line at end of headers: %r" % (line,))
 262
 263         headers = {}
 264         for line in lines:
 265             try:
 266                 name, value = line.split(':', 1)
 267             except ValueError:
 268                 raise HTTPError(
 269                     "Malformed HTTP header line in response: %r" % (line,))
 270
 271             value = value.strip()
 272
 273             # HTTP headers are case-insensitive
 274             name = name.lower()
 275             headers[name] = value
 276
 277         return headers
 278
 279     def _checkURL(self, url):
 280         # XXX: document that this can be overridden to match desired policy
 281         # XXX: make sure url is well-formed and routeable
 282         return _allowedURL(url)
 283
 284     def fetch(self, url, body=None, headers=None):
 285         stop = int(time.time()) + self.ALLOWED_TIME
 286         off = self.ALLOWED_TIME
 287
 288         if headers is None:
 289             headers = {}
 290
 291         headers.setdefault('User-Agent',
 292                            "%s %s" % (USER_AGENT, pycurl.version,))
 293
 294         header_list = []
 295         if headers is not None:
 296             for header_name, header_value in headers.iteritems():
 297                 header_list.append('%s: %s' % (header_name, header_value))
 298
 299         c = pycurl.Curl()
 300         try:
 301             c.setopt(pycurl.NOSIGNAL, 1)
 302
 303             if header_list:
 304                 c.setopt(pycurl.HTTPHEADER, header_list)
 305
 306             # Presence of a body indicates that we should do a POST
 307             if body is not None:
 308                 c.setopt(pycurl.POST, 1)
 309                 c.setopt(pycurl.POSTFIELDS, body)
 310
 311             while off > 0:
 312                 if not self._checkURL(url):
 313                     raise HTTPError("Fetching URL not allowed: %r" % (url,))
 314
 315                 data = cStringIO.StringIO()
 316                 def write_data(chunk):
 317                     if data.tell() > 1024*MAX_RESPONSE_KB:
 318                         return 0
 319                     else:
 320                         return data.write(chunk)
 321
 322                 response_header_data = cStringIO.StringIO()
 323                 c.setopt(pycurl.WRITEFUNCTION, write_data)
 324                 c.setopt(pycurl.HEADERFUNCTION, response_header_data.write)
 325                 c.setopt(pycurl.TIMEOUT, off)
 326                 c.setopt(pycurl.URL, openid.urinorm.urinorm(url))
 327                 c.setopt(pycurl.RANGE, '0-%s'%(MAX_RESPONSE_KB*1024))
 328
 329                 c.perform()
 330
 331                 response_headers = self._parseHeaders(response_header_data)
 332                 code = c.getinfo(pycurl.RESPONSE_CODE)
 333                 if code in [301, 302, 303, 307]:
 334                     url = response_headers.get('location')
 335                     if url is None:
 336                         raise HTTPError(
 337                             'Redirect (%s) returned without a location' % code)
 338
 339                     # Redirects are always GETs
 340                     c.setopt(pycurl.POST, 0)
 341
 342                     # There is no way to reset POSTFIELDS to empty and
 343                     # reuse the connection, but we only use it once.
 344                 else:
 345                     resp = HTTPResponse()
 346                     resp.headers = response_headers
 347                     resp.status = code
 348                     resp.final_url = url
 349                     resp.body = data.getvalue()
 350                     return resp
 351
 352                 off = stop - int(time.time())
 353
 354             raise HTTPError("Timed out fetching: %r" % (url,))
 355         finally:
 356             c.close()
 357
 358 class HTTPLib2Fetcher(HTTPFetcher):
 359     """A fetcher that uses C{httplib2} for performing HTTP
 360     requests. This implementation supports HTTP caching.
 361
 362     @see: http://bitworking.org/projects/httplib2/
 363     """
 364
 365     def __init__(self, cache=None):
 366         """@param cache: An object suitable for use as an C{httplib2}
 367             cache. If a string is passed, it is assumed to be a
 368             directory name.
 369         """
 370         if httplib2 is None:
 371             raise RuntimeError('Cannot find httplib2 library. '
 372                                'See http://bitworking.org/projects/httplib2/')
 373
 374         super(HTTPLib2Fetcher, self).__init__()
 375
 376         # An instance of the httplib2 object that performs HTTP requests
 377         self.httplib2 = httplib2.Http(cache)
 378
 379         # We want httplib2 to raise exceptions for errors, just like
 380         # the other fetchers.
 381         self.httplib2.force_exception_to_status_code = False
 382
 383     def fetch(self, url, body=None, headers=None):
 384         """Perform an HTTP request
 385
 386         @raises Exception: Any exception that can be raised by httplib2
 387
 388         @see: C{L{HTTPFetcher.fetch}}
 389         """
 390         if body:
 391             method = 'POST'
 392         else:
 393             method = 'GET'
 394
 395         if headers is None:
 396             headers = {}
 397         headers.setdefault(
 398             'Range',
 399             '0-%s' % (1024*MAX_RESPONSE_KB,))
 400
 401         # httplib2 doesn't check to make sure that the URL's scheme is
 402         # 'http' so we do it here.
 403         if not (url.startswith('http://') or url.startswith('https://')):
 404             raise ValueError('URL is not a HTTP URL: %r' % (url,))
 405
 406         httplib2_response, content = self.httplib2.request(
 407             url, method, body=body, headers=headers)
 408
 409         # Translate the httplib2 response to our HTTP response abstraction
 410
 411         # When a 400 is returned, there is no "content-location"
 412         # header set. This seems like a bug to me. I can't think of a
 413         # case where we really care about the final URL when it is an
 414         # error response, but being careful about it can't hurt.
 415         try:
 416             final_url = httplib2_response['content-location']
 417         except KeyError:
 418             # We're assuming that no redirects occurred
 419             assert not httplib2_response.previous
 420
 421             # And this should never happen for a successful response
 422             assert httplib2_response.status != 200
 423             final_url = url
 424
 425         return HTTPResponse(
 426             body=content,
 427             final_url=final_url,
 428             headers=dict(httplib2_response.items()),
 429             status=httplib2_response.status,
 430             )