getting file size for all dict files to be downloaded. coming to be 400mb or so.
[worddb.git] / libs / httplib2 / __init__.py
blob982bf8a02af7a87cb260b63a99792492853ef316
1 from __future__ import generators
2 """
3 httplib2
5 A caching http interface that supports ETags and gzip
6 to conserve bandwidth.
8 Requires Python 2.3 or later
10 Changelog:
11 2007-08-18, Rick: Modified so it's able to use a socks proxy if needed.
13 """
15 __author__ = "Joe Gregorio (joe@bitworking.org)"
16 __copyright__ = "Copyright 2006, Joe Gregorio"
17 __contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)",
18 "James Antill",
19 "Xavier Verges Farrero",
20 "Jonathan Feinberg",
21 "Blair Zajac",
22 "Sam Ruby",
23 "Louis Nyffenegger"]
24 __license__ = "MIT"
25 __version__ = "$Rev: 259 $"
27 import re
28 import sys
29 import md5
30 import email
31 import email.Utils
32 import email.Message
33 import StringIO
34 import gzip
35 import zlib
36 import httplib
37 import urlparse
38 import base64
39 import os
40 import copy
41 import calendar
42 import time
43 import random
44 import sha
45 import hmac
46 from gettext import gettext as _
47 import socket
49 try:
50 import socks
51 except ImportError:
52 socks = None
54 if sys.version_info >= (2,3):
55 from iri2uri import iri2uri
56 else:
57 def iri2uri(uri):
58 return uri
60 __all__ = ['Http', 'Response', 'ProxyInfo', 'HttpLib2Error',
61 'RedirectMissingLocation', 'RedirectLimit', 'FailedToDecompressContent',
62 'UnimplementedDigestAuthOptionError', 'UnimplementedHmacDigestAuthOptionError',
63 'debuglevel']
66 # The httplib debug level, set to a non-zero value to get debug output
67 debuglevel = 0
69 # Python 2.3 support
70 if sys.version_info < (2,4):
71 def sorted(seq):
72 seq.sort()
73 return seq
75 # Python 2.3 support
76 def HTTPResponse__getheaders(self):
77 """Return list of (header, value) tuples."""
78 if self.msg is None:
79 raise httplib.ResponseNotReady()
80 return self.msg.items()
82 if not hasattr(httplib.HTTPResponse, 'getheaders'):
83 httplib.HTTPResponse.getheaders = HTTPResponse__getheaders
85 # All exceptions raised here derive from HttpLib2Error
86 class HttpLib2Error(Exception): pass
88 # Some exceptions can be caught and optionally
89 # be turned back into responses.
90 class HttpLib2ErrorWithResponse(HttpLib2Error):
91 def __init__(self, desc, response, content):
92 self.response = response
93 self.content = content
94 HttpLib2Error.__init__(self, desc)
96 class RedirectMissingLocation(HttpLib2ErrorWithResponse): pass
97 class RedirectLimit(HttpLib2ErrorWithResponse): pass
98 class FailedToDecompressContent(HttpLib2ErrorWithResponse): pass
99 class UnimplementedDigestAuthOptionError(HttpLib2ErrorWithResponse): pass
100 class UnimplementedHmacDigestAuthOptionError(HttpLib2ErrorWithResponse): pass
102 class RelativeURIError(HttpLib2Error): pass
103 class ServerNotFoundError(HttpLib2Error): pass
105 # Open Items:
106 # -----------
107 # Proxy support
109 # Are we removing the cached content too soon on PUT (only delete on 200 Maybe?)
111 # Pluggable cache storage (supports storing the cache in
112 # flat files by default. We need a plug-in architecture
113 # that can support Berkeley DB and Squid)
115 # == Known Issues ==
116 # Does not handle a resource that uses conneg and Last-Modified but no ETag as a cache validator.
117 # Does not handle Cache-Control: max-stale
118 # Does not use Age: headers when calculating cache freshness.
121 # The number of redirections to follow before giving up.
122 # Note that only GET redirects are automatically followed.
123 # Will also honor 301 requests by saving that info and never
124 # requesting that URI again.
125 DEFAULT_MAX_REDIRECTS = 5
127 # Which headers are hop-by-hop headers by default
128 HOP_BY_HOP = ['connection', 'keep-alive', 'proxy-authenticate', 'proxy-authorization', 'te', 'trailers', 'transfer-encoding', 'upgrade']
130 def _get_end2end_headers(response):
131 hopbyhop = list(HOP_BY_HOP)
132 hopbyhop.extend([x.strip() for x in response.get('connection', '').split(',')])
133 return [header for header in response.keys() if header not in hopbyhop]
135 URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
137 def parse_uri(uri):
138 """Parses a URI using the regex given in Appendix B of RFC 3986.
140 (scheme, authority, path, query, fragment) = parse_uri(uri)
142 groups = URI.match(uri).groups()
143 return (groups[1], groups[3], groups[4], groups[6], groups[8])
145 def urlnorm(uri):
146 (scheme, authority, path, query, fragment) = parse_uri(uri)
147 if not scheme or not authority:
148 raise RelativeURIError("Only absolute URIs are allowed. uri = %s" % uri)
149 authority = authority.lower()
150 scheme = scheme.lower()
151 if not path:
152 path = "/"
153 # Could do syntax based normalization of the URI before
154 # computing the digest. See Section 6.2.2 of Std 66.
155 request_uri = query and "?".join([path, query]) or path
156 scheme = scheme.lower()
157 defrag_uri = scheme + "://" + authority + request_uri
158 return scheme, authority, request_uri, defrag_uri
161 # Cache filename construction (original borrowed from Venus http://intertwingly.net/code/venus/)
162 re_url_scheme = re.compile(r'^\w+://')
163 re_slash = re.compile(r'[?/:|]+')
165 def safename(filename):
166 """Return a filename suitable for the cache.
168 Strips dangerous and common characters to create a filename we
169 can use to store the cache in.
172 try:
173 if re_url_scheme.match(filename):
174 if isinstance(filename,str):
175 filename = filename.decode('utf-8')
176 filename = filename.encode('idna')
177 else:
178 filename = filename.encode('idna')
179 except UnicodeError:
180 pass
181 if isinstance(filename,unicode):
182 filename=filename.encode('utf-8')
183 filemd5 = md5.new(filename).hexdigest()
184 filename = re_url_scheme.sub("", filename)
185 filename = re_slash.sub(",", filename)
187 # limit length of filename
188 if len(filename)>200:
189 filename=filename[:200]
190 return ",".join((filename, filemd5))
192 NORMALIZE_SPACE = re.compile(r'(?:\r\n)?[ \t]+')
193 def _normalize_headers(headers):
194 return dict([ (key.lower(), NORMALIZE_SPACE.sub(value, ' ').strip()) for (key, value) in headers.iteritems()])
196 def _parse_cache_control(headers):
197 retval = {}
198 if headers.has_key('cache-control'):
199 parts = headers['cache-control'].split(',')
200 parts_with_args = [tuple([x.strip() for x in part.split("=")]) for part in parts if -1 != part.find("=")]
201 parts_wo_args = [(name.strip(), 1) for name in parts if -1 == name.find("=")]
202 retval = dict(parts_with_args + parts_wo_args)
203 return retval
205 # Whether to use a strict mode to parse WWW-Authenticate headers
206 # Might lead to bad results in case of ill-formed header value,
207 # so disabled by default, falling back to relaxed parsing.
208 # Set to true to turn on, usefull for testing servers.
209 USE_WWW_AUTH_STRICT_PARSING = 0
211 # In regex below:
212 # [^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+ matches a "token" as defined by HTTP
213 # "(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?" matches a "quoted-string" as defined by HTTP, when LWS have already been replaced by a single space
214 # Actually, as an auth-param value can be either a token or a quoted-string, they are combined in a single pattern which matches both:
215 # \"?((?<=\")(?:[^\0-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x08\x0A-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?
216 WWW_AUTH_STRICT = re.compile(r"^(?:\s*(?:,\s*)?([^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+)\s*=\s*\"?((?<=\")(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?)(.*)$")
217 WWW_AUTH_RELAXED = re.compile(r"^(?:\s*(?:,\s*)?([^ \t\r\n=]+)\s*=\s*\"?((?<=\")(?:[^\\\"]|\\.)*?(?=\")|(?<!\")[^ \t\r\n,]+(?!\"))\"?)(.*)$")
218 UNQUOTE_PAIRS = re.compile(r'\\(.)')
219 def _parse_www_authenticate(headers, headername='www-authenticate'):
220 """Returns a dictionary of dictionaries, one dict
221 per auth_scheme."""
222 retval = {}
223 if headers.has_key(headername):
224 authenticate = headers[headername].strip()
225 www_auth = USE_WWW_AUTH_STRICT_PARSING and WWW_AUTH_STRICT or WWW_AUTH_RELAXED
226 while authenticate:
227 # Break off the scheme at the beginning of the line
228 if headername == 'authentication-info':
229 (auth_scheme, the_rest) = ('digest', authenticate)
230 else:
231 (auth_scheme, the_rest) = authenticate.split(" ", 1)
232 # Now loop over all the key value pairs that come after the scheme,
233 # being careful not to roll into the next scheme
234 match = www_auth.search(the_rest)
235 auth_params = {}
236 while match:
237 if match and len(match.groups()) == 3:
238 (key, value, the_rest) = match.groups()
239 auth_params[key.lower()] = UNQUOTE_PAIRS.sub(r'\1', value) # '\\'.join([x.replace('\\', '') for x in value.split('\\\\')])
240 match = www_auth.search(the_rest)
241 retval[auth_scheme.lower()] = auth_params
242 authenticate = the_rest.strip()
243 return retval
246 def _entry_disposition(response_headers, request_headers):
247 """Determine freshness from the Date, Expires and Cache-Control headers.
249 We don't handle the following:
251 1. Cache-Control: max-stale
252 2. Age: headers are not used in the calculations.
254 Not that this algorithm is simpler than you might think
255 because we are operating as a private (non-shared) cache.
256 This lets us ignore 's-maxage'. We can also ignore
257 'proxy-invalidate' since we aren't a proxy.
258 We will never return a stale document as
259 fresh as a design decision, and thus the non-implementation
260 of 'max-stale'. This also lets us safely ignore 'must-revalidate'
261 since we operate as if every server has sent 'must-revalidate'.
262 Since we are private we get to ignore both 'public' and
263 'private' parameters. We also ignore 'no-transform' since
264 we don't do any transformations.
265 The 'no-store' parameter is handled at a higher level.
266 So the only Cache-Control parameters we look at are:
268 no-cache
269 only-if-cached
270 max-age
271 min-fresh
274 retval = "STALE"
275 cc = _parse_cache_control(request_headers)
276 cc_response = _parse_cache_control(response_headers)
278 if request_headers.has_key('pragma') and request_headers['pragma'].lower().find('no-cache') != -1:
279 retval = "TRANSPARENT"
280 if 'cache-control' not in request_headers:
281 request_headers['cache-control'] = 'no-cache'
282 elif cc.has_key('no-cache'):
283 retval = "TRANSPARENT"
284 elif cc_response.has_key('no-cache'):
285 retval = "STALE"
286 elif cc.has_key('only-if-cached'):
287 retval = "FRESH"
288 elif response_headers.has_key('date'):
289 date = calendar.timegm(email.Utils.parsedate_tz(response_headers['date']))
290 now = time.time()
291 current_age = max(0, now - date)
292 if cc_response.has_key('max-age'):
293 try:
294 freshness_lifetime = int(cc_response['max-age'])
295 except ValueError:
296 freshness_lifetime = 0
297 elif response_headers.has_key('expires'):
298 expires = email.Utils.parsedate_tz(response_headers['expires'])
299 if None == expires:
300 freshness_lifetime = 0
301 else:
302 freshness_lifetime = max(0, calendar.timegm(expires) - date)
303 else:
304 freshness_lifetime = 0
305 if cc.has_key('max-age'):
306 try:
307 freshness_lifetime = int(cc['max-age'])
308 except ValueError:
309 freshness_lifetime = 0
310 if cc.has_key('min-fresh'):
311 try:
312 min_fresh = int(cc['min-fresh'])
313 except ValueError:
314 min_fresh = 0
315 current_age += min_fresh
316 if freshness_lifetime > current_age:
317 retval = "FRESH"
318 return retval
320 def _decompressContent(response, new_content):
321 content = new_content
322 try:
323 encoding = response.get('content-encoding', None)
324 if encoding in ['gzip', 'deflate']:
325 if encoding == 'gzip':
326 content = gzip.GzipFile(fileobj=StringIO.StringIO(new_content)).read()
327 if encoding == 'deflate':
328 content = zlib.decompress(content)
329 response['content-length'] = str(len(content))
330 del response['content-encoding']
331 except IOError:
332 content = ""
333 raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding'), response, content)
334 return content
336 def _updateCache(request_headers, response_headers, content, cache, cachekey):
337 if cachekey:
338 cc = _parse_cache_control(request_headers)
339 cc_response = _parse_cache_control(response_headers)
340 if cc.has_key('no-store') or cc_response.has_key('no-store'):
341 cache.delete(cachekey)
342 else:
343 info = email.Message.Message()
344 for key, value in response_headers.iteritems():
345 if key not in ['status','content-encoding','transfer-encoding']:
346 info[key] = value
348 status = response_headers.status
349 if status == 304:
350 status = 200
352 status_header = 'status: %d\r\n' % response_headers.status
354 header_str = info.as_string()
356 header_str = re.sub("\r(?!\n)|(?<!\r)\n", "\r\n", header_str)
357 text = "".join([status_header, header_str, content])
359 cache.set(cachekey, text)
361 def _cnonce():
362 dig = md5.new("%s:%s" % (time.ctime(), ["0123456789"[random.randrange(0, 9)] for i in range(20)])).hexdigest()
363 return dig[:16]
365 def _wsse_username_token(cnonce, iso_now, password):
366 return base64.encodestring(sha.new("%s%s%s" % (cnonce, iso_now, password)).digest()).strip()
369 # For credentials we need two things, first
370 # a pool of credential to try (not necesarily tied to BAsic, Digest, etc.)
371 # Then we also need a list of URIs that have already demanded authentication
372 # That list is tricky since sub-URIs can take the same auth, or the
373 # auth scheme may change as you descend the tree.
374 # So we also need each Auth instance to be able to tell us
375 # how close to the 'top' it is.
377 class Authentication(object):
378 def __init__(self, credentials, host, request_uri, headers, response, content, http):
379 (scheme, authority, path, query, fragment) = parse_uri(request_uri)
380 self.path = path
381 self.host = host
382 self.credentials = credentials
383 self.http = http
385 def depth(self, request_uri):
386 (scheme, authority, path, query, fragment) = parse_uri(request_uri)
387 return request_uri[len(self.path):].count("/")
389 def inscope(self, host, request_uri):
390 # XXX Should we normalize the request_uri?
391 (scheme, authority, path, query, fragment) = parse_uri(request_uri)
392 return (host == self.host) and path.startswith(self.path)
394 def request(self, method, request_uri, headers, content):
395 """Modify the request headers to add the appropriate
396 Authorization header. Over-rise this in sub-classes."""
397 pass
399 def response(self, response, content):
400 """Gives us a chance to update with new nonces
401 or such returned from the last authorized response.
402 Over-rise this in sub-classes if necessary.
404 Return TRUE is the request is to be retried, for
405 example Digest may return stale=true.
407 return False
411 class BasicAuthentication(Authentication):
412 def __init__(self, credentials, host, request_uri, headers, response, content, http):
413 Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
415 def request(self, method, request_uri, headers, content):
416 """Modify the request headers to add the appropriate
417 Authorization header."""
418 headers['authorization'] = 'Basic ' + base64.encodestring("%s:%s" % self.credentials).strip()
421 class DigestAuthentication(Authentication):
422 """Only do qop='auth' and MD5, since that
423 is all Apache currently implements"""
424 def __init__(self, credentials, host, request_uri, headers, response, content, http):
425 Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
426 challenge = _parse_www_authenticate(response, 'www-authenticate')
427 self.challenge = challenge['digest']
428 qop = self.challenge.get('qop')
429 self.challenge['qop'] = ('auth' in [x.strip() for x in qop.split()]) and 'auth' or None
430 if self.challenge['qop'] is None:
431 raise UnimplementedDigestAuthOptionError( _("Unsupported value for qop: %s." % qop))
432 self.challenge['algorithm'] = self.challenge.get('algorithm', 'MD5')
433 if self.challenge['algorithm'] != 'MD5':
434 raise UnimplementedDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm']))
435 self.A1 = "".join([self.credentials[0], ":", self.challenge['realm'], ":", self.credentials[1]])
436 self.challenge['nc'] = 1
438 def request(self, method, request_uri, headers, content, cnonce = None):
439 """Modify the request headers"""
440 H = lambda x: md5.new(x).hexdigest()
441 KD = lambda s, d: H("%s:%s" % (s, d))
442 A2 = "".join([method, ":", request_uri])
443 self.challenge['cnonce'] = cnonce or _cnonce()
444 request_digest = '"%s"' % KD(H(self.A1), "%s:%s:%s:%s:%s" % (self.challenge['nonce'],
445 '%08x' % self.challenge['nc'],
446 self.challenge['cnonce'],
447 self.challenge['qop'], H(A2)
449 headers['Authorization'] = 'Digest username="%s", realm="%s", nonce="%s", uri="%s", algorithm=%s, response=%s, qop=%s, nc=%08x, cnonce="%s"' % (
450 self.credentials[0],
451 self.challenge['realm'],
452 self.challenge['nonce'],
453 request_uri,
454 self.challenge['algorithm'],
455 request_digest,
456 self.challenge['qop'],
457 self.challenge['nc'],
458 self.challenge['cnonce'],
460 self.challenge['nc'] += 1
462 def response(self, response, content):
463 if not response.has_key('authentication-info'):
464 challenge = _parse_www_authenticate(response, 'www-authenticate').get('digest', {})
465 if 'true' == challenge.get('stale'):
466 self.challenge['nonce'] = challenge['nonce']
467 self.challenge['nc'] = 1
468 return True
469 else:
470 updated_challenge = _parse_www_authenticate(response, 'authentication-info').get('digest', {})
472 if updated_challenge.has_key('nextnonce'):
473 self.challenge['nonce'] = updated_challenge['nextnonce']
474 self.challenge['nc'] = 1
475 return False
478 class HmacDigestAuthentication(Authentication):
479 """Adapted from Robert Sayre's code and DigestAuthentication above."""
480 __author__ = "Thomas Broyer (t.broyer@ltgt.net)"
482 def __init__(self, credentials, host, request_uri, headers, response, content, http):
483 Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
484 challenge = _parse_www_authenticate(response, 'www-authenticate')
485 self.challenge = challenge['hmacdigest']
486 # TODO: self.challenge['domain']
487 self.challenge['reason'] = self.challenge.get('reason', 'unauthorized')
488 if self.challenge['reason'] not in ['unauthorized', 'integrity']:
489 self.challenge['reason'] = 'unauthorized'
490 self.challenge['salt'] = self.challenge.get('salt', '')
491 if not self.challenge.get('snonce'):
492 raise UnimplementedHmacDigestAuthOptionError( _("The challenge doesn't contain a server nonce, or this one is empty."))
493 self.challenge['algorithm'] = self.challenge.get('algorithm', 'HMAC-SHA-1')
494 if self.challenge['algorithm'] not in ['HMAC-SHA-1', 'HMAC-MD5']:
495 raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm']))
496 self.challenge['pw-algorithm'] = self.challenge.get('pw-algorithm', 'SHA-1')
497 if self.challenge['pw-algorithm'] not in ['SHA-1', 'MD5']:
498 raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for pw-algorithm: %s." % self.challenge['pw-algorithm']))
499 if self.challenge['algorithm'] == 'HMAC-MD5':
500 self.hashmod = md5
501 else:
502 self.hashmod = sha
503 if self.challenge['pw-algorithm'] == 'MD5':
504 self.pwhashmod = md5
505 else:
506 self.pwhashmod = sha
507 self.key = "".join([self.credentials[0], ":",
508 self.pwhashmod.new("".join([self.credentials[1], self.challenge['salt']])).hexdigest().lower(),
509 ":", self.challenge['realm']
511 self.key = self.pwhashmod.new(self.key).hexdigest().lower()
513 def request(self, method, request_uri, headers, content):
514 """Modify the request headers"""
515 keys = _get_end2end_headers(headers)
516 keylist = "".join(["%s " % k for k in keys])
517 headers_val = "".join([headers[k] for k in keys])
518 created = time.strftime('%Y-%m-%dT%H:%M:%SZ',time.gmtime())
519 cnonce = _cnonce()
520 request_digest = "%s:%s:%s:%s:%s" % (method, request_uri, cnonce, self.challenge['snonce'], headers_val)
521 request_digest = hmac.new(self.key, request_digest, self.hashmod).hexdigest().lower()
522 headers['Authorization'] = 'HMACDigest username="%s", realm="%s", snonce="%s", cnonce="%s", uri="%s", created="%s", response="%s", headers="%s"' % (
523 self.credentials[0],
524 self.challenge['realm'],
525 self.challenge['snonce'],
526 cnonce,
527 request_uri,
528 created,
529 request_digest,
530 keylist,
533 def response(self, response, content):
534 challenge = _parse_www_authenticate(response, 'www-authenticate').get('hmacdigest', {})
535 if challenge.get('reason') in ['integrity', 'stale']:
536 return True
537 return False
540 class WsseAuthentication(Authentication):
541 """This is thinly tested and should not be relied upon.
542 At this time there isn't any third party server to test against.
543 Blogger and TypePad implemented this algorithm at one point
544 but Blogger has since switched to Basic over HTTPS and
545 TypePad has implemented it wrong, by never issuing a 401
546 challenge but instead requiring your client to telepathically know that
547 their endpoint is expecting WSSE profile="UsernameToken"."""
548 def __init__(self, credentials, host, request_uri, headers, response, content, http):
549 Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
551 def request(self, method, request_uri, headers, content):
552 """Modify the request headers to add the appropriate
553 Authorization header."""
554 headers['Authorization'] = 'WSSE profile="UsernameToken"'
555 iso_now = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
556 cnonce = _cnonce()
557 password_digest = _wsse_username_token(cnonce, iso_now, self.credentials[1])
558 headers['X-WSSE'] = 'UsernameToken Username="%s", PasswordDigest="%s", Nonce="%s", Created="%s"' % (
559 self.credentials[0],
560 password_digest,
561 cnonce,
562 iso_now)
564 class GoogleLoginAuthentication(Authentication):
565 def __init__(self, credentials, host, request_uri, headers, response, content, http):
566 from urllib import urlencode
567 Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
568 challenge = _parse_www_authenticate(response, 'www-authenticate')
569 service = challenge['googlelogin'].get('service', 'xapi')
570 # Bloggger actually returns the service in the challenge
571 # For the rest we guess based on the URI
572 if service == 'xapi' and request_uri.find("calendar") > 0:
573 service = "cl"
574 # No point in guessing Base or Spreadsheet
575 #elif request_uri.find("spreadsheets") > 0:
576 # service = "wise"
578 auth = dict(Email=credentials[0], Passwd=credentials[1], service=service, source=headers['user-agent'])
579 resp, content = self.http.request("https://www.google.com/accounts/ClientLogin", method="POST", body=urlencode(auth), headers={'Content-Type': 'application/x-www-form-urlencoded'})
580 lines = content.split('\n')
581 d = dict([tuple(line.split("=", 1)) for line in lines if line])
582 if resp.status == 403:
583 self.Auth = ""
584 else:
585 self.Auth = d['Auth']
587 def request(self, method, request_uri, headers, content):
588 """Modify the request headers to add the appropriate
589 Authorization header."""
590 headers['authorization'] = 'GoogleLogin Auth=' + self.Auth
593 AUTH_SCHEME_CLASSES = {
594 "basic": BasicAuthentication,
595 "wsse": WsseAuthentication,
596 "digest": DigestAuthentication,
597 "hmacdigest": HmacDigestAuthentication,
598 "googlelogin": GoogleLoginAuthentication
601 AUTH_SCHEME_ORDER = ["hmacdigest", "googlelogin", "digest", "wsse", "basic"]
603 def _md5(s):
604 return
606 class FileCache(object):
607 """Uses a local directory as a store for cached files.
608 Not really safe to use if multiple threads or processes are going to
609 be running on the same cache.
611 def __init__(self, cache, safe=safename): # use safe=lambda x: md5.new(x).hexdigest() for the old behavior
612 self.cache = cache
613 self.safe = safe
614 if not os.path.exists(cache):
615 os.makedirs(self.cache)
617 def get(self, key):
618 retval = None
619 cacheFullPath = os.path.join(self.cache, self.safe(key))
620 try:
621 f = file(cacheFullPath, "r")
622 retval = f.read()
623 f.close()
624 except IOError:
625 pass
626 return retval
628 def set(self, key, value):
629 cacheFullPath = os.path.join(self.cache, self.safe(key))
630 f = file(cacheFullPath, "w")
631 f.write(value)
632 f.close()
634 def delete(self, key):
635 cacheFullPath = os.path.join(self.cache, self.safe(key))
636 if os.path.exists(cacheFullPath):
637 os.remove(cacheFullPath)
639 class Credentials(object):
640 def __init__(self):
641 self.credentials = []
643 def add(self, name, password, domain=""):
644 self.credentials.append((domain.lower(), name, password))
646 def clear(self):
647 self.credentials = []
649 def iter(self, domain):
650 for (cdomain, name, password) in self.credentials:
651 if cdomain == "" or domain == cdomain:
652 yield (name, password)
654 class KeyCerts(Credentials):
655 """Identical to Credentials except that
656 name/password are mapped to key/cert."""
657 pass
660 class ProxyInfo(object):
661 """Collect information required to use a proxy."""
662 def __init__(self, proxy_type, proxy_host, proxy_port, proxy_rdns=None, proxy_user=None, proxy_pass=None):
663 """The parameter proxy_type must be set to one of socks.PROXY_TYPE_XXX
664 constants. For example:
666 p = ProxyInfo(proxy_type=socks.PROXY_TYPE_HTTP, proxy_host='localhost', proxy_port=8000)
668 self.proxy_type, self.proxy_host, self.proxy_port, self.proxy_rdns, self.proxy_user, self.proxy_pass = proxy_type, proxy_host, proxy_port, proxy_rdns, proxy_user, proxy_pass
670 def astuple(self):
671 return (self.proxy_type, self.proxy_host, self.proxy_port, self.proxy_rdns,
672 self.proxy_user, self.proxy_pass)
674 def isgood(self):
675 return socks and (self.proxy_host != None) and (self.proxy_port != None)
678 class HTTPConnectionWithTimeout(httplib.HTTPConnection):
679 """HTTPConnection subclass that supports timeouts"""
681 def __init__(self, host, port=None, strict=None, timeout=None, proxy_info=None):
682 httplib.HTTPConnection.__init__(self, host, port, strict)
683 self.timeout = timeout
684 self.proxy_info = proxy_info
686 def connect(self):
687 """Connect to the host and port specified in __init__."""
688 # Mostly verbatim from httplib.py.
689 msg = "getaddrinfo returns an empty list"
690 for res in socket.getaddrinfo(self.host, self.port, 0,
691 socket.SOCK_STREAM):
692 af, socktype, proto, canonname, sa = res
693 try:
694 if self.proxy_info and self.proxy_info.isgood():
695 self.sock = socks.socksocket(af, socktype, proto)
696 self.sock.setproxy(*self.proxy_info.astuple())
697 else:
698 self.sock = socket.socket(af, socktype, proto)
699 # Different from httplib: support timeouts.
700 if self.timeout is not None:
701 self.sock.settimeout(self.timeout)
702 # End of difference from httplib.
703 if self.debuglevel > 0:
704 print "connect: (%s, %s)" % (self.host, self.port)
705 self.sock.connect(sa)
706 except socket.error, msg:
707 if self.debuglevel > 0:
708 print 'connect fail:', (self.host, self.port)
709 if self.sock:
710 self.sock.close()
711 self.sock = None
712 continue
713 break
714 if not self.sock:
715 raise socket.error, msg
717 class HTTPSConnectionWithTimeout(httplib.HTTPSConnection):
718 "This class allows communication via SSL."
720 def __init__(self, host, port=None, key_file=None, cert_file=None,
721 strict=None, timeout=None, proxy_info=None):
722 self.timeout = timeout
723 self.proxy_info = proxy_info
724 httplib.HTTPSConnection.__init__(self, host, port=port, key_file=key_file,
725 cert_file=cert_file, strict=strict)
727 def connect(self):
728 "Connect to a host on a given (SSL) port."
730 if self.proxy_info and self.proxy_info.isgood():
731 self.sock.setproxy(*self.proxy_info.astuple())
732 sock.setproxy(*self.proxy_info.astuple())
733 else:
734 sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
735 if self.timeout is not None:
736 sock.settimeout(self.timeout)
737 sock.connect((self.host, self.port))
738 ssl = socket.ssl(sock, self.key_file, self.cert_file)
739 self.sock = httplib.FakeSocket(sock, ssl)
743 class Http(object):
744 """An HTTP client that handles:
745 - all methods
746 - caching
747 - ETags
748 - compression,
749 - HTTPS
750 - Basic
751 - Digest
752 - WSSE
754 and more.
756 def __init__(self, cache=None, timeout=None, proxy_info=None):
757 """The value of proxy_info is a ProxyInfo instance.
759 If 'cache' is a string then it is used as a directory name
760 for a disk cache. Otherwise it must be an object that supports
761 the same interface as FileCache."""
762 self.proxy_info = proxy_info
763 # Map domain name to an httplib connection
764 self.connections = {}
765 # The location of the cache, for now a directory
766 # where cached responses are held.
767 if cache and isinstance(cache, str):
768 self.cache = FileCache(cache)
769 else:
770 self.cache = cache
772 # Name/password
773 self.credentials = Credentials()
775 # Key/cert
776 self.certificates = KeyCerts()
778 # authorization objects
779 self.authorizations = []
781 # If set to False then no redirects are followed, even safe ones.
782 self.follow_redirects = True
784 # If 'follow_redirects' is True, and this is set to True then
785 # all redirecs are followed, including unsafe ones.
786 self.follow_all_redirects = False
788 self.ignore_etag = False
790 self.force_exception_to_status_code = False
792 self.timeout = timeout
794 def _auth_from_challenge(self, host, request_uri, headers, response, content):
795 """A generator that creates Authorization objects
796 that can be applied to requests.
798 challenges = _parse_www_authenticate(response, 'www-authenticate')
799 for cred in self.credentials.iter(host):
800 for scheme in AUTH_SCHEME_ORDER:
801 if challenges.has_key(scheme):
802 yield AUTH_SCHEME_CLASSES[scheme](cred, host, request_uri, headers, response, content, self)
804 def add_credentials(self, name, password, domain=""):
805 """Add a name and password that will be used
806 any time a request requires authentication."""
807 self.credentials.add(name, password, domain)
809 def add_certificate(self, key, cert, domain):
810 """Add a key and cert that will be used
811 any time a request requires authentication."""
812 self.certificates.add(key, cert, domain)
814 def clear_credentials(self):
815 """Remove all the names and passwords
816 that are used for authentication"""
817 self.credentials.clear()
818 self.authorizations = []
820 def _conn_request(self, conn, request_uri, method, body, headers):
821 for i in range(2):
822 try:
823 conn.request(method, request_uri, body, headers)
824 response = conn.getresponse()
825 except socket.gaierror:
826 conn.close()
827 raise ServerNotFoundError("Unable to find the server at %s" % conn.host)
828 except httplib.HTTPException, e:
829 if i == 0:
830 conn.close()
831 conn.connect()
832 continue
833 else:
834 raise
835 else:
836 content = response.read()
837 response = Response(response)
838 if method != "HEAD":
839 content = _decompressContent(response, content)
841 break;
842 return (response, content)
845 def _request(self, conn, host, absolute_uri, request_uri, method, body, headers, redirections, cachekey):
846 """Do the actual request using the connection object
847 and also follow one level of redirects if necessary"""
849 auths = [(auth.depth(request_uri), auth) for auth in self.authorizations if auth.inscope(host, request_uri)]
850 auth = auths and sorted(auths)[0][1] or None
851 if auth:
852 auth.request(method, request_uri, headers, body)
854 (response, content) = self._conn_request(conn, request_uri, method, body, headers)
856 if auth:
857 if auth.response(response, body):
858 auth.request(method, request_uri, headers, body)
859 (response, content) = self._conn_request(conn, request_uri, method, body, headers )
860 response._stale_digest = 1
862 if response.status == 401:
863 for authorization in self._auth_from_challenge(host, request_uri, headers, response, content):
864 authorization.request(method, request_uri, headers, body)
865 (response, content) = self._conn_request(conn, request_uri, method, body, headers, )
866 if response.status != 401:
867 self.authorizations.append(authorization)
868 authorization.response(response, body)
869 break
871 if (self.follow_all_redirects or (method in ["GET", "HEAD"]) or response.status == 303):
872 if self.follow_redirects and response.status in [300, 301, 302, 303, 307]:
873 # Pick out the location header and basically start from the beginning
874 # remembering first to strip the ETag header and decrement our 'depth'
875 if redirections:
876 if not response.has_key('location') and response.status != 300:
877 raise RedirectMissingLocation( _("Redirected but the response is missing a Location: header."), response, content)
878 # Fix-up relative redirects (which violate an RFC 2616 MUST)
879 if response.has_key('location'):
880 location = response['location']
881 (scheme, authority, path, query, fragment) = parse_uri(location)
882 if authority == None:
883 response['location'] = urlparse.urljoin(absolute_uri, location)
884 if response.status == 301 and method in ["GET", "HEAD"]:
885 response['-x-permanent-redirect-url'] = response['location']
886 if not response.has_key('content-location'):
887 response['content-location'] = absolute_uri
888 _updateCache(headers, response, content, self.cache, cachekey)
889 if headers.has_key('if-none-match'):
890 del headers['if-none-match']
891 if headers.has_key('if-modified-since'):
892 del headers['if-modified-since']
893 if response.has_key('location'):
894 location = response['location']
895 old_response = copy.deepcopy(response)
896 if not old_response.has_key('content-location'):
897 old_response['content-location'] = absolute_uri
898 redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method
899 (response, content) = self.request(location, redirect_method, body=body, headers = headers, redirections = redirections - 1)
900 response.previous = old_response
901 else:
902 raise RedirectLimit( _("Redirected more times than rediection_limit allows."), response, content)
903 elif response.status in [200, 203] and method == "GET":
904 # Don't cache 206's since we aren't going to handle byte range requests
905 if not response.has_key('content-location'):
906 response['content-location'] = absolute_uri
907 _updateCache(headers, response, content, self.cache, cachekey)
909 return (response, content)
912 # Need to catch and rebrand some exceptions
913 # Then need to optionally turn all exceptions into status codes
914 # including all socket.* and httplib.* exceptions.
917 def request(self, uri, method="GET", body=None, headers=None, redirections=DEFAULT_MAX_REDIRECTS, connection_type=None):
918 """ Performs a single HTTP request.
919 The 'uri' is the URI of the HTTP resource and can begin
920 with either 'http' or 'https'. The value of 'uri' must be an absolute URI.
922 The 'method' is the HTTP method to perform, such as GET, POST, DELETE, etc.
923 There is no restriction on the methods allowed.
925 The 'body' is the entity body to be sent with the request. It is a string
926 object.
928 Any extra headers that are to be sent with the request should be provided in the
929 'headers' dictionary.
931 The maximum number of redirect to follow before raising an
932 exception is 'redirections. The default is 5.
934 The return value is a tuple of (response, content), the first
935 being and instance of the 'Response' class, the second being
936 a string that contains the response entity body.
938 try:
939 if headers is None:
940 headers = {}
941 else:
942 headers = _normalize_headers(headers)
944 if not headers.has_key('user-agent'):
945 headers['user-agent'] = "Python-httplib2/%s" % __version__
947 uri = iri2uri(uri)
949 (scheme, authority, request_uri, defrag_uri) = urlnorm(uri)
951 conn_key = scheme+":"+authority
952 if conn_key in self.connections:
953 conn = self.connections[conn_key]
954 else:
955 if not connection_type:
956 connection_type = (scheme == 'https') and HTTPSConnectionWithTimeout or HTTPConnectionWithTimeout
957 certs = list(self.certificates.iter(authority))
958 if scheme == 'https' and certs:
959 conn = self.connections[conn_key] = connection_type(authority, key_file=certs[0][0],
960 cert_file=certs[0][1], timeout=self.timeout, proxy_info=self.proxy_info)
961 else:
962 conn = self.connections[conn_key] = connection_type(authority, timeout=self.timeout, proxy_info=self.proxy_info)
963 conn.set_debuglevel(debuglevel)
965 if method in ["GET", "HEAD"] and 'range' not in headers:
966 headers['accept-encoding'] = 'compress, gzip'
968 info = email.Message.Message()
969 cached_value = None
970 if self.cache:
971 cachekey = defrag_uri
972 cached_value = self.cache.get(cachekey)
973 if cached_value:
974 info = email.message_from_string(cached_value)
975 try:
976 content = cached_value.split('\r\n\r\n', 1)[1]
977 except IndexError:
978 self.cache.delete(cachekey)
979 cachekey = None
980 cached_value = None
981 else:
982 cachekey = None
984 if method in ["PUT"] and self.cache and info.has_key('etag') and not self.ignore_etag and 'if-match' not in headers:
985 # http://www.w3.org/1999/04/Editing/
986 headers['if-match'] = info['etag']
988 if method not in ["GET", "HEAD"] and self.cache and cachekey:
989 # RFC 2616 Section 13.10
990 self.cache.delete(cachekey)
992 if cached_value and method in ["GET", "HEAD"] and self.cache and 'range' not in headers:
993 if info.has_key('-x-permanent-redirect-url'):
994 # Should cached permanent redirects be counted in our redirection count? For now, yes.
995 (response, new_content) = self.request(info['-x-permanent-redirect-url'], "GET", headers = headers, redirections = redirections - 1)
996 response.previous = Response(info)
997 response.previous.fromcache = True
998 else:
999 # Determine our course of action:
1000 # Is the cached entry fresh or stale?
1001 # Has the client requested a non-cached response?
1003 # There seems to be three possible answers:
1004 # 1. [FRESH] Return the cache entry w/o doing a GET
1005 # 2. [STALE] Do the GET (but add in cache validators if available)
1006 # 3. [TRANSPARENT] Do a GET w/o any cache validators (Cache-Control: no-cache) on the request
1007 entry_disposition = _entry_disposition(info, headers)
1009 if entry_disposition == "FRESH":
1010 if not cached_value:
1011 info['status'] = '504'
1012 content = ""
1013 response = Response(info)
1014 if cached_value:
1015 response.fromcache = True
1016 return (response, content)
1018 if entry_disposition == "STALE":
1019 if info.has_key('etag') and not self.ignore_etag and not 'if-none-match' in headers:
1020 headers['if-none-match'] = info['etag']
1021 if info.has_key('last-modified') and not 'last-modified' in headers:
1022 headers['if-modified-since'] = info['last-modified']
1023 elif entry_disposition == "TRANSPARENT":
1024 pass
1026 (response, new_content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
1028 if response.status == 304 and method == "GET":
1029 # Rewrite the cache entry with the new end-to-end headers
1030 # Take all headers that are in response
1031 # and overwrite their values in info.
1032 # unless they are hop-by-hop, or are listed in the connection header.
1034 for key in _get_end2end_headers(response):
1035 info[key] = response[key]
1036 merged_response = Response(info)
1037 if hasattr(response, "_stale_digest"):
1038 merged_response._stale_digest = response._stale_digest
1039 _updateCache(headers, merged_response, content, self.cache, cachekey)
1040 response = merged_response
1041 response.status = 200
1042 response.fromcache = True
1044 elif response.status == 200:
1045 content = new_content
1046 else:
1047 self.cache.delete(cachekey)
1048 content = new_content
1049 else:
1050 (response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
1051 except Exception, e:
1052 if self.force_exception_to_status_code:
1053 if isinstance(e, HttpLib2ErrorWithResponse):
1054 response = e.response
1055 content = e.content
1056 response.status = 500
1057 response.reason = str(e)
1058 elif isinstance(e, socket.timeout):
1059 content = "Request Timeout"
1060 response = Response( {
1061 "content-type": "text/plain",
1062 "status": "408",
1063 "content-length": len(content)
1065 response.reason = "Request Timeout"
1066 else:
1067 content = str(e)
1068 response = Response( {
1069 "content-type": "text/plain",
1070 "status": "400",
1071 "content-length": len(content)
1073 response.reason = "Bad Request"
1074 else:
1075 raise
1078 return (response, content)
1082 class Response(dict):
1083 """An object more like email.Message than httplib.HTTPResponse."""
1085 """Is this response from our local cache"""
1086 fromcache = False
1088 """HTTP protocol version used by server. 10 for HTTP/1.0, 11 for HTTP/1.1. """
1089 version = 11
1091 "Status code returned by server. "
1092 status = 200
1094 """Reason phrase returned by server."""
1095 reason = "Ok"
1097 previous = None
1099 def __init__(self, info):
1100 # info is either an email.Message or
1101 # an httplib.HTTPResponse object.
1102 if isinstance(info, httplib.HTTPResponse):
1103 for key, value in info.getheaders():
1104 self[key] = value
1105 self.status = info.status
1106 self['status'] = str(self.status)
1107 self.reason = info.reason
1108 self.version = info.version
1109 elif isinstance(info, email.Message.Message):
1110 for key, value in info.items():
1111 self[key] = value
1112 self.status = int(self['status'])
1113 else:
1114 for key, value in info.iteritems():
1115 self[key] = value
1116 self.status = int(self.get('status', self.status))
1119 def __getattr__(self, name):
1120 if name == 'dict':
1121 return self
1122 else:
1123 raise AttributeError, name