tools/telemetry/third_party/webpagereplay/httpclient.py

   1 #!/usr/bin/env python
   2 # Copyright 2012 Google Inc. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License, Version 2.0 (the "License");
   5 # you may not use this file except in compliance with the License.
   6 # You may obtain a copy of the License at
   7 #
   8 #      http://www.apache.org/licenses/LICENSE-2.0
   9 #
  10 # Unless required by applicable law or agreed to in writing, software
  11 # distributed under the License is distributed on an "AS IS" BASIS,
  12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 # See the License for the specific language governing permissions and
  14 # limitations under the License.
  15
  16 """Retrieve web resources over http."""
  17
  18 import copy
  19 import httplib
  20 import logging
  21 import random
  22 import StringIO
  23
  24 import httparchive
  25 import platformsettings
  26 import script_injector
  27
  28
  29 # PIL isn't always available, but we still want to be able to run without
  30 # the image scrambling functionality in this case.
  31 try:
  32   import Image
  33 except ImportError:
  34   Image = None
  35
  36 TIMER = platformsettings.timer
  37
  38
  39 class HttpClientException(Exception):
  40   """Base class for all exceptions in httpclient."""
  41   pass
  42
  43
  44 def _InjectScripts(response, inject_script):
  45   """Injects |inject_script| immediately after <head> or <html>.
  46
  47   Copies |response| if it is modified.
  48
  49   Args:
  50     response: an ArchivedHttpResponse
  51     inject_script: JavaScript string (e.g. "Math.random = function(){...}")
  52   Returns:
  53     an ArchivedHttpResponse
  54   """
  55   if type(response) == tuple:
  56     logging.warn('tuple response: %s', response)
  57   content_type = response.get_header('content-type')
  58   if content_type and content_type.startswith('text/html'):
  59     text = response.get_data_as_text()
  60     text, already_injected = script_injector.InjectScript(
  61         text, 'text/html', inject_script)
  62     if not already_injected:
  63       response = copy.deepcopy(response)
  64       response.set_data(text)
  65   return response
  66
  67
  68 def _ScrambleImages(response):
  69   """If the |response| is an image, attempt to scramble it.
  70
  71   Copies |response| if it is modified.
  72
  73   Args:
  74     response: an ArchivedHttpResponse
  75   Returns:
  76     an ArchivedHttpResponse
  77   """
  78
  79   assert Image, '--scramble_images requires the PIL module to be installed.'
  80
  81   content_type = response.get_header('content-type')
  82   if content_type and content_type.startswith('image/'):
  83     try:
  84       image_data = response.response_data[0]
  85       image_data.decode(encoding='base64')
  86       im = Image.open(StringIO.StringIO(image_data))
  87
  88       pixel_data = list(im.getdata())
  89       random.shuffle(pixel_data)
  90
  91       scrambled_image = im.copy()
  92       scrambled_image.putdata(pixel_data)
  93
  94       output_image_io = StringIO.StringIO()
  95       scrambled_image.save(output_image_io, im.format)
  96       output_image_data = output_image_io.getvalue()
  97       output_image_data.encode(encoding='base64')
  98
  99       response = copy.deepcopy(response)
 100       response.set_data(output_image_data)
 101     except Exception:
 102       pass
 103
 104   return response
 105
 106
 107 class DetailedHTTPResponse(httplib.HTTPResponse):
 108   """Preserve details relevant to replaying responses.
 109
 110   WARNING: This code uses attributes and methods of HTTPResponse
 111   that are not part of the public interface.
 112   """
 113
 114   def read_chunks(self):
 115     """Return the response body content and timing data.
 116
 117     The returned chunks have the chunk size and CRLFs stripped off.
 118     If the response was compressed, the returned data is still compressed.
 119
 120     Returns:
 121       (chunks, delays)
 122         chunks:
 123           [response_body]                  # non-chunked responses
 124           [chunk_1, chunk_2, ...]          # chunked responses
 125         delays:
 126           [0]                              # non-chunked responses
 127           [chunk_1_first_byte_delay, ...]  # chunked responses
 128
 129       The delay for the first body item should be recorded by the caller.
 130     """
 131     buf = []
 132     chunks = []
 133     delays = []
 134     if not self.chunked:
 135       chunks.append(self.read())
 136       delays.append(0)
 137     else:
 138       start = TIMER()
 139       try:
 140         while True:
 141           line = self.fp.readline()
 142           chunk_size = self._read_chunk_size(line)
 143           if chunk_size is None:
 144             raise httplib.IncompleteRead(''.join(chunks))
 145           if chunk_size == 0:
 146             break
 147           delays.append(TIMER() - start)
 148           chunks.append(self._safe_read(chunk_size))
 149           self._safe_read(2)  # skip the CRLF at the end of the chunk
 150           start = TIMER()
 151
 152         # Ignore any trailers.
 153         while True:
 154           line = self.fp.readline()
 155           if not line or line == '\r\n':
 156             break
 157       finally:
 158         self.close()
 159     return chunks, delays
 160
 161   @classmethod
 162   def _read_chunk_size(cls, line):
 163     chunk_extensions_pos = line.find(';')
 164     if chunk_extensions_pos != -1:
 165       line = line[:chunk_extensions_pos]  # strip chunk-extensions
 166     try:
 167       chunk_size = int(line, 16)
 168     except ValueError:
 169       return None
 170     return chunk_size
 171
 172
 173 class DetailedHTTPConnection(httplib.HTTPConnection):
 174   """Preserve details relevant to replaying connections."""
 175   response_class = DetailedHTTPResponse
 176
 177
 178 class DetailedHTTPSResponse(DetailedHTTPResponse):
 179   """Preserve details relevant to replaying SSL responses."""
 180   pass
 181
 182
 183 class DetailedHTTPSConnection(httplib.HTTPSConnection):
 184   """Preserve details relevant to replaying SSL connections."""
 185   response_class = DetailedHTTPSResponse
 186
 187
 188 class RealHttpFetch(object):
 189
 190   def __init__(self, real_dns_lookup):
 191     """Initialize RealHttpFetch.
 192
 193     Args:
 194       real_dns_lookup: a function that resolves a host to an IP.
 195     """
 196     self._real_dns_lookup = real_dns_lookup
 197
 198   @staticmethod
 199   def _GetHeaderNameValue(header):
 200     """Parse the header line and return a name/value tuple.
 201
 202     Args:
 203       header: a string for a header such as "Content-Length: 314".
 204     Returns:
 205       A tuple (header_name, header_value) on success or None if the header
 206       is not in expected format. header_name is in lowercase.
 207     """
 208     i = header.find(':')
 209     if i > 0:
 210       return (header[:i].lower(), header[i+1:].strip())
 211     return None
 212
 213   @staticmethod
 214   def _ToTuples(headers):
 215     """Parse headers and save them to a list of tuples.
 216
 217     This method takes HttpResponse.msg.headers as input and convert it
 218     to a list of (header_name, header_value) tuples.
 219     HttpResponse.msg.headers is a list of strings where each string
 220     represents either a header or a continuation line of a header.
 221     1. a normal header consists of two parts which are separated by colon :
 222        "header_name:header_value..."
 223     2. a continuation line is a string starting with whitespace
 224        "[whitespace]continued_header_value..."
 225     If a header is not in good shape or an unexpected continuation line is
 226     seen, it will be ignored.
 227
 228     Should avoid using response.getheaders() directly
 229     because response.getheaders() can't handle multiple headers
 230     with the same name properly. Instead, parse the
 231     response.msg.headers using this method to get all headers.
 232
 233     Args:
 234       headers: an instance of HttpResponse.msg.headers.
 235     Returns:
 236       A list of tuples which looks like:
 237       [(header_name, header_value), (header_name2, header_value2)...]
 238     """
 239     all_headers = []
 240     for line in headers:
 241       if line[0] in '\t ':
 242         if not all_headers:
 243           logging.warning(
 244               'Unexpected response header continuation line [%s]', line)
 245           continue
 246         name, value = all_headers.pop()
 247         value += '\n ' + line.strip()
 248       else:
 249         name_value = RealHttpFetch._GetHeaderNameValue(line)
 250         if not name_value:
 251           logging.warning(
 252               'Response header in wrong format [%s]', line)
 253           continue
 254         name, value = name_value  # pylint: disable=unpacking-non-sequence
 255       all_headers.append((name, value))
 256     return all_headers
 257
 258   @staticmethod
 259   def _get_request_host_port(request):
 260     host_parts = request.host.split(':')
 261     host = host_parts[0]
 262     port = int(host_parts[1]) if len(host_parts) == 2 else None
 263     return host, port
 264
 265   @staticmethod
 266   def _get_system_proxy(is_ssl):
 267     return platformsettings.get_system_proxy(is_ssl)
 268
 269   def _get_connection(self, request_host, request_port, is_ssl):
 270     """Return a detailed connection object for host/port pair.
 271
 272     If a system proxy is defined (see platformsettings.py), it will be used.
 273
 274     Args:
 275       request_host: a host string (e.g. "www.example.com").
 276       request_port: a port integer (e.g. 8080) or None (for the default port).
 277       is_ssl: True if HTTPS connection is needed.
 278     Returns:
 279       A DetailedHTTPSConnection or DetailedHTTPConnection instance.
 280     """
 281     connection_host = request_host
 282     connection_port = request_port
 283     system_proxy = self._get_system_proxy(is_ssl)
 284     if system_proxy:
 285       connection_host = system_proxy.host
 286       connection_port = system_proxy.port
 287
 288     # Use an IP address because WPR may override DNS settings.
 289     connection_ip = self._real_dns_lookup(connection_host)
 290     if not connection_ip:
 291       logging.critical('Unable to find host ip for name: %s', connection_host)
 292       return None
 293
 294     if is_ssl:
 295       connection = DetailedHTTPSConnection(connection_ip, connection_port)
 296       if system_proxy:
 297         connection.set_tunnel(request_host, request_port)
 298     else:
 299       connection = DetailedHTTPConnection(connection_ip, connection_port)
 300     return connection
 301
 302   def __call__(self, request):
 303     """Fetch an HTTP request.
 304
 305     Args:
 306       request: an ArchivedHttpRequest
 307     Returns:
 308       an ArchivedHttpResponse
 309     """
 310     logging.debug('RealHttpFetch: %s %s', request.host, request.full_path)
 311     request_host, request_port = self._get_request_host_port(request)
 312     retries = 3
 313     while True:
 314       try:
 315         connection = self._get_connection(
 316             request_host, request_port, request.is_ssl)
 317         connect_start = TIMER()
 318         connection.connect()
 319         connect_delay = int((TIMER() - connect_start) * 1000)
 320         start = TIMER()
 321         connection.request(
 322             request.command,
 323             request.full_path,
 324             request.request_body,
 325             request.headers)
 326         response = connection.getresponse()
 327         headers_delay = int((TIMER() - start) * 1000)
 328
 329         chunks, chunk_delays = response.read_chunks()
 330         delays = {
 331             'connect': connect_delay,
 332             'headers': headers_delay,
 333             'data': chunk_delays
 334             }
 335         archived_http_response = httparchive.ArchivedHttpResponse(
 336             response.version,
 337             response.status,
 338             response.reason,
 339             RealHttpFetch._ToTuples(response.msg.headers),
 340             chunks,
 341             delays)
 342         return archived_http_response
 343       except Exception, e:
 344         if retries:
 345           retries -= 1
 346           logging.warning('Retrying fetch %s: %s', request, e)
 347           continue
 348         logging.critical('Could not fetch %s: %s', request, e)
 349         return None
 350
 351
 352 class RecordHttpArchiveFetch(object):
 353   """Make real HTTP fetches and save responses in the given HttpArchive."""
 354
 355   def __init__(self, http_archive, real_dns_lookup, inject_script):
 356     """Initialize RecordHttpArchiveFetch.
 357
 358     Args:
 359       http_archive: an instance of a HttpArchive
 360       real_dns_lookup: a function that resolves a host to an IP.
 361       inject_script: script string to inject in all pages
 362     """
 363     self.http_archive = http_archive
 364     self.real_http_fetch = RealHttpFetch(real_dns_lookup)
 365     self.inject_script = inject_script
 366
 367   def __call__(self, request):
 368     """Fetch the request and return the response.
 369
 370     Args:
 371       request: an ArchivedHttpRequest.
 372     Returns:
 373       an ArchivedHttpResponse
 374     """
 375     # If request is already in the archive, return the archived response.
 376     if request in self.http_archive:
 377       logging.debug('Repeated request found: %s', request)
 378       response = self.http_archive[request]
 379     else:
 380       response = self.real_http_fetch(request)
 381       if response is None:
 382         return None
 383       self.http_archive[request] = response
 384     if self.inject_script:
 385       response = _InjectScripts(response, self.inject_script)
 386     logging.debug('Recorded: %s', request)
 387     return response
 388
 389
 390 class ReplayHttpArchiveFetch(object):
 391   """Serve responses from the given HttpArchive."""
 392
 393   def __init__(self, http_archive, real_dns_lookup, inject_script,
 394                use_diff_on_unknown_requests=False,
 395                use_closest_match=False, scramble_images=False):
 396     """Initialize ReplayHttpArchiveFetch.
 397
 398     Args:
 399       http_archive: an instance of a HttpArchive
 400       real_dns_lookup: a function that resolves a host to an IP.
 401       inject_script: script string to inject in all pages
 402       use_diff_on_unknown_requests: If True, log unknown requests
 403         with a diff to requests that look similar.
 404       use_closest_match: If True, on replay mode, serve the closest match
 405         in the archive instead of giving a 404.
 406     """
 407     self.http_archive = http_archive
 408     self.inject_script = inject_script
 409     self.use_diff_on_unknown_requests = use_diff_on_unknown_requests
 410     self.use_closest_match = use_closest_match
 411     self.scramble_images = scramble_images
 412     self.real_http_fetch = RealHttpFetch(real_dns_lookup)
 413
 414   def __call__(self, request):
 415     """Fetch the request and return the response.
 416
 417     Args:
 418       request: an instance of an ArchivedHttpRequest.
 419     Returns:
 420       Instance of ArchivedHttpResponse (if found) or None
 421     """
 422     if request.host.startswith('127.0.0.1:'):
 423       return self.real_http_fetch(request)
 424
 425     response = self.http_archive.get(request)
 426
 427     if self.use_closest_match and not response:
 428       closest_request = self.http_archive.find_closest_request(
 429           request, use_path=True)
 430       if closest_request:
 431         response = self.http_archive.get(closest_request)
 432         if response:
 433           logging.info('Request not found: %s\nUsing closest match: %s',
 434                        request, closest_request)
 435
 436     if not response:
 437       reason = str(request)
 438       if self.use_diff_on_unknown_requests:
 439         diff = self.http_archive.diff(request)
 440         if diff:
 441           reason += (
 442               "\nNearest request diff "
 443               "('-' for archived request, '+' for current request):\n%s" % diff)
 444       logging.warning('Could not replay: %s', reason)
 445     else:
 446       if self.inject_script:
 447         response = _InjectScripts(response, self.inject_script)
 448       if self.scramble_images:
 449         response = _ScrambleImages(response)
 450     return response
 451
 452
 453 class ControllableHttpArchiveFetch(object):
 454   """Controllable fetch function that can swap between record and replay."""
 455
 456   def __init__(self, http_archive, real_dns_lookup,
 457                inject_script, use_diff_on_unknown_requests,
 458                use_record_mode, use_closest_match, scramble_images):
 459     """Initialize HttpArchiveFetch.
 460
 461     Args:
 462       http_archive: an instance of a HttpArchive
 463       real_dns_lookup: a function that resolves a host to an IP.
 464       inject_script: script string to inject in all pages.
 465       use_diff_on_unknown_requests: If True, log unknown requests
 466         with a diff to requests that look similar.
 467       use_record_mode: If True, start in server in record mode.
 468       use_closest_match: If True, on replay mode, serve the closest match
 469         in the archive instead of giving a 404.
 470     """
 471     self.http_archive = http_archive
 472     self.record_fetch = RecordHttpArchiveFetch(
 473         http_archive, real_dns_lookup, inject_script)
 474     self.replay_fetch = ReplayHttpArchiveFetch(
 475         http_archive, real_dns_lookup, inject_script,
 476         use_diff_on_unknown_requests, use_closest_match, scramble_images)
 477     if use_record_mode:
 478       self.SetRecordMode()
 479     else:
 480       self.SetReplayMode()
 481
 482   def SetRecordMode(self):
 483     self.fetch = self.record_fetch
 484     self.is_record_mode = True
 485
 486   def SetReplayMode(self):
 487     self.fetch = self.replay_fetch
 488     self.is_record_mode = False
 489
 490   def __call__(self, *args, **kwargs):
 491     """Forward calls to Replay/Record fetch functions depending on mode."""
 492     return self.fetch(*args, **kwargs)