tools/telemetry/third_party/webpagereplay/httparchive.py

   1 #!/usr/bin/env python
   2 # Copyright 2010 Google Inc. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License, Version 2.0 (the "License");
   5 # you may not use this file except in compliance with the License.
   6 # You may obtain a copy of the License at
   7 #
   8 #      http://www.apache.org/licenses/LICENSE-2.0
   9 #
  10 # Unless required by applicable law or agreed to in writing, software
  11 # distributed under the License is distributed on an "AS IS" BASIS,
  12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 # See the License for the specific language governing permissions and
  14 # limitations under the License.
  15
  16 """View and edit HTTP Archives.
  17
  18 To list all URLs in an archive:
  19   $ ./httparchive.py ls archive.wpr
  20
  21 To view the content of all URLs from example.com:
  22   $ ./httparchive.py cat --host example.com archive.wpr
  23
  24 To view the content of a particular URL:
  25   $ ./httparchive.py cat --host www.example.com --full_path /foo archive.wpr
  26
  27 To view the content of all URLs:
  28   $ ./httparchive.py cat archive.wpr
  29
  30 To edit a particular URL:
  31   $ ./httparchive.py edit --host www.example.com --full_path /foo archive.wpr
  32
  33 To print statistics of an archive:
  34   $ ./httparchive.py stats archive.wpr
  35
  36 To print statistics of a set of URLs:
  37   $ ./httparchive.py stats --host www.example.com archive.wpr
  38
  39 To merge multiple archives
  40   $ ./httparchive.py merge --merged_file new.wpr archive1.wpr archive2.wpr ...
  41 """
  42
  43 import calendar
  44 import certutils
  45 import cPickle
  46 import difflib
  47 import email.utils
  48 import httplib
  49 import httpzlib
  50 import json
  51 import logging
  52 import optparse
  53 import os
  54 import StringIO
  55 import subprocess
  56 import sys
  57 import tempfile
  58 import time
  59 import urlparse
  60 from collections import defaultdict
  61
  62
  63
  64 def LogRunTime(fn):
  65   """Annotation which logs the run time of the function."""
  66   def wrapped(self, *args, **kwargs):
  67     start_time = time.time()
  68     try:
  69       return fn(self, *args, **kwargs)
  70     finally:
  71       run_time = (time.time() - start_time) * 1000.0
  72       logging.debug('%s: %dms', fn.__name__, run_time)
  73   return wrapped
  74
  75
  76 class HttpArchiveException(Exception):
  77   """Base class for all exceptions in httparchive."""
  78   pass
  79
  80
  81 class HttpArchive(dict):
  82   """Dict with ArchivedHttpRequest keys and ArchivedHttpResponse values.
  83
  84   Attributes:
  85     responses_by_host: dict of {hostname, {request: response}}. This must remain
  86         in sync with the underlying dict of self. It is used as an optimization
  87         so that get_requests() doesn't have to linearly search all requests in
  88         the archive to find potential matches.
  89   """
  90
  91   def __init__(self):  # pylint: disable=super-init-not-called
  92     self.responses_by_host = defaultdict(dict)
  93
  94   def __setstate__(self, state):
  95     """Influence how to unpickle.
  96
  97     Args:
  98       state: a dictionary for __dict__
  99     """
 100     self.__dict__.update(state)
 101     self.responses_by_host = defaultdict(dict)
 102     for request in self:
 103       self.responses_by_host[request.host][request] = self[request]
 104
 105   def __getstate__(self):
 106     """Influence how to pickle.
 107
 108     Returns:
 109       a dict to use for pickling
 110     """
 111     state = self.__dict__.copy()
 112     del state['responses_by_host']
 113     return state
 114
 115   def __setitem__(self, key, value):
 116     super(HttpArchive, self).__setitem__(key, value)
 117     if hasattr(self, 'responses_by_host'):
 118       self.responses_by_host[key.host][key] = value
 119
 120   def __delitem__(self, key):
 121     super(HttpArchive, self).__delitem__(key)
 122     del self.responses_by_host[key.host][key]
 123
 124   def get(self, request, default=None):
 125     """Return the archived response for a given request.
 126
 127     Does extra checking for handling some HTTP request headers.
 128
 129     Args:
 130       request: instance of ArchivedHttpRequest
 131       default: default value to return if request is not found
 132
 133     Returns:
 134       Instance of ArchivedHttpResponse or default if no matching
 135       response is found
 136     """
 137     if request in self:
 138       return self[request]
 139     return self.get_conditional_response(request, default)
 140
 141   def get_conditional_response(self, request, default):
 142     """Get the response based on the conditional HTTP request headers.
 143
 144     Args:
 145       request: an ArchivedHttpRequest representing the original request.
 146       default: default ArchivedHttpResponse
 147           original request with matched headers removed.
 148
 149     Returns:
 150       an ArchivedHttpResponse with a status of 200, 302 (not modified), or
 151           412 (precondition failed)
 152     """
 153     response = default
 154     if request.is_conditional():
 155       stripped_request = request.create_request_without_conditions()
 156       if stripped_request in self:
 157         response = self[stripped_request]
 158         if response.status == 200:
 159           status = self.get_conditional_status(request, response)
 160           if status != 200:
 161             response = create_response(status)
 162     return response
 163
 164   def get_conditional_status(self, request, response):
 165     status = 200
 166     last_modified = email.utils.parsedate(
 167         response.update_date(response.get_header('last-modified')))
 168     response_etag = response.get_header('etag')
 169     is_get_or_head = request.command.upper() in ('GET', 'HEAD')
 170
 171     match_value = request.headers.get('if-match', None)
 172     if match_value:
 173       if self.is_etag_match(match_value, response_etag):
 174         status = 200
 175       else:
 176         status = 412  # precondition failed
 177     none_match_value = request.headers.get('if-none-match', None)
 178     if none_match_value:
 179       if self.is_etag_match(none_match_value, response_etag):
 180         status = 304
 181       elif is_get_or_head:
 182         status = 200
 183       else:
 184         status = 412
 185     if is_get_or_head and last_modified:
 186       for header in ('if-modified-since', 'if-unmodified-since'):
 187         date = email.utils.parsedate(request.headers.get(header, None))
 188         if date:
 189           if ((header == 'if-modified-since' and last_modified > date) or
 190               (header == 'if-unmodified-since' and last_modified < date)):
 191             if status != 412:
 192               status = 200
 193           else:
 194             status = 304  # not modified
 195     return status
 196
 197   @staticmethod
 198   def is_etag_match(request_etag, response_etag):
 199     """Determines whether the entity tags of the request/response matches.
 200
 201     Args:
 202       request_etag: the value string of the "if-(none)-match:"
 203                     portion of the request header
 204       response_etag: the etag value of the response
 205
 206     Returns:
 207       True on match, False otherwise
 208     """
 209     response_etag = response_etag.strip('" ')
 210     for etag in request_etag.split(','):
 211       etag = etag.strip('" ')
 212       if etag in ('*', response_etag):
 213         return True
 214     return False
 215
 216   def get_requests(self, command=None, host=None, full_path=None, is_ssl=None,
 217                    use_query=True):
 218     """Return a list of requests that match the given args."""
 219     if host:
 220       return [r for r in self.responses_by_host[host]
 221               if r.matches(command, None, full_path, is_ssl,
 222                            use_query=use_query)]
 223     else:
 224       return [r for r in self
 225               if r.matches(command, host, full_path, is_ssl,
 226                            use_query=use_query)]
 227
 228   def ls(self, command=None, host=None, full_path=None):
 229     """List all URLs that match given params."""
 230     return ''.join(sorted(
 231         '%s\n' % r for r in self.get_requests(command, host, full_path)))
 232
 233   def cat(self, command=None, host=None, full_path=None):
 234     """Print the contents of all URLs that match given params."""
 235     out = StringIO.StringIO()
 236     for request in self.get_requests(command, host, full_path):
 237       print >>out, str(request)
 238       print >>out, 'Untrimmed request headers:'
 239       for k in request.headers:
 240         print >>out, '    %s: %s' % (k, request.headers[k])
 241       if request.request_body:
 242         print >>out, request.request_body
 243       print >>out, '---- Response Info', '-' * 51
 244       response = self[request]
 245       chunk_lengths = [len(x) for x in response.response_data]
 246       print >>out, ('Status: %s\n'
 247                     'Reason: %s\n'
 248                     'Headers delay: %s\n'
 249                     'Response headers:') % (
 250           response.status, response.reason, response.delays['headers'])
 251       for k, v in response.headers:
 252         print >>out, '    %s: %s' % (k, v)
 253       print >>out, ('Chunk count: %s\n'
 254                     'Chunk lengths: %s\n'
 255                     'Chunk delays: %s') % (
 256           len(chunk_lengths), chunk_lengths, response.delays['data'])
 257       body = response.get_data_as_text()
 258       print >>out, '---- Response Data', '-' * 51
 259       if body:
 260         print >>out, body
 261       else:
 262         print >>out, '[binary data]'
 263       print >>out, '=' * 70
 264     return out.getvalue()
 265
 266   def stats(self, command=None, host=None, full_path=None):
 267     """Print stats about the archive for all URLs that match given params."""
 268     matching_requests = self.get_requests(command, host, full_path)
 269     if not matching_requests:
 270       print 'Failed to find any requests matching given command, host, path.'
 271       return
 272
 273     out = StringIO.StringIO()
 274     stats = {
 275         'Total': len(matching_requests),
 276         'Domains': defaultdict(int),
 277         'HTTP_response_code': defaultdict(int),
 278         'content_type': defaultdict(int),
 279         'Documents': defaultdict(int),
 280         }
 281
 282     for request in matching_requests:
 283       stats['Domains'][request.host] += 1
 284       stats['HTTP_response_code'][self[request].status] += 1
 285
 286       content_type = self[request].get_header('content-type')
 287       # Remove content type options for readability and higher level groupings.
 288       str_content_type = str(content_type.split(';')[0]
 289                             if content_type else None)
 290       stats['content_type'][str_content_type] += 1
 291
 292       #  Documents are the main URL requested and not a referenced resource.
 293       if str_content_type == 'text/html' and not 'referer' in request.headers:
 294         stats['Documents'][request.host] += 1
 295
 296     print >>out, json.dumps(stats, indent=4)
 297     return out.getvalue()
 298
 299   def merge(self, merged_archive=None, other_archives=None):
 300     """Merge multiple archives into merged_archive by 'chaining' resources,
 301     only resources that are not part of the accumlated archive are added"""
 302     if not other_archives:
 303       print 'No archives passed to merge'
 304       return
 305
 306     # Note we already loaded 'replay_file'.
 307     print 'Loaded %d responses' % len(self)
 308
 309     for archive in other_archives:
 310       if not os.path.exists(archive):
 311         print 'Error: Replay file "%s" does not exist' % archive
 312         return
 313
 314       http_archive_other = HttpArchive.Load(archive)
 315       print 'Loaded %d responses from %s' % (len(http_archive_other), archive)
 316       for r in http_archive_other:
 317         # Only resources that are not already part of the current archive
 318         # get added.
 319         if r not in self:
 320           print '\t %s ' % r
 321           self[r] = http_archive_other[r]
 322     self.Persist('%s' % merged_archive)
 323
 324   def edit(self, command=None, host=None, full_path=None):
 325     """Edits the single request which matches given params."""
 326     editor = os.getenv('EDITOR')
 327     if not editor:
 328       print 'You must set the EDITOR environmental variable.'
 329       return
 330
 331     matching_requests = self.get_requests(command, host, full_path)
 332     if not matching_requests:
 333       print ('Failed to find any requests matching given command, host, '
 334              'full_path.')
 335       return
 336
 337     if len(matching_requests) > 1:
 338       print 'Found multiple matching requests. Please refine.'
 339       print self.ls(command, host, full_path)
 340
 341     response = self[matching_requests[0]]
 342     tmp_file = tempfile.NamedTemporaryFile(delete=False)
 343     tmp_file.write(response.get_response_as_text())
 344     tmp_file.close()
 345     subprocess.check_call([editor, tmp_file.name])
 346     response.set_response_from_text(''.join(open(tmp_file.name).readlines()))
 347     os.remove(tmp_file.name)
 348
 349   def find_closest_request(self, request, use_path=False):
 350     """Find the closest matching request in the archive to the given request.
 351
 352     Args:
 353       request: an ArchivedHttpRequest
 354       use_path: If True, closest matching request's path component must match.
 355         (Note: this refers to the 'path' component within the URL, not the
 356          'full path' which includes the query string component.)
 357         If use_path=True, candidate will NOT match in example below
 358         e.g. request   = GET www.test.com/path?aaa
 359              candidate = GET www.test.com/diffpath?aaa
 360     Returns:
 361       If a close match is found, return the instance of ArchivedHttpRequest.
 362       Otherwise, return None.
 363     """
 364     full_path = request.full_path if use_path else None
 365     requests = self.get_requests(request.command, request.host, full_path,
 366                                  is_ssl=request.is_ssl, use_query=not use_path)
 367
 368     if not requests:
 369       return None
 370
 371     if len(requests) == 1:
 372       return requests[0]
 373
 374     matcher = difflib.SequenceMatcher(b=request.formatted_request)
 375
 376     # quick_ratio() is cheap to compute, but ratio() is expensive. So we call
 377     # quick_ratio() on all requests, sort them descending, and then loop through
 378     # until we find a candidate whose ratio() is >= the next quick_ratio().
 379     # This works because quick_ratio() is guaranteed to be an upper bound on
 380     # ratio().
 381     candidates = []
 382     for candidate in requests:
 383       matcher.set_seq1(candidate.formatted_request)
 384       candidates.append((matcher.quick_ratio(), candidate))
 385
 386     candidates.sort(reverse=True, key=lambda c: c[0])
 387
 388     best_match = (0, None)
 389     for i in xrange(len(candidates)):
 390       matcher.set_seq1(candidates[i][1].formatted_request)
 391       best_match = max(best_match, (matcher.ratio(), candidates[i][1]))
 392       if i + 1 < len(candidates) and best_match[0] >= candidates[i+1][0]:
 393         break
 394     return best_match[1]
 395
 396   def diff(self, request):
 397     """Diff the given request to the closest matching request in the archive.
 398
 399     Args:
 400       request: an ArchivedHttpRequest
 401     Returns:
 402       If a close match is found, return a textual diff between the requests.
 403       Otherwise, return None.
 404     """
 405     request_lines = request.formatted_request.split('\n')
 406     closest_request = self.find_closest_request(request)
 407     if closest_request:
 408       closest_request_lines = closest_request.formatted_request.split('\n')
 409       return '\n'.join(difflib.ndiff(closest_request_lines, request_lines))
 410     return None
 411
 412   def get_server_cert(self, host):
 413     """Gets certificate from the server and stores it in archive"""
 414     request = ArchivedHttpRequest('SERVER_CERT', host, '', None, {})
 415     if request not in self:
 416       self[request] = create_response(200, body=certutils.get_host_cert(host))
 417     return self[request].response_data[0]
 418
 419   def get_certificate(self, host):
 420     request = ArchivedHttpRequest('DUMMY_CERT', host, '', None, {})
 421     if request not in self:
 422       self[request] = create_response(200, body=self._generate_cert(host))
 423     return self[request].response_data[0]
 424
 425   @classmethod
 426   def AssertWritable(cls, filename):
 427     """Raises an IOError if filename is not writable."""
 428     persist_dir = os.path.dirname(os.path.abspath(filename))
 429     if not os.path.exists(persist_dir):
 430       raise IOError('Directory does not exist: %s' % persist_dir)
 431     if os.path.exists(filename):
 432       if not os.access(filename, os.W_OK):
 433         raise IOError('Need write permission on file: %s' % filename)
 434     elif not os.access(persist_dir, os.W_OK):
 435       raise IOError('Need write permission on directory: %s' % persist_dir)
 436
 437   @classmethod
 438   def Load(cls, filename):
 439     """Load an instance from filename."""
 440     return cPickle.load(open(filename, 'rb'))
 441
 442   def Persist(self, filename):
 443     """Persist all state to filename."""
 444     try:
 445       original_checkinterval = sys.getcheckinterval()
 446       sys.setcheckinterval(2**31-1)  # Lock out other threads so nothing can
 447                                      # modify |self| during pickling.
 448       pickled_self = cPickle.dumps(self, cPickle.HIGHEST_PROTOCOL)
 449     finally:
 450       sys.setcheckinterval(original_checkinterval)
 451     with open(filename, 'wb') as f:
 452       f.write(pickled_self)
 453
 454
 455 class ArchivedHttpRequest(object):
 456   """Record all the state that goes into a request.
 457
 458   ArchivedHttpRequest instances are considered immutable so they can
 459   serve as keys for HttpArchive instances.
 460   (The immutability is not enforced.)
 461
 462   Upon creation, the headers are "trimmed" (i.e. edited or dropped)
 463   and saved to self.trimmed_headers to allow requests to match in a wider
 464   variety of playback situations (e.g. using different user agents).
 465
 466   For unpickling, 'trimmed_headers' is recreated from 'headers'. That
 467   allows for changes to the trim function and can help with debugging.
 468   """
 469   CONDITIONAL_HEADERS = [
 470       'if-none-match', 'if-match',
 471       'if-modified-since', 'if-unmodified-since']
 472
 473   def __init__(self, command, host, full_path, request_body, headers,
 474                is_ssl=False):
 475     """Initialize an ArchivedHttpRequest.
 476
 477     Args:
 478       command: a string (e.g. 'GET' or 'POST').
 479       host: a host name (e.g. 'www.google.com').
 480       full_path: a request path.  Includes everything after the host & port in
 481           the URL (e.g. '/search?q=dogs').
 482       request_body: a request body string for a POST or None.
 483       headers: {key: value, ...} where key and value are strings.
 484       is_ssl: a boolean which is True iff request is make via SSL.
 485     """
 486     self.command = command
 487     self.host = host
 488     self.full_path = full_path
 489     self.path = urlparse.urlparse(full_path).path if full_path else None
 490     self.request_body = request_body
 491     self.headers = headers
 492     self.is_ssl = is_ssl
 493     self.trimmed_headers = self._TrimHeaders(headers)
 494     self.formatted_request = self._GetFormattedRequest()
 495
 496   def __str__(self):
 497     scheme = 'https' if self.is_ssl else 'http'
 498     return '%s %s://%s%s %s' % (
 499         self.command, scheme, self.host, self.full_path, self.trimmed_headers)
 500
 501   def __repr__(self):
 502     return repr((self.command, self.host, self.full_path, self.request_body,
 503                  self.trimmed_headers, self.is_ssl))
 504
 505   def __hash__(self):
 506     """Return a integer hash to use for hashed collections including dict."""
 507     return hash(repr(self))
 508
 509   def __eq__(self, other):
 510     """Define the __eq__ method to match the hash behavior."""
 511     return repr(self) == repr(other)
 512
 513   def __setstate__(self, state):
 514     """Influence how to unpickle.
 515
 516     "headers" are the original request headers.
 517     "trimmed_headers" are the trimmed headers used for matching requests
 518     during replay.
 519
 520     Args:
 521       state: a dictionary for __dict__
 522     """
 523     if 'full_headers' in state:
 524       # Fix older version of archive.
 525       state['headers'] = state['full_headers']
 526       del state['full_headers']
 527     if 'headers' not in state:
 528       raise HttpArchiveException(
 529           'Archived HTTP request is missing "headers". The HTTP archive is'
 530           ' likely from a previous version and must be re-recorded.')
 531     if 'path' in state:
 532       # before, 'path' and 'path_without_query' were used and 'path' was
 533       # pickled.  Now, 'path' has been renamed to 'full_path' and
 534       # 'path_without_query' has been renamed to 'path'.  'full_path' is
 535       # pickled, but 'path' is not.  If we see 'path' here it means we are
 536       # dealing with an older archive.
 537       state['full_path'] = state['path']
 538       del state['path']
 539     state['trimmed_headers'] = self._TrimHeaders(dict(state['headers']))
 540     if 'is_ssl' not in state:
 541       state['is_ssl'] = False
 542     self.__dict__.update(state)
 543     self.path = urlparse.urlparse(self.full_path).path
 544     self.formatted_request = self._GetFormattedRequest()
 545
 546   def __getstate__(self):
 547     """Influence how to pickle.
 548
 549     Returns:
 550       a dict to use for pickling
 551     """
 552     state = self.__dict__.copy()
 553     del state['trimmed_headers']
 554     del state['path']
 555     del state['formatted_request']
 556     return state
 557
 558   def _GetFormattedRequest(self):
 559     """Format request to make diffs easier to read.
 560
 561     Returns:
 562       A string consisting of the request. Example:
 563       'GET www.example.com/path\nHeader-Key: header value\n'
 564     """
 565     parts = ['%s %s%s\n' % (self.command, self.host, self.full_path)]
 566     if self.request_body:
 567       parts.append('%s\n' % self.request_body)
 568     for k, v in self.trimmed_headers:
 569       k = '-'.join(x.capitalize() for x in k.split('-'))
 570       parts.append('%s: %s\n' % (k, v))
 571     return ''.join(parts)
 572
 573   def matches(self, command=None, host=None, full_path=None, is_ssl=None,
 574               use_query=True):
 575     """Returns true iff the request matches all parameters.
 576
 577     Args:
 578       command: a string (e.g. 'GET' or 'POST').
 579       host: a host name (e.g. 'www.google.com').
 580       full_path: a request path with query string (e.g. '/search?q=dogs')
 581       is_ssl: whether the request is secure.
 582       use_query:
 583         If use_query is True, request matching uses both the hierarchical path
 584         and query string component.
 585         If use_query is False, request matching only uses the hierarchical path
 586
 587         e.g. req1 = GET www.test.com/index?aaaa
 588              req2 = GET www.test.com/index?bbbb
 589
 590         If use_query is True, req1.matches(req2) evaluates to False
 591         If use_query is False, req1.matches(req2) evaluates to True
 592
 593     Returns:
 594       True iff the request matches all parameters
 595     """
 596     if command is not None and command != self.command:
 597       return False
 598     if is_ssl is not None and is_ssl != self.is_ssl:
 599       return False
 600     if host is not None and host != self.host:
 601       return False
 602     if full_path is None:
 603       return True
 604     if use_query:
 605       return full_path == self.full_path
 606     else:
 607       return self.path == urlparse.urlparse(full_path).path
 608
 609   @classmethod
 610   def _TrimHeaders(cls, headers):
 611     """Removes headers that are known to cause problems during replay.
 612
 613     These headers are removed for the following reasons:
 614     - accept: Causes problems with www.bing.com. During record, CSS is fetched
 615               with *. During replay, it's text/css.
 616     - accept-charset, accept-language, referer: vary between clients.
 617     - cache-control:  sometimes sent from Chrome with 'max-age=0' as value.
 618     - connection, method, scheme, url, version: Cause problems with spdy.
 619     - cookie: Extremely sensitive to request/response order.
 620     - keep-alive: Doesn't affect the content of the request, only some
 621       transient state of the transport layer.
 622     - user-agent: Changes with every Chrome version.
 623     - proxy-connection: Sent for proxy requests.
 624
 625     Another variant to consider is dropping only the value from the header.
 626     However, this is particularly bad for the cookie header, because the
 627     presence of the cookie depends on the responses we've seen when the request
 628     is made.
 629
 630     Args:
 631       headers: {header_key: header_value, ...}
 632
 633     Returns:
 634       [(header_key, header_value), ...]  # (with undesirable headers removed)
 635     """
 636     # TODO(tonyg): Strip sdch from the request headers because we can't
 637     # guarantee that the dictionary will be recorded, so replay may not work.
 638     if 'accept-encoding' in headers:
 639       accept_encoding = headers['accept-encoding']
 640       accept_encoding = accept_encoding.replace('sdch', '')
 641       stripped_encodings = [e.strip() for e in accept_encoding.split(',')]
 642       accept_encoding = ','.join(filter(bool, stripped_encodings))
 643       headers['accept-encoding'] = accept_encoding
 644     undesirable_keys = [
 645         'accept', 'accept-charset', 'accept-language', 'cache-control',
 646         'connection', 'cookie', 'keep-alive', 'method',
 647         'referer', 'scheme', 'url', 'version', 'user-agent', 'proxy-connection',
 648         'x-chrome-variations']
 649     return sorted([(k, v) for k, v in headers.items()
 650                    if k.lower() not in undesirable_keys])
 651
 652   def is_conditional(self):
 653     """Return list of headers that match conditional headers."""
 654     for header in self.CONDITIONAL_HEADERS:
 655       if header in self.headers:
 656         return True
 657     return False
 658
 659   def create_request_without_conditions(self):
 660     stripped_headers = dict((k, v) for k, v in self.headers.iteritems()
 661                             if k.lower() not in self.CONDITIONAL_HEADERS)
 662     return ArchivedHttpRequest(
 663         self.command, self.host, self.full_path, self.request_body,
 664         stripped_headers, self.is_ssl)
 665
 666 class ArchivedHttpResponse(object):
 667   """All the data needed to recreate all HTTP response."""
 668
 669   # CHUNK_EDIT_SEPARATOR is used to edit and view text content.
 670   # It is not sent in responses. It is added by get_data_as_text()
 671   # and removed by set_data().
 672   CHUNK_EDIT_SEPARATOR = '[WEB_PAGE_REPLAY_CHUNK_BOUNDARY]'
 673
 674   # DELAY_EDIT_SEPARATOR is used to edit and view server delays.
 675   DELAY_EDIT_SEPARATOR = ('\n[WEB_PAGE_REPLAY_EDIT_ARCHIVE --- '
 676                           'Delays are above. Response content is below.]\n')
 677
 678   def __init__(self, version, status, reason, headers, response_data,
 679                delays=None):
 680     """Initialize an ArchivedHttpResponse.
 681
 682     Args:
 683       version: HTTP protocol version used by server.
 684           10 for HTTP/1.0, 11 for HTTP/1.1 (same as httplib).
 685       status: Status code returned by server (e.g. 200).
 686       reason: Reason phrase returned by server (e.g. "OK").
 687       headers: list of (header, value) tuples.
 688       response_data: list of content chunks.
 689           Concatenating the chunks gives the complete contents
 690           (i.e. the chunks do not have any lengths or delimiters).
 691           Do not include the final, zero-length chunk that marks the end.
 692       delays: dict of (ms) delays for 'connect', 'headers' and 'data'.
 693           e.g. {'connect': 50, 'headers': 150, 'data': [0, 10, 10]}
 694           connect - The time to connect to the server.
 695             Each resource has a value because Replay's record mode captures it.
 696             This includes the time for the SYN and SYN/ACK (1 rtt).
 697           headers - The time elapsed between the TCP connect and the headers.
 698             This typically includes all the server-time to generate a response.
 699           data - If the response is chunked, these are the times for each chunk.
 700     """
 701     self.version = version
 702     self.status = status
 703     self.reason = reason
 704     self.headers = headers
 705     self.response_data = response_data
 706     self.delays = delays
 707     self.fix_delays()
 708
 709   def fix_delays(self):
 710     """Initialize delays, or check the number of data delays."""
 711     expected_num_delays = len(self.response_data)
 712     if not self.delays:
 713       self.delays = {
 714           'connect': 0,
 715           'headers': 0,
 716           'data': [0] * expected_num_delays
 717           }
 718     else:
 719       num_delays = len(self.delays['data'])
 720       if num_delays != expected_num_delays:
 721         raise HttpArchiveException(
 722             'Server delay length mismatch: %d (expected %d): %s',
 723             num_delays, expected_num_delays, self.delays['data'])
 724
 725   def __repr__(self):
 726     return repr((self.version, self.status, self.reason, sorted(self.headers),
 727                  self.response_data))
 728
 729   def __hash__(self):
 730     """Return a integer hash to use for hashed collections including dict."""
 731     return hash(repr(self))
 732
 733   def __eq__(self, other):
 734     """Define the __eq__ method to match the hash behavior."""
 735     return repr(self) == repr(other)
 736
 737   def __setstate__(self, state):
 738     """Influence how to unpickle.
 739
 740     Args:
 741       state: a dictionary for __dict__
 742     """
 743     if 'server_delays' in state:
 744       state['delays'] = {
 745           'connect': 0,
 746           'headers': 0,
 747           'data': state['server_delays']
 748           }
 749       del state['server_delays']
 750     elif 'delays' not in state:
 751       state['delays'] = None
 752     self.__dict__.update(state)
 753     self.fix_delays()
 754
 755   def get_header(self, key, default=None):
 756     for k, v in self.headers:
 757       if key.lower() == k.lower():
 758         return v
 759     return default
 760
 761   def set_header(self, key, value):
 762     for i, (k, v) in enumerate(self.headers):
 763       if key == k:
 764         self.headers[i] = (key, value)
 765         return
 766     self.headers.append((key, value))
 767
 768   def remove_header(self, key):
 769     for i, (k, v) in enumerate(self.headers):
 770       if key.lower() == k.lower():
 771         self.headers.pop(i)
 772         return
 773
 774   @staticmethod
 775   def _get_epoch_seconds(date_str):
 776     """Return the epoch seconds of a date header.
 777
 778     Args:
 779       date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")
 780     Returns:
 781       epoch seconds as a float
 782     """
 783     date_tuple = email.utils.parsedate(date_str)
 784     if date_tuple:
 785       return calendar.timegm(date_tuple)
 786     return None
 787
 788   def update_date(self, date_str, now=None):
 789     """Return an updated date based on its delta from the "Date" header.
 790
 791     For example, if |date_str| is one week later than the "Date" header,
 792     then the returned date string is one week later than the current date.
 793
 794     Args:
 795       date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")
 796     Returns:
 797       a date string
 798     """
 799     date_seconds = self._get_epoch_seconds(self.get_header('date'))
 800     header_seconds = self._get_epoch_seconds(date_str)
 801     if date_seconds and header_seconds:
 802       updated_seconds = header_seconds + (now or time.time()) - date_seconds
 803       return email.utils.formatdate(updated_seconds, usegmt=True)
 804     return date_str
 805
 806   def is_gzip(self):
 807     return self.get_header('content-encoding') == 'gzip'
 808
 809   def is_compressed(self):
 810     return self.get_header('content-encoding') in ('gzip', 'deflate')
 811
 812   def is_chunked(self):
 813     return self.get_header('transfer-encoding') == 'chunked'
 814
 815   def get_data_as_text(self):
 816     """Return content as a single string.
 817
 818     Uncompresses and concatenates chunks with CHUNK_EDIT_SEPARATOR.
 819     """
 820     content_type = self.get_header('content-type')
 821     if (not content_type or
 822         not (content_type.startswith('text/') or
 823              content_type == 'application/x-javascript' or
 824              content_type.startswith('application/json'))):
 825       return None
 826     if self.is_compressed():
 827       uncompressed_chunks = httpzlib.uncompress_chunks(
 828           self.response_data, self.is_gzip())
 829     else:
 830       uncompressed_chunks = self.response_data
 831     return self.CHUNK_EDIT_SEPARATOR.join(uncompressed_chunks)
 832
 833   def get_delays_as_text(self):
 834     """Return delays as editable text."""
 835     return json.dumps(self.delays, indent=2)
 836
 837   def get_response_as_text(self):
 838     """Returns response content as a single string.
 839
 840     Server delays are separated on a per-chunk basis. Delays are in seconds.
 841     Response content begins after DELAY_EDIT_SEPARATOR
 842     """
 843     data = self.get_data_as_text()
 844     if data is None:
 845       logging.warning('Data can not be represented as text.')
 846       data = ''
 847     delays = self.get_delays_as_text()
 848     return self.DELAY_EDIT_SEPARATOR.join((delays, data))
 849
 850   def set_data(self, text):
 851     """Inverse of get_data_as_text().
 852
 853     Split on CHUNK_EDIT_SEPARATOR and compress if needed.
 854     """
 855     text_chunks = text.split(self.CHUNK_EDIT_SEPARATOR)
 856     if self.is_compressed():
 857       self.response_data = httpzlib.compress_chunks(text_chunks, self.is_gzip())
 858     else:
 859       self.response_data = text_chunks
 860     if not self.is_chunked():
 861       content_length = sum(len(c) for c in self.response_data)
 862       self.set_header('content-length', str(content_length))
 863
 864   def set_delays(self, delays_text):
 865     """Inverse of get_delays_as_text().
 866
 867     Args:
 868       delays_text: JSON encoded text such as the following:
 869           {
 870             connect: 80,
 871             headers: 80,
 872             data: [6, 55, 0]
 873           }
 874         Times are in milliseconds.
 875         Each data delay corresponds with one response_data value.
 876     """
 877     try:
 878       self.delays = json.loads(delays_text)
 879     except (ValueError, KeyError) as e:
 880       logging.critical('Unable to parse delays %s: %s', delays_text, e)
 881     self.fix_delays()
 882
 883   def set_response_from_text(self, text):
 884     """Inverse of get_response_as_text().
 885
 886     Modifies the state of the archive according to the textual representation.
 887     """
 888     try:
 889       delays, data = text.split(self.DELAY_EDIT_SEPARATOR)
 890     except ValueError:
 891       logging.critical(
 892           'Error parsing text representation. Skipping edits.')
 893       return
 894     self.set_delays(delays)
 895     self.set_data(data)
 896
 897
 898 def create_response(status, reason=None, headers=None, body=None):
 899   """Convenience method for creating simple ArchivedHttpResponse objects."""
 900   if reason is None:
 901     reason = httplib.responses.get(status, 'Unknown')
 902   if headers is None:
 903     headers = [('content-type', 'text/plain')]
 904   if body is None:
 905     body = "%s %s" % (status, reason)
 906   return ArchivedHttpResponse(11, status, reason, headers, [body])
 907
 908
 909 def main():
 910   class PlainHelpFormatter(optparse.IndentedHelpFormatter):
 911     def format_description(self, description):
 912       if description:
 913         return description + '\n'
 914       else:
 915         return ''
 916
 917   option_parser = optparse.OptionParser(
 918       usage='%prog [ls|cat|edit|stats|merge] [options] replay_file(s)',
 919       formatter=PlainHelpFormatter(),
 920       description=__doc__,
 921       epilog='http://code.google.com/p/web-page-replay/')
 922
 923   option_parser.add_option('-c', '--command', default=None,
 924       action='store',
 925       type='string',
 926       help='Only show URLs matching this command.')
 927   option_parser.add_option('-o', '--host', default=None,
 928       action='store',
 929       type='string',
 930       help='Only show URLs matching this host.')
 931   option_parser.add_option('-p', '--full_path', default=None,
 932       action='store',
 933       type='string',
 934       help='Only show URLs matching this full path.')
 935   option_parser.add_option('-f', '--merged_file', default=None,
 936         action='store',
 937         type='string',
 938         help='The output file to use when using the merge command.')
 939
 940   options, args = option_parser.parse_args()
 941
 942   # Merge command expects an umlimited number of archives.
 943   if len(args) < 2:
 944     print 'args: %s' % args
 945     option_parser.error('Must specify a command and replay_file')
 946
 947   command = args[0]
 948   replay_file = args[1]
 949
 950   if not os.path.exists(replay_file):
 951     option_parser.error('Replay file "%s" does not exist' % replay_file)
 952
 953   http_archive = HttpArchive.Load(replay_file)
 954   if command == 'ls':
 955     print http_archive.ls(options.command, options.host, options.full_path)
 956   elif command == 'cat':
 957     print http_archive.cat(options.command, options.host, options.full_path)
 958   elif command == 'stats':
 959     print http_archive.stats(options.command, options.host, options.full_path)
 960   elif command == 'merge':
 961     if not options.merged_file:
 962       print 'Error: Must specify a merged file name (use --merged_file)'
 963       return
 964     http_archive.merge(options.merged_file, args[2:])
 965   elif command == 'edit':
 966     http_archive.edit(options.command, options.host, options.full_path)
 967     http_archive.Persist(replay_file)
 968   else:
 969     option_parser.error('Unknown command "%s"' % command)
 970   return 0
 971
 972
 973 if __name__ == '__main__':
 974   sys.exit(main())