Merge Chromium + Blink git repositories
[chromium-blink-merge.git] / tools / telemetry / third_party / webpagereplay / httparchive.py
blobd51916295fb5ff4e030a2a263b65124b635723e5
1 #!/usr/bin/env python
2 # Copyright 2010 Google Inc. All Rights Reserved.
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
8 # http://www.apache.org/licenses/LICENSE-2.0
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
16 """View and edit HTTP Archives.
18 To list all URLs in an archive:
19 $ ./httparchive.py ls archive.wpr
21 To view the content of all URLs from example.com:
22 $ ./httparchive.py cat --host example.com archive.wpr
24 To view the content of a particular URL:
25 $ ./httparchive.py cat --host www.example.com --full_path /foo archive.wpr
27 To view the content of all URLs:
28 $ ./httparchive.py cat archive.wpr
30 To edit a particular URL:
31 $ ./httparchive.py edit --host www.example.com --full_path /foo archive.wpr
33 To print statistics of an archive:
34 $ ./httparchive.py stats archive.wpr
36 To print statistics of a set of URLs:
37 $ ./httparchive.py stats --host www.example.com archive.wpr
39 To merge multiple archives
40 $ ./httparchive.py merge --merged_file new.wpr archive1.wpr archive2.wpr ...
41 """
43 import calendar
44 import certutils
45 import cPickle
46 import difflib
47 import email.utils
48 import httplib
49 import httpzlib
50 import json
51 import logging
52 import optparse
53 import os
54 import StringIO
55 import subprocess
56 import sys
57 import tempfile
58 import time
59 import urlparse
60 from collections import defaultdict
64 def LogRunTime(fn):
65 """Annotation which logs the run time of the function."""
66 def wrapped(self, *args, **kwargs):
67 start_time = time.time()
68 try:
69 return fn(self, *args, **kwargs)
70 finally:
71 run_time = (time.time() - start_time) * 1000.0
72 logging.debug('%s: %dms', fn.__name__, run_time)
73 return wrapped
76 class HttpArchiveException(Exception):
77 """Base class for all exceptions in httparchive."""
78 pass
81 class HttpArchive(dict):
82 """Dict with ArchivedHttpRequest keys and ArchivedHttpResponse values.
84 Attributes:
85 responses_by_host: dict of {hostname, {request: response}}. This must remain
86 in sync with the underlying dict of self. It is used as an optimization
87 so that get_requests() doesn't have to linearly search all requests in
88 the archive to find potential matches.
89 """
91 def __init__(self): # pylint: disable=super-init-not-called
92 self.responses_by_host = defaultdict(dict)
94 def __setstate__(self, state):
95 """Influence how to unpickle.
97 Args:
98 state: a dictionary for __dict__
99 """
100 self.__dict__.update(state)
101 self.responses_by_host = defaultdict(dict)
102 for request in self:
103 self.responses_by_host[request.host][request] = self[request]
105 def __getstate__(self):
106 """Influence how to pickle.
108 Returns:
109 a dict to use for pickling
111 state = self.__dict__.copy()
112 del state['responses_by_host']
113 return state
115 def __setitem__(self, key, value):
116 super(HttpArchive, self).__setitem__(key, value)
117 if hasattr(self, 'responses_by_host'):
118 self.responses_by_host[key.host][key] = value
120 def __delitem__(self, key):
121 super(HttpArchive, self).__delitem__(key)
122 del self.responses_by_host[key.host][key]
124 def get(self, request, default=None):
125 """Return the archived response for a given request.
127 Does extra checking for handling some HTTP request headers.
129 Args:
130 request: instance of ArchivedHttpRequest
131 default: default value to return if request is not found
133 Returns:
134 Instance of ArchivedHttpResponse or default if no matching
135 response is found
137 if request in self:
138 return self[request]
139 return self.get_conditional_response(request, default)
141 def get_conditional_response(self, request, default):
142 """Get the response based on the conditional HTTP request headers.
144 Args:
145 request: an ArchivedHttpRequest representing the original request.
146 default: default ArchivedHttpResponse
147 original request with matched headers removed.
149 Returns:
150 an ArchivedHttpResponse with a status of 200, 302 (not modified), or
151 412 (precondition failed)
153 response = default
154 if request.is_conditional():
155 stripped_request = request.create_request_without_conditions()
156 if stripped_request in self:
157 response = self[stripped_request]
158 if response.status == 200:
159 status = self.get_conditional_status(request, response)
160 if status != 200:
161 response = create_response(status)
162 return response
164 def get_conditional_status(self, request, response):
165 status = 200
166 last_modified = email.utils.parsedate(
167 response.update_date(response.get_header('last-modified')))
168 response_etag = response.get_header('etag')
169 is_get_or_head = request.command.upper() in ('GET', 'HEAD')
171 match_value = request.headers.get('if-match', None)
172 if match_value:
173 if self.is_etag_match(match_value, response_etag):
174 status = 200
175 else:
176 status = 412 # precondition failed
177 none_match_value = request.headers.get('if-none-match', None)
178 if none_match_value:
179 if self.is_etag_match(none_match_value, response_etag):
180 status = 304
181 elif is_get_or_head:
182 status = 200
183 else:
184 status = 412
185 if is_get_or_head and last_modified:
186 for header in ('if-modified-since', 'if-unmodified-since'):
187 date = email.utils.parsedate(request.headers.get(header, None))
188 if date:
189 if ((header == 'if-modified-since' and last_modified > date) or
190 (header == 'if-unmodified-since' and last_modified < date)):
191 if status != 412:
192 status = 200
193 else:
194 status = 304 # not modified
195 return status
197 @staticmethod
198 def is_etag_match(request_etag, response_etag):
199 """Determines whether the entity tags of the request/response matches.
201 Args:
202 request_etag: the value string of the "if-(none)-match:"
203 portion of the request header
204 response_etag: the etag value of the response
206 Returns:
207 True on match, False otherwise
209 response_etag = response_etag.strip('" ')
210 for etag in request_etag.split(','):
211 etag = etag.strip('" ')
212 if etag in ('*', response_etag):
213 return True
214 return False
216 def get_requests(self, command=None, host=None, full_path=None, is_ssl=None,
217 use_query=True):
218 """Return a list of requests that match the given args."""
219 if host:
220 return [r for r in self.responses_by_host[host]
221 if r.matches(command, None, full_path, is_ssl,
222 use_query=use_query)]
223 else:
224 return [r for r in self
225 if r.matches(command, host, full_path, is_ssl,
226 use_query=use_query)]
228 def ls(self, command=None, host=None, full_path=None):
229 """List all URLs that match given params."""
230 return ''.join(sorted(
231 '%s\n' % r for r in self.get_requests(command, host, full_path)))
233 def cat(self, command=None, host=None, full_path=None):
234 """Print the contents of all URLs that match given params."""
235 out = StringIO.StringIO()
236 for request in self.get_requests(command, host, full_path):
237 print >>out, str(request)
238 print >>out, 'Untrimmed request headers:'
239 for k in request.headers:
240 print >>out, ' %s: %s' % (k, request.headers[k])
241 if request.request_body:
242 print >>out, request.request_body
243 print >>out, '---- Response Info', '-' * 51
244 response = self[request]
245 chunk_lengths = [len(x) for x in response.response_data]
246 print >>out, ('Status: %s\n'
247 'Reason: %s\n'
248 'Headers delay: %s\n'
249 'Response headers:') % (
250 response.status, response.reason, response.delays['headers'])
251 for k, v in response.headers:
252 print >>out, ' %s: %s' % (k, v)
253 print >>out, ('Chunk count: %s\n'
254 'Chunk lengths: %s\n'
255 'Chunk delays: %s') % (
256 len(chunk_lengths), chunk_lengths, response.delays['data'])
257 body = response.get_data_as_text()
258 print >>out, '---- Response Data', '-' * 51
259 if body:
260 print >>out, body
261 else:
262 print >>out, '[binary data]'
263 print >>out, '=' * 70
264 return out.getvalue()
266 def stats(self, command=None, host=None, full_path=None):
267 """Print stats about the archive for all URLs that match given params."""
268 matching_requests = self.get_requests(command, host, full_path)
269 if not matching_requests:
270 print 'Failed to find any requests matching given command, host, path.'
271 return
273 out = StringIO.StringIO()
274 stats = {
275 'Total': len(matching_requests),
276 'Domains': defaultdict(int),
277 'HTTP_response_code': defaultdict(int),
278 'content_type': defaultdict(int),
279 'Documents': defaultdict(int),
282 for request in matching_requests:
283 stats['Domains'][request.host] += 1
284 stats['HTTP_response_code'][self[request].status] += 1
286 content_type = self[request].get_header('content-type')
287 # Remove content type options for readability and higher level groupings.
288 str_content_type = str(content_type.split(';')[0]
289 if content_type else None)
290 stats['content_type'][str_content_type] += 1
292 # Documents are the main URL requested and not a referenced resource.
293 if str_content_type == 'text/html' and not 'referer' in request.headers:
294 stats['Documents'][request.host] += 1
296 print >>out, json.dumps(stats, indent=4)
297 return out.getvalue()
299 def merge(self, merged_archive=None, other_archives=None):
300 """Merge multiple archives into merged_archive by 'chaining' resources,
301 only resources that are not part of the accumlated archive are added"""
302 if not other_archives:
303 print 'No archives passed to merge'
304 return
306 # Note we already loaded 'replay_file'.
307 print 'Loaded %d responses' % len(self)
309 for archive in other_archives:
310 if not os.path.exists(archive):
311 print 'Error: Replay file "%s" does not exist' % archive
312 return
314 http_archive_other = HttpArchive.Load(archive)
315 print 'Loaded %d responses from %s' % (len(http_archive_other), archive)
316 for r in http_archive_other:
317 # Only resources that are not already part of the current archive
318 # get added.
319 if r not in self:
320 print '\t %s ' % r
321 self[r] = http_archive_other[r]
322 self.Persist('%s' % merged_archive)
324 def edit(self, command=None, host=None, full_path=None):
325 """Edits the single request which matches given params."""
326 editor = os.getenv('EDITOR')
327 if not editor:
328 print 'You must set the EDITOR environmental variable.'
329 return
331 matching_requests = self.get_requests(command, host, full_path)
332 if not matching_requests:
333 print ('Failed to find any requests matching given command, host, '
334 'full_path.')
335 return
337 if len(matching_requests) > 1:
338 print 'Found multiple matching requests. Please refine.'
339 print self.ls(command, host, full_path)
341 response = self[matching_requests[0]]
342 tmp_file = tempfile.NamedTemporaryFile(delete=False)
343 tmp_file.write(response.get_response_as_text())
344 tmp_file.close()
345 subprocess.check_call([editor, tmp_file.name])
346 response.set_response_from_text(''.join(open(tmp_file.name).readlines()))
347 os.remove(tmp_file.name)
349 def find_closest_request(self, request, use_path=False):
350 """Find the closest matching request in the archive to the given request.
352 Args:
353 request: an ArchivedHttpRequest
354 use_path: If True, closest matching request's path component must match.
355 (Note: this refers to the 'path' component within the URL, not the
356 'full path' which includes the query string component.)
357 If use_path=True, candidate will NOT match in example below
358 e.g. request = GET www.test.com/path?aaa
359 candidate = GET www.test.com/diffpath?aaa
360 Returns:
361 If a close match is found, return the instance of ArchivedHttpRequest.
362 Otherwise, return None.
364 full_path = request.full_path if use_path else None
365 requests = self.get_requests(request.command, request.host, full_path,
366 is_ssl=request.is_ssl, use_query=not use_path)
368 if not requests:
369 return None
371 if len(requests) == 1:
372 return requests[0]
374 matcher = difflib.SequenceMatcher(b=request.formatted_request)
376 # quick_ratio() is cheap to compute, but ratio() is expensive. So we call
377 # quick_ratio() on all requests, sort them descending, and then loop through
378 # until we find a candidate whose ratio() is >= the next quick_ratio().
379 # This works because quick_ratio() is guaranteed to be an upper bound on
380 # ratio().
381 candidates = []
382 for candidate in requests:
383 matcher.set_seq1(candidate.formatted_request)
384 candidates.append((matcher.quick_ratio(), candidate))
386 candidates.sort(reverse=True, key=lambda c: c[0])
388 best_match = (0, None)
389 for i in xrange(len(candidates)):
390 matcher.set_seq1(candidates[i][1].formatted_request)
391 best_match = max(best_match, (matcher.ratio(), candidates[i][1]))
392 if i + 1 < len(candidates) and best_match[0] >= candidates[i+1][0]:
393 break
394 return best_match[1]
396 def diff(self, request):
397 """Diff the given request to the closest matching request in the archive.
399 Args:
400 request: an ArchivedHttpRequest
401 Returns:
402 If a close match is found, return a textual diff between the requests.
403 Otherwise, return None.
405 request_lines = request.formatted_request.split('\n')
406 closest_request = self.find_closest_request(request)
407 if closest_request:
408 closest_request_lines = closest_request.formatted_request.split('\n')
409 return '\n'.join(difflib.ndiff(closest_request_lines, request_lines))
410 return None
412 def get_server_cert(self, host):
413 """Gets certificate from the server and stores it in archive"""
414 request = ArchivedHttpRequest('SERVER_CERT', host, '', None, {})
415 if request not in self:
416 self[request] = create_response(200, body=certutils.get_host_cert(host))
417 return self[request].response_data[0]
419 def get_certificate(self, host):
420 request = ArchivedHttpRequest('DUMMY_CERT', host, '', None, {})
421 if request not in self:
422 self[request] = create_response(200, body=self._generate_cert(host))
423 return self[request].response_data[0]
425 @classmethod
426 def AssertWritable(cls, filename):
427 """Raises an IOError if filename is not writable."""
428 persist_dir = os.path.dirname(os.path.abspath(filename))
429 if not os.path.exists(persist_dir):
430 raise IOError('Directory does not exist: %s' % persist_dir)
431 if os.path.exists(filename):
432 if not os.access(filename, os.W_OK):
433 raise IOError('Need write permission on file: %s' % filename)
434 elif not os.access(persist_dir, os.W_OK):
435 raise IOError('Need write permission on directory: %s' % persist_dir)
437 @classmethod
438 def Load(cls, filename):
439 """Load an instance from filename."""
440 return cPickle.load(open(filename, 'rb'))
442 def Persist(self, filename):
443 """Persist all state to filename."""
444 try:
445 original_checkinterval = sys.getcheckinterval()
446 sys.setcheckinterval(2**31-1) # Lock out other threads so nothing can
447 # modify |self| during pickling.
448 pickled_self = cPickle.dumps(self, cPickle.HIGHEST_PROTOCOL)
449 finally:
450 sys.setcheckinterval(original_checkinterval)
451 with open(filename, 'wb') as f:
452 f.write(pickled_self)
455 class ArchivedHttpRequest(object):
456 """Record all the state that goes into a request.
458 ArchivedHttpRequest instances are considered immutable so they can
459 serve as keys for HttpArchive instances.
460 (The immutability is not enforced.)
462 Upon creation, the headers are "trimmed" (i.e. edited or dropped)
463 and saved to self.trimmed_headers to allow requests to match in a wider
464 variety of playback situations (e.g. using different user agents).
466 For unpickling, 'trimmed_headers' is recreated from 'headers'. That
467 allows for changes to the trim function and can help with debugging.
469 CONDITIONAL_HEADERS = [
470 'if-none-match', 'if-match',
471 'if-modified-since', 'if-unmodified-since']
473 def __init__(self, command, host, full_path, request_body, headers,
474 is_ssl=False):
475 """Initialize an ArchivedHttpRequest.
477 Args:
478 command: a string (e.g. 'GET' or 'POST').
479 host: a host name (e.g. 'www.google.com').
480 full_path: a request path. Includes everything after the host & port in
481 the URL (e.g. '/search?q=dogs').
482 request_body: a request body string for a POST or None.
483 headers: {key: value, ...} where key and value are strings.
484 is_ssl: a boolean which is True iff request is make via SSL.
486 self.command = command
487 self.host = host
488 self.full_path = full_path
489 self.path = urlparse.urlparse(full_path).path if full_path else None
490 self.request_body = request_body
491 self.headers = headers
492 self.is_ssl = is_ssl
493 self.trimmed_headers = self._TrimHeaders(headers)
494 self.formatted_request = self._GetFormattedRequest()
496 def __str__(self):
497 scheme = 'https' if self.is_ssl else 'http'
498 return '%s %s://%s%s %s' % (
499 self.command, scheme, self.host, self.full_path, self.trimmed_headers)
501 def __repr__(self):
502 return repr((self.command, self.host, self.full_path, self.request_body,
503 self.trimmed_headers, self.is_ssl))
505 def __hash__(self):
506 """Return a integer hash to use for hashed collections including dict."""
507 return hash(repr(self))
509 def __eq__(self, other):
510 """Define the __eq__ method to match the hash behavior."""
511 return repr(self) == repr(other)
513 def __setstate__(self, state):
514 """Influence how to unpickle.
516 "headers" are the original request headers.
517 "trimmed_headers" are the trimmed headers used for matching requests
518 during replay.
520 Args:
521 state: a dictionary for __dict__
523 if 'full_headers' in state:
524 # Fix older version of archive.
525 state['headers'] = state['full_headers']
526 del state['full_headers']
527 if 'headers' not in state:
528 raise HttpArchiveException(
529 'Archived HTTP request is missing "headers". The HTTP archive is'
530 ' likely from a previous version and must be re-recorded.')
531 if 'path' in state:
532 # before, 'path' and 'path_without_query' were used and 'path' was
533 # pickled. Now, 'path' has been renamed to 'full_path' and
534 # 'path_without_query' has been renamed to 'path'. 'full_path' is
535 # pickled, but 'path' is not. If we see 'path' here it means we are
536 # dealing with an older archive.
537 state['full_path'] = state['path']
538 del state['path']
539 state['trimmed_headers'] = self._TrimHeaders(dict(state['headers']))
540 if 'is_ssl' not in state:
541 state['is_ssl'] = False
542 self.__dict__.update(state)
543 self.path = urlparse.urlparse(self.full_path).path
544 self.formatted_request = self._GetFormattedRequest()
546 def __getstate__(self):
547 """Influence how to pickle.
549 Returns:
550 a dict to use for pickling
552 state = self.__dict__.copy()
553 del state['trimmed_headers']
554 del state['path']
555 del state['formatted_request']
556 return state
558 def _GetFormattedRequest(self):
559 """Format request to make diffs easier to read.
561 Returns:
562 A string consisting of the request. Example:
563 'GET www.example.com/path\nHeader-Key: header value\n'
565 parts = ['%s %s%s\n' % (self.command, self.host, self.full_path)]
566 if self.request_body:
567 parts.append('%s\n' % self.request_body)
568 for k, v in self.trimmed_headers:
569 k = '-'.join(x.capitalize() for x in k.split('-'))
570 parts.append('%s: %s\n' % (k, v))
571 return ''.join(parts)
573 def matches(self, command=None, host=None, full_path=None, is_ssl=None,
574 use_query=True):
575 """Returns true iff the request matches all parameters.
577 Args:
578 command: a string (e.g. 'GET' or 'POST').
579 host: a host name (e.g. 'www.google.com').
580 full_path: a request path with query string (e.g. '/search?q=dogs')
581 is_ssl: whether the request is secure.
582 use_query:
583 If use_query is True, request matching uses both the hierarchical path
584 and query string component.
585 If use_query is False, request matching only uses the hierarchical path
587 e.g. req1 = GET www.test.com/index?aaaa
588 req2 = GET www.test.com/index?bbbb
590 If use_query is True, req1.matches(req2) evaluates to False
591 If use_query is False, req1.matches(req2) evaluates to True
593 Returns:
594 True iff the request matches all parameters
596 if command is not None and command != self.command:
597 return False
598 if is_ssl is not None and is_ssl != self.is_ssl:
599 return False
600 if host is not None and host != self.host:
601 return False
602 if full_path is None:
603 return True
604 if use_query:
605 return full_path == self.full_path
606 else:
607 return self.path == urlparse.urlparse(full_path).path
609 @classmethod
610 def _TrimHeaders(cls, headers):
611 """Removes headers that are known to cause problems during replay.
613 These headers are removed for the following reasons:
614 - accept: Causes problems with www.bing.com. During record, CSS is fetched
615 with *. During replay, it's text/css.
616 - accept-charset, accept-language, referer: vary between clients.
617 - cache-control: sometimes sent from Chrome with 'max-age=0' as value.
618 - connection, method, scheme, url, version: Cause problems with spdy.
619 - cookie: Extremely sensitive to request/response order.
620 - keep-alive: Doesn't affect the content of the request, only some
621 transient state of the transport layer.
622 - user-agent: Changes with every Chrome version.
623 - proxy-connection: Sent for proxy requests.
625 Another variant to consider is dropping only the value from the header.
626 However, this is particularly bad for the cookie header, because the
627 presence of the cookie depends on the responses we've seen when the request
628 is made.
630 Args:
631 headers: {header_key: header_value, ...}
633 Returns:
634 [(header_key, header_value), ...] # (with undesirable headers removed)
636 # TODO(tonyg): Strip sdch from the request headers because we can't
637 # guarantee that the dictionary will be recorded, so replay may not work.
638 if 'accept-encoding' in headers:
639 accept_encoding = headers['accept-encoding']
640 accept_encoding = accept_encoding.replace('sdch', '')
641 stripped_encodings = [e.strip() for e in accept_encoding.split(',')]
642 accept_encoding = ','.join(filter(bool, stripped_encodings))
643 headers['accept-encoding'] = accept_encoding
644 undesirable_keys = [
645 'accept', 'accept-charset', 'accept-language', 'cache-control',
646 'connection', 'cookie', 'keep-alive', 'method',
647 'referer', 'scheme', 'url', 'version', 'user-agent', 'proxy-connection',
648 'x-chrome-variations']
649 return sorted([(k, v) for k, v in headers.items()
650 if k.lower() not in undesirable_keys])
652 def is_conditional(self):
653 """Return list of headers that match conditional headers."""
654 for header in self.CONDITIONAL_HEADERS:
655 if header in self.headers:
656 return True
657 return False
659 def create_request_without_conditions(self):
660 stripped_headers = dict((k, v) for k, v in self.headers.iteritems()
661 if k.lower() not in self.CONDITIONAL_HEADERS)
662 return ArchivedHttpRequest(
663 self.command, self.host, self.full_path, self.request_body,
664 stripped_headers, self.is_ssl)
666 class ArchivedHttpResponse(object):
667 """All the data needed to recreate all HTTP response."""
669 # CHUNK_EDIT_SEPARATOR is used to edit and view text content.
670 # It is not sent in responses. It is added by get_data_as_text()
671 # and removed by set_data().
672 CHUNK_EDIT_SEPARATOR = '[WEB_PAGE_REPLAY_CHUNK_BOUNDARY]'
674 # DELAY_EDIT_SEPARATOR is used to edit and view server delays.
675 DELAY_EDIT_SEPARATOR = ('\n[WEB_PAGE_REPLAY_EDIT_ARCHIVE --- '
676 'Delays are above. Response content is below.]\n')
678 def __init__(self, version, status, reason, headers, response_data,
679 delays=None):
680 """Initialize an ArchivedHttpResponse.
682 Args:
683 version: HTTP protocol version used by server.
684 10 for HTTP/1.0, 11 for HTTP/1.1 (same as httplib).
685 status: Status code returned by server (e.g. 200).
686 reason: Reason phrase returned by server (e.g. "OK").
687 headers: list of (header, value) tuples.
688 response_data: list of content chunks.
689 Concatenating the chunks gives the complete contents
690 (i.e. the chunks do not have any lengths or delimiters).
691 Do not include the final, zero-length chunk that marks the end.
692 delays: dict of (ms) delays for 'connect', 'headers' and 'data'.
693 e.g. {'connect': 50, 'headers': 150, 'data': [0, 10, 10]}
694 connect - The time to connect to the server.
695 Each resource has a value because Replay's record mode captures it.
696 This includes the time for the SYN and SYN/ACK (1 rtt).
697 headers - The time elapsed between the TCP connect and the headers.
698 This typically includes all the server-time to generate a response.
699 data - If the response is chunked, these are the times for each chunk.
701 self.version = version
702 self.status = status
703 self.reason = reason
704 self.headers = headers
705 self.response_data = response_data
706 self.delays = delays
707 self.fix_delays()
709 def fix_delays(self):
710 """Initialize delays, or check the number of data delays."""
711 expected_num_delays = len(self.response_data)
712 if not self.delays:
713 self.delays = {
714 'connect': 0,
715 'headers': 0,
716 'data': [0] * expected_num_delays
718 else:
719 num_delays = len(self.delays['data'])
720 if num_delays != expected_num_delays:
721 raise HttpArchiveException(
722 'Server delay length mismatch: %d (expected %d): %s',
723 num_delays, expected_num_delays, self.delays['data'])
725 def __repr__(self):
726 return repr((self.version, self.status, self.reason, sorted(self.headers),
727 self.response_data))
729 def __hash__(self):
730 """Return a integer hash to use for hashed collections including dict."""
731 return hash(repr(self))
733 def __eq__(self, other):
734 """Define the __eq__ method to match the hash behavior."""
735 return repr(self) == repr(other)
737 def __setstate__(self, state):
738 """Influence how to unpickle.
740 Args:
741 state: a dictionary for __dict__
743 if 'server_delays' in state:
744 state['delays'] = {
745 'connect': 0,
746 'headers': 0,
747 'data': state['server_delays']
749 del state['server_delays']
750 elif 'delays' not in state:
751 state['delays'] = None
752 self.__dict__.update(state)
753 self.fix_delays()
755 def get_header(self, key, default=None):
756 for k, v in self.headers:
757 if key.lower() == k.lower():
758 return v
759 return default
761 def set_header(self, key, value):
762 for i, (k, v) in enumerate(self.headers):
763 if key == k:
764 self.headers[i] = (key, value)
765 return
766 self.headers.append((key, value))
768 def remove_header(self, key):
769 for i, (k, v) in enumerate(self.headers):
770 if key.lower() == k.lower():
771 self.headers.pop(i)
772 return
774 @staticmethod
775 def _get_epoch_seconds(date_str):
776 """Return the epoch seconds of a date header.
778 Args:
779 date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")
780 Returns:
781 epoch seconds as a float
783 date_tuple = email.utils.parsedate(date_str)
784 if date_tuple:
785 return calendar.timegm(date_tuple)
786 return None
788 def update_date(self, date_str, now=None):
789 """Return an updated date based on its delta from the "Date" header.
791 For example, if |date_str| is one week later than the "Date" header,
792 then the returned date string is one week later than the current date.
794 Args:
795 date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")
796 Returns:
797 a date string
799 date_seconds = self._get_epoch_seconds(self.get_header('date'))
800 header_seconds = self._get_epoch_seconds(date_str)
801 if date_seconds and header_seconds:
802 updated_seconds = header_seconds + (now or time.time()) - date_seconds
803 return email.utils.formatdate(updated_seconds, usegmt=True)
804 return date_str
806 def is_gzip(self):
807 return self.get_header('content-encoding') == 'gzip'
809 def is_compressed(self):
810 return self.get_header('content-encoding') in ('gzip', 'deflate')
812 def is_chunked(self):
813 return self.get_header('transfer-encoding') == 'chunked'
815 def get_data_as_text(self):
816 """Return content as a single string.
818 Uncompresses and concatenates chunks with CHUNK_EDIT_SEPARATOR.
820 content_type = self.get_header('content-type')
821 if (not content_type or
822 not (content_type.startswith('text/') or
823 content_type == 'application/x-javascript' or
824 content_type.startswith('application/json'))):
825 return None
826 if self.is_compressed():
827 uncompressed_chunks = httpzlib.uncompress_chunks(
828 self.response_data, self.is_gzip())
829 else:
830 uncompressed_chunks = self.response_data
831 return self.CHUNK_EDIT_SEPARATOR.join(uncompressed_chunks)
833 def get_delays_as_text(self):
834 """Return delays as editable text."""
835 return json.dumps(self.delays, indent=2)
837 def get_response_as_text(self):
838 """Returns response content as a single string.
840 Server delays are separated on a per-chunk basis. Delays are in seconds.
841 Response content begins after DELAY_EDIT_SEPARATOR
843 data = self.get_data_as_text()
844 if data is None:
845 logging.warning('Data can not be represented as text.')
846 data = ''
847 delays = self.get_delays_as_text()
848 return self.DELAY_EDIT_SEPARATOR.join((delays, data))
850 def set_data(self, text):
851 """Inverse of get_data_as_text().
853 Split on CHUNK_EDIT_SEPARATOR and compress if needed.
855 text_chunks = text.split(self.CHUNK_EDIT_SEPARATOR)
856 if self.is_compressed():
857 self.response_data = httpzlib.compress_chunks(text_chunks, self.is_gzip())
858 else:
859 self.response_data = text_chunks
860 if not self.is_chunked():
861 content_length = sum(len(c) for c in self.response_data)
862 self.set_header('content-length', str(content_length))
864 def set_delays(self, delays_text):
865 """Inverse of get_delays_as_text().
867 Args:
868 delays_text: JSON encoded text such as the following:
870 connect: 80,
871 headers: 80,
872 data: [6, 55, 0]
874 Times are in milliseconds.
875 Each data delay corresponds with one response_data value.
877 try:
878 self.delays = json.loads(delays_text)
879 except (ValueError, KeyError) as e:
880 logging.critical('Unable to parse delays %s: %s', delays_text, e)
881 self.fix_delays()
883 def set_response_from_text(self, text):
884 """Inverse of get_response_as_text().
886 Modifies the state of the archive according to the textual representation.
888 try:
889 delays, data = text.split(self.DELAY_EDIT_SEPARATOR)
890 except ValueError:
891 logging.critical(
892 'Error parsing text representation. Skipping edits.')
893 return
894 self.set_delays(delays)
895 self.set_data(data)
898 def create_response(status, reason=None, headers=None, body=None):
899 """Convenience method for creating simple ArchivedHttpResponse objects."""
900 if reason is None:
901 reason = httplib.responses.get(status, 'Unknown')
902 if headers is None:
903 headers = [('content-type', 'text/plain')]
904 if body is None:
905 body = "%s %s" % (status, reason)
906 return ArchivedHttpResponse(11, status, reason, headers, [body])
909 def main():
910 class PlainHelpFormatter(optparse.IndentedHelpFormatter):
911 def format_description(self, description):
912 if description:
913 return description + '\n'
914 else:
915 return ''
917 option_parser = optparse.OptionParser(
918 usage='%prog [ls|cat|edit|stats|merge] [options] replay_file(s)',
919 formatter=PlainHelpFormatter(),
920 description=__doc__,
921 epilog='http://code.google.com/p/web-page-replay/')
923 option_parser.add_option('-c', '--command', default=None,
924 action='store',
925 type='string',
926 help='Only show URLs matching this command.')
927 option_parser.add_option('-o', '--host', default=None,
928 action='store',
929 type='string',
930 help='Only show URLs matching this host.')
931 option_parser.add_option('-p', '--full_path', default=None,
932 action='store',
933 type='string',
934 help='Only show URLs matching this full path.')
935 option_parser.add_option('-f', '--merged_file', default=None,
936 action='store',
937 type='string',
938 help='The output file to use when using the merge command.')
940 options, args = option_parser.parse_args()
942 # Merge command expects an umlimited number of archives.
943 if len(args) < 2:
944 print 'args: %s' % args
945 option_parser.error('Must specify a command and replay_file')
947 command = args[0]
948 replay_file = args[1]
950 if not os.path.exists(replay_file):
951 option_parser.error('Replay file "%s" does not exist' % replay_file)
953 http_archive = HttpArchive.Load(replay_file)
954 if command == 'ls':
955 print http_archive.ls(options.command, options.host, options.full_path)
956 elif command == 'cat':
957 print http_archive.cat(options.command, options.host, options.full_path)
958 elif command == 'stats':
959 print http_archive.stats(options.command, options.host, options.full_path)
960 elif command == 'merge':
961 if not options.merged_file:
962 print 'Error: Must specify a merged file name (use --merged_file)'
963 return
964 http_archive.merge(options.merged_file, args[2:])
965 elif command == 'edit':
966 http_archive.edit(options.command, options.host, options.full_path)
967 http_archive.Persist(replay_file)
968 else:
969 option_parser.error('Unknown command "%s"' % command)
970 return 0
973 if __name__ == '__main__':
974 sys.exit(main())