2 # Copyright 2010 Google Inc. All Rights Reserved.
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
8 # http://www.apache.org/licenses/LICENSE-2.0
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
16 """View and edit HTTP Archives.
18 To list all URLs in an archive:
19 $ ./httparchive.py ls archive.wpr
21 To view the content of all URLs from example.com:
22 $ ./httparchive.py cat --host example.com archive.wpr
24 To view the content of a particular URL:
25 $ ./httparchive.py cat --host www.example.com --full_path /foo archive.wpr
27 To view the content of all URLs:
28 $ ./httparchive.py cat archive.wpr
30 To edit a particular URL:
31 $ ./httparchive.py edit --host www.example.com --full_path /foo archive.wpr
33 To print statistics of an archive:
34 $ ./httparchive.py stats archive.wpr
36 To print statistics of a set of URLs:
37 $ ./httparchive.py stats --host www.example.com archive.wpr
39 To merge multiple archives
40 $ ./httparchive.py merge --merged_file new.wpr archive1.wpr archive2.wpr ...
60 from collections
import defaultdict
65 """Annotation which logs the run time of the function."""
66 def wrapped(self
, *args
, **kwargs
):
67 start_time
= time
.time()
69 return fn(self
, *args
, **kwargs
)
71 run_time
= (time
.time() - start_time
) * 1000.0
72 logging
.debug('%s: %dms', fn
.__name
__, run_time
)
76 class HttpArchiveException(Exception):
77 """Base class for all exceptions in httparchive."""
81 class HttpArchive(dict):
82 """Dict with ArchivedHttpRequest keys and ArchivedHttpResponse values.
85 responses_by_host: dict of {hostname, {request: response}}. This must remain
86 in sync with the underlying dict of self. It is used as an optimization
87 so that get_requests() doesn't have to linearly search all requests in
88 the archive to find potential matches.
91 def __init__(self
): # pylint: disable=super-init-not-called
92 self
.responses_by_host
= defaultdict(dict)
94 def __setstate__(self
, state
):
95 """Influence how to unpickle.
98 state: a dictionary for __dict__
100 self
.__dict
__.update(state
)
101 self
.responses_by_host
= defaultdict(dict)
103 self
.responses_by_host
[request
.host
][request
] = self
[request
]
105 def __getstate__(self
):
106 """Influence how to pickle.
109 a dict to use for pickling
111 state
= self
.__dict
__.copy()
112 del state
['responses_by_host']
115 def __setitem__(self
, key
, value
):
116 super(HttpArchive
, self
).__setitem
__(key
, value
)
117 if hasattr(self
, 'responses_by_host'):
118 self
.responses_by_host
[key
.host
][key
] = value
120 def __delitem__(self
, key
):
121 super(HttpArchive
, self
).__delitem
__(key
)
122 del self
.responses_by_host
[key
.host
][key
]
124 def get(self
, request
, default
=None):
125 """Return the archived response for a given request.
127 Does extra checking for handling some HTTP request headers.
130 request: instance of ArchivedHttpRequest
131 default: default value to return if request is not found
134 Instance of ArchivedHttpResponse or default if no matching
139 return self
.get_conditional_response(request
, default
)
141 def get_conditional_response(self
, request
, default
):
142 """Get the response based on the conditional HTTP request headers.
145 request: an ArchivedHttpRequest representing the original request.
146 default: default ArchivedHttpResponse
147 original request with matched headers removed.
150 an ArchivedHttpResponse with a status of 200, 302 (not modified), or
151 412 (precondition failed)
154 if request
.is_conditional():
155 stripped_request
= request
.create_request_without_conditions()
156 if stripped_request
in self
:
157 response
= self
[stripped_request
]
158 if response
.status
== 200:
159 status
= self
.get_conditional_status(request
, response
)
161 response
= create_response(status
)
164 def get_conditional_status(self
, request
, response
):
166 last_modified
= email
.utils
.parsedate(
167 response
.update_date(response
.get_header('last-modified')))
168 response_etag
= response
.get_header('etag')
169 is_get_or_head
= request
.command
.upper() in ('GET', 'HEAD')
171 match_value
= request
.headers
.get('if-match', None)
173 if self
.is_etag_match(match_value
, response_etag
):
176 status
= 412 # precondition failed
177 none_match_value
= request
.headers
.get('if-none-match', None)
179 if self
.is_etag_match(none_match_value
, response_etag
):
185 if is_get_or_head
and last_modified
:
186 for header
in ('if-modified-since', 'if-unmodified-since'):
187 date
= email
.utils
.parsedate(request
.headers
.get(header
, None))
189 if ((header
== 'if-modified-since' and last_modified
> date
) or
190 (header
== 'if-unmodified-since' and last_modified
< date
)):
194 status
= 304 # not modified
198 def is_etag_match(request_etag
, response_etag
):
199 """Determines whether the entity tags of the request/response matches.
202 request_etag: the value string of the "if-(none)-match:"
203 portion of the request header
204 response_etag: the etag value of the response
207 True on match, False otherwise
209 response_etag
= response_etag
.strip('" ')
210 for etag
in request_etag
.split(','):
211 etag
= etag
.strip('" ')
212 if etag
in ('*', response_etag
):
216 def get_requests(self
, command
=None, host
=None, full_path
=None, is_ssl
=None,
218 """Return a list of requests that match the given args."""
220 return [r
for r
in self
.responses_by_host
[host
]
221 if r
.matches(command
, None, full_path
, is_ssl
,
222 use_query
=use_query
)]
224 return [r
for r
in self
225 if r
.matches(command
, host
, full_path
, is_ssl
,
226 use_query
=use_query
)]
228 def ls(self
, command
=None, host
=None, full_path
=None):
229 """List all URLs that match given params."""
230 return ''.join(sorted(
231 '%s\n' % r
for r
in self
.get_requests(command
, host
, full_path
)))
233 def cat(self
, command
=None, host
=None, full_path
=None):
234 """Print the contents of all URLs that match given params."""
235 out
= StringIO
.StringIO()
236 for request
in self
.get_requests(command
, host
, full_path
):
237 print >>out
, str(request
)
238 print >>out
, 'Untrimmed request headers:'
239 for k
in request
.headers
:
240 print >>out
, ' %s: %s' % (k
, request
.headers
[k
])
241 if request
.request_body
:
242 print >>out
, request
.request_body
243 print >>out
, '---- Response Info', '-' * 51
244 response
= self
[request
]
245 chunk_lengths
= [len(x
) for x
in response
.response_data
]
246 print >>out
, ('Status: %s\n'
248 'Headers delay: %s\n'
249 'Response headers:') % (
250 response
.status
, response
.reason
, response
.delays
['headers'])
251 for k
, v
in response
.headers
:
252 print >>out
, ' %s: %s' % (k
, v
)
253 print >>out
, ('Chunk count: %s\n'
254 'Chunk lengths: %s\n'
255 'Chunk delays: %s') % (
256 len(chunk_lengths
), chunk_lengths
, response
.delays
['data'])
257 body
= response
.get_data_as_text()
258 print >>out
, '---- Response Data', '-' * 51
262 print >>out
, '[binary data]'
263 print >>out
, '=' * 70
264 return out
.getvalue()
266 def stats(self
, command
=None, host
=None, full_path
=None):
267 """Print stats about the archive for all URLs that match given params."""
268 matching_requests
= self
.get_requests(command
, host
, full_path
)
269 if not matching_requests
:
270 print 'Failed to find any requests matching given command, host, path.'
273 out
= StringIO
.StringIO()
275 'Total': len(matching_requests
),
276 'Domains': defaultdict(int),
277 'HTTP_response_code': defaultdict(int),
278 'content_type': defaultdict(int),
279 'Documents': defaultdict(int),
282 for request
in matching_requests
:
283 stats
['Domains'][request
.host
] += 1
284 stats
['HTTP_response_code'][self
[request
].status
] += 1
286 content_type
= self
[request
].get_header('content-type')
287 # Remove content type options for readability and higher level groupings.
288 str_content_type
= str(content_type
.split(';')[0]
289 if content_type
else None)
290 stats
['content_type'][str_content_type
] += 1
292 # Documents are the main URL requested and not a referenced resource.
293 if str_content_type
== 'text/html' and not 'referer' in request
.headers
:
294 stats
['Documents'][request
.host
] += 1
296 print >>out
, json
.dumps(stats
, indent
=4)
297 return out
.getvalue()
299 def merge(self
, merged_archive
=None, other_archives
=None):
300 """Merge multiple archives into merged_archive by 'chaining' resources,
301 only resources that are not part of the accumlated archive are added"""
302 if not other_archives
:
303 print 'No archives passed to merge'
306 # Note we already loaded 'replay_file'.
307 print 'Loaded %d responses' % len(self
)
309 for archive
in other_archives
:
310 if not os
.path
.exists(archive
):
311 print 'Error: Replay file "%s" does not exist' % archive
314 http_archive_other
= HttpArchive
.Load(archive
)
315 print 'Loaded %d responses from %s' % (len(http_archive_other
), archive
)
316 for r
in http_archive_other
:
317 # Only resources that are not already part of the current archive
321 self
[r
] = http_archive_other
[r
]
322 self
.Persist('%s' % merged_archive
)
324 def edit(self
, command
=None, host
=None, full_path
=None):
325 """Edits the single request which matches given params."""
326 editor
= os
.getenv('EDITOR')
328 print 'You must set the EDITOR environmental variable.'
331 matching_requests
= self
.get_requests(command
, host
, full_path
)
332 if not matching_requests
:
333 print ('Failed to find any requests matching given command, host, '
337 if len(matching_requests
) > 1:
338 print 'Found multiple matching requests. Please refine.'
339 print self
.ls(command
, host
, full_path
)
341 response
= self
[matching_requests
[0]]
342 tmp_file
= tempfile
.NamedTemporaryFile(delete
=False)
343 tmp_file
.write(response
.get_response_as_text())
345 subprocess
.check_call([editor
, tmp_file
.name
])
346 response
.set_response_from_text(''.join(open(tmp_file
.name
).readlines()))
347 os
.remove(tmp_file
.name
)
349 def find_closest_request(self
, request
, use_path
=False):
350 """Find the closest matching request in the archive to the given request.
353 request: an ArchivedHttpRequest
354 use_path: If True, closest matching request's path component must match.
355 (Note: this refers to the 'path' component within the URL, not the
356 'full path' which includes the query string component.)
357 If use_path=True, candidate will NOT match in example below
358 e.g. request = GET www.test.com/path?aaa
359 candidate = GET www.test.com/diffpath?aaa
361 If a close match is found, return the instance of ArchivedHttpRequest.
362 Otherwise, return None.
364 full_path
= request
.full_path
if use_path
else None
365 requests
= self
.get_requests(request
.command
, request
.host
, full_path
,
366 is_ssl
=request
.is_ssl
, use_query
=not use_path
)
371 if len(requests
) == 1:
374 matcher
= difflib
.SequenceMatcher(b
=request
.formatted_request
)
376 # quick_ratio() is cheap to compute, but ratio() is expensive. So we call
377 # quick_ratio() on all requests, sort them descending, and then loop through
378 # until we find a candidate whose ratio() is >= the next quick_ratio().
379 # This works because quick_ratio() is guaranteed to be an upper bound on
382 for candidate
in requests
:
383 matcher
.set_seq1(candidate
.formatted_request
)
384 candidates
.append((matcher
.quick_ratio(), candidate
))
386 candidates
.sort(reverse
=True, key
=lambda c
: c
[0])
388 best_match
= (0, None)
389 for i
in xrange(len(candidates
)):
390 matcher
.set_seq1(candidates
[i
][1].formatted_request
)
391 best_match
= max(best_match
, (matcher
.ratio(), candidates
[i
][1]))
392 if i
+ 1 < len(candidates
) and best_match
[0] >= candidates
[i
+1][0]:
396 def diff(self
, request
):
397 """Diff the given request to the closest matching request in the archive.
400 request: an ArchivedHttpRequest
402 If a close match is found, return a textual diff between the requests.
403 Otherwise, return None.
405 request_lines
= request
.formatted_request
.split('\n')
406 closest_request
= self
.find_closest_request(request
)
408 closest_request_lines
= closest_request
.formatted_request
.split('\n')
409 return '\n'.join(difflib
.ndiff(closest_request_lines
, request_lines
))
412 def get_server_cert(self
, host
):
413 """Gets certificate from the server and stores it in archive"""
414 request
= ArchivedHttpRequest('SERVER_CERT', host
, '', None, {})
415 if request
not in self
:
416 self
[request
] = create_response(200, body
=certutils
.get_host_cert(host
))
417 return self
[request
].response_data
[0]
419 def get_certificate(self
, host
):
420 request
= ArchivedHttpRequest('DUMMY_CERT', host
, '', None, {})
421 if request
not in self
:
422 self
[request
] = create_response(200, body
=self
._generate
_cert
(host
))
423 return self
[request
].response_data
[0]
426 def AssertWritable(cls
, filename
):
427 """Raises an IOError if filename is not writable."""
428 persist_dir
= os
.path
.dirname(os
.path
.abspath(filename
))
429 if not os
.path
.exists(persist_dir
):
430 raise IOError('Directory does not exist: %s' % persist_dir
)
431 if os
.path
.exists(filename
):
432 if not os
.access(filename
, os
.W_OK
):
433 raise IOError('Need write permission on file: %s' % filename
)
434 elif not os
.access(persist_dir
, os
.W_OK
):
435 raise IOError('Need write permission on directory: %s' % persist_dir
)
438 def Load(cls
, filename
):
439 """Load an instance from filename."""
440 return cPickle
.load(open(filename
, 'rb'))
442 def Persist(self
, filename
):
443 """Persist all state to filename."""
445 original_checkinterval
= sys
.getcheckinterval()
446 sys
.setcheckinterval(2**31-1) # Lock out other threads so nothing can
447 # modify |self| during pickling.
448 pickled_self
= cPickle
.dumps(self
, cPickle
.HIGHEST_PROTOCOL
)
450 sys
.setcheckinterval(original_checkinterval
)
451 with
open(filename
, 'wb') as f
:
452 f
.write(pickled_self
)
455 class ArchivedHttpRequest(object):
456 """Record all the state that goes into a request.
458 ArchivedHttpRequest instances are considered immutable so they can
459 serve as keys for HttpArchive instances.
460 (The immutability is not enforced.)
462 Upon creation, the headers are "trimmed" (i.e. edited or dropped)
463 and saved to self.trimmed_headers to allow requests to match in a wider
464 variety of playback situations (e.g. using different user agents).
466 For unpickling, 'trimmed_headers' is recreated from 'headers'. That
467 allows for changes to the trim function and can help with debugging.
469 CONDITIONAL_HEADERS
= [
470 'if-none-match', 'if-match',
471 'if-modified-since', 'if-unmodified-since']
473 def __init__(self
, command
, host
, full_path
, request_body
, headers
,
475 """Initialize an ArchivedHttpRequest.
478 command: a string (e.g. 'GET' or 'POST').
479 host: a host name (e.g. 'www.google.com').
480 full_path: a request path. Includes everything after the host & port in
481 the URL (e.g. '/search?q=dogs').
482 request_body: a request body string for a POST or None.
483 headers: {key: value, ...} where key and value are strings.
484 is_ssl: a boolean which is True iff request is make via SSL.
486 self
.command
= command
488 self
.full_path
= full_path
489 self
.path
= urlparse
.urlparse(full_path
).path
if full_path
else None
490 self
.request_body
= request_body
491 self
.headers
= headers
493 self
.trimmed_headers
= self
._TrimHeaders
(headers
)
494 self
.formatted_request
= self
._GetFormattedRequest
()
497 scheme
= 'https' if self
.is_ssl
else 'http'
498 return '%s %s://%s%s %s' % (
499 self
.command
, scheme
, self
.host
, self
.full_path
, self
.trimmed_headers
)
502 return repr((self
.command
, self
.host
, self
.full_path
, self
.request_body
,
503 self
.trimmed_headers
, self
.is_ssl
))
506 """Return a integer hash to use for hashed collections including dict."""
507 return hash(repr(self
))
509 def __eq__(self
, other
):
510 """Define the __eq__ method to match the hash behavior."""
511 return repr(self
) == repr(other
)
513 def __setstate__(self
, state
):
514 """Influence how to unpickle.
516 "headers" are the original request headers.
517 "trimmed_headers" are the trimmed headers used for matching requests
521 state: a dictionary for __dict__
523 if 'full_headers' in state
:
524 # Fix older version of archive.
525 state
['headers'] = state
['full_headers']
526 del state
['full_headers']
527 if 'headers' not in state
:
528 raise HttpArchiveException(
529 'Archived HTTP request is missing "headers". The HTTP archive is'
530 ' likely from a previous version and must be re-recorded.')
532 # before, 'path' and 'path_without_query' were used and 'path' was
533 # pickled. Now, 'path' has been renamed to 'full_path' and
534 # 'path_without_query' has been renamed to 'path'. 'full_path' is
535 # pickled, but 'path' is not. If we see 'path' here it means we are
536 # dealing with an older archive.
537 state
['full_path'] = state
['path']
539 state
['trimmed_headers'] = self
._TrimHeaders
(dict(state
['headers']))
540 if 'is_ssl' not in state
:
541 state
['is_ssl'] = False
542 self
.__dict
__.update(state
)
543 self
.path
= urlparse
.urlparse(self
.full_path
).path
544 self
.formatted_request
= self
._GetFormattedRequest
()
546 def __getstate__(self
):
547 """Influence how to pickle.
550 a dict to use for pickling
552 state
= self
.__dict
__.copy()
553 del state
['trimmed_headers']
555 del state
['formatted_request']
558 def _GetFormattedRequest(self
):
559 """Format request to make diffs easier to read.
562 A string consisting of the request. Example:
563 'GET www.example.com/path\nHeader-Key: header value\n'
565 parts
= ['%s %s%s\n' % (self
.command
, self
.host
, self
.full_path
)]
566 if self
.request_body
:
567 parts
.append('%s\n' % self
.request_body
)
568 for k
, v
in self
.trimmed_headers
:
569 k
= '-'.join(x
.capitalize() for x
in k
.split('-'))
570 parts
.append('%s: %s\n' % (k
, v
))
571 return ''.join(parts
)
573 def matches(self
, command
=None, host
=None, full_path
=None, is_ssl
=None,
575 """Returns true iff the request matches all parameters.
578 command: a string (e.g. 'GET' or 'POST').
579 host: a host name (e.g. 'www.google.com').
580 full_path: a request path with query string (e.g. '/search?q=dogs')
581 is_ssl: whether the request is secure.
583 If use_query is True, request matching uses both the hierarchical path
584 and query string component.
585 If use_query is False, request matching only uses the hierarchical path
587 e.g. req1 = GET www.test.com/index?aaaa
588 req2 = GET www.test.com/index?bbbb
590 If use_query is True, req1.matches(req2) evaluates to False
591 If use_query is False, req1.matches(req2) evaluates to True
594 True iff the request matches all parameters
596 if command
is not None and command
!= self
.command
:
598 if is_ssl
is not None and is_ssl
!= self
.is_ssl
:
600 if host
is not None and host
!= self
.host
:
602 if full_path
is None:
605 return full_path
== self
.full_path
607 return self
.path
== urlparse
.urlparse(full_path
).path
610 def _TrimHeaders(cls
, headers
):
611 """Removes headers that are known to cause problems during replay.
613 These headers are removed for the following reasons:
614 - accept: Causes problems with www.bing.com. During record, CSS is fetched
615 with *. During replay, it's text/css.
616 - accept-charset, accept-language, referer: vary between clients.
617 - cache-control: sometimes sent from Chrome with 'max-age=0' as value.
618 - connection, method, scheme, url, version: Cause problems with spdy.
619 - cookie: Extremely sensitive to request/response order.
620 - keep-alive: Doesn't affect the content of the request, only some
621 transient state of the transport layer.
622 - user-agent: Changes with every Chrome version.
623 - proxy-connection: Sent for proxy requests.
625 Another variant to consider is dropping only the value from the header.
626 However, this is particularly bad for the cookie header, because the
627 presence of the cookie depends on the responses we've seen when the request
631 headers: {header_key: header_value, ...}
634 [(header_key, header_value), ...] # (with undesirable headers removed)
636 # TODO(tonyg): Strip sdch from the request headers because we can't
637 # guarantee that the dictionary will be recorded, so replay may not work.
638 if 'accept-encoding' in headers
:
639 accept_encoding
= headers
['accept-encoding']
640 accept_encoding
= accept_encoding
.replace('sdch', '')
641 stripped_encodings
= [e
.strip() for e
in accept_encoding
.split(',')]
642 accept_encoding
= ','.join(filter(bool, stripped_encodings
))
643 headers
['accept-encoding'] = accept_encoding
645 'accept', 'accept-charset', 'accept-language', 'cache-control',
646 'connection', 'cookie', 'keep-alive', 'method',
647 'referer', 'scheme', 'url', 'version', 'user-agent', 'proxy-connection',
648 'x-chrome-variations']
649 return sorted([(k
, v
) for k
, v
in headers
.items()
650 if k
.lower() not in undesirable_keys
])
652 def is_conditional(self
):
653 """Return list of headers that match conditional headers."""
654 for header
in self
.CONDITIONAL_HEADERS
:
655 if header
in self
.headers
:
659 def create_request_without_conditions(self
):
660 stripped_headers
= dict((k
, v
) for k
, v
in self
.headers
.iteritems()
661 if k
.lower() not in self
.CONDITIONAL_HEADERS
)
662 return ArchivedHttpRequest(
663 self
.command
, self
.host
, self
.full_path
, self
.request_body
,
664 stripped_headers
, self
.is_ssl
)
666 class ArchivedHttpResponse(object):
667 """All the data needed to recreate all HTTP response."""
669 # CHUNK_EDIT_SEPARATOR is used to edit and view text content.
670 # It is not sent in responses. It is added by get_data_as_text()
671 # and removed by set_data().
672 CHUNK_EDIT_SEPARATOR
= '[WEB_PAGE_REPLAY_CHUNK_BOUNDARY]'
674 # DELAY_EDIT_SEPARATOR is used to edit and view server delays.
675 DELAY_EDIT_SEPARATOR
= ('\n[WEB_PAGE_REPLAY_EDIT_ARCHIVE --- '
676 'Delays are above. Response content is below.]\n')
678 def __init__(self
, version
, status
, reason
, headers
, response_data
,
680 """Initialize an ArchivedHttpResponse.
683 version: HTTP protocol version used by server.
684 10 for HTTP/1.0, 11 for HTTP/1.1 (same as httplib).
685 status: Status code returned by server (e.g. 200).
686 reason: Reason phrase returned by server (e.g. "OK").
687 headers: list of (header, value) tuples.
688 response_data: list of content chunks.
689 Concatenating the chunks gives the complete contents
690 (i.e. the chunks do not have any lengths or delimiters).
691 Do not include the final, zero-length chunk that marks the end.
692 delays: dict of (ms) delays for 'connect', 'headers' and 'data'.
693 e.g. {'connect': 50, 'headers': 150, 'data': [0, 10, 10]}
694 connect - The time to connect to the server.
695 Each resource has a value because Replay's record mode captures it.
696 This includes the time for the SYN and SYN/ACK (1 rtt).
697 headers - The time elapsed between the TCP connect and the headers.
698 This typically includes all the server-time to generate a response.
699 data - If the response is chunked, these are the times for each chunk.
701 self
.version
= version
704 self
.headers
= headers
705 self
.response_data
= response_data
709 def fix_delays(self
):
710 """Initialize delays, or check the number of data delays."""
711 expected_num_delays
= len(self
.response_data
)
716 'data': [0] * expected_num_delays
719 num_delays
= len(self
.delays
['data'])
720 if num_delays
!= expected_num_delays
:
721 raise HttpArchiveException(
722 'Server delay length mismatch: %d (expected %d): %s',
723 num_delays
, expected_num_delays
, self
.delays
['data'])
726 return repr((self
.version
, self
.status
, self
.reason
, sorted(self
.headers
),
730 """Return a integer hash to use for hashed collections including dict."""
731 return hash(repr(self
))
733 def __eq__(self
, other
):
734 """Define the __eq__ method to match the hash behavior."""
735 return repr(self
) == repr(other
)
737 def __setstate__(self
, state
):
738 """Influence how to unpickle.
741 state: a dictionary for __dict__
743 if 'server_delays' in state
:
747 'data': state
['server_delays']
749 del state
['server_delays']
750 elif 'delays' not in state
:
751 state
['delays'] = None
752 self
.__dict
__.update(state
)
755 def get_header(self
, key
, default
=None):
756 for k
, v
in self
.headers
:
757 if key
.lower() == k
.lower():
761 def set_header(self
, key
, value
):
762 for i
, (k
, v
) in enumerate(self
.headers
):
764 self
.headers
[i
] = (key
, value
)
766 self
.headers
.append((key
, value
))
768 def remove_header(self
, key
):
769 for i
, (k
, v
) in enumerate(self
.headers
):
770 if key
.lower() == k
.lower():
775 def _get_epoch_seconds(date_str
):
776 """Return the epoch seconds of a date header.
779 date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")
781 epoch seconds as a float
783 date_tuple
= email
.utils
.parsedate(date_str
)
785 return calendar
.timegm(date_tuple
)
788 def update_date(self
, date_str
, now
=None):
789 """Return an updated date based on its delta from the "Date" header.
791 For example, if |date_str| is one week later than the "Date" header,
792 then the returned date string is one week later than the current date.
795 date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")
799 date_seconds
= self
._get
_epoch
_seconds
(self
.get_header('date'))
800 header_seconds
= self
._get
_epoch
_seconds
(date_str
)
801 if date_seconds
and header_seconds
:
802 updated_seconds
= header_seconds
+ (now
or time
.time()) - date_seconds
803 return email
.utils
.formatdate(updated_seconds
, usegmt
=True)
807 return self
.get_header('content-encoding') == 'gzip'
809 def is_compressed(self
):
810 return self
.get_header('content-encoding') in ('gzip', 'deflate')
812 def is_chunked(self
):
813 return self
.get_header('transfer-encoding') == 'chunked'
815 def get_data_as_text(self
):
816 """Return content as a single string.
818 Uncompresses and concatenates chunks with CHUNK_EDIT_SEPARATOR.
820 content_type
= self
.get_header('content-type')
821 if (not content_type
or
822 not (content_type
.startswith('text/') or
823 content_type
== 'application/x-javascript' or
824 content_type
.startswith('application/json'))):
826 if self
.is_compressed():
827 uncompressed_chunks
= httpzlib
.uncompress_chunks(
828 self
.response_data
, self
.is_gzip())
830 uncompressed_chunks
= self
.response_data
831 return self
.CHUNK_EDIT_SEPARATOR
.join(uncompressed_chunks
)
833 def get_delays_as_text(self
):
834 """Return delays as editable text."""
835 return json
.dumps(self
.delays
, indent
=2)
837 def get_response_as_text(self
):
838 """Returns response content as a single string.
840 Server delays are separated on a per-chunk basis. Delays are in seconds.
841 Response content begins after DELAY_EDIT_SEPARATOR
843 data
= self
.get_data_as_text()
845 logging
.warning('Data can not be represented as text.')
847 delays
= self
.get_delays_as_text()
848 return self
.DELAY_EDIT_SEPARATOR
.join((delays
, data
))
850 def set_data(self
, text
):
851 """Inverse of get_data_as_text().
853 Split on CHUNK_EDIT_SEPARATOR and compress if needed.
855 text_chunks
= text
.split(self
.CHUNK_EDIT_SEPARATOR
)
856 if self
.is_compressed():
857 self
.response_data
= httpzlib
.compress_chunks(text_chunks
, self
.is_gzip())
859 self
.response_data
= text_chunks
860 if not self
.is_chunked():
861 content_length
= sum(len(c
) for c
in self
.response_data
)
862 self
.set_header('content-length', str(content_length
))
864 def set_delays(self
, delays_text
):
865 """Inverse of get_delays_as_text().
868 delays_text: JSON encoded text such as the following:
874 Times are in milliseconds.
875 Each data delay corresponds with one response_data value.
878 self
.delays
= json
.loads(delays_text
)
879 except (ValueError, KeyError) as e
:
880 logging
.critical('Unable to parse delays %s: %s', delays_text
, e
)
883 def set_response_from_text(self
, text
):
884 """Inverse of get_response_as_text().
886 Modifies the state of the archive according to the textual representation.
889 delays
, data
= text
.split(self
.DELAY_EDIT_SEPARATOR
)
892 'Error parsing text representation. Skipping edits.')
894 self
.set_delays(delays
)
898 def create_response(status
, reason
=None, headers
=None, body
=None):
899 """Convenience method for creating simple ArchivedHttpResponse objects."""
901 reason
= httplib
.responses
.get(status
, 'Unknown')
903 headers
= [('content-type', 'text/plain')]
905 body
= "%s %s" % (status
, reason
)
906 return ArchivedHttpResponse(11, status
, reason
, headers
, [body
])
910 class PlainHelpFormatter(optparse
.IndentedHelpFormatter
):
911 def format_description(self
, description
):
913 return description
+ '\n'
917 option_parser
= optparse
.OptionParser(
918 usage
='%prog [ls|cat|edit|stats|merge] [options] replay_file(s)',
919 formatter
=PlainHelpFormatter(),
921 epilog
='http://code.google.com/p/web-page-replay/')
923 option_parser
.add_option('-c', '--command', default
=None,
926 help='Only show URLs matching this command.')
927 option_parser
.add_option('-o', '--host', default
=None,
930 help='Only show URLs matching this host.')
931 option_parser
.add_option('-p', '--full_path', default
=None,
934 help='Only show URLs matching this full path.')
935 option_parser
.add_option('-f', '--merged_file', default
=None,
938 help='The output file to use when using the merge command.')
940 options
, args
= option_parser
.parse_args()
942 # Merge command expects an umlimited number of archives.
944 print 'args: %s' % args
945 option_parser
.error('Must specify a command and replay_file')
948 replay_file
= args
[1]
950 if not os
.path
.exists(replay_file
):
951 option_parser
.error('Replay file "%s" does not exist' % replay_file
)
953 http_archive
= HttpArchive
.Load(replay_file
)
955 print http_archive
.ls(options
.command
, options
.host
, options
.full_path
)
956 elif command
== 'cat':
957 print http_archive
.cat(options
.command
, options
.host
, options
.full_path
)
958 elif command
== 'stats':
959 print http_archive
.stats(options
.command
, options
.host
, options
.full_path
)
960 elif command
== 'merge':
961 if not options
.merged_file
:
962 print 'Error: Must specify a merged file name (use --merged_file)'
964 http_archive
.merge(options
.merged_file
, args
[2:])
965 elif command
== 'edit':
966 http_archive
.edit(options
.command
, options
.host
, options
.full_path
)
967 http_archive
.Persist(replay_file
)
969 option_parser
.error('Unknown command "%s"' % command
)
973 if __name__
== '__main__':