Merge Chromium + Blink git repositories
[chromium-blink-merge.git] / tools / telemetry / third_party / webpagereplay / httpclient.py
blobc650dc4d1114f680ab369d525ae2a00030c20b84
1 #!/usr/bin/env python
2 # Copyright 2012 Google Inc. All Rights Reserved.
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
8 # http://www.apache.org/licenses/LICENSE-2.0
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
16 """Retrieve web resources over http."""
18 import copy
19 import httplib
20 import logging
21 import random
22 import StringIO
24 import httparchive
25 import platformsettings
26 import script_injector
29 # PIL isn't always available, but we still want to be able to run without
30 # the image scrambling functionality in this case.
31 try:
32 import Image
33 except ImportError:
34 Image = None
36 TIMER = platformsettings.timer
39 class HttpClientException(Exception):
40 """Base class for all exceptions in httpclient."""
41 pass
44 def _InjectScripts(response, inject_script):
45 """Injects |inject_script| immediately after <head> or <html>.
47 Copies |response| if it is modified.
49 Args:
50 response: an ArchivedHttpResponse
51 inject_script: JavaScript string (e.g. "Math.random = function(){...}")
52 Returns:
53 an ArchivedHttpResponse
54 """
55 if type(response) == tuple:
56 logging.warn('tuple response: %s', response)
57 content_type = response.get_header('content-type')
58 if content_type and content_type.startswith('text/html'):
59 text = response.get_data_as_text()
60 text, already_injected = script_injector.InjectScript(
61 text, 'text/html', inject_script)
62 if not already_injected:
63 response = copy.deepcopy(response)
64 response.set_data(text)
65 return response
68 def _ScrambleImages(response):
69 """If the |response| is an image, attempt to scramble it.
71 Copies |response| if it is modified.
73 Args:
74 response: an ArchivedHttpResponse
75 Returns:
76 an ArchivedHttpResponse
77 """
79 assert Image, '--scramble_images requires the PIL module to be installed.'
81 content_type = response.get_header('content-type')
82 if content_type and content_type.startswith('image/'):
83 try:
84 image_data = response.response_data[0]
85 image_data.decode(encoding='base64')
86 im = Image.open(StringIO.StringIO(image_data))
88 pixel_data = list(im.getdata())
89 random.shuffle(pixel_data)
91 scrambled_image = im.copy()
92 scrambled_image.putdata(pixel_data)
94 output_image_io = StringIO.StringIO()
95 scrambled_image.save(output_image_io, im.format)
96 output_image_data = output_image_io.getvalue()
97 output_image_data.encode(encoding='base64')
99 response = copy.deepcopy(response)
100 response.set_data(output_image_data)
101 except Exception:
102 pass
104 return response
107 class DetailedHTTPResponse(httplib.HTTPResponse):
108 """Preserve details relevant to replaying responses.
110 WARNING: This code uses attributes and methods of HTTPResponse
111 that are not part of the public interface.
114 def read_chunks(self):
115 """Return the response body content and timing data.
117 The returned chunks have the chunk size and CRLFs stripped off.
118 If the response was compressed, the returned data is still compressed.
120 Returns:
121 (chunks, delays)
122 chunks:
123 [response_body] # non-chunked responses
124 [chunk_1, chunk_2, ...] # chunked responses
125 delays:
126 [0] # non-chunked responses
127 [chunk_1_first_byte_delay, ...] # chunked responses
129 The delay for the first body item should be recorded by the caller.
131 buf = []
132 chunks = []
133 delays = []
134 if not self.chunked:
135 chunks.append(self.read())
136 delays.append(0)
137 else:
138 start = TIMER()
139 try:
140 while True:
141 line = self.fp.readline()
142 chunk_size = self._read_chunk_size(line)
143 if chunk_size is None:
144 raise httplib.IncompleteRead(''.join(chunks))
145 if chunk_size == 0:
146 break
147 delays.append(TIMER() - start)
148 chunks.append(self._safe_read(chunk_size))
149 self._safe_read(2) # skip the CRLF at the end of the chunk
150 start = TIMER()
152 # Ignore any trailers.
153 while True:
154 line = self.fp.readline()
155 if not line or line == '\r\n':
156 break
157 finally:
158 self.close()
159 return chunks, delays
161 @classmethod
162 def _read_chunk_size(cls, line):
163 chunk_extensions_pos = line.find(';')
164 if chunk_extensions_pos != -1:
165 line = line[:chunk_extensions_pos] # strip chunk-extensions
166 try:
167 chunk_size = int(line, 16)
168 except ValueError:
169 return None
170 return chunk_size
173 class DetailedHTTPConnection(httplib.HTTPConnection):
174 """Preserve details relevant to replaying connections."""
175 response_class = DetailedHTTPResponse
178 class DetailedHTTPSResponse(DetailedHTTPResponse):
179 """Preserve details relevant to replaying SSL responses."""
180 pass
183 class DetailedHTTPSConnection(httplib.HTTPSConnection):
184 """Preserve details relevant to replaying SSL connections."""
185 response_class = DetailedHTTPSResponse
188 class RealHttpFetch(object):
190 def __init__(self, real_dns_lookup):
191 """Initialize RealHttpFetch.
193 Args:
194 real_dns_lookup: a function that resolves a host to an IP.
196 self._real_dns_lookup = real_dns_lookup
198 @staticmethod
199 def _GetHeaderNameValue(header):
200 """Parse the header line and return a name/value tuple.
202 Args:
203 header: a string for a header such as "Content-Length: 314".
204 Returns:
205 A tuple (header_name, header_value) on success or None if the header
206 is not in expected format. header_name is in lowercase.
208 i = header.find(':')
209 if i > 0:
210 return (header[:i].lower(), header[i+1:].strip())
211 return None
213 @staticmethod
214 def _ToTuples(headers):
215 """Parse headers and save them to a list of tuples.
217 This method takes HttpResponse.msg.headers as input and convert it
218 to a list of (header_name, header_value) tuples.
219 HttpResponse.msg.headers is a list of strings where each string
220 represents either a header or a continuation line of a header.
221 1. a normal header consists of two parts which are separated by colon :
222 "header_name:header_value..."
223 2. a continuation line is a string starting with whitespace
224 "[whitespace]continued_header_value..."
225 If a header is not in good shape or an unexpected continuation line is
226 seen, it will be ignored.
228 Should avoid using response.getheaders() directly
229 because response.getheaders() can't handle multiple headers
230 with the same name properly. Instead, parse the
231 response.msg.headers using this method to get all headers.
233 Args:
234 headers: an instance of HttpResponse.msg.headers.
235 Returns:
236 A list of tuples which looks like:
237 [(header_name, header_value), (header_name2, header_value2)...]
239 all_headers = []
240 for line in headers:
241 if line[0] in '\t ':
242 if not all_headers:
243 logging.warning(
244 'Unexpected response header continuation line [%s]', line)
245 continue
246 name, value = all_headers.pop()
247 value += '\n ' + line.strip()
248 else:
249 name_value = RealHttpFetch._GetHeaderNameValue(line)
250 if not name_value:
251 logging.warning(
252 'Response header in wrong format [%s]', line)
253 continue
254 name, value = name_value # pylint: disable=unpacking-non-sequence
255 all_headers.append((name, value))
256 return all_headers
258 @staticmethod
259 def _get_request_host_port(request):
260 host_parts = request.host.split(':')
261 host = host_parts[0]
262 port = int(host_parts[1]) if len(host_parts) == 2 else None
263 return host, port
265 @staticmethod
266 def _get_system_proxy(is_ssl):
267 return platformsettings.get_system_proxy(is_ssl)
269 def _get_connection(self, request_host, request_port, is_ssl):
270 """Return a detailed connection object for host/port pair.
272 If a system proxy is defined (see platformsettings.py), it will be used.
274 Args:
275 request_host: a host string (e.g. "www.example.com").
276 request_port: a port integer (e.g. 8080) or None (for the default port).
277 is_ssl: True if HTTPS connection is needed.
278 Returns:
279 A DetailedHTTPSConnection or DetailedHTTPConnection instance.
281 connection_host = request_host
282 connection_port = request_port
283 system_proxy = self._get_system_proxy(is_ssl)
284 if system_proxy:
285 connection_host = system_proxy.host
286 connection_port = system_proxy.port
288 # Use an IP address because WPR may override DNS settings.
289 connection_ip = self._real_dns_lookup(connection_host)
290 if not connection_ip:
291 logging.critical('Unable to find host ip for name: %s', connection_host)
292 return None
294 if is_ssl:
295 connection = DetailedHTTPSConnection(connection_ip, connection_port)
296 if system_proxy:
297 connection.set_tunnel(request_host, request_port)
298 else:
299 connection = DetailedHTTPConnection(connection_ip, connection_port)
300 return connection
302 def __call__(self, request):
303 """Fetch an HTTP request.
305 Args:
306 request: an ArchivedHttpRequest
307 Returns:
308 an ArchivedHttpResponse
310 logging.debug('RealHttpFetch: %s %s', request.host, request.full_path)
311 request_host, request_port = self._get_request_host_port(request)
312 retries = 3
313 while True:
314 try:
315 connection = self._get_connection(
316 request_host, request_port, request.is_ssl)
317 connect_start = TIMER()
318 connection.connect()
319 connect_delay = int((TIMER() - connect_start) * 1000)
320 start = TIMER()
321 connection.request(
322 request.command,
323 request.full_path,
324 request.request_body,
325 request.headers)
326 response = connection.getresponse()
327 headers_delay = int((TIMER() - start) * 1000)
329 chunks, chunk_delays = response.read_chunks()
330 delays = {
331 'connect': connect_delay,
332 'headers': headers_delay,
333 'data': chunk_delays
335 archived_http_response = httparchive.ArchivedHttpResponse(
336 response.version,
337 response.status,
338 response.reason,
339 RealHttpFetch._ToTuples(response.msg.headers),
340 chunks,
341 delays)
342 return archived_http_response
343 except Exception, e:
344 if retries:
345 retries -= 1
346 logging.warning('Retrying fetch %s: %s', request, e)
347 continue
348 logging.critical('Could not fetch %s: %s', request, e)
349 return None
352 class RecordHttpArchiveFetch(object):
353 """Make real HTTP fetches and save responses in the given HttpArchive."""
355 def __init__(self, http_archive, real_dns_lookup, inject_script):
356 """Initialize RecordHttpArchiveFetch.
358 Args:
359 http_archive: an instance of a HttpArchive
360 real_dns_lookup: a function that resolves a host to an IP.
361 inject_script: script string to inject in all pages
363 self.http_archive = http_archive
364 self.real_http_fetch = RealHttpFetch(real_dns_lookup)
365 self.inject_script = inject_script
367 def __call__(self, request):
368 """Fetch the request and return the response.
370 Args:
371 request: an ArchivedHttpRequest.
372 Returns:
373 an ArchivedHttpResponse
375 # If request is already in the archive, return the archived response.
376 if request in self.http_archive:
377 logging.debug('Repeated request found: %s', request)
378 response = self.http_archive[request]
379 else:
380 response = self.real_http_fetch(request)
381 if response is None:
382 return None
383 self.http_archive[request] = response
384 if self.inject_script:
385 response = _InjectScripts(response, self.inject_script)
386 logging.debug('Recorded: %s', request)
387 return response
390 class ReplayHttpArchiveFetch(object):
391 """Serve responses from the given HttpArchive."""
393 def __init__(self, http_archive, real_dns_lookup, inject_script,
394 use_diff_on_unknown_requests=False,
395 use_closest_match=False, scramble_images=False):
396 """Initialize ReplayHttpArchiveFetch.
398 Args:
399 http_archive: an instance of a HttpArchive
400 real_dns_lookup: a function that resolves a host to an IP.
401 inject_script: script string to inject in all pages
402 use_diff_on_unknown_requests: If True, log unknown requests
403 with a diff to requests that look similar.
404 use_closest_match: If True, on replay mode, serve the closest match
405 in the archive instead of giving a 404.
407 self.http_archive = http_archive
408 self.inject_script = inject_script
409 self.use_diff_on_unknown_requests = use_diff_on_unknown_requests
410 self.use_closest_match = use_closest_match
411 self.scramble_images = scramble_images
412 self.real_http_fetch = RealHttpFetch(real_dns_lookup)
414 def __call__(self, request):
415 """Fetch the request and return the response.
417 Args:
418 request: an instance of an ArchivedHttpRequest.
419 Returns:
420 Instance of ArchivedHttpResponse (if found) or None
422 if request.host.startswith('127.0.0.1:'):
423 return self.real_http_fetch(request)
425 response = self.http_archive.get(request)
427 if self.use_closest_match and not response:
428 closest_request = self.http_archive.find_closest_request(
429 request, use_path=True)
430 if closest_request:
431 response = self.http_archive.get(closest_request)
432 if response:
433 logging.info('Request not found: %s\nUsing closest match: %s',
434 request, closest_request)
436 if not response:
437 reason = str(request)
438 if self.use_diff_on_unknown_requests:
439 diff = self.http_archive.diff(request)
440 if diff:
441 reason += (
442 "\nNearest request diff "
443 "('-' for archived request, '+' for current request):\n%s" % diff)
444 logging.warning('Could not replay: %s', reason)
445 else:
446 if self.inject_script:
447 response = _InjectScripts(response, self.inject_script)
448 if self.scramble_images:
449 response = _ScrambleImages(response)
450 return response
453 class ControllableHttpArchiveFetch(object):
454 """Controllable fetch function that can swap between record and replay."""
456 def __init__(self, http_archive, real_dns_lookup,
457 inject_script, use_diff_on_unknown_requests,
458 use_record_mode, use_closest_match, scramble_images):
459 """Initialize HttpArchiveFetch.
461 Args:
462 http_archive: an instance of a HttpArchive
463 real_dns_lookup: a function that resolves a host to an IP.
464 inject_script: script string to inject in all pages.
465 use_diff_on_unknown_requests: If True, log unknown requests
466 with a diff to requests that look similar.
467 use_record_mode: If True, start in server in record mode.
468 use_closest_match: If True, on replay mode, serve the closest match
469 in the archive instead of giving a 404.
471 self.http_archive = http_archive
472 self.record_fetch = RecordHttpArchiveFetch(
473 http_archive, real_dns_lookup, inject_script)
474 self.replay_fetch = ReplayHttpArchiveFetch(
475 http_archive, real_dns_lookup, inject_script,
476 use_diff_on_unknown_requests, use_closest_match, scramble_images)
477 if use_record_mode:
478 self.SetRecordMode()
479 else:
480 self.SetReplayMode()
482 def SetRecordMode(self):
483 self.fetch = self.record_fetch
484 self.is_record_mode = True
486 def SetReplayMode(self):
487 self.fetch = self.replay_fetch
488 self.is_record_mode = False
490 def __call__(self, *args, **kwargs):
491 """Forward calls to Replay/Record fetch functions depending on mode."""
492 return self.fetch(*args, **kwargs)