2 # Copyright 2012 Google Inc. All Rights Reserved.
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
8 # http://www.apache.org/licenses/LICENSE-2.0
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
16 """Retrieve web resources over http."""
25 import platformsettings
26 import script_injector
29 # PIL isn't always available, but we still want to be able to run without
30 # the image scrambling functionality in this case.
36 TIMER
= platformsettings
.timer
39 class HttpClientException(Exception):
40 """Base class for all exceptions in httpclient."""
44 def _InjectScripts(response
, inject_script
):
45 """Injects |inject_script| immediately after <head> or <html>.
47 Copies |response| if it is modified.
50 response: an ArchivedHttpResponse
51 inject_script: JavaScript string (e.g. "Math.random = function(){...}")
53 an ArchivedHttpResponse
55 if type(response
) == tuple:
56 logging
.warn('tuple response: %s', response
)
57 content_type
= response
.get_header('content-type')
58 if content_type
and content_type
.startswith('text/html'):
59 text
= response
.get_data_as_text()
60 text
, already_injected
= script_injector
.InjectScript(
61 text
, 'text/html', inject_script
)
62 if not already_injected
:
63 response
= copy
.deepcopy(response
)
64 response
.set_data(text
)
68 def _ScrambleImages(response
):
69 """If the |response| is an image, attempt to scramble it.
71 Copies |response| if it is modified.
74 response: an ArchivedHttpResponse
76 an ArchivedHttpResponse
79 assert Image
, '--scramble_images requires the PIL module to be installed.'
81 content_type
= response
.get_header('content-type')
82 if content_type
and content_type
.startswith('image/'):
84 image_data
= response
.response_data
[0]
85 image_data
.decode(encoding
='base64')
86 im
= Image
.open(StringIO
.StringIO(image_data
))
88 pixel_data
= list(im
.getdata())
89 random
.shuffle(pixel_data
)
91 scrambled_image
= im
.copy()
92 scrambled_image
.putdata(pixel_data
)
94 output_image_io
= StringIO
.StringIO()
95 scrambled_image
.save(output_image_io
, im
.format
)
96 output_image_data
= output_image_io
.getvalue()
97 output_image_data
.encode(encoding
='base64')
99 response
= copy
.deepcopy(response
)
100 response
.set_data(output_image_data
)
107 class DetailedHTTPResponse(httplib
.HTTPResponse
):
108 """Preserve details relevant to replaying responses.
110 WARNING: This code uses attributes and methods of HTTPResponse
111 that are not part of the public interface.
114 def read_chunks(self
):
115 """Return the response body content and timing data.
117 The returned chunks have the chunk size and CRLFs stripped off.
118 If the response was compressed, the returned data is still compressed.
123 [response_body] # non-chunked responses
124 [chunk_1, chunk_2, ...] # chunked responses
126 [0] # non-chunked responses
127 [chunk_1_first_byte_delay, ...] # chunked responses
129 The delay for the first body item should be recorded by the caller.
135 chunks
.append(self
.read())
141 line
= self
.fp
.readline()
142 chunk_size
= self
._read
_chunk
_size
(line
)
143 if chunk_size
is None:
144 raise httplib
.IncompleteRead(''.join(chunks
))
147 delays
.append(TIMER() - start
)
148 chunks
.append(self
._safe
_read
(chunk_size
))
149 self
._safe
_read
(2) # skip the CRLF at the end of the chunk
152 # Ignore any trailers.
154 line
= self
.fp
.readline()
155 if not line
or line
== '\r\n':
159 return chunks
, delays
162 def _read_chunk_size(cls
, line
):
163 chunk_extensions_pos
= line
.find(';')
164 if chunk_extensions_pos
!= -1:
165 line
= line
[:chunk_extensions_pos
] # strip chunk-extensions
167 chunk_size
= int(line
, 16)
173 class DetailedHTTPConnection(httplib
.HTTPConnection
):
174 """Preserve details relevant to replaying connections."""
175 response_class
= DetailedHTTPResponse
178 class DetailedHTTPSResponse(DetailedHTTPResponse
):
179 """Preserve details relevant to replaying SSL responses."""
183 class DetailedHTTPSConnection(httplib
.HTTPSConnection
):
184 """Preserve details relevant to replaying SSL connections."""
185 response_class
= DetailedHTTPSResponse
188 class RealHttpFetch(object):
190 def __init__(self
, real_dns_lookup
):
191 """Initialize RealHttpFetch.
194 real_dns_lookup: a function that resolves a host to an IP.
196 self
._real
_dns
_lookup
= real_dns_lookup
199 def _GetHeaderNameValue(header
):
200 """Parse the header line and return a name/value tuple.
203 header: a string for a header such as "Content-Length: 314".
205 A tuple (header_name, header_value) on success or None if the header
206 is not in expected format. header_name is in lowercase.
210 return (header
[:i
].lower(), header
[i
+1:].strip())
214 def _ToTuples(headers
):
215 """Parse headers and save them to a list of tuples.
217 This method takes HttpResponse.msg.headers as input and convert it
218 to a list of (header_name, header_value) tuples.
219 HttpResponse.msg.headers is a list of strings where each string
220 represents either a header or a continuation line of a header.
221 1. a normal header consists of two parts which are separated by colon :
222 "header_name:header_value..."
223 2. a continuation line is a string starting with whitespace
224 "[whitespace]continued_header_value..."
225 If a header is not in good shape or an unexpected continuation line is
226 seen, it will be ignored.
228 Should avoid using response.getheaders() directly
229 because response.getheaders() can't handle multiple headers
230 with the same name properly. Instead, parse the
231 response.msg.headers using this method to get all headers.
234 headers: an instance of HttpResponse.msg.headers.
236 A list of tuples which looks like:
237 [(header_name, header_value), (header_name2, header_value2)...]
244 'Unexpected response header continuation line [%s]', line
)
246 name
, value
= all_headers
.pop()
247 value
+= '\n ' + line
.strip()
249 name_value
= RealHttpFetch
._GetHeaderNameValue
(line
)
252 'Response header in wrong format [%s]', line
)
254 name
, value
= name_value
# pylint: disable=unpacking-non-sequence
255 all_headers
.append((name
, value
))
259 def _get_request_host_port(request
):
260 host_parts
= request
.host
.split(':')
262 port
= int(host_parts
[1]) if len(host_parts
) == 2 else None
266 def _get_system_proxy(is_ssl
):
267 return platformsettings
.get_system_proxy(is_ssl
)
269 def _get_connection(self
, request_host
, request_port
, is_ssl
):
270 """Return a detailed connection object for host/port pair.
272 If a system proxy is defined (see platformsettings.py), it will be used.
275 request_host: a host string (e.g. "www.example.com").
276 request_port: a port integer (e.g. 8080) or None (for the default port).
277 is_ssl: True if HTTPS connection is needed.
279 A DetailedHTTPSConnection or DetailedHTTPConnection instance.
281 connection_host
= request_host
282 connection_port
= request_port
283 system_proxy
= self
._get
_system
_proxy
(is_ssl
)
285 connection_host
= system_proxy
.host
286 connection_port
= system_proxy
.port
288 # Use an IP address because WPR may override DNS settings.
289 connection_ip
= self
._real
_dns
_lookup
(connection_host
)
290 if not connection_ip
:
291 logging
.critical('Unable to find host ip for name: %s', connection_host
)
295 connection
= DetailedHTTPSConnection(connection_ip
, connection_port
)
297 connection
.set_tunnel(request_host
, request_port
)
299 connection
= DetailedHTTPConnection(connection_ip
, connection_port
)
302 def __call__(self
, request
):
303 """Fetch an HTTP request.
306 request: an ArchivedHttpRequest
308 an ArchivedHttpResponse
310 logging
.debug('RealHttpFetch: %s %s', request
.host
, request
.full_path
)
311 request_host
, request_port
= self
._get
_request
_host
_port
(request
)
315 connection
= self
._get
_connection
(
316 request_host
, request_port
, request
.is_ssl
)
317 connect_start
= TIMER()
319 connect_delay
= int((TIMER() - connect_start
) * 1000)
324 request
.request_body
,
326 response
= connection
.getresponse()
327 headers_delay
= int((TIMER() - start
) * 1000)
329 chunks
, chunk_delays
= response
.read_chunks()
331 'connect': connect_delay
,
332 'headers': headers_delay
,
335 archived_http_response
= httparchive
.ArchivedHttpResponse(
339 RealHttpFetch
._ToTuples
(response
.msg
.headers
),
342 return archived_http_response
346 logging
.warning('Retrying fetch %s: %s', request
, e
)
348 logging
.critical('Could not fetch %s: %s', request
, e
)
352 class RecordHttpArchiveFetch(object):
353 """Make real HTTP fetches and save responses in the given HttpArchive."""
355 def __init__(self
, http_archive
, real_dns_lookup
, inject_script
):
356 """Initialize RecordHttpArchiveFetch.
359 http_archive: an instance of a HttpArchive
360 real_dns_lookup: a function that resolves a host to an IP.
361 inject_script: script string to inject in all pages
363 self
.http_archive
= http_archive
364 self
.real_http_fetch
= RealHttpFetch(real_dns_lookup
)
365 self
.inject_script
= inject_script
367 def __call__(self
, request
):
368 """Fetch the request and return the response.
371 request: an ArchivedHttpRequest.
373 an ArchivedHttpResponse
375 # If request is already in the archive, return the archived response.
376 if request
in self
.http_archive
:
377 logging
.debug('Repeated request found: %s', request
)
378 response
= self
.http_archive
[request
]
380 response
= self
.real_http_fetch(request
)
383 self
.http_archive
[request
] = response
384 if self
.inject_script
:
385 response
= _InjectScripts(response
, self
.inject_script
)
386 logging
.debug('Recorded: %s', request
)
390 class ReplayHttpArchiveFetch(object):
391 """Serve responses from the given HttpArchive."""
393 def __init__(self
, http_archive
, real_dns_lookup
, inject_script
,
394 use_diff_on_unknown_requests
=False,
395 use_closest_match
=False, scramble_images
=False):
396 """Initialize ReplayHttpArchiveFetch.
399 http_archive: an instance of a HttpArchive
400 real_dns_lookup: a function that resolves a host to an IP.
401 inject_script: script string to inject in all pages
402 use_diff_on_unknown_requests: If True, log unknown requests
403 with a diff to requests that look similar.
404 use_closest_match: If True, on replay mode, serve the closest match
405 in the archive instead of giving a 404.
407 self
.http_archive
= http_archive
408 self
.inject_script
= inject_script
409 self
.use_diff_on_unknown_requests
= use_diff_on_unknown_requests
410 self
.use_closest_match
= use_closest_match
411 self
.scramble_images
= scramble_images
412 self
.real_http_fetch
= RealHttpFetch(real_dns_lookup
)
414 def __call__(self
, request
):
415 """Fetch the request and return the response.
418 request: an instance of an ArchivedHttpRequest.
420 Instance of ArchivedHttpResponse (if found) or None
422 if request
.host
.startswith('127.0.0.1:'):
423 return self
.real_http_fetch(request
)
425 response
= self
.http_archive
.get(request
)
427 if self
.use_closest_match
and not response
:
428 closest_request
= self
.http_archive
.find_closest_request(
429 request
, use_path
=True)
431 response
= self
.http_archive
.get(closest_request
)
433 logging
.info('Request not found: %s\nUsing closest match: %s',
434 request
, closest_request
)
437 reason
= str(request
)
438 if self
.use_diff_on_unknown_requests
:
439 diff
= self
.http_archive
.diff(request
)
442 "\nNearest request diff "
443 "('-' for archived request, '+' for current request):\n%s" % diff
)
444 logging
.warning('Could not replay: %s', reason
)
446 if self
.inject_script
:
447 response
= _InjectScripts(response
, self
.inject_script
)
448 if self
.scramble_images
:
449 response
= _ScrambleImages(response
)
453 class ControllableHttpArchiveFetch(object):
454 """Controllable fetch function that can swap between record and replay."""
456 def __init__(self
, http_archive
, real_dns_lookup
,
457 inject_script
, use_diff_on_unknown_requests
,
458 use_record_mode
, use_closest_match
, scramble_images
):
459 """Initialize HttpArchiveFetch.
462 http_archive: an instance of a HttpArchive
463 real_dns_lookup: a function that resolves a host to an IP.
464 inject_script: script string to inject in all pages.
465 use_diff_on_unknown_requests: If True, log unknown requests
466 with a diff to requests that look similar.
467 use_record_mode: If True, start in server in record mode.
468 use_closest_match: If True, on replay mode, serve the closest match
469 in the archive instead of giving a 404.
471 self
.http_archive
= http_archive
472 self
.record_fetch
= RecordHttpArchiveFetch(
473 http_archive
, real_dns_lookup
, inject_script
)
474 self
.replay_fetch
= ReplayHttpArchiveFetch(
475 http_archive
, real_dns_lookup
, inject_script
,
476 use_diff_on_unknown_requests
, use_closest_match
, scramble_images
)
482 def SetRecordMode(self
):
483 self
.fetch
= self
.record_fetch
484 self
.is_record_mode
= True
486 def SetReplayMode(self
):
487 self
.fetch
= self
.replay_fetch
488 self
.is_record_mode
= False
490 def __call__(self
, *args
, **kwargs
):
491 """Forward calls to Replay/Record fetch functions depending on mode."""
492 return self
.fetch(*args
, **kwargs
)