2 # Copyright 2014 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
6 # Tool for seeing the real world impact of a patch.
8 # Layout Tests can tell you whether something has changed, but this can help
9 # you determine whether a subtle/controversial change is beneficial or not.
11 # It dumps the rendering of a large number of sites, both with and without a
12 # patch being evaluated, then sorts them by greatest difference in rendering,
13 # such that a human reviewer can quickly review the most impacted sites,
14 # rather than having to manually try sites to see if anything changes.
16 # In future it might be possible to extend this to other kinds of differences,
17 # e.g. page load times.
20 from argparse
import RawTextHelpFormatter
21 from contextlib
import closing
24 from distutils
.spawn
import find_executable
25 from operator
import itemgetter
26 import multiprocessing
29 from cStringIO
import StringIO
34 from urllib2
import urlopen
35 from urlparse
import urlparse
37 from zipfile
import ZipFile
39 from nsfw_urls
import nsfw_urls
43 additional_content_shell_flags
= ""
44 chromium_src_root
= ""
51 print_lock
= multiprocessing
.Lock()
54 def MakeDirsIfNotExist(dir):
58 if e
.errno
!= errno
.EEXIST
:
62 def SetupPathsAndOut():
63 global chromium_src_root
, chromium_out_dir
, output_dir
64 global image_diff
, content_shell
65 chromium_src_root
= os
.path
.abspath(os
.path
.join(os
.path
.dirname(__file__
),
68 # Find out directory (might be out_linux for users of cr).
69 for out_suffix
in ["_linux", ""]:
70 out_dir
= os
.path
.join(chromium_src_root
, "out" + out_suffix
)
71 if os
.path
.exists(out_dir
):
72 chromium_out_dir
= out_dir
74 if not chromium_out_dir
:
77 this_script_name
= "real_world_impact"
78 output_dir
= os
.path
.join(chromium_out_dir
,
81 MakeDirsIfNotExist(output_dir
)
83 image_diff
= os
.path
.join(chromium_out_dir
, "Release", "image_diff")
85 if sys
.platform
== 'darwin':
86 content_shell
= os
.path
.join(chromium_out_dir
, "Release",
87 "Content Shell.app/Contents/MacOS/Content Shell")
88 elif sys
.platform
.startswith('linux'):
89 content_shell
= os
.path
.join(chromium_out_dir
, "Release",
91 elif sys
.platform
.startswith('win'):
92 content_shell
= os
.path
.join(chromium_out_dir
, "Release",
97 def CheckPrerequisites():
98 if not find_executable("wget"):
99 print "wget not found! Install wget and re-run this."
101 if not os
.path
.exists(image_diff
):
102 print "image_diff not found (%s)!" % image_diff
103 print "Build the image_diff target and re-run this."
105 if not os
.path
.exists(content_shell
):
106 print "Content shell not found (%s)!" % content_shell
107 print "Build Release/content_shell and re-run this."
112 def PickSampleUrls():
114 data_dir
= os
.path
.join(output_dir
, "data")
115 MakeDirsIfNotExist(data_dir
)
117 # Download Alexa top 1,000,000 sites
118 # TODO(johnme): Should probably update this when it gets too stale...
119 csv_path
= os
.path
.join(data_dir
, "top-1m.csv")
120 if not os
.path
.exists(csv_path
):
121 print "Downloading list of top 1,000,000 sites from Alexa..."
122 csv_url
= "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"
123 with
closing(urlopen(csv_url
)) as stream
:
124 ZipFile(StringIO(stream
.read())).extract("top-1m.csv", data_dir
)
126 bad_urls_path
= os
.path
.join(data_dir
, "bad_urls.txt")
127 if os
.path
.exists(bad_urls_path
):
128 with
open(bad_urls_path
) as f
:
129 bad_urls
= set(f
.read().splitlines())
133 # See if we've already selected a sample of size num_sites (this way, if you
134 # call this script with arguments "before N" then "after N", where N is the
135 # same number, we'll use the same sample, as expected!).
136 urls_path
= os
.path
.join(data_dir
, "%06d_urls.txt" % num_sites
)
137 if not os
.path
.exists(urls_path
):
138 if action
== 'compare':
139 print ("Error: you must run 'before %d' and 'after %d' before "
140 "running 'compare %d'") % (num_sites
, num_sites
, num_sites
)
142 print "Picking %d sample urls..." % num_sites
144 # TODO(johnme): For now this just gets the top num_sites entries. In future
145 # this should pick a weighted random sample. For example, it could fit a
146 # power-law distribution, which is a good model of website popularity
147 # (http://www.useit.com/alertbox/9704b.html).
149 remaining_num_sites
= num_sites
150 with
open(csv_path
) as f
:
152 if remaining_num_sites
<= 0:
154 remaining_num_sites
-= 1
155 hostname
= entry
.strip().split(',')[1]
156 if not '/' in hostname
: # Skip Alexa 1,000,000 entries that have paths.
157 url
= "http://%s/" % hostname
158 if not url
in bad_urls
:
160 # Don't write these to disk yet; we'll do that in SaveWorkingUrls below
161 # once we have tried to download them and seen which ones fail.
163 with
open(urls_path
) as f
:
164 urls
= [u
for u
in f
.read().splitlines() if not u
in bad_urls
]
168 def SaveWorkingUrls():
169 # TODO(johnme): Update the list if a url that used to work goes offline.
170 urls_path
= os
.path
.join(output_dir
, "data", "%06d_urls.txt" % num_sites
)
171 if not os
.path
.exists(urls_path
):
172 with
open(urls_path
, 'w') as f
:
173 f
.writelines(u
+ '\n' for u
in urls
)
176 def PrintElapsedTime(elapsed
, detail
=""):
177 elapsed
= round(elapsed
* 10) / 10.0
180 print "Took %dm%.1fs" % (m
, s
), detail
183 def DownloadStaticCopyTask(url
):
184 url_parts
= urlparse(url
)
185 host_dir
= os
.path
.join(output_dir
, "data", url_parts
.hostname
)
186 # Use wget for now, as does a reasonable job of spidering page dependencies
187 # (e.g. CSS, JS, images).
190 subprocess
.check_call(["wget",
191 "--execute", "robots=off",
192 ("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS "
193 "X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) C"
194 "hrome/32.0.1700.14 Safari/537.36"),
197 "--adjust-extension",
199 "--directory-prefix=" + host_dir
,
200 "--force-directories",
201 "--default-page=index.html",
202 "--no-check-certificate",
203 "--timeout=5", # 5s timeout
207 except KeyboardInterrupt:
209 except subprocess
.CalledProcessError
:
210 # Ignoring these for now, as some sites have issues with their subresources
211 # yet still produce a renderable index.html
212 pass #success = False
214 download_path
= os
.path
.join(host_dir
, url_parts
.hostname
, "index.html")
215 if not os
.path
.exists(download_path
):
219 print "Downloaded:", url
222 print "Failed to download:", url
227 def DownloadStaticCopies():
231 url_parts
= urlparse(url
)
232 host_dir
= os
.path
.join(output_dir
, "data", url_parts
.hostname
)
233 download_path
= os
.path
.join(host_dir
, url_parts
.hostname
, "index.html")
234 if not os
.path
.exists(download_path
):
238 print "Downloading static copies of %d sites..." % len(new_urls
)
239 start_time
= time
.time()
241 results
= multiprocessing
.Pool(20).map(DownloadStaticCopyTask
, new_urls
)
242 failed_urls
= [new_urls
[i
] for i
,ret
in enumerate(results
) if not ret
]
244 bad_urls_path
= os
.path
.join(output_dir
, "data", "bad_urls.txt")
245 with
open(bad_urls_path
, 'a') as f
:
246 f
.writelines(u
+ '\n' for u
in failed_urls
)
247 failed_urls_set
= set(failed_urls
)
248 urls
= [u
for u
in urls
if u
not in failed_urls_set
]
250 PrintElapsedTime(time
.time() - start_time
)
256 url_parts
= urlparse(url
)
257 host_dir
= os
.path
.join(output_dir
, "data", url_parts
.hostname
)
258 html_path
= os
.path
.join(host_dir
, url_parts
.hostname
, "index.html")
261 nojs_path
= os
.path
.join(host_dir
, url_parts
.hostname
, "index-nojs.html")
262 if not os
.path
.exists(nojs_path
):
263 with
open(html_path
) as f
:
267 # These aren't intended to be XSS safe :)
268 block_tags
= (r
'<\s*(script|object|video|audio|iframe|frameset|frame)'
269 r
'\b.*?<\s*\/\s*\1\s*>')
270 block_attrs
= r
'\s(onload|onerror)\s*=\s*(\'[^
\']*\'|
"[^"]*|\S
*)'
271 html = re.sub(block_tags, '', html, flags=re.I|re.S)
272 html = re.sub(block_attrs, '', html, flags=re.I)
273 with open(nojs_path, 'w
') as f:
275 html_path = nojs_path
277 start_time = time.time()
279 with open(os.devnull, "w") as fnull:
280 p = subprocess.Popen([content_shell,
282 additional_content_shell_flags,
283 # The single quote is not a typo, it's a separator
!
284 html_path
+ "'--pixel-test"
287 stdout
=subprocess
.PIPE
,
289 result
= p
.stdout
.read()
290 PNG_START
= b
"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A"
291 PNG_END
= b
"\x49\x45\x4E\x44\xAE\x42\x60\x82"
293 start
= result
.index(PNG_START
)
294 end
= result
.rindex(PNG_END
) + 8
298 png_path
= os
.path
.join(output_dir
, action
, url_parts
.hostname
+ ".png")
299 MakeDirsIfNotExist(os
.path
.dirname(png_path
))
300 with
open(png_path
, 'wb') as f
:
301 f
.write(result
[start
:end
])
302 elapsed_time
= (time
.time() - start_time
, url
)
307 print "Taking screenshots of %d pages..." % len(urls
)
308 start_time
= time
.time()
310 results
= multiprocessing
.Pool().map(RunDrtTask
, urls
, 1)
312 max_time
, url
= max(t
for t
in results
if t
)
313 elapsed_detail
= "(slowest: %.2fs on %s)" % (max_time
, url
)
314 PrintElapsedTime(time
.time() - start_time
, elapsed_detail
)
317 def CompareResultsTask(url
):
318 url_parts
= urlparse(url
)
319 before_path
= os
.path
.join(output_dir
, "before", url_parts
.hostname
+ ".png")
320 after_path
= os
.path
.join(output_dir
, "after", url_parts
.hostname
+ ".png")
321 diff_path
= os
.path
.join(output_dir
, "diff", url_parts
.hostname
+ ".png")
322 MakeDirsIfNotExist(os
.path
.join(output_dir
, "diff"))
324 # TODO(johnme): Don't hardcode "real_world_impact".
325 red_path
= ("data:image/gif;base64,R0lGODlhAQABAPAAAP8AAP///yH5BAAAAAAALAAAAA"
326 "ABAAEAAAICRAEAOw==")
328 before_exists
= os
.path
.exists(before_path
)
329 after_exists
= os
.path
.exists(after_path
)
330 if not before_exists
and not after_exists
:
331 # TODO(johnme): Make this more informative.
332 return (-100, url
, red_path
)
333 if before_exists
!= after_exists
:
334 # TODO(johnme): Make this more informative.
335 return (200, url
, red_path
)
337 # Get percentage difference.
338 p
= subprocess
.Popen([image_diff
, "--histogram",
339 before_path
, after_path
],
341 stdout
=subprocess
.PIPE
)
342 output
,_
= p
.communicate()
343 if p
.returncode
== 0:
344 return (0, url
, before_path
)
345 diff_match
= re
.match(r
'histogram diff: (\d+\.\d{2})% (?:passed|failed)\n'
346 'exact diff: (\d+\.\d{2})% (?:passed|failed)', output
)
348 raise Exception("image_diff output format changed")
349 histogram_diff
= float(diff_match
.group(1))
350 exact_diff
= float(diff_match
.group(2))
351 combined_diff
= max(histogram_diff
+ exact_diff
/ 8, 0.001)
354 subprocess
.call([image_diff
, "--diff", before_path
, after_path
, diff_path
])
355 return (combined_diff
, url
, diff_path
)
358 def CompareResults():
359 print "Running image_diff on %d pages..." % len(urls
)
360 start_time
= time
.time()
362 results
= multiprocessing
.Pool().map(CompareResultsTask
, urls
)
363 results
.sort(key
=itemgetter(0), reverse
=True)
365 PrintElapsedTime(time
.time() - start_time
)
367 now
= datetime
.datetime
.today().strftime("%a %Y-%m-%d %H:%M")
368 html_start
= textwrap
.dedent("""\
372 <title>Real World Impact report %s</title>
374 var togglingImg = null;
375 var toggleTimer = null;
379 var newFolder = before ? "before" : "after";
380 togglingImg.src = togglingImg.src.replace(/before|after|diff/, newFolder);
382 toggleTimer = setTimeout(toggle, 300);
385 function startToggle(img) {
389 img.origSrc = img.src;
392 function stopToggle(img) {
393 clearTimeout(toggleTimer);
394 img.src = img.origSrc;
397 document.onkeydown = function(e) {
398 e = e || window.event;
399 var keyCode = e.keyCode || e.which;
403 newFolder = "before"; break;
405 newFolder = "after"; break;
407 newFolder = "diff"; break;
411 var imgs = document.getElementsByTagName("img");
412 for (var i = 0; i < imgs.length; i++) {
413 imgs[i].src = imgs[i].src.replace(/before|after|diff/, newFolder);
422 font-family: monospace;
430 border: 10px solid red;
436 body:not(.details-supported) details {
443 if ('open' in document.createElement('details'))
444 document.body.className = "details-supported";
446 <!--<div class="nsfw-spacer"></div>-->
447 <p class="nsfw-warning">Warning: sites below are taken from the Alexa top %d
449 <!--<div class="nsfw-spacer"></div>-->
450 <h1>Real World Impact report %s</h1>
451 <p class="info">Press 1, 2 and 3 to switch between before, after and diff
452 screenshots respectively; or hover over the images to rapidly alternate
453 between before and after.</p>
454 """ % (now
, num_sites
, now
))
457 <h2>No difference on <a href="%s">%s</a>.</h2>
461 <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2>
462 <img src="%s" width="800" height="600"
463 onmouseover="startToggle(this)" onmouseout="stopToggle(this)">
466 html_nsfw_diff_row
= """\
467 <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2>
469 <summary>This site may be NSFW. Click to expand/collapse.</summary>
470 <img src="%s" width="800" height="600"
471 onmouseover="startToggle(this)" onmouseout="stopToggle(this)">
475 html_end
= textwrap
.dedent("""\
479 html_path
= os
.path
.join(output_dir
, "diff.html")
480 with
open(html_path
, 'w') as f
:
482 for (diff_float
, url
, diff_path
) in results
:
483 diff_path
= os
.path
.relpath(diff_path
, output_dir
)
485 f
.write(html_same_row
% (url
, url
))
486 elif url
in nsfw_urls
:
487 f
.write(html_nsfw_diff_row
% (diff_float
, url
, url
, diff_path
))
489 f
.write(html_diff_row
% (diff_float
, url
, url
, diff_path
))
492 webbrowser
.open_new_tab("file://" + html_path
)
496 global num_sites
, action
, allow_js
, additional_content_shell_flags
498 parser
= argparse
.ArgumentParser(
499 formatter_class
=RawTextHelpFormatter
,
500 description
="Compare the real world impact of a content shell change.",
501 epilog
=textwrap
.dedent("""\
503 1. Build content_shell in out/Release without any changes.
504 2. Run: %s before [num sites to test (default %d)].
506 a. Apply your controversial patch and rebuild content_shell.
507 b. Pass --additional_flags="--enable_your_flag" in step 4.
508 4. Run: %s after [num sites to test (default %d)].
509 5. Run: %s compare [num sites to test (default %d)].
510 This will open the results in your web browser.
511 """ % (argv
[0], num_sites
, argv
[0], num_sites
, argv
[0], num_sites
)))
512 parser
.add_argument("--allow_js", help="Don't disable Javascript",
514 parser
.add_argument("--additional_flags",
515 help="Additional flags to pass to content shell")
516 parser
.add_argument("action",
517 help=textwrap
.dedent("""\
519 download - Just download the sites.
520 before - Run content shell and record 'before' result.
521 after - Run content shell and record 'after' result.
522 compare - Compare before and after results.
524 choices
=["download", "before", "after", "compare"])
525 parser
.add_argument("num_sites",
526 help="Number of sites (default %s)" % num_sites
,
527 type=int, default
=num_sites
, nargs
='?')
528 args
= parser
.parse_args()
533 num_sites
= args
.num_sites
536 allow_js
= args
.allow_js
538 if (args
.additional_flags
):
539 additional_content_shell_flags
= args
.additional_flags
541 if not SetupPathsAndOut() or not CheckPrerequisites() or not PickSampleUrls():
544 if action
== 'compare':
547 DownloadStaticCopies()
548 if action
!= 'download':
553 if __name__
== '__main__':
554 sys
.exit(main(sys
.argv
))