tools/real_world_impact/real_world_impact.py

   1 #!/usr/bin/env python
   2 # Copyright 2014 The Chromium Authors. All rights reserved.
   3 # Use of this source code is governed by a BSD-style license that can be
   4 # found in the LICENSE file.
   5
   6 # Tool for seeing the real world impact of a patch.
   7 #
   8 # Layout Tests can tell you whether something has changed, but this can help
   9 # you determine whether a subtle/controversial change is beneficial or not.
  10 #
  11 # It dumps the rendering of a large number of sites, both with and without a
  12 # patch being evaluated, then sorts them by greatest difference in rendering,
  13 # such that a human reviewer can quickly review the most impacted sites,
  14 # rather than having to manually try sites to see if anything changes.
  15 #
  16 # In future it might be possible to extend this to other kinds of differences,
  17 # e.g. page load times.
  18
  19 import argparse
  20 from argparse import RawTextHelpFormatter
  21 from contextlib import closing
  22 import datetime
  23 import errno
  24 from distutils.spawn import find_executable
  25 from operator import itemgetter
  26 import multiprocessing
  27 import os
  28 import re
  29 from cStringIO import StringIO
  30 import subprocess
  31 import sys
  32 import textwrap
  33 import time
  34 from urllib2 import urlopen
  35 from urlparse import urlparse
  36 import webbrowser
  37 from zipfile import ZipFile
  38
  39 from nsfw_urls import nsfw_urls
  40
  41 action = None
  42 allow_js = False
  43 additional_content_shell_flags = ""
  44 chromium_src_root = ""
  45 chromium_out_dir = ""
  46 image_diff = ""
  47 content_shell = ""
  48 output_dir = ""
  49 num_sites = 100
  50 urls = []
  51 print_lock = multiprocessing.Lock()
  52
  53
  54 def MakeDirsIfNotExist(dir):
  55   try:
  56     os.makedirs(dir)
  57   except OSError as e:
  58     if e.errno != errno.EEXIST:
  59       raise
  60
  61
  62 def SetupPathsAndOut():
  63   global chromium_src_root, chromium_out_dir, output_dir
  64   global image_diff, content_shell
  65   chromium_src_root = os.path.abspath(os.path.join(os.path.dirname(__file__),
  66                                                    os.pardir,
  67                                                    os.pardir))
  68   # Find out directory (might be out_linux for users of cr).
  69   for out_suffix in ["_linux", ""]:
  70     out_dir = os.path.join(chromium_src_root, "out" + out_suffix)
  71     if os.path.exists(out_dir):
  72       chromium_out_dir = out_dir
  73       break
  74   if not chromium_out_dir:
  75     return False
  76
  77   this_script_name = "real_world_impact"
  78   output_dir = os.path.join(chromium_out_dir,
  79                             "Release",
  80                             this_script_name)
  81   MakeDirsIfNotExist(output_dir)
  82
  83   image_diff = os.path.join(chromium_out_dir, "Release", "image_diff")
  84
  85   if sys.platform == 'darwin':
  86     content_shell = os.path.join(chromium_out_dir, "Release",
  87                     "Content Shell.app/Contents/MacOS/Content Shell")
  88   elif sys.platform.startswith('linux'):
  89     content_shell = os.path.join(chromium_out_dir, "Release",
  90                     "content_shell")
  91   elif sys.platform.startswith('win'):
  92     content_shell = os.path.join(chromium_out_dir, "Release",
  93                     "content_shell.exe")
  94   return True
  95
  96
  97 def CheckPrerequisites():
  98   if not find_executable("wget"):
  99     print "wget not found! Install wget and re-run this."
 100     return False
 101   if not os.path.exists(image_diff):
 102     print "image_diff not found (%s)!" % image_diff
 103     print "Build the image_diff target and re-run this."
 104     return False
 105   if not os.path.exists(content_shell):
 106     print "Content shell not found (%s)!" % content_shell
 107     print "Build Release/content_shell and re-run this."
 108     return False
 109   return True
 110
 111
 112 def PickSampleUrls():
 113   global urls
 114   data_dir = os.path.join(output_dir, "data")
 115   MakeDirsIfNotExist(data_dir)
 116
 117   # Download Alexa top 1,000,000 sites
 118   # TODO(johnme): Should probably update this when it gets too stale...
 119   csv_path = os.path.join(data_dir, "top-1m.csv")
 120   if not os.path.exists(csv_path):
 121     print "Downloading list of top 1,000,000 sites from Alexa..."
 122     csv_url = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"
 123     with closing(urlopen(csv_url)) as stream:
 124       ZipFile(StringIO(stream.read())).extract("top-1m.csv", data_dir)
 125
 126   bad_urls_path = os.path.join(data_dir, "bad_urls.txt")
 127   if os.path.exists(bad_urls_path):
 128     with open(bad_urls_path) as f:
 129       bad_urls = set(f.read().splitlines())
 130   else:
 131     bad_urls = set()
 132
 133   # See if we've already selected a sample of size num_sites (this way, if you
 134   # call this script with arguments "before N" then "after N", where N is the
 135   # same number, we'll use the same sample, as expected!).
 136   urls_path = os.path.join(data_dir, "%06d_urls.txt" % num_sites)
 137   if not os.path.exists(urls_path):
 138     if action == 'compare':
 139       print ("Error: you must run 'before %d' and 'after %d' before "
 140              "running 'compare %d'") % (num_sites, num_sites, num_sites)
 141       return False
 142     print "Picking %d sample urls..." % num_sites
 143
 144     # TODO(johnme): For now this just gets the top num_sites entries. In future
 145     # this should pick a weighted random sample. For example, it could fit a
 146     # power-law distribution, which is a good model of website popularity
 147     # (http://www.useit.com/alertbox/9704b.html).
 148     urls = []
 149     remaining_num_sites = num_sites
 150     with open(csv_path) as f:
 151       for entry in f:
 152         if remaining_num_sites <= 0:
 153           break
 154         remaining_num_sites -= 1
 155         hostname = entry.strip().split(',')[1]
 156         if not '/' in hostname:  # Skip Alexa 1,000,000 entries that have paths.
 157           url = "http://%s/" % hostname
 158           if not url in bad_urls:
 159             urls.append(url)
 160     # Don't write these to disk yet; we'll do that in SaveWorkingUrls below
 161     # once we have tried to download them and seen which ones fail.
 162   else:
 163     with open(urls_path) as f:
 164       urls = [u for u in f.read().splitlines() if not u in bad_urls]
 165   return True
 166
 167
 168 def SaveWorkingUrls():
 169   # TODO(johnme): Update the list if a url that used to work goes offline.
 170   urls_path = os.path.join(output_dir, "data", "%06d_urls.txt" % num_sites)
 171   if not os.path.exists(urls_path):
 172     with open(urls_path, 'w') as f:
 173       f.writelines(u + '\n' for u in urls)
 174
 175
 176 def PrintElapsedTime(elapsed, detail=""):
 177   elapsed = round(elapsed * 10) / 10.0
 178   m = elapsed / 60
 179   s = elapsed % 60
 180   print "Took %dm%.1fs" % (m, s), detail
 181
 182
 183 def DownloadStaticCopyTask(url):
 184   url_parts = urlparse(url)
 185   host_dir = os.path.join(output_dir, "data", url_parts.hostname)
 186   # Use wget for now, as does a reasonable job of spidering page dependencies
 187   # (e.g. CSS, JS, images).
 188   success = True
 189   try:
 190     subprocess.check_call(["wget",
 191                            "--execute", "robots=off",
 192                            ("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS "
 193                             "X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) C"
 194                             "hrome/32.0.1700.14 Safari/537.36"),
 195                            "--page-requisites",
 196                            "--span-hosts",
 197                            "--adjust-extension",
 198                            "--convert-links",
 199                            "--directory-prefix=" + host_dir,
 200                            "--force-directories",
 201                            "--default-page=index.html",
 202                            "--no-check-certificate",
 203                            "--timeout=5", # 5s timeout
 204                            "--tries=2",
 205                            "--quiet",
 206                            url])
 207   except KeyboardInterrupt:
 208     success = False
 209   except subprocess.CalledProcessError:
 210     # Ignoring these for now, as some sites have issues with their subresources
 211     # yet still produce a renderable index.html
 212     pass #success = False
 213   if success:
 214     download_path = os.path.join(host_dir, url_parts.hostname, "index.html")
 215     if not os.path.exists(download_path):
 216       success = False
 217     else:
 218       with print_lock:
 219         print "Downloaded:", url
 220   if not success:
 221     with print_lock:
 222       print "Failed to download:", url
 223     return False
 224   return True
 225
 226
 227 def DownloadStaticCopies():
 228   global urls
 229   new_urls = []
 230   for url in urls:
 231     url_parts = urlparse(url)
 232     host_dir = os.path.join(output_dir, "data", url_parts.hostname)
 233     download_path = os.path.join(host_dir, url_parts.hostname, "index.html")
 234     if not os.path.exists(download_path):
 235       new_urls.append(url)
 236
 237   if new_urls:
 238     print "Downloading static copies of %d sites..." % len(new_urls)
 239     start_time = time.time()
 240
 241     results = multiprocessing.Pool(20).map(DownloadStaticCopyTask, new_urls)
 242     failed_urls = [new_urls[i] for i,ret in enumerate(results) if not ret]
 243     if failed_urls:
 244       bad_urls_path = os.path.join(output_dir, "data", "bad_urls.txt")
 245       with open(bad_urls_path, 'a') as f:
 246         f.writelines(u + '\n' for u in failed_urls)
 247       failed_urls_set = set(failed_urls)
 248       urls = [u for u in urls if u not in failed_urls_set]
 249
 250     PrintElapsedTime(time.time() - start_time)
 251
 252   SaveWorkingUrls()
 253
 254
 255 def RunDrtTask(url):
 256   url_parts = urlparse(url)
 257   host_dir = os.path.join(output_dir, "data", url_parts.hostname)
 258   html_path = os.path.join(host_dir, url_parts.hostname, "index.html")
 259
 260   if not allow_js:
 261     nojs_path = os.path.join(host_dir, url_parts.hostname, "index-nojs.html")
 262     if not os.path.exists(nojs_path):
 263       with open(html_path) as f:
 264         html = f.read()
 265       if not html:
 266         return False
 267       # These aren't intended to be XSS safe :)
 268       block_tags = (r'<\s*(script|object|video|audio|iframe|frameset|frame)'
 269                     r'\b.*?<\s*\/\s*\1\s*>')
 270       block_attrs = r'\s(onload|onerror)\s*=\s*(\'[^\']*\'|"[^"]*|\S*)'
 271       html = re.sub(block_tags, '', html, flags=re.I|re.S)
 272       html = re.sub(block_attrs, '', html, flags=re.I)
 273       with open(nojs_path, 'w') as f:
 274         f.write(html)
 275     html_path = nojs_path
 276
 277   start_time = time.time()
 278
 279   with open(os.devnull, "w") as fnull:
 280     p = subprocess.Popen([content_shell,
 281                           "--dump-render-tree",
 282                           additional_content_shell_flags,
 283                           # The single quote is not a typo, it's a separator!
 284                           html_path + "'--pixel-test"
 285                          ],
 286                          shell=False,
 287                          stdout=subprocess.PIPE,
 288                          stderr=fnull)
 289   result = p.stdout.read()
 290   PNG_START = b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A"
 291   PNG_END = b"\x49\x45\x4E\x44\xAE\x42\x60\x82"
 292   try:
 293     start = result.index(PNG_START)
 294     end = result.rindex(PNG_END) + 8
 295   except ValueError:
 296     return False
 297
 298   png_path = os.path.join(output_dir, action, url_parts.hostname + ".png")
 299   MakeDirsIfNotExist(os.path.dirname(png_path))
 300   with open(png_path, 'wb') as f:
 301     f.write(result[start:end])
 302   elapsed_time = (time.time() - start_time, url)
 303   return elapsed_time
 304
 305
 306 def RunDrt():
 307   print "Taking screenshots of %d pages..." % len(urls)
 308   start_time = time.time()
 309
 310   results = multiprocessing.Pool().map(RunDrtTask, urls, 1)
 311
 312   max_time, url = max(t for t in results if t)
 313   elapsed_detail = "(slowest: %.2fs on %s)" % (max_time, url)
 314   PrintElapsedTime(time.time() - start_time, elapsed_detail)
 315
 316
 317 def CompareResultsTask(url):
 318   url_parts = urlparse(url)
 319   before_path = os.path.join(output_dir, "before", url_parts.hostname + ".png")
 320   after_path = os.path.join(output_dir, "after", url_parts.hostname + ".png")
 321   diff_path = os.path.join(output_dir, "diff", url_parts.hostname + ".png")
 322   MakeDirsIfNotExist(os.path.join(output_dir, "diff"))
 323
 324   # TODO(johnme): Don't hardcode "real_world_impact".
 325   red_path = ("data:image/gif;base64,R0lGODlhAQABAPAAAP8AAP///yH5BAAAAAAALAAAAA"
 326               "ABAAEAAAICRAEAOw==")
 327
 328   before_exists = os.path.exists(before_path)
 329   after_exists = os.path.exists(after_path)
 330   if not before_exists and not after_exists:
 331     # TODO(johnme): Make this more informative.
 332     return (-100, url, red_path)
 333   if before_exists != after_exists:
 334     # TODO(johnme): Make this more informative.
 335     return (200, url, red_path)
 336
 337   # Get percentage difference.
 338   p = subprocess.Popen([image_diff, "--histogram",
 339                         before_path, after_path],
 340                         shell=False,
 341                         stdout=subprocess.PIPE)
 342   output,_ = p.communicate()
 343   if p.returncode == 0:
 344     return (0, url, before_path)
 345   diff_match = re.match(r'histogram diff: (\d+\.\d{2})% (?:passed|failed)\n'
 346                          'exact diff: (\d+\.\d{2})% (?:passed|failed)', output)
 347   if not diff_match:
 348     raise Exception("image_diff output format changed")
 349   histogram_diff = float(diff_match.group(1))
 350   exact_diff = float(diff_match.group(2))
 351   combined_diff = max(histogram_diff + exact_diff / 8, 0.001)
 352
 353   # Produce diff PNG.
 354   subprocess.call([image_diff, "--diff", before_path, after_path, diff_path])
 355   return (combined_diff, url, diff_path)
 356
 357
 358 def CompareResults():
 359   print "Running image_diff on %d pages..." % len(urls)
 360   start_time = time.time()
 361
 362   results = multiprocessing.Pool().map(CompareResultsTask, urls)
 363   results.sort(key=itemgetter(0), reverse=True)
 364
 365   PrintElapsedTime(time.time() - start_time)
 366
 367   now = datetime.datetime.today().strftime("%a %Y-%m-%d %H:%M")
 368   html_start = textwrap.dedent("""\
 369   <!DOCTYPE html>
 370   <html>
 371   <head>
 372   <title>Real World Impact report %s</title>
 373   <script>
 374     var togglingImg = null;
 375     var toggleTimer = null;
 376
 377     var before = true;
 378     function toggle() {
 379       var newFolder = before ? "before" : "after";
 380       togglingImg.src = togglingImg.src.replace(/before|after|diff/, newFolder);
 381       before = !before;
 382       toggleTimer = setTimeout(toggle, 300);
 383     }
 384
 385     function startToggle(img) {
 386       before = true;
 387       togglingImg = img;
 388       if (!img.origSrc)
 389         img.origSrc = img.src;
 390       toggle();
 391     }
 392     function stopToggle(img) {
 393       clearTimeout(toggleTimer);
 394       img.src = img.origSrc;
 395     }
 396
 397     document.onkeydown = function(e) {
 398       e = e || window.event;
 399       var keyCode = e.keyCode || e.which;
 400       var newFolder;
 401       switch (keyCode) {
 402         case 49: //'1'
 403           newFolder = "before"; break;
 404         case 50: //'2'
 405           newFolder = "after"; break;
 406         case 51: //'3'
 407           newFolder = "diff"; break;
 408         default:
 409           return;
 410       }
 411       var imgs = document.getElementsByTagName("img");
 412       for (var i = 0; i < imgs.length; i++) {
 413         imgs[i].src = imgs[i].src.replace(/before|after|diff/, newFolder);
 414       }
 415     };
 416   </script>
 417   <style>
 418     h1 {
 419       font-family: sans;
 420     }
 421     h2 {
 422       font-family: monospace;
 423       white-space: pre;
 424     }
 425     .nsfw-spacer {
 426       height: 50vh;
 427     }
 428     .nsfw-warning {
 429       background: yellow;
 430       border: 10px solid red;
 431     }
 432     .info {
 433       font-size: 1.2em;
 434       font-style: italic;
 435     }
 436     body:not(.details-supported) details {
 437       display: none;
 438     }
 439   </style>
 440   </head>
 441   <body>
 442     <script>
 443     if ('open' in document.createElement('details'))
 444       document.body.className = "details-supported";
 445     </script>
 446     <!--<div class="nsfw-spacer"></div>-->
 447     <p class="nsfw-warning">Warning: sites below are taken from the Alexa top %d
 448     and may be NSFW.</p>
 449     <!--<div class="nsfw-spacer"></div>-->
 450     <h1>Real World Impact report %s</h1>
 451     <p class="info">Press 1, 2 and 3 to switch between before, after and diff
 452     screenshots respectively; or hover over the images to rapidly alternate
 453     between before and after.</p>
 454   """ % (now, num_sites, now))
 455
 456   html_same_row = """\
 457   <h2>No difference on <a href="%s">%s</a>.</h2>
 458   """
 459
 460   html_diff_row = """\
 461   <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2>
 462   <img src="%s" width="800" height="600"
 463        onmouseover="startToggle(this)" onmouseout="stopToggle(this)">
 464   """
 465
 466   html_nsfw_diff_row = """\
 467   <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2>
 468   <details>
 469     <summary>This site may be NSFW. Click to expand/collapse.</summary>
 470     <img src="%s" width="800" height="600"
 471          onmouseover="startToggle(this)" onmouseout="stopToggle(this)">
 472   </details>
 473   """
 474
 475   html_end = textwrap.dedent("""\
 476   </body>
 477   </html>""")
 478
 479   html_path = os.path.join(output_dir, "diff.html")
 480   with open(html_path, 'w') as f:
 481     f.write(html_start)
 482     for (diff_float, url, diff_path) in results:
 483       diff_path = os.path.relpath(diff_path, output_dir)
 484       if diff_float == 0:
 485         f.write(html_same_row % (url, url))
 486       elif url in nsfw_urls:
 487         f.write(html_nsfw_diff_row % (diff_float, url, url, diff_path))
 488       else:
 489         f.write(html_diff_row % (diff_float, url, url, diff_path))
 490     f.write(html_end)
 491
 492   webbrowser.open_new_tab("file://" + html_path)
 493
 494
 495 def main(argv):
 496   global num_sites, action, allow_js, additional_content_shell_flags
 497
 498   parser = argparse.ArgumentParser(
 499       formatter_class=RawTextHelpFormatter,
 500       description="Compare the real world impact of a content shell change.",
 501       epilog=textwrap.dedent("""\
 502           Example usage:
 503             1. Build content_shell in out/Release without any changes.
 504             2. Run: %s before [num sites to test (default %d)].
 505             3. Either:
 506                  a. Apply your controversial patch and rebuild content_shell.
 507                  b. Pass --additional_flags="--enable_your_flag" in step 4.
 508             4. Run: %s after [num sites to test (default %d)].
 509             5. Run: %s compare [num sites to test (default %d)].
 510                This will open the results in your web browser.
 511           """ % (argv[0], num_sites, argv[0], num_sites, argv[0], num_sites)))
 512   parser.add_argument("--allow_js", help="Don't disable Javascript",
 513                       action="store_true")
 514   parser.add_argument("--additional_flags",
 515                       help="Additional flags to pass to content shell")
 516   parser.add_argument("action",
 517                       help=textwrap.dedent("""\
 518                         Action to perform.
 519                           download - Just download the sites.
 520                           before - Run content shell and record 'before' result.
 521                           after - Run content shell and record 'after' result.
 522                           compare - Compare before and after results.
 523                       """),
 524                       choices=["download", "before", "after", "compare"])
 525   parser.add_argument("num_sites",
 526                       help="Number of sites (default %s)" % num_sites,
 527                       type=int, default=num_sites, nargs='?')
 528   args = parser.parse_args()
 529
 530   action = args.action
 531
 532   if (args.num_sites):
 533     num_sites = args.num_sites
 534
 535   if (args.allow_js):
 536     allow_js = args.allow_js
 537
 538   if (args.additional_flags):
 539     additional_content_shell_flags = args.additional_flags
 540
 541   if not SetupPathsAndOut() or not CheckPrerequisites() or not PickSampleUrls():
 542     return 1
 543
 544   if action == 'compare':
 545     CompareResults()
 546   else:
 547     DownloadStaticCopies()
 548     if action != 'download':
 549       RunDrt()
 550   return 0
 551
 552
 553 if __name__ == '__main__':
 554   sys.exit(main(sys.argv))