tools/perf/profile_creators/profile_safe_url_generator.py

   1 # Copyright 2015 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 import HTMLParser
   6 import json
   7 import logging
   8 import urllib2
   9 import urlparse
  10
  11
  12 class _HRefParser(HTMLParser.HTMLParser):
  13   def __init__(self):
  14     HTMLParser.HTMLParser.__init__(self)
  15     self.hrefs = []
  16
  17   def handle_starttag(self, tag, attrs):
  18     if tag == "a":
  19       for name, value in attrs:
  20         if name == "href":
  21           self.hrefs.append(value)
  22
  23
  24 def _AbsoluteUrlHasSaneScheme(absolute_url):
  25   if len(absolute_url) < 4:
  26     return False
  27   return absolute_url[0:4] == 'http'
  28
  29
  30 def GenerateSafeUrls():
  31   """Prints a list of safe urls.
  32
  33   Generates a safe list of urls from a seed list. Each href in the HTML
  34   fetched from the url from the seed list is placed into the safe list. The
  35   safe list contains unsanitized urls.
  36   """
  37   # A list of websites whose hrefs are unlikely to link to sites that contain
  38   # malware.
  39   seed_urls = [
  40     "http://www.cnn.com",
  41     "https://www.youtube.com",
  42     "https://www.facebook.com",
  43     "https://www.twitter.com",
  44     "https://www.yahoo.com",
  45     "https://www.amazon.com",
  46     "https://www.wikipedia.com",
  47     "https://www.bing.com",
  48     "https://www.dailymotion.com",
  49     "https://www.stackoverflow.com",
  50     "https://www.google.com/#q=dumpling",
  51     "http://www.baidu.com/s?wd=rice",
  52     "http://www.baidu.com/s?wd=cow",
  53     "https://www.google.com/#q=fox",
  54     "http://www.yahoo.co.jp/",
  55     "http://www.yandex.ru/",
  56     "https://www.imdb.com/",
  57     "http://www.huffingtonpost.com/",
  58     "https://www.deviantart.com/",
  59     "http://www.wsj.com/",
  60   ]
  61
  62   safe_urls = set()
  63
  64   for url in seed_urls:
  65     try:
  66       # Fetch and parse the HTML.
  67       response = urllib2.urlopen(url)
  68       encoding = response.headers.getparam('charset')
  69       html = response.read()
  70       if encoding:
  71         html = html.decode(encoding)
  72
  73       parser = _HRefParser()
  74       parser.feed(html)
  75     except:
  76       logging.exception("Error fetching or parsing url: %s", url)
  77       raise
  78
  79     # Looks for all hrefs.
  80     for relative_url in parser.hrefs:
  81       if not relative_url:
  82         continue
  83
  84       absolute_url = urlparse.urljoin(url, relative_url)
  85       if not _AbsoluteUrlHasSaneScheme(absolute_url):
  86         continue
  87       safe_urls.add(absolute_url)
  88
  89   # Sort the urls, to make them easier to view in bulk.
  90   safe_urls_list = list(safe_urls)
  91   safe_urls_list.sort()
  92
  93   print json.dumps(safe_urls_list, indent=2, separators=(",", ":"))
  94
  95 if __name__ == "__main__":
  96   GenerateSafeUrls()