chrome/common/extensions/docs/server2/link_error_detector.py

   1 # Copyright 2013 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 from collections import defaultdict, deque, namedtuple
   6 from HTMLParser import HTMLParser, HTMLParseError
   7 from itertools import groupby
   8 from operator import itemgetter
   9 import posixpath
  10 from urlparse import urlsplit
  11
  12 from file_system_util import CreateURLsFromPaths
  13
  14
  15 Page = namedtuple('Page', 'status, links, anchors, anchor_refs')
  16
  17
  18 def _SplitAnchor(url):
  19   components = urlsplit(url)
  20   return components.path, components.fragment
  21
  22
  23 def _Process(path, renderer):
  24   '''Render the page at |path| using a |renderer| and process the contents of
  25   that page. Returns a |Page| namedtuple with fields for the http status code
  26   of the page render, the href of all the links that occurred on the page, all
  27   of the anchors on the page (ids and names), and all links that contain an
  28   anchor component.
  29
  30   If a non-html page is properly rendered, a |Page| with status code 200 and
  31   all other fields empty is returned.
  32   '''
  33   parser = _ContentParser()
  34   response = renderer(path)
  35
  36   if response.status != 200:
  37     return Page(response.status, (), (), ())
  38   if not path.endswith('.html'):
  39     return Page(200, (), (), ())
  40
  41   try:
  42     parser.feed(str(response.content))
  43   except HTMLParseError:
  44     return Page(200, (), (), ())
  45
  46   links, anchors = parser.links, parser.anchors
  47   if '/' in path:
  48     base, _ = path.rsplit('/', 1)
  49   else:
  50     base = ''
  51   edges = []
  52   anchor_refs = []
  53
  54   # Convert relative links to absolute links and categorize links as edges
  55   # or anchor_refs.
  56   for link in links:
  57     # Files like experimental_history.html are refered to with the URL
  58     # experimental.history.html.
  59     head, last = link.rsplit('/', 1) if '/' in link else ('', link)
  60     last, anchor = _SplitAnchor(last)
  61
  62     if last.endswith('.html') and last.count('.') > 1:
  63       last = last.replace('.', '_', last.count('.') - 1)
  64       link = posixpath.join(head, last)
  65       if anchor:
  66         link = '%s#%s' % (link, anchor)
  67
  68     if link.startswith('#'):
  69       anchor_refs.append(link)
  70     else:
  71       if link.startswith('/'):
  72         link = link[1:]
  73       else:
  74         link = posixpath.normpath('%s/%s' % (base, link))
  75
  76       if '#' in link:
  77         anchor_refs.append(link)
  78       else:
  79         edges.append(link)
  80
  81   return Page(200, edges, anchors, anchor_refs)
  82
  83
  84 class _ContentParser(HTMLParser):
  85   '''Parse an html file pulling out all links and anchor_refs, where an
  86   anchor_ref is a link that contains an anchor.
  87   '''
  88
  89   def __init__(self):
  90     HTMLParser.__init__(self)
  91     self.links = []
  92     self.anchors = set()
  93
  94   def handle_starttag(self, tag, raw_attrs):
  95     attrs = dict(raw_attrs)
  96
  97     if tag == 'a':
  98       # Handle special cases for href's that: start with a space, contain
  99       # just a '.' (period), contain python templating code, are an absolute
 100       # url, are a zip file, or execute javascript on the page.
 101       href = attrs.get('href', '').strip()
 102       if href and not href == '.' and not '{{' in href:
 103         if not urlsplit(href).scheme in ('http', 'https'):
 104           if not href.endswith('.zip') and not 'javascript:' in href:
 105             self.links.append(href)
 106
 107     if attrs.get('id'):
 108       self.anchors.add(attrs['id'])
 109     if attrs.get('name'):
 110       self.anchors.add(attrs['name'])
 111
 112
 113 class LinkErrorDetector(object):
 114   '''Finds link errors on the doc server. This includes broken links, those with
 115   a target page that 404s or contain an anchor that doesn't exist, or pages that
 116   have no links to them.
 117   '''
 118
 119   def __init__(self, file_system, renderer, public_path, root_pages):
 120     '''Creates a new broken link detector. |renderer| is a callable that takes
 121     a path and returns a full html page. |public_path| is the path to public
 122     template files. All URLs in |root_pages| are used as the starting nodes for
 123     the orphaned page search.
 124     '''
 125     self._file_system = file_system
 126     self._renderer = renderer
 127     self._public_path = public_path
 128     self._pages = defaultdict(lambda: Page(404, (), (), ()))
 129     self._root_pages = frozenset(root_pages)
 130     self._always_detached = frozenset((
 131         'apps/404.html',
 132         'extensions/404.html',
 133         'apps/private_apis.html',
 134         'extensions/private_apis.html'))
 135     self._redirection_whitelist = frozenset(('extensions/', 'apps/'))
 136
 137     self._RenderAllPages()
 138
 139   def _RenderAllPages(self):
 140     '''Traverses the public templates directory rendering each URL and
 141     processing the resultant html to pull out all links and anchors.
 142     '''
 143     top_level_directories = (
 144       ('docs/templates/public', ''),
 145       ('docs/static', 'static/'),
 146       ('docs/examples', 'extensions/examples/'),
 147     )
 148
 149     for dirpath, urlprefix in top_level_directories:
 150       files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix)
 151       for url, path in files:
 152         self._pages[url] = _Process(url, self._renderer)
 153
 154         if self._pages[url].status != 200:
 155           print(url, ', a url derived from the path', dirpath +
 156               ', resulted in a', self._pages[url].status)
 157
 158   def _FollowRedirections(self, starting_url, limit=4):
 159     '''Follow redirection until a non-redirectable page is reached. Start at
 160     |starting_url| which must return a 301 or 302 status code.
 161
 162     Return a tuple of: the status of rendering |staring_url|, the final url,
 163     and a list of the pages reached including |starting_url|. If no redirection
 164     occurred, returns (None, None, None).
 165     '''
 166     pages_reached = [starting_url]
 167     redirect_link = None
 168     target_page = self._renderer(starting_url)
 169     original_status = status = target_page.status
 170     count = 0
 171
 172     while status in (301, 302):
 173       if count > limit:
 174         return None, None, None
 175       redirect_link = target_page.headers.get('Location')
 176       target_page = self._renderer(redirect_link)
 177       status = target_page.status
 178       pages_reached.append(redirect_link)
 179       count += 1
 180
 181     if redirect_link is None:
 182       return None, None, None
 183
 184     return original_status, redirect_link, pages_reached
 185
 186   def _CategorizeBrokenLinks(self, url, page, pages):
 187     '''Find all broken links on a page and create appropriate notes describing
 188     why tehy are broken (broken anchor, target redirects, etc). |page| is the
 189     current page being checked and is the result of rendering |url|. |pages|
 190     is a callable that takes a path and returns a Page.
 191     '''
 192     broken_links = []
 193
 194     for link in page.links + page.anchor_refs:
 195       components = urlsplit(link)
 196       fragment = components.fragment
 197
 198       if components.path == '':
 199         if fragment == 'top' or fragment == '':
 200           continue
 201         if not fragment in page.anchors:
 202           broken_links.append((200, url, link, 'target anchor not found'))
 203       else:
 204         # Render the target page
 205         target_page = pages(components.path)
 206
 207         if target_page.status != 200:
 208           if components.path in self._redirection_whitelist:
 209             continue
 210
 211           status, relink, _ = self._FollowRedirections(components.path)
 212           if relink:
 213             broken_links.append((
 214                 status,
 215                 url,
 216                 link,
 217                 'redirects to %s' % relink))
 218           else:
 219             broken_links.append((
 220                 target_page.status, url, link, 'target page not found'))
 221
 222         elif fragment:
 223           if not fragment in target_page.anchors:
 224             broken_links.append((
 225                 target_page.status, url, link, 'target anchor not found'))
 226
 227     return broken_links
 228
 229   def GetBrokenLinks(self):
 230     '''Find all broken links. A broken link is a link that leads to a page
 231     that does not exist (404s), redirects to another page (301 or 302), or
 232     has an anchor whose target does not exist.
 233
 234     Returns a list of tuples of four elements: status, url, target_page,
 235     notes.
 236     '''
 237     broken_links = []
 238
 239     for url in self._pages.keys():
 240       page = self._pages[url]
 241       if page.status != 200:
 242         continue
 243       broken_links.extend(self._CategorizeBrokenLinks(
 244           url, page, lambda x: self._pages[x]))
 245
 246     return broken_links
 247
 248   def GetOrphanedPages(self):
 249     '''Crawls the server find all pages that are connected to the pages at
 250     |seed_url|s. Return the links that are valid on the server but are not in
 251     part of the connected component containing the |root_pages|. These pages
 252     are orphans and cannot be reached simply by clicking through the server.
 253     '''
 254     pages_to_check = deque(self._root_pages.union(self._always_detached))
 255     found = set(self._root_pages) | self._always_detached
 256
 257     while pages_to_check:
 258       item = pages_to_check.popleft()
 259       target_page = self._pages[item]
 260
 261       if target_page.status != 200:
 262         redirected_page = self._FollowRedirections(item)[1]
 263         if not redirected_page is None:
 264           target_page = self._pages[redirected_page]
 265
 266       for link in target_page.links:
 267         if link not in found:
 268           found.add(link)
 269           pages_to_check.append(link)
 270
 271     all_urls = set(
 272         [url for url, page in self._pages.iteritems() if page.status == 200])
 273
 274     return [url for url in all_urls - found if url.endswith('.html')]
 275
 276
 277 def StringifyBrokenLinks(broken_links):
 278   '''Prints out broken links in a more readable format.
 279   '''
 280   def fixed_width(string, width):
 281     return "%s%s" % (string, (width - len(string)) * ' ')
 282
 283   first_col_width = max(len(link[1]) for link in broken_links)
 284   second_col_width = max(len(link[2]) for link in broken_links)
 285   target = itemgetter(2)
 286   output = []
 287
 288   def pretty_print(link, col_offset=0):
 289     return "%s -> %s %s" % (
 290         fixed_width(link[1], first_col_width - col_offset),
 291         fixed_width(link[2], second_col_width),
 292         link[3])
 293
 294   for target, links in groupby(sorted(broken_links, key=target), target):
 295     links = list(links)
 296     # Compress messages
 297     if len(links) > 50 and not links[0][2].startswith('#'):
 298       message = "Found %d broken links (" % len(links)
 299       output.append("%s%s)" % (message, pretty_print(links[0], len(message))))
 300     else:
 301       for link in links:
 302         output.append(pretty_print(link))
 303
 304   return '\n'.join(output)