Merge Chromium + Blink git repositories
[chromium-blink-merge.git] / chrome / common / extensions / docs / server2 / link_error_detector.py
blobee9bc7c5bbb89fb92f1c2ec10cbf4f79df874b0b
1 # Copyright 2013 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 from collections import defaultdict, deque, namedtuple
6 from HTMLParser import HTMLParser, HTMLParseError
7 from itertools import groupby
8 from operator import itemgetter
9 import posixpath
10 from urlparse import urlsplit
12 from file_system_util import CreateURLsFromPaths
13 from path_util import AssertIsDirectory
16 Page = namedtuple('Page', 'status, links, anchors, anchor_refs')
19 def _SplitAnchor(url):
20 components = urlsplit(url)
21 return components.path, components.fragment
24 def _Process(path, renderer):
25 '''Render the page at |path| using a |renderer| and process the contents of
26 that page. Returns a |Page| namedtuple with fields for the http status code
27 of the page render, the href of all the links that occurred on the page, all
28 of the anchors on the page (ids and names), and all links that contain an
29 anchor component.
31 If a non-html page is properly rendered, a |Page| with status code 200 and
32 all other fields empty is returned.
33 '''
34 parser = _ContentParser()
35 response = renderer(path)
37 if response.status != 200:
38 return Page(response.status, (), (), ())
39 if not path.endswith('.html'):
40 return Page(200, (), (), ())
42 try:
43 parser.feed(str(response.content))
44 except HTMLParseError:
45 return Page(200, (), (), ())
47 links, anchors = parser.links, parser.anchors
48 if '/' in path:
49 base, _ = path.rsplit('/', 1)
50 else:
51 base = ''
52 edges = []
53 anchor_refs = []
55 # Convert relative links to absolute links and categorize links as edges
56 # or anchor_refs.
57 for link in links:
58 # Files like experimental_history.html are refered to with the URL
59 # experimental.history.html.
60 head, last = link.rsplit('/', 1) if '/' in link else ('', link)
61 last, anchor = _SplitAnchor(last)
63 if last.endswith('.html') and last.count('.') > 1:
64 last = last.replace('.', '_', last.count('.') - 1)
65 link = posixpath.join(head, last)
66 if anchor:
67 link = '%s#%s' % (link, anchor)
69 if link.startswith('#'):
70 anchor_refs.append(link)
71 else:
72 if link.startswith('/'):
73 link = link[1:]
74 else:
75 link = posixpath.normpath('%s/%s' % (base, link))
77 if '#' in link:
78 anchor_refs.append(link)
79 else:
80 edges.append(link)
82 return Page(200, edges, anchors, anchor_refs)
85 class _ContentParser(HTMLParser):
86 '''Parse an html file pulling out all links and anchor_refs, where an
87 anchor_ref is a link that contains an anchor.
88 '''
90 def __init__(self):
91 HTMLParser.__init__(self)
92 self.links = []
93 self.anchors = set()
95 def handle_starttag(self, tag, raw_attrs):
96 attrs = dict(raw_attrs)
98 if tag == 'a':
99 # Handle special cases for href's that: start with a space, contain
100 # just a '.' (period), contain python templating code, are an absolute
101 # url, are a zip file, or execute javascript on the page.
102 href = attrs.get('href', '').strip()
103 if href and not href == '.' and not '{{' in href:
104 if not urlsplit(href).scheme in ('http', 'https'):
105 if not href.endswith('.zip') and not 'javascript:' in href:
106 self.links.append(href)
108 if attrs.get('id'):
109 self.anchors.add(attrs['id'])
110 if attrs.get('name'):
111 self.anchors.add(attrs['name'])
114 class LinkErrorDetector(object):
115 '''Finds link errors on the doc server. This includes broken links, those with
116 a target page that 404s or contain an anchor that doesn't exist, or pages that
117 have no links to them.
120 def __init__(self, file_system, renderer, public_path, root_pages):
121 '''Creates a new broken link detector. |renderer| is a callable that takes
122 a path and returns a full html page. |public_path| is the path to public
123 template files. All URLs in |root_pages| are used as the starting nodes for
124 the orphaned page search.
126 AssertIsDirectory(public_path)
127 self._file_system = file_system
128 self._renderer = renderer
129 self._public_path = public_path
130 self._pages = defaultdict(lambda: Page(404, (), (), ()))
131 self._root_pages = frozenset(root_pages)
132 self._always_detached = frozenset((
133 'apps/404.html',
134 'extensions/404.html',
135 'apps/private_apis.html',
136 'extensions/private_apis.html'))
137 self._redirection_whitelist = frozenset(('extensions/', 'apps/'))
139 self._RenderAllPages()
141 def _RenderAllPages(self):
142 '''Traverses the public templates directory rendering each URL and
143 processing the resultant html to pull out all links and anchors.
145 top_level_directories = (
146 ('docs/templates/public/', ''),
147 ('docs/static/', 'static/'),
148 ('docs/examples/', 'extensions/examples/'),
151 for dirpath, urlprefix in top_level_directories:
152 files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix)
153 for url, path in files:
154 self._pages[url] = _Process(url, self._renderer)
156 if self._pages[url].status != 200:
157 print(url, ', a url derived from the path', dirpath +
158 ', resulted in a', self._pages[url].status)
160 def _FollowRedirections(self, starting_url, limit=4):
161 '''Follow redirection until a non-redirectable page is reached. Start at
162 |starting_url| which must return a 301 or 302 status code.
164 Return a tuple of: the status of rendering |staring_url|, the final url,
165 and a list of the pages reached including |starting_url|. If no redirection
166 occurred, returns (None, None, None).
168 pages_reached = [starting_url]
169 redirect_link = None
170 target_page = self._renderer(starting_url)
171 original_status = status = target_page.status
172 count = 0
174 while status in (301, 302):
175 if count > limit:
176 return None, None, None
177 redirect_link = target_page.headers.get('Location')
178 target_page = self._renderer(redirect_link)
179 status = target_page.status
180 pages_reached.append(redirect_link)
181 count += 1
183 if redirect_link is None:
184 return None, None, None
186 return original_status, redirect_link, pages_reached
188 def _CategorizeBrokenLinks(self, url, page, pages):
189 '''Find all broken links on a page and create appropriate notes describing
190 why tehy are broken (broken anchor, target redirects, etc). |page| is the
191 current page being checked and is the result of rendering |url|. |pages|
192 is a callable that takes a path and returns a Page.
194 broken_links = []
196 for link in page.links + page.anchor_refs:
197 components = urlsplit(link)
198 fragment = components.fragment
200 if components.path == '':
201 if fragment == 'top' or fragment == '':
202 continue
203 if not fragment in page.anchors:
204 broken_links.append((200, url, link, 'target anchor not found'))
205 else:
206 # Render the target page
207 target_page = pages(components.path)
209 if target_page.status != 200:
210 if components.path in self._redirection_whitelist:
211 continue
213 status, relink, _ = self._FollowRedirections(components.path)
214 if relink:
215 broken_links.append((
216 status,
217 url,
218 link,
219 'redirects to %s' % relink))
220 else:
221 broken_links.append((
222 target_page.status, url, link, 'target page not found'))
224 elif fragment:
225 if not fragment in target_page.anchors:
226 broken_links.append((
227 target_page.status, url, link, 'target anchor not found'))
229 return broken_links
231 def GetBrokenLinks(self):
232 '''Find all broken links. A broken link is a link that leads to a page
233 that does not exist (404s), redirects to another page (301 or 302), or
234 has an anchor whose target does not exist.
236 Returns a list of tuples of four elements: status, url, target_page,
237 notes.
239 broken_links = []
241 for url in self._pages.keys():
242 page = self._pages[url]
243 if page.status != 200:
244 continue
245 broken_links.extend(self._CategorizeBrokenLinks(
246 url, page, lambda x: self._pages[x]))
248 return broken_links
250 def GetOrphanedPages(self):
251 '''Crawls the server find all pages that are connected to the pages at
252 |seed_url|s. Return the links that are valid on the server but are not in
253 part of the connected component containing the |root_pages|. These pages
254 are orphans and cannot be reached simply by clicking through the server.
256 pages_to_check = deque(self._root_pages.union(self._always_detached))
257 found = set(self._root_pages) | self._always_detached
259 while pages_to_check:
260 item = pages_to_check.popleft()
261 target_page = self._pages[item]
263 if target_page.status != 200:
264 redirected_page = self._FollowRedirections(item)[1]
265 if not redirected_page is None:
266 target_page = self._pages[redirected_page]
268 for link in target_page.links:
269 if link not in found:
270 found.add(link)
271 pages_to_check.append(link)
273 all_urls = set(
274 [url for url, page in self._pages.iteritems() if page.status == 200])
276 return [url for url in all_urls - found if url.endswith('.html')]
279 def StringifyBrokenLinks(broken_links):
280 '''Prints out broken links in a more readable format.
282 def fixed_width(string, width):
283 return "%s%s" % (string, (width - len(string)) * ' ')
285 first_col_width = max(len(link[1]) for link in broken_links)
286 second_col_width = max(len(link[2]) for link in broken_links)
287 target = itemgetter(2)
288 output = []
290 def pretty_print(link, col_offset=0):
291 return "%s -> %s %s" % (
292 fixed_width(link[1], first_col_width - col_offset),
293 fixed_width(link[2], second_col_width),
294 link[3])
296 for target, links in groupby(sorted(broken_links, key=target), target):
297 links = list(links)
298 # Compress messages
299 if len(links) > 50 and not links[0][2].startswith('#'):
300 message = "Found %d broken links (" % len(links)
301 output.append("%s%s)" % (message, pretty_print(links[0], len(message))))
302 else:
303 for link in links:
304 output.append(pretty_print(link))
306 return '\n'.join(output)