Disable view source for Developer Tools.
[chromium-blink-merge.git] / chrome / common / extensions / docs / server2 / link_error_detector.py
blobb495f319e9b37a0fce8404151bd15b570792f299
1 # Copyright 2013 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 from collections import defaultdict, deque, namedtuple
6 from HTMLParser import HTMLParser, HTMLParseError
7 from itertools import groupby
8 from operator import itemgetter
9 import posixpath
10 from urlparse import urlsplit
12 from file_system_util import CreateURLsFromPaths
15 Page = namedtuple('Page', 'status, links, anchors, anchor_refs')
18 def _SplitAnchor(url):
19 components = urlsplit(url)
20 return components.path, components.fragment
23 def _Process(path, renderer):
24 '''Render the page at |path| using a |renderer| and process the contents of
25 that page. Returns a |Page| namedtuple with fields for the http status code
26 of the page render, the href of all the links that occurred on the page, all
27 of the anchors on the page (ids and names), and all links that contain an
28 anchor component.
30 If a non-html page is properly rendered, a |Page| with status code 200 and
31 all other fields empty is returned.
32 '''
33 parser = _ContentParser()
34 response = renderer(path)
36 if response.status != 200:
37 return Page(response.status, (), (), ())
38 if not path.endswith('.html'):
39 return Page(200, (), (), ())
41 try:
42 parser.feed(str(response.content))
43 except HTMLParseError:
44 return Page(200, (), (), ())
46 links, anchors = parser.links, parser.anchors
47 if '/' in path:
48 base, _ = path.rsplit('/', 1)
49 else:
50 base = ''
51 edges = []
52 anchor_refs = []
54 # Convert relative links to absolute links and categorize links as edges
55 # or anchor_refs.
56 for link in links:
57 # Files like experimental_history.html are refered to with the URL
58 # experimental.history.html.
59 head, last = link.rsplit('/', 1) if '/' in link else ('', link)
60 last, anchor = _SplitAnchor(last)
62 if last.endswith('.html') and last.count('.') > 1:
63 last = last.replace('.', '_', last.count('.') - 1)
64 link = posixpath.join(head, last)
65 if anchor:
66 link = '%s#%s' % (link, anchor)
68 if link.startswith('#'):
69 anchor_refs.append(link)
70 else:
71 if link.startswith('/'):
72 link = link[1:]
73 else:
74 link = posixpath.normpath('%s/%s' % (base, link))
76 if '#' in link:
77 anchor_refs.append(link)
78 else:
79 edges.append(link)
81 return Page(200, edges, anchors, anchor_refs)
84 class _ContentParser(HTMLParser):
85 '''Parse an html file pulling out all links and anchor_refs, where an
86 anchor_ref is a link that contains an anchor.
87 '''
89 def __init__(self):
90 HTMLParser.__init__(self)
91 self.links = []
92 self.anchors = set()
94 def handle_starttag(self, tag, raw_attrs):
95 attrs = dict(raw_attrs)
97 if tag == 'a':
98 # Handle special cases for href's that: start with a space, contain
99 # just a '.' (period), contain python templating code, are an absolute
100 # url, are a zip file, or execute javascript on the page.
101 href = attrs.get('href', '').strip()
102 if href and not href == '.' and not '{{' in href:
103 if not urlsplit(href).scheme in ('http', 'https'):
104 if not href.endswith('.zip') and not 'javascript:' in href:
105 self.links.append(href)
107 if attrs.get('id'):
108 self.anchors.add(attrs['id'])
109 if attrs.get('name'):
110 self.anchors.add(attrs['name'])
113 class LinkErrorDetector(object):
114 '''Finds link errors on the doc server. This includes broken links, those with
115 a target page that 404s or contain an anchor that doesn't exist, or pages that
116 have no links to them.
119 def __init__(self, file_system, renderer, public_path, root_pages):
120 '''Creates a new broken link detector. |renderer| is a callable that takes
121 a path and returns a full html page. |public_path| is the path to public
122 template files. All URLs in |root_pages| are used as the starting nodes for
123 the orphaned page search.
125 self._file_system = file_system
126 self._renderer = renderer
127 self._public_path = public_path
128 self._pages = defaultdict(lambda: Page(404, (), (), ()))
129 self._root_pages = frozenset(root_pages)
130 self._always_detached = frozenset((
131 'apps/404.html',
132 'extensions/404.html',
133 'apps/private_apis.html',
134 'extensions/private_apis.html'))
135 self._redirection_whitelist = frozenset(('extensions/', 'apps/'))
137 self._RenderAllPages()
139 def _RenderAllPages(self):
140 '''Traverses the public templates directory rendering each URL and
141 processing the resultant html to pull out all links and anchors.
143 top_level_directories = (
144 ('docs/templates/public', ''),
145 ('docs/static', 'static/'),
146 ('docs/examples', 'extensions/examples/'),
149 for dirpath, urlprefix in top_level_directories:
150 files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix)
151 for url, path in files:
152 self._pages[url] = _Process(url, self._renderer)
154 if self._pages[url].status != 200:
155 print(url, ', a url derived from the path', dirpath +
156 ', resulted in a', self._pages[url].status)
158 def _FollowRedirections(self, starting_url, limit=4):
159 '''Follow redirection until a non-redirectable page is reached. Start at
160 |starting_url| which must return a 301 or 302 status code.
162 Return a tuple of: the status of rendering |staring_url|, the final url,
163 and a list of the pages reached including |starting_url|. If no redirection
164 occurred, returns (None, None, None).
166 pages_reached = [starting_url]
167 redirect_link = None
168 target_page = self._renderer(starting_url)
169 original_status = status = target_page.status
170 count = 0
172 while status in (301, 302):
173 if count > limit:
174 return None, None, None
175 redirect_link = target_page.headers.get('Location')
176 target_page = self._renderer(redirect_link)
177 status = target_page.status
178 pages_reached.append(redirect_link)
179 count += 1
181 if redirect_link is None:
182 return None, None, None
184 return original_status, redirect_link, pages_reached
186 def _CategorizeBrokenLinks(self, url, page, pages):
187 '''Find all broken links on a page and create appropriate notes describing
188 why tehy are broken (broken anchor, target redirects, etc). |page| is the
189 current page being checked and is the result of rendering |url|. |pages|
190 is a callable that takes a path and returns a Page.
192 broken_links = []
194 for link in page.links + page.anchor_refs:
195 components = urlsplit(link)
196 fragment = components.fragment
198 if components.path == '':
199 if fragment == 'top' or fragment == '':
200 continue
201 if not fragment in page.anchors:
202 broken_links.append((200, url, link, 'target anchor not found'))
203 else:
204 # Render the target page
205 target_page = pages(components.path)
207 if target_page.status != 200:
208 if components.path in self._redirection_whitelist:
209 continue
211 status, relink, _ = self._FollowRedirections(components.path)
212 if relink:
213 broken_links.append((
214 status,
215 url,
216 link,
217 'redirects to %s' % relink))
218 else:
219 broken_links.append((
220 target_page.status, url, link, 'target page not found'))
222 elif fragment:
223 if not fragment in target_page.anchors:
224 broken_links.append((
225 target_page.status, url, link, 'target anchor not found'))
227 return broken_links
229 def GetBrokenLinks(self):
230 '''Find all broken links. A broken link is a link that leads to a page
231 that does not exist (404s), redirects to another page (301 or 302), or
232 has an anchor whose target does not exist.
234 Returns a list of tuples of four elements: status, url, target_page,
235 notes.
237 broken_links = []
239 for url in self._pages.keys():
240 page = self._pages[url]
241 if page.status != 200:
242 continue
243 broken_links.extend(self._CategorizeBrokenLinks(
244 url, page, lambda x: self._pages[x]))
246 return broken_links
248 def GetOrphanedPages(self):
249 '''Crawls the server find all pages that are connected to the pages at
250 |seed_url|s. Return the links that are valid on the server but are not in
251 part of the connected component containing the |root_pages|. These pages
252 are orphans and cannot be reached simply by clicking through the server.
254 pages_to_check = deque(self._root_pages.union(self._always_detached))
255 found = set(self._root_pages) | self._always_detached
257 while pages_to_check:
258 item = pages_to_check.popleft()
259 target_page = self._pages[item]
261 if target_page.status != 200:
262 redirected_page = self._FollowRedirections(item)[1]
263 if not redirected_page is None:
264 target_page = self._pages[redirected_page]
266 for link in target_page.links:
267 if link not in found:
268 found.add(link)
269 pages_to_check.append(link)
271 all_urls = set(
272 [url for url, page in self._pages.iteritems() if page.status == 200])
274 return [url for url in all_urls - found if url.endswith('.html')]
277 def StringifyBrokenLinks(broken_links):
278 '''Prints out broken links in a more readable format.
280 def fixed_width(string, width):
281 return "%s%s" % (string, (width - len(string)) * ' ')
283 first_col_width = max(len(link[1]) for link in broken_links)
284 second_col_width = max(len(link[2]) for link in broken_links)
285 target = itemgetter(2)
286 output = []
288 def pretty_print(link, col_offset=0):
289 return "%s -> %s %s" % (
290 fixed_width(link[1], first_col_width - col_offset),
291 fixed_width(link[2], second_col_width),
292 link[3])
294 for target, links in groupby(sorted(broken_links, key=target), target):
295 links = list(links)
296 # Compress messages
297 if len(links) > 50 and not links[0][2].startswith('#'):
298 message = "Found %d broken links (" % len(links)
299 output.append("%s%s)" % (message, pretty_print(links[0], len(message))))
300 else:
301 for link in links:
302 output.append(pretty_print(link))
304 return '\n'.join(output)