1 # Copyright 2013 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 from collections
import defaultdict
, deque
, namedtuple
6 from HTMLParser
import HTMLParser
, HTMLParseError
7 from itertools
import groupby
8 from operator
import itemgetter
10 from urlparse
import urlsplit
12 from file_system_util
import CreateURLsFromPaths
15 Page
= namedtuple('Page', 'status, links, anchors, anchor_refs')
18 def _SplitAnchor(url
):
19 components
= urlsplit(url
)
20 return components
.path
, components
.fragment
23 def _Process(path
, renderer
):
24 '''Render the page at |path| using a |renderer| and process the contents of
25 that page. Returns a |Page| namedtuple with fields for the http status code
26 of the page render, the href of all the links that occurred on the page, all
27 of the anchors on the page (ids and names), and all links that contain an
30 If a non-html page is properly rendered, a |Page| with status code 200 and
31 all other fields empty is returned.
33 parser
= _ContentParser()
34 response
= renderer(path
)
36 if response
.status
!= 200:
37 return Page(response
.status
, (), (), ())
38 if not path
.endswith('.html'):
39 return Page(200, (), (), ())
42 parser
.feed(str(response
.content
))
43 except HTMLParseError
:
44 return Page(200, (), (), ())
46 links
, anchors
= parser
.links
, parser
.anchors
48 base
, _
= path
.rsplit('/', 1)
54 # Convert relative links to absolute links and categorize links as edges
57 # Files like experimental_history.html are refered to with the URL
58 # experimental.history.html.
59 head
, last
= link
.rsplit('/', 1) if '/' in link
else ('', link
)
60 last
, anchor
= _SplitAnchor(last
)
62 if last
.endswith('.html') and last
.count('.') > 1:
63 last
= last
.replace('.', '_', last
.count('.') - 1)
64 link
= posixpath
.join(head
, last
)
66 link
= '%s#%s' % (link
, anchor
)
68 if link
.startswith('#'):
69 anchor_refs
.append(link
)
71 if link
.startswith('/'):
74 link
= posixpath
.normpath('%s/%s' % (base
, link
))
77 anchor_refs
.append(link
)
81 return Page(200, edges
, anchors
, anchor_refs
)
84 class _ContentParser(HTMLParser
):
85 '''Parse an html file pulling out all links and anchor_refs, where an
86 anchor_ref is a link that contains an anchor.
90 HTMLParser
.__init
__(self
)
94 def handle_starttag(self
, tag
, raw_attrs
):
95 attrs
= dict(raw_attrs
)
98 # Handle special cases for href's that: start with a space, contain
99 # just a '.' (period), contain python templating code, are an absolute
100 # url, are a zip file, or execute javascript on the page.
101 href
= attrs
.get('href', '').strip()
102 if href
and not href
== '.' and not '{{' in href
:
103 if not urlsplit(href
).scheme
in ('http', 'https'):
104 if not href
.endswith('.zip') and not 'javascript:' in href
:
105 self
.links
.append(href
)
108 self
.anchors
.add(attrs
['id'])
109 if attrs
.get('name'):
110 self
.anchors
.add(attrs
['name'])
113 class LinkErrorDetector(object):
114 '''Finds link errors on the doc server. This includes broken links, those with
115 a target page that 404s or contain an anchor that doesn't exist, or pages that
116 have no links to them.
119 def __init__(self
, file_system
, renderer
, public_path
, root_pages
):
120 '''Creates a new broken link detector. |renderer| is a callable that takes
121 a path and returns a full html page. |public_path| is the path to public
122 template files. All URLs in |root_pages| are used as the starting nodes for
123 the orphaned page search.
125 self
._file
_system
= file_system
126 self
._renderer
= renderer
127 self
._public
_path
= public_path
128 self
._pages
= defaultdict(lambda: Page(404, (), (), ()))
129 self
._root
_pages
= frozenset(root_pages
)
130 self
._always
_detached
= frozenset((
132 'extensions/404.html',
133 'apps/private_apis.html',
134 'extensions/private_apis.html'))
135 self
._redirection
_whitelist
= frozenset(('extensions/', 'apps/'))
137 self
._RenderAllPages
()
139 def _RenderAllPages(self
):
140 '''Traverses the public templates directory rendering each URL and
141 processing the resultant html to pull out all links and anchors.
143 top_level_directories
= (
144 ('docs/templates/public', ''),
145 ('docs/static', 'static/'),
146 ('docs/examples', 'extensions/examples/'),
149 for dirpath
, urlprefix
in top_level_directories
:
150 files
= CreateURLsFromPaths(self
._file
_system
, dirpath
, urlprefix
)
151 for url
, path
in files
:
152 self
._pages
[url
] = _Process(url
, self
._renderer
)
154 if self
._pages
[url
].status
!= 200:
155 print(url
, ', a url derived from the path', dirpath
+
156 ', resulted in a', self
._pages
[url
].status
)
158 def _FollowRedirections(self
, starting_url
, limit
=4):
159 '''Follow redirection until a non-redirectable page is reached. Start at
160 |starting_url| which must return a 301 or 302 status code.
162 Return a tuple of: the status of rendering |staring_url|, the final url,
163 and a list of the pages reached including |starting_url|. If no redirection
164 occurred, returns (None, None, None).
166 pages_reached
= [starting_url
]
168 target_page
= self
._renderer
(starting_url
)
169 original_status
= status
= target_page
.status
172 while status
in (301, 302):
174 return None, None, None
175 redirect_link
= target_page
.headers
.get('Location')
176 target_page
= self
._renderer
(redirect_link
)
177 status
= target_page
.status
178 pages_reached
.append(redirect_link
)
181 if redirect_link
is None:
182 return None, None, None
184 return original_status
, redirect_link
, pages_reached
186 def _CategorizeBrokenLinks(self
, url
, page
, pages
):
187 '''Find all broken links on a page and create appropriate notes describing
188 why tehy are broken (broken anchor, target redirects, etc). |page| is the
189 current page being checked and is the result of rendering |url|. |pages|
190 is a callable that takes a path and returns a Page.
194 for link
in page
.links
+ page
.anchor_refs
:
195 components
= urlsplit(link
)
196 fragment
= components
.fragment
198 if components
.path
== '':
199 if fragment
== 'top' or fragment
== '':
201 if not fragment
in page
.anchors
:
202 broken_links
.append((200, url
, link
, 'target anchor not found'))
204 # Render the target page
205 target_page
= pages(components
.path
)
207 if target_page
.status
!= 200:
208 if components
.path
in self
._redirection
_whitelist
:
211 status
, relink
, _
= self
._FollowRedirections
(components
.path
)
213 broken_links
.append((
217 'redirects to %s' % relink
))
219 broken_links
.append((
220 target_page
.status
, url
, link
, 'target page not found'))
223 if not fragment
in target_page
.anchors
:
224 broken_links
.append((
225 target_page
.status
, url
, link
, 'target anchor not found'))
229 def GetBrokenLinks(self
):
230 '''Find all broken links. A broken link is a link that leads to a page
231 that does not exist (404s), redirects to another page (301 or 302), or
232 has an anchor whose target does not exist.
234 Returns a list of tuples of four elements: status, url, target_page,
239 for url
in self
._pages
.keys():
240 page
= self
._pages
[url
]
241 if page
.status
!= 200:
243 broken_links
.extend(self
._CategorizeBrokenLinks
(
244 url
, page
, lambda x
: self
._pages
[x
]))
248 def GetOrphanedPages(self
):
249 '''Crawls the server find all pages that are connected to the pages at
250 |seed_url|s. Return the links that are valid on the server but are not in
251 part of the connected component containing the |root_pages|. These pages
252 are orphans and cannot be reached simply by clicking through the server.
254 pages_to_check
= deque(self
._root
_pages
.union(self
._always
_detached
))
255 found
= set(self
._root
_pages
) | self
._always
_detached
257 while pages_to_check
:
258 item
= pages_to_check
.popleft()
259 target_page
= self
._pages
[item
]
261 if target_page
.status
!= 200:
262 redirected_page
= self
._FollowRedirections
(item
)[1]
263 if not redirected_page
is None:
264 target_page
= self
._pages
[redirected_page
]
266 for link
in target_page
.links
:
267 if link
not in found
:
269 pages_to_check
.append(link
)
272 [url
for url
, page
in self
._pages
.iteritems() if page
.status
== 200])
274 return [url
for url
in all_urls
- found
if url
.endswith('.html')]
277 def StringifyBrokenLinks(broken_links
):
278 '''Prints out broken links in a more readable format.
280 def fixed_width(string
, width
):
281 return "%s%s" % (string
, (width
- len(string
)) * ' ')
283 first_col_width
= max(len(link
[1]) for link
in broken_links
)
284 second_col_width
= max(len(link
[2]) for link
in broken_links
)
285 target
= itemgetter(2)
288 def pretty_print(link
, col_offset
=0):
289 return "%s -> %s %s" % (
290 fixed_width(link
[1], first_col_width
- col_offset
),
291 fixed_width(link
[2], second_col_width
),
294 for target
, links
in groupby(sorted(broken_links
, key
=target
), target
):
297 if len(links
) > 50 and not links
[0][2].startswith('#'):
298 message
= "Found %d broken links (" % len(links
)
299 output
.append("%s%s)" % (message
, pretty_print(links
[0], len(message
))))
302 output
.append(pretty_print(link
))
304 return '\n'.join(output
)