1 # Copyright 2013 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 from collections
import defaultdict
, deque
, namedtuple
6 from HTMLParser
import HTMLParser
, HTMLParseError
7 from itertools
import groupby
8 from operator
import itemgetter
10 from urlparse
import urlsplit
12 from file_system_util
import CreateURLsFromPaths
13 from path_util
import AssertIsDirectory
16 Page
= namedtuple('Page', 'status, links, anchors, anchor_refs')
19 def _SplitAnchor(url
):
20 components
= urlsplit(url
)
21 return components
.path
, components
.fragment
24 def _Process(path
, renderer
):
25 '''Render the page at |path| using a |renderer| and process the contents of
26 that page. Returns a |Page| namedtuple with fields for the http status code
27 of the page render, the href of all the links that occurred on the page, all
28 of the anchors on the page (ids and names), and all links that contain an
31 If a non-html page is properly rendered, a |Page| with status code 200 and
32 all other fields empty is returned.
34 parser
= _ContentParser()
35 response
= renderer(path
)
37 if response
.status
!= 200:
38 return Page(response
.status
, (), (), ())
39 if not path
.endswith('.html'):
40 return Page(200, (), (), ())
43 parser
.feed(str(response
.content
))
44 except HTMLParseError
:
45 return Page(200, (), (), ())
47 links
, anchors
= parser
.links
, parser
.anchors
49 base
, _
= path
.rsplit('/', 1)
55 # Convert relative links to absolute links and categorize links as edges
58 # Files like experimental_history.html are refered to with the URL
59 # experimental.history.html.
60 head
, last
= link
.rsplit('/', 1) if '/' in link
else ('', link
)
61 last
, anchor
= _SplitAnchor(last
)
63 if last
.endswith('.html') and last
.count('.') > 1:
64 last
= last
.replace('.', '_', last
.count('.') - 1)
65 link
= posixpath
.join(head
, last
)
67 link
= '%s#%s' % (link
, anchor
)
69 if link
.startswith('#'):
70 anchor_refs
.append(link
)
72 if link
.startswith('/'):
75 link
= posixpath
.normpath('%s/%s' % (base
, link
))
78 anchor_refs
.append(link
)
82 return Page(200, edges
, anchors
, anchor_refs
)
85 class _ContentParser(HTMLParser
):
86 '''Parse an html file pulling out all links and anchor_refs, where an
87 anchor_ref is a link that contains an anchor.
91 HTMLParser
.__init
__(self
)
95 def handle_starttag(self
, tag
, raw_attrs
):
96 attrs
= dict(raw_attrs
)
99 # Handle special cases for href's that: start with a space, contain
100 # just a '.' (period), contain python templating code, are an absolute
101 # url, are a zip file, or execute javascript on the page.
102 href
= attrs
.get('href', '').strip()
103 if href
and not href
== '.' and not '{{' in href
:
104 if not urlsplit(href
).scheme
in ('http', 'https'):
105 if not href
.endswith('.zip') and not 'javascript:' in href
:
106 self
.links
.append(href
)
109 self
.anchors
.add(attrs
['id'])
110 if attrs
.get('name'):
111 self
.anchors
.add(attrs
['name'])
114 class LinkErrorDetector(object):
115 '''Finds link errors on the doc server. This includes broken links, those with
116 a target page that 404s or contain an anchor that doesn't exist, or pages that
117 have no links to them.
120 def __init__(self
, file_system
, renderer
, public_path
, root_pages
):
121 '''Creates a new broken link detector. |renderer| is a callable that takes
122 a path and returns a full html page. |public_path| is the path to public
123 template files. All URLs in |root_pages| are used as the starting nodes for
124 the orphaned page search.
126 AssertIsDirectory(public_path
)
127 self
._file
_system
= file_system
128 self
._renderer
= renderer
129 self
._public
_path
= public_path
130 self
._pages
= defaultdict(lambda: Page(404, (), (), ()))
131 self
._root
_pages
= frozenset(root_pages
)
132 self
._always
_detached
= frozenset((
134 'extensions/404.html',
135 'apps/private_apis.html',
136 'extensions/private_apis.html'))
137 self
._redirection
_whitelist
= frozenset(('extensions/', 'apps/'))
139 self
._RenderAllPages
()
141 def _RenderAllPages(self
):
142 '''Traverses the public templates directory rendering each URL and
143 processing the resultant html to pull out all links and anchors.
145 top_level_directories
= (
146 ('docs/templates/public/', ''),
147 ('docs/static/', 'static/'),
148 ('docs/examples/', 'extensions/examples/'),
151 for dirpath
, urlprefix
in top_level_directories
:
152 files
= CreateURLsFromPaths(self
._file
_system
, dirpath
, urlprefix
)
153 for url
, path
in files
:
154 self
._pages
[url
] = _Process(url
, self
._renderer
)
156 if self
._pages
[url
].status
!= 200:
157 print(url
, ', a url derived from the path', dirpath
+
158 ', resulted in a', self
._pages
[url
].status
)
160 def _FollowRedirections(self
, starting_url
, limit
=4):
161 '''Follow redirection until a non-redirectable page is reached. Start at
162 |starting_url| which must return a 301 or 302 status code.
164 Return a tuple of: the status of rendering |staring_url|, the final url,
165 and a list of the pages reached including |starting_url|. If no redirection
166 occurred, returns (None, None, None).
168 pages_reached
= [starting_url
]
170 target_page
= self
._renderer
(starting_url
)
171 original_status
= status
= target_page
.status
174 while status
in (301, 302):
176 return None, None, None
177 redirect_link
= target_page
.headers
.get('Location')
178 target_page
= self
._renderer
(redirect_link
)
179 status
= target_page
.status
180 pages_reached
.append(redirect_link
)
183 if redirect_link
is None:
184 return None, None, None
186 return original_status
, redirect_link
, pages_reached
188 def _CategorizeBrokenLinks(self
, url
, page
, pages
):
189 '''Find all broken links on a page and create appropriate notes describing
190 why tehy are broken (broken anchor, target redirects, etc). |page| is the
191 current page being checked and is the result of rendering |url|. |pages|
192 is a callable that takes a path and returns a Page.
196 for link
in page
.links
+ page
.anchor_refs
:
197 components
= urlsplit(link
)
198 fragment
= components
.fragment
200 if components
.path
== '':
201 if fragment
== 'top' or fragment
== '':
203 if not fragment
in page
.anchors
:
204 broken_links
.append((200, url
, link
, 'target anchor not found'))
206 # Render the target page
207 target_page
= pages(components
.path
)
209 if target_page
.status
!= 200:
210 if components
.path
in self
._redirection
_whitelist
:
213 status
, relink
, _
= self
._FollowRedirections
(components
.path
)
215 broken_links
.append((
219 'redirects to %s' % relink
))
221 broken_links
.append((
222 target_page
.status
, url
, link
, 'target page not found'))
225 if not fragment
in target_page
.anchors
:
226 broken_links
.append((
227 target_page
.status
, url
, link
, 'target anchor not found'))
231 def GetBrokenLinks(self
):
232 '''Find all broken links. A broken link is a link that leads to a page
233 that does not exist (404s), redirects to another page (301 or 302), or
234 has an anchor whose target does not exist.
236 Returns a list of tuples of four elements: status, url, target_page,
241 for url
in self
._pages
.keys():
242 page
= self
._pages
[url
]
243 if page
.status
!= 200:
245 broken_links
.extend(self
._CategorizeBrokenLinks
(
246 url
, page
, lambda x
: self
._pages
[x
]))
250 def GetOrphanedPages(self
):
251 '''Crawls the server find all pages that are connected to the pages at
252 |seed_url|s. Return the links that are valid on the server but are not in
253 part of the connected component containing the |root_pages|. These pages
254 are orphans and cannot be reached simply by clicking through the server.
256 pages_to_check
= deque(self
._root
_pages
.union(self
._always
_detached
))
257 found
= set(self
._root
_pages
) | self
._always
_detached
259 while pages_to_check
:
260 item
= pages_to_check
.popleft()
261 target_page
= self
._pages
[item
]
263 if target_page
.status
!= 200:
264 redirected_page
= self
._FollowRedirections
(item
)[1]
265 if not redirected_page
is None:
266 target_page
= self
._pages
[redirected_page
]
268 for link
in target_page
.links
:
269 if link
not in found
:
271 pages_to_check
.append(link
)
274 [url
for url
, page
in self
._pages
.iteritems() if page
.status
== 200])
276 return [url
for url
in all_urls
- found
if url
.endswith('.html')]
279 def StringifyBrokenLinks(broken_links
):
280 '''Prints out broken links in a more readable format.
282 def fixed_width(string
, width
):
283 return "%s%s" % (string
, (width
- len(string
)) * ' ')
285 first_col_width
= max(len(link
[1]) for link
in broken_links
)
286 second_col_width
= max(len(link
[2]) for link
in broken_links
)
287 target
= itemgetter(2)
290 def pretty_print(link
, col_offset
=0):
291 return "%s -> %s %s" % (
292 fixed_width(link
[1], first_col_width
- col_offset
),
293 fixed_width(link
[2], second_col_width
),
296 for target
, links
in groupby(sorted(broken_links
, key
=target
), target
):
299 if len(links
) > 50 and not links
[0][2].startswith('#'):
300 message
= "Found %d broken links (" % len(links
)
301 output
.append("%s%s)" % (message
, pretty_print(links
[0], len(message
))))
304 output
.append(pretty_print(link
))
306 return '\n'.join(output
)