1 # Copyright 2013 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 from HTMLParser
import HTMLParser
8 class ParseResult(object):
9 '''The result of |ParseDocument|:
10 |title| The title of the page, as pulled from the first <h1>.
11 |title_attributes| The attributes of the <h1> tag the title is derived from.
12 |sections| The list of Sections within this document.
13 |warnings| Any warnings while parsing the document.
16 def __init__(self
, title
, title_attributes
, sections
, warnings
):
18 self
.title_attributes
= title_attributes
19 self
.sections
= sections
20 self
.warnings
= warnings
23 class DocumentSection(object):
24 '''A section of the document as grouped by <section>...</section>. Any content
25 not within section tags is considered an implicit section, so:
26 "Foo <section>Bar</section> Baz" is 3 sections.
27 |structure| A list of DocumentStructureEntry for each top-level heading.
34 class DocumentStructureEntry(object):
35 '''An entry in the document structure.
36 |attributes| The attributes of the header tag this entry is derived from.
37 |name| The name of this entry, as pulled from the header tag this entry
39 |entries| A list of child DocumentStructureEntry items.
42 def __init__(self
, tag
, attributes
):
43 self
.attributes
= attributes
46 # Callers shouldn't care about the tag, but we need it for sanity checking,
47 # so make it private. In particular we pretend that anything but the first
48 # h1 is an h2, and it'd be odd to expose that.
50 # Documents can override the name of the entry using title="".
51 self
._has
_explicit
_name
= False
54 return '<%s>%s</%s>' % (self
._tag
, self
.name
, self
._tag
)
60 def ParseDocument(document
, expect_title
=False):
61 '''Parses the title and a document structure form |document| and returns a
64 parser
= _DocumentParser(expect_title
)
67 return parser
.parse_result
70 def RemoveTitle(document
):
71 '''Removes the first <h1>..</h1> tag found in |document| and returns a
72 (result, warning) tuple.
74 If no title is found or |document| is malformed in some way, returns the
75 original document and a warning message. Otherwise, returns the result of
76 removing the title from |document| with a None warning message.
79 def min_index(lhs
, rhs
):
80 lhs_index
, rhs_index
= document
.find(lhs
), document
.find(rhs
)
81 if lhs_index
== -1: return rhs_index
82 if rhs_index
== -1: return lhs_index
83 return min(lhs_index
, rhs_index
)
85 title_start
= min_index('<h1', '<H1')
87 return document
, 'No opening <h1> was found'
88 title_end
= min_index('/h1>', '/H1>')
90 return document
, 'No closing </h1> was found'
91 if title_end
< title_start
:
92 return document
, 'The </h1> appeared before the <h1>'
94 return (document
[:title_start
] + document
[title_end
+ 4:], None)
97 _HEADER_TAGS
= ['h2', 'h3', 'h4']
100 class _DocumentParser(HTMLParser
):
101 '''HTMLParser for ParseDocument.
104 def __init__(self
, expect_title
):
105 HTMLParser
.__init
__(self
)
107 self
.parse_result
= None
109 self
._expect
_title
= expect_title
110 self
._title
_entry
= None
112 self
._processing
_section
= DocumentSection()
113 self
._processing
_entry
= None
116 def handle_starttag(self
, tag
, attrs
):
118 self
._OnSectionBoundary
()
121 if tag
!= 'h1' and tag
not in _HEADER_TAGS
:
124 if self
._processing
_entry
is not None:
125 self
._WarnWithPosition
('Found <%s> in the middle of processing a <%s>' %
126 (tag
, self
._processing
_entry
._tag
))
129 attrs_dict
= dict(attrs
)
130 self
._processing
_entry
= DocumentStructureEntry(tag
, attrs_dict
)
132 explicit_name
= attrs_dict
.pop('title', None)
133 if explicit_name
== '':
134 # Don't create a TOC entry at all if the tag has specified title="".
136 if explicit_name
is not None:
137 self
._processing
_entry
.name
= explicit_name
138 self
._processing
_entry
._has
_explicit
_name
= True
140 if tag
== 'h1' and self
._title
_entry
is not None:
141 self
._WarnWithPosition
('Found multiple <h1> tags. Subsequent <h1> tags '
142 'will be classified as <h2> for the purpose of '
147 self
._title
_entry
= self
._processing
_entry
149 belongs_to
= self
._processing
_section
.structure
150 for header
in _HEADER_TAGS
[:_HEADER_TAGS
.index(tag
)]:
151 if len(belongs_to
) == 0:
152 # TODO(kalman): Re-enable this warning once the reference pages have
153 # their references fixed.
154 #self._WarnWithPosition('Found <%s> without any preceding <%s>' %
157 belongs_to
= belongs_to
[-1].entries
158 belongs_to
.append(self
._processing
_entry
)
160 def handle_endtag(self
, tag
):
162 self
._OnSectionBoundary
()
165 if tag
!= 'h1' and tag
not in _HEADER_TAGS
:
168 if self
._processing
_entry
is None:
169 self
._WarnWithPosition
('Found closing </%s> without an opening <%s>' %
173 if self
._processing
_entry
._tag
!= tag
:
174 self
._WarnWithPosition
('Found closing </%s> while processing a <%s>' %
175 (tag
, self
._processing
_entry
._tag
))
176 # Note: no early return, it's more likely that the mismatched header was
177 # a typo rather than a misplaced closing header tag.
179 self
._processing
_entry
= None
181 def handle_data(self
, data
):
182 if (self
._processing
_entry
is not None and
183 not self
._processing
_entry
._has
_explicit
_name
):
184 # += is inefficient, but probably fine here because the chances of a
185 # large number of nested tags within header tags is pretty low.
186 self
._processing
_entry
.name
+= data
189 HTMLParser
.close(self
)
191 self
._OnSectionBoundary
()
193 if self
._processing
_entry
is not None:
194 self
._warnings
.append('Finished parsing while still processing a <%s>' %
195 parser
._processing
_entry
._tag
)
197 if self
._expect
_title
:
198 if not self
._title
_entry
:
199 self
._warnings
.append('Expected a title')
200 title
, title_attributes
= '', {}
202 title
, title_attributes
= (
203 self
._title
_entry
.name
, self
._title
_entry
.attributes
)
205 if self
._title
_entry
:
206 self
._warnings
.append('Found unexpected title "%s"' %
207 self
._title
_entry
.name
)
208 title
, title_attributes
= None, None
210 self
.parse_result
= ParseResult(
211 title
, title_attributes
, self
._sections
, self
._warnings
)
213 def _OnSectionBoundary(self
):
214 # Only start a new section if the previous section was non-empty.
215 if self
._processing
_section
.structure
:
216 self
._sections
.append(self
._processing
_section
)
217 self
._processing
_section
= DocumentSection()
219 def _WarnWithPosition(self
, message
):
220 line
, col
= self
.getpos()
221 self
._warnings
.append('%s (line %s, column %s)' % (message
, line
, col
+ 1))