2 # Copyright 2013 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
8 from document_parser
import ParseDocument
, RemoveTitle
12 Preamble before heading.
14 <h1 id='main' class='header'>Main header</h1>
15 Some intro to the content.
17 <h2 id='banana' class='header' title=''>Bananas</h2>
18 Something about bananas.
20 <h2 id='orange' title='hello'>Oranges</h2>
21 Something about oranges.
23 <h3 id='valencia'>Valencia Oranges</h3>
24 A description of valencia oranges.
26 <h3 id='seville'>Seville Oranges</h3>
27 A description of seville oranges.
30 Grapefruit closed a h2 with a h3. This should be a warning.
32 <h1 id='not-main'>Not the main header</h1>
33 But it should still show up in the TOC as though it were an h2.
35 <h2>Not <h3>a banana</h2>
36 The embedded h3 should be ignored.
39 h4 are part of the document structure, but this is not inside a h3.
42 Now I'm just getting lazy.
45 This h4 is inside a h3 so will show up.
48 Header 5s are not parsed.
52 _WHOLE_DOCUMENT_WITHOUT_TITLE
= '''
53 Preamble before heading.
56 Some intro to the content.
58 <h2 id='banana' class='header' title=''>Bananas</h2>
59 Something about bananas.
61 <h2 id='orange' title='hello'>Oranges</h2>
62 Something about oranges.
64 <h3 id='valencia'>Valencia Oranges</h3>
65 A description of valencia oranges.
67 <h3 id='seville'>Seville Oranges</h3>
68 A description of seville oranges.
71 Grapefruit closed a h2 with a h3. This should be a warning.
73 <h1 id='not-main'>Not the main header</h1>
74 But it should still show up in the TOC as though it were an h2.
76 <h2>Not <h3>a banana</h2>
77 The embedded h3 should be ignored.
80 h4 are part of the document structure, but this is not inside a h3.
83 Now I'm just getting lazy.
86 This h4 is inside a h3 so will show up.
89 Header 5s are not parsed.
93 class DocumentParserUnittest(unittest
.TestCase
):
95 def testEmptyDocument(self
):
96 self
.assertEqual(('', 'No opening <h1> was found'), RemoveTitle(''))
98 result
= ParseDocument('')
99 self
.assertEqual(None, result
.title
)
100 self
.assertEqual(None, result
.title_attributes
)
101 self
.assertEqual([], result
.sections
)
102 self
.assertEqual([], result
.warnings
)
104 result
= ParseDocument('', expect_title
=True)
105 self
.assertEqual('', result
.title
)
106 self
.assertEqual({}, result
.title_attributes
)
107 self
.assertEqual([], result
.sections
)
108 self
.assertEqual(['Expected a title'], result
.warnings
)
110 def testRemoveTitle(self
):
111 no_closing_tag
= '<h1>No closing tag'
112 self
.assertEqual((no_closing_tag
, 'No closing </h1> was found'),
113 RemoveTitle(no_closing_tag
))
115 no_opening_tag
= 'No opening tag</h1>'
116 self
.assertEqual((no_opening_tag
, 'No opening <h1> was found'),
117 RemoveTitle(no_opening_tag
))
119 tags_wrong_order
= '</h1>Tags in wrong order<h1>'
120 self
.assertEqual((tags_wrong_order
, 'The </h1> appeared before the <h1>'),
121 RemoveTitle(tags_wrong_order
))
123 multiple_titles
= '<h1>First header</h1> and <h1>Second header</h1>'
124 self
.assertEqual((' and <h1>Second header</h1>', None),
125 RemoveTitle(multiple_titles
))
127 upper_case
= '<H1>Upper case header tag</H1> hi'
128 self
.assertEqual((' hi', None), RemoveTitle(upper_case
))
129 mixed_case
= '<H1>Mixed case header tag</h1> hi'
130 self
.assertEqual((' hi', None), RemoveTitle(mixed_case
))
132 def testOnlyTitleDocument(self
):
133 document
= '<h1 id="header">heading</h1>'
134 self
.assertEqual(('', None), RemoveTitle(document
))
136 result
= ParseDocument(document
)
137 self
.assertEqual(None, result
.title
)
138 self
.assertEqual(None, result
.title_attributes
)
139 self
.assertEqual([], result
.sections
)
140 self
.assertEqual(['Found unexpected title "heading"'], result
.warnings
)
142 result
= ParseDocument(document
, expect_title
=True)
143 self
.assertEqual('heading', result
.title
)
144 self
.assertEqual({'id': 'header'}, result
.title_attributes
)
145 self
.assertEqual([], result
.sections
)
146 self
.assertEqual([], result
.warnings
)
148 def testWholeDocument(self
):
149 self
.assertEqual((_WHOLE_DOCUMENT_WITHOUT_TITLE
, None),
150 RemoveTitle(_WHOLE_DOCUMENT
))
151 result
= ParseDocument(_WHOLE_DOCUMENT
, expect_title
=True)
152 self
.assertEqual('Main header', result
.title
)
153 self
.assertEqual({'id': 'main', 'class': 'header'}, result
.title_attributes
)
155 'Found closing </h3> while processing a <h2> (line 19, column 15)',
156 'Found multiple <h1> tags. Subsequent <h1> tags will be classified as '
157 '<h2> for the purpose of the structure (line 22, column 1)',
158 'Found <h3> in the middle of processing a <h2> (line 25, column 9)',
159 # TODO(kalman): Re-enable this warning once the reference pages have
160 # their references fixed.
161 #'Found <h4> without any preceding <h3> (line 28, column 1)',
164 # The non-trivial table of contents assertions...
165 self
.assertEqual(1, len(result
.sections
))
166 entries
= result
.sections
[0].structure
168 self
.assertEqual(4, len(entries
), entries
)
169 entry0
, entry1
, entry2
, entry3
= entries
171 self
.assertEqual('hello', entry0
.name
)
172 self
.assertEqual({'id': 'orange'}, entry0
.attributes
)
173 self
.assertEqual(2, len(entry0
.entries
))
174 entry0_0
, entry0_1
= entry0
.entries
176 self
.assertEqual('Valencia Oranges', entry0_0
.name
)
177 self
.assertEqual({'id': 'valencia'}, entry0_0
.attributes
)
178 self
.assertEqual([], entry0_0
.entries
)
179 self
.assertEqual('Seville Oranges', entry0_1
.name
)
180 self
.assertEqual({'id': 'seville'}, entry0_1
.attributes
)
181 self
.assertEqual([], entry0_1
.entries
)
183 self
.assertEqual('Grapefruit', entry1
.name
)
184 self
.assertEqual({}, entry1
.attributes
)
185 self
.assertEqual([], entry1
.entries
)
187 self
.assertEqual('Not the main header', entry2
.name
)
188 self
.assertEqual({'id': 'not-main'}, entry2
.attributes
)
189 self
.assertEqual([], entry2
.entries
)
191 self
.assertEqual('Not a banana', entry3
.name
)
192 self
.assertEqual({}, entry3
.attributes
)
193 self
.assertEqual(2, len(entry3
.entries
))
194 entry3_1
, entry3_2
= entry3
.entries
196 self
.assertEqual('It\'s a h4', entry3_1
.name
)
197 self
.assertEqual({}, entry3_1
.attributes
)
198 self
.assertEqual([], entry3_1
.entries
)
200 self
.assertEqual('Plantains', entry3_2
.name
)
201 self
.assertEqual({}, entry3_2
.attributes
)
202 self
.assertEqual(1, len(entry3_2
.entries
))
203 entry3_2_1
, = entry3_2
.entries
205 self
.assertEqual('Another h4', entry3_2_1
.name
)
206 self
.assertEqual({}, entry3_2_1
.attributes
)
207 self
.assertEqual([], entry3_2_1
.entries
)
209 def testSingleExplicitSection(self
):
211 result
= ParseDocument(document
, expect_title
=True)
212 self
.assertEqual([], result
.warnings
)
213 self
.assertEqual('Header', result
.title
)
214 self
.assertEqual(1, len(result
.sections
))
215 section0
, = result
.sections
216 entry0
, = section0
.structure
217 self
.assertEqual('An inner header', entry0
.name
)
218 # A single section, one with the title inside the section, the other out.
219 test('<h1>Header</h1>'
221 'Just a single section here.'
222 '<h2>An inner header</h2>'
225 'Another single section here.'
227 '<h2>An inner header</h2>'
230 def testMultipleSections(self
):
231 result
= ParseDocument(
233 '<h2>First header</h2>'
234 'This content outside a section is the first section.'
237 '<h2>Second header</h2>'
241 '<h2>Third header</h2>'
244 self
.assertEqual([], result
.warnings
)
245 self
.assertEqual('Header', result
.title
)
246 self
.assertEqual(3, len(result
.sections
))
247 section0
, section1
, section2
= result
.sections
248 def assert_single_header(section
, name
):
249 self
.assertEqual(1, len(section
.structure
))
250 self
.assertEqual(name
, section
.structure
[0].name
)
251 assert_single_header(section0
, 'First header')
252 assert_single_header(section1
, 'Second header')
253 assert_single_header(section2
, 'Third header')
256 if __name__
== '__main__':