chrome/common/extensions/docs/server2/document_parser_test.py

   1 #!/usr/bin/env python
   2 # Copyright 2013 The Chromium Authors. All rights reserved.
   3 # Use of this source code is governed by a BSD-style license that can be
   4 # found in the LICENSE file.
   5
   6 import unittest
   7
   8 from document_parser import ParseDocument, RemoveTitle
   9
  10
  11 _WHOLE_DOCUMENT = '''
  12 Preamble before heading.
  13
  14 <h1 id='main' class='header'>Main header</h1>
  15 Some intro to the content.
  16
  17 <h2 id='banana' class='header' title=''>Bananas</h2>
  18 Something about bananas.
  19
  20 <h2 id='orange' title='hello'>Oranges</h2>
  21 Something about oranges.
  22
  23 <h3 id='valencia'>Valencia Oranges</h3>
  24 A description of valencia oranges.
  25
  26 <h3 id='seville'>Seville Oranges</h3>
  27 A description of seville oranges.
  28
  29 <h2>Grapefruit</h3>
  30 Grapefruit closed a h2 with a h3. This should be a warning.
  31
  32 <h1 id='not-main'>Not the main header</h1>
  33 But it should still show up in the TOC as though it were an h2.
  34
  35 <h2>Not <h3>a banana</h2>
  36 The embedded h3 should be ignored.
  37
  38 <h4>It's a h4</h4>
  39 h4 are part of the document structure, but this is not inside a h3.
  40
  41 <h3>Plantains</h3>
  42 Now I'm just getting lazy.
  43
  44 <h4>Another h4</h4>
  45 This h4 is inside a h3 so will show up.
  46
  47 <h5>Header 5</h5>
  48 Header 5s are not parsed.
  49 '''
  50
  51
  52 _WHOLE_DOCUMENT_WITHOUT_TITLE = '''
  53 Preamble before heading.
  54
  55
  56 Some intro to the content.
  57
  58 <h2 id='banana' class='header' title=''>Bananas</h2>
  59 Something about bananas.
  60
  61 <h2 id='orange' title='hello'>Oranges</h2>
  62 Something about oranges.
  63
  64 <h3 id='valencia'>Valencia Oranges</h3>
  65 A description of valencia oranges.
  66
  67 <h3 id='seville'>Seville Oranges</h3>
  68 A description of seville oranges.
  69
  70 <h2>Grapefruit</h3>
  71 Grapefruit closed a h2 with a h3. This should be a warning.
  72
  73 <h1 id='not-main'>Not the main header</h1>
  74 But it should still show up in the TOC as though it were an h2.
  75
  76 <h2>Not <h3>a banana</h2>
  77 The embedded h3 should be ignored.
  78
  79 <h4>It's a h4</h4>
  80 h4 are part of the document structure, but this is not inside a h3.
  81
  82 <h3>Plantains</h3>
  83 Now I'm just getting lazy.
  84
  85 <h4>Another h4</h4>
  86 This h4 is inside a h3 so will show up.
  87
  88 <h5>Header 5</h5>
  89 Header 5s are not parsed.
  90 '''
  91
  92
  93 class DocumentParserUnittest(unittest.TestCase):
  94
  95   def testEmptyDocument(self):
  96     self.assertEqual(('', 'No opening <h1> was found'), RemoveTitle(''))
  97
  98     result = ParseDocument('')
  99     self.assertEqual(None, result.title)
 100     self.assertEqual(None, result.title_attributes)
 101     self.assertEqual([], result.sections)
 102     self.assertEqual([], result.warnings)
 103
 104     result = ParseDocument('', expect_title=True)
 105     self.assertEqual('', result.title)
 106     self.assertEqual({}, result.title_attributes)
 107     self.assertEqual([], result.sections)
 108     self.assertEqual(['Expected a title'], result.warnings)
 109
 110   def testRemoveTitle(self):
 111     no_closing_tag = '<h1>No closing tag'
 112     self.assertEqual((no_closing_tag, 'No closing </h1> was found'),
 113                      RemoveTitle(no_closing_tag))
 114
 115     no_opening_tag = 'No opening tag</h1>'
 116     self.assertEqual((no_opening_tag, 'No opening <h1> was found'),
 117                      RemoveTitle(no_opening_tag))
 118
 119     tags_wrong_order = '</h1>Tags in wrong order<h1>'
 120     self.assertEqual((tags_wrong_order, 'The </h1> appeared before the <h1>'),
 121                      RemoveTitle(tags_wrong_order))
 122
 123     multiple_titles = '<h1>First header</h1> and <h1>Second header</h1>'
 124     self.assertEqual((' and <h1>Second header</h1>', None),
 125                      RemoveTitle(multiple_titles))
 126
 127     upper_case = '<H1>Upper case header tag</H1> hi'
 128     self.assertEqual((' hi', None), RemoveTitle(upper_case))
 129     mixed_case = '<H1>Mixed case header tag</h1> hi'
 130     self.assertEqual((' hi', None), RemoveTitle(mixed_case))
 131
 132   def testOnlyTitleDocument(self):
 133     document = '<h1 id="header">heading</h1>'
 134     self.assertEqual(('', None), RemoveTitle(document))
 135
 136     result = ParseDocument(document)
 137     self.assertEqual(None, result.title)
 138     self.assertEqual(None, result.title_attributes)
 139     self.assertEqual([], result.sections)
 140     self.assertEqual(['Found unexpected title "heading"'], result.warnings)
 141
 142     result = ParseDocument(document, expect_title=True)
 143     self.assertEqual('heading', result.title)
 144     self.assertEqual({'id': 'header'}, result.title_attributes)
 145     self.assertEqual([], result.sections)
 146     self.assertEqual([], result.warnings)
 147
 148   def testWholeDocument(self):
 149     self.assertEqual((_WHOLE_DOCUMENT_WITHOUT_TITLE, None),
 150                      RemoveTitle(_WHOLE_DOCUMENT))
 151     result = ParseDocument(_WHOLE_DOCUMENT, expect_title=True)
 152     self.assertEqual('Main header', result.title)
 153     self.assertEqual({'id': 'main', 'class': 'header'}, result.title_attributes)
 154     self.assertEqual([
 155       'Found closing </h3> while processing a <h2> (line 19, column 15)',
 156       'Found multiple <h1> tags. Subsequent <h1> tags will be classified as '
 157           '<h2> for the purpose of the structure (line 22, column 1)',
 158       'Found <h3> in the middle of processing a <h2> (line 25, column 9)',
 159       # TODO(kalman): Re-enable this warning once the reference pages have
 160       # their references fixed.
 161       #'Found <h4> without any preceding <h3> (line 28, column 1)',
 162     ], result.warnings)
 163
 164     # The non-trivial table of contents assertions...
 165     self.assertEqual(1, len(result.sections))
 166     entries = result.sections[0].structure
 167
 168     self.assertEqual(4, len(entries), entries)
 169     entry0, entry1, entry2, entry3 = entries
 170
 171     self.assertEqual('hello', entry0.name)
 172     self.assertEqual({'id': 'orange'}, entry0.attributes)
 173     self.assertEqual(2, len(entry0.entries))
 174     entry0_0, entry0_1 = entry0.entries
 175
 176     self.assertEqual('Valencia Oranges', entry0_0.name)
 177     self.assertEqual({'id': 'valencia'}, entry0_0.attributes)
 178     self.assertEqual([], entry0_0.entries)
 179     self.assertEqual('Seville Oranges', entry0_1.name)
 180     self.assertEqual({'id': 'seville'}, entry0_1.attributes)
 181     self.assertEqual([], entry0_1.entries)
 182
 183     self.assertEqual('Grapefruit', entry1.name)
 184     self.assertEqual({}, entry1.attributes)
 185     self.assertEqual([], entry1.entries)
 186
 187     self.assertEqual('Not the main header', entry2.name)
 188     self.assertEqual({'id': 'not-main'}, entry2.attributes)
 189     self.assertEqual([], entry2.entries)
 190
 191     self.assertEqual('Not a banana', entry3.name)
 192     self.assertEqual({}, entry3.attributes)
 193     self.assertEqual(2, len(entry3.entries))
 194     entry3_1, entry3_2 = entry3.entries
 195
 196     self.assertEqual('It\'s a h4', entry3_1.name)
 197     self.assertEqual({}, entry3_1.attributes)
 198     self.assertEqual([], entry3_1.entries)
 199
 200     self.assertEqual('Plantains', entry3_2.name)
 201     self.assertEqual({}, entry3_2.attributes)
 202     self.assertEqual(1, len(entry3_2.entries))
 203     entry3_2_1, = entry3_2.entries
 204
 205     self.assertEqual('Another h4', entry3_2_1.name)
 206     self.assertEqual({}, entry3_2_1.attributes)
 207     self.assertEqual([], entry3_2_1.entries)
 208
 209   def testSingleExplicitSection(self):
 210     def test(document):
 211       result = ParseDocument(document, expect_title=True)
 212       self.assertEqual([], result.warnings)
 213       self.assertEqual('Header', result.title)
 214       self.assertEqual(1, len(result.sections))
 215       section0, = result.sections
 216       entry0, = section0.structure
 217       self.assertEqual('An inner header', entry0.name)
 218     # A single section, one with the title inside the section, the other out.
 219     test('<h1>Header</h1>'
 220          '<section>'
 221          'Just a single section here.'
 222          '<h2>An inner header</h2>'
 223          '</section>')
 224     test('<section>'
 225          'Another single section here.'
 226          '<h1>Header</h1>'
 227          '<h2>An inner header</h2>'
 228          '</section>')
 229
 230   def testMultipleSections(self):
 231     result = ParseDocument(
 232         '<h1>Header</h1>'
 233         '<h2>First header</h2>'
 234         'This content outside a section is the first section.'
 235         '<section>'
 236         'Second section'
 237         '<h2>Second header</h2>'
 238         '</section>'
 239         '<section>'
 240         'Third section'
 241         '<h2>Third header</h2>'
 242         '</section>',
 243         expect_title=True)
 244     self.assertEqual([], result.warnings)
 245     self.assertEqual('Header', result.title)
 246     self.assertEqual(3, len(result.sections))
 247     section0, section1, section2 = result.sections
 248     def assert_single_header(section, name):
 249       self.assertEqual(1, len(section.structure))
 250       self.assertEqual(name, section.structure[0].name)
 251     assert_single_header(section0, 'First header')
 252     assert_single_header(section1, 'Second header')
 253     assert_single_header(section2, 'Third header')
 254
 255
 256 if __name__ == '__main__':
 257   unittest.main()