chrome/common/extensions/docs/server2/document_parser.py

   1 # Copyright 2013 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 from HTMLParser import HTMLParser
   6
   7
   8 class ParseResult(object):
   9   '''The result of |ParseDocument|:
  10   |title|             The title of the page, as pulled from the first <h1>.
  11   |title_attributes|  The attributes of the <h1> tag the title is derived from.
  12   |sections|          The list of Sections within this document.
  13   |warnings|          Any warnings while parsing the document.
  14   '''
  15
  16   def __init__(self, title, title_attributes, sections, warnings):
  17     self.title = title
  18     self.title_attributes = title_attributes
  19     self.sections = sections
  20     self.warnings = warnings
  21
  22
  23 class DocumentSection(object):
  24   '''A section of the document as grouped by <section>...</section>. Any content
  25   not within section tags is considered an implicit section, so:
  26   "Foo <section>Bar</section> Baz" is 3 sections.
  27   |structure|  A list of DocumentStructureEntry for each top-level heading.
  28   '''
  29
  30   def __init__(self):
  31     self.structure = []
  32
  33
  34 class DocumentStructureEntry(object):
  35   '''An entry in the document structure.
  36   |attributes| The attributes of the header tag this entry is derived from.
  37   |name|       The name of this entry, as pulled from the header tag this entry
  38                is derived from.
  39   |entries|    A list of child DocumentStructureEntry items.
  40   '''
  41
  42   def __init__(self, tag, attributes):
  43     self.attributes = attributes
  44     self.name = ''
  45     self.entries = []
  46     # Callers shouldn't care about the tag, but we need it for sanity checking,
  47     # so make it private. In particular we pretend that anything but the first
  48     # h1 is an h2, and it'd be odd to expose that.
  49     self._tag = tag
  50     # Documents can override the name of the entry using title="".
  51     self._has_explicit_name = False
  52
  53   def __repr__(self):
  54     return '<%s>%s</%s>' % (self._tag, self.name, self._tag)
  55
  56   def __str__(self):
  57     return repr(self)
  58
  59
  60 def ParseDocument(document, expect_title=False):
  61   '''Parses the title and a document structure form |document| and returns a
  62   ParseResult.
  63   '''
  64   parser = _DocumentParser(expect_title)
  65   parser.feed(document)
  66   parser.close()
  67   return parser.parse_result
  68
  69
  70 def RemoveTitle(document):
  71   '''Removes the first <h1>..</h1> tag found in |document| and returns a
  72   (result, warning) tuple.
  73
  74   If no title is found or |document| is malformed in some way, returns the
  75   original document and a warning message. Otherwise, returns the result of
  76   removing the title from |document| with a None warning message.
  77   '''
  78
  79   def min_index(lhs, rhs):
  80     lhs_index, rhs_index = document.find(lhs), document.find(rhs)
  81     if lhs_index == -1: return rhs_index
  82     if rhs_index == -1: return lhs_index
  83     return min(lhs_index, rhs_index)
  84
  85   title_start = min_index('<h1', '<H1')
  86   if title_start == -1:
  87     return document, 'No opening <h1> was found'
  88   title_end = min_index('/h1>', '/H1>')
  89   if title_end == -1:
  90     return document, 'No closing </h1> was found'
  91   if title_end < title_start:
  92     return document, 'The </h1> appeared before the <h1>'
  93
  94   return (document[:title_start] + document[title_end + 4:], None)
  95
  96
  97 _HEADER_TAGS = ['h2', 'h3', 'h4']
  98
  99
 100 class _DocumentParser(HTMLParser):
 101   '''HTMLParser for ParseDocument.
 102   '''
 103
 104   def __init__(self, expect_title):
 105     HTMLParser.__init__(self)
 106     # Public.
 107     self.parse_result = None
 108     # Private.
 109     self._expect_title = expect_title
 110     self._title_entry = None
 111     self._sections = []
 112     self._processing_section = DocumentSection()
 113     self._processing_entry = None
 114     self._warnings = []
 115
 116   def handle_starttag(self, tag, attrs):
 117     if tag == 'section':
 118       self._OnSectionBoundary()
 119       return
 120
 121     if tag != 'h1' and tag not in _HEADER_TAGS:
 122       return
 123
 124     if self._processing_entry is not None:
 125       self._WarnWithPosition('Found <%s> in the middle of processing a <%s>' %
 126                              (tag, self._processing_entry._tag))
 127       return
 128
 129     attrs_dict = dict(attrs)
 130     self._processing_entry = DocumentStructureEntry(tag, attrs_dict)
 131
 132     explicit_name = attrs_dict.pop('title', None)
 133     if explicit_name == '':
 134       # Don't create a TOC entry at all if the tag has specified title="".
 135       return
 136     if explicit_name is not None:
 137       self._processing_entry.name = explicit_name
 138       self._processing_entry._has_explicit_name = True
 139
 140     if tag == 'h1' and self._title_entry is not None:
 141       self._WarnWithPosition('Found multiple <h1> tags. Subsequent <h1> tags '
 142                              'will be classified as <h2> for the purpose of '
 143                              'the structure')
 144       tag = 'h2'
 145
 146     if tag == 'h1':
 147       self._title_entry = self._processing_entry
 148     else:
 149       belongs_to = self._processing_section.structure
 150       for header in _HEADER_TAGS[:_HEADER_TAGS.index(tag)]:
 151         if len(belongs_to) == 0:
 152           # TODO(kalman): Re-enable this warning once the reference pages have
 153           # their references fixed.
 154           #self._WarnWithPosition('Found <%s> without any preceding <%s>' %
 155           #                       (tag, header))
 156           break
 157         belongs_to = belongs_to[-1].entries
 158       belongs_to.append(self._processing_entry)
 159
 160   def handle_endtag(self, tag):
 161     if tag == 'section':
 162       self._OnSectionBoundary()
 163       return
 164
 165     if tag != 'h1' and tag not in _HEADER_TAGS:
 166       return
 167
 168     if self._processing_entry is None:
 169       self._WarnWithPosition('Found closing </%s> without an opening <%s>' %
 170                              (tag, tag))
 171       return
 172
 173     if self._processing_entry._tag != tag:
 174       self._WarnWithPosition('Found closing </%s> while processing a <%s>' %
 175                              (tag, self._processing_entry._tag))
 176       # Note: no early return, it's more likely that the mismatched header was
 177       # a typo rather than a misplaced closing header tag.
 178
 179     self._processing_entry = None
 180
 181   def handle_data(self, data):
 182     if (self._processing_entry is not None and
 183         not self._processing_entry._has_explicit_name):
 184       # += is inefficient, but probably fine here because the chances of a
 185       # large number of nested tags within header tags is pretty low.
 186       self._processing_entry.name += data
 187
 188   def close(self):
 189     HTMLParser.close(self)
 190
 191     self._OnSectionBoundary()
 192
 193     if self._processing_entry is not None:
 194       self._warnings.append('Finished parsing while still processing a <%s>' %
 195                             parser._processing_entry._tag)
 196
 197     if self._expect_title:
 198       if not self._title_entry:
 199         self._warnings.append('Expected a title')
 200         title, title_attributes = '', {}
 201       else:
 202         title, title_attributes = (
 203             self._title_entry.name, self._title_entry.attributes)
 204     else:
 205       if self._title_entry:
 206         self._warnings.append('Found unexpected title "%s"' %
 207                               self._title_entry.name)
 208       title, title_attributes = None, None
 209
 210     self.parse_result = ParseResult(
 211         title, title_attributes, self._sections, self._warnings)
 212
 213   def _OnSectionBoundary(self):
 214     # Only start a new section if the previous section was non-empty.
 215     if self._processing_section.structure:
 216       self._sections.append(self._processing_section)
 217       self._processing_section = DocumentSection()
 218
 219   def _WarnWithPosition(self, message):
 220     line, col = self.getpos()
 221     self._warnings.append('%s (line %s, column %s)' % (message, line, col + 1))