cygprofile: increase timeouts to allow showing web contents
[chromium-blink-merge.git] / chrome / common / extensions / docs / server2 / document_parser.py
blobbfd88e7d70c984a4fe3f6533fc75ff28c572a7a4
1 # Copyright 2013 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 from HTMLParser import HTMLParser
8 class ParseResult(object):
9 '''The result of |ParseDocument|:
10 |title| The title of the page, as pulled from the first <h1>.
11 |title_attributes| The attributes of the <h1> tag the title is derived from.
12 |sections| The list of Sections within this document.
13 |warnings| Any warnings while parsing the document.
14 '''
16 def __init__(self, title, title_attributes, sections, warnings):
17 self.title = title
18 self.title_attributes = title_attributes
19 self.sections = sections
20 self.warnings = warnings
23 class DocumentSection(object):
24 '''A section of the document as grouped by <section>...</section>. Any content
25 not within section tags is considered an implicit section, so:
26 "Foo <section>Bar</section> Baz" is 3 sections.
27 |structure| A list of DocumentStructureEntry for each top-level heading.
28 '''
30 def __init__(self):
31 self.structure = []
34 class DocumentStructureEntry(object):
35 '''An entry in the document structure.
36 |attributes| The attributes of the header tag this entry is derived from.
37 |name| The name of this entry, as pulled from the header tag this entry
38 is derived from.
39 |entries| A list of child DocumentStructureEntry items.
40 '''
42 def __init__(self, tag, attributes):
43 self.attributes = attributes
44 self.name = ''
45 self.entries = []
46 # Callers shouldn't care about the tag, but we need it for sanity checking,
47 # so make it private. In particular we pretend that anything but the first
48 # h1 is an h2, and it'd be odd to expose that.
49 self._tag = tag
50 # Documents can override the name of the entry using title="".
51 self._has_explicit_name = False
53 def __repr__(self):
54 return '<%s>%s</%s>' % (self._tag, self.name, self._tag)
56 def __str__(self):
57 return repr(self)
60 def ParseDocument(document, expect_title=False):
61 '''Parses the title and a document structure form |document| and returns a
62 ParseResult.
63 '''
64 parser = _DocumentParser(expect_title)
65 parser.feed(document)
66 parser.close()
67 return parser.parse_result
70 def RemoveTitle(document):
71 '''Removes the first <h1>..</h1> tag found in |document| and returns a
72 (result, warning) tuple.
74 If no title is found or |document| is malformed in some way, returns the
75 original document and a warning message. Otherwise, returns the result of
76 removing the title from |document| with a None warning message.
77 '''
79 def min_index(lhs, rhs):
80 lhs_index, rhs_index = document.find(lhs), document.find(rhs)
81 if lhs_index == -1: return rhs_index
82 if rhs_index == -1: return lhs_index
83 return min(lhs_index, rhs_index)
85 title_start = min_index('<h1', '<H1')
86 if title_start == -1:
87 return document, 'No opening <h1> was found'
88 title_end = min_index('/h1>', '/H1>')
89 if title_end == -1:
90 return document, 'No closing </h1> was found'
91 if title_end < title_start:
92 return document, 'The </h1> appeared before the <h1>'
94 return (document[:title_start] + document[title_end + 4:], None)
97 _HEADER_TAGS = ['h2', 'h3', 'h4']
100 class _DocumentParser(HTMLParser):
101 '''HTMLParser for ParseDocument.
104 def __init__(self, expect_title):
105 HTMLParser.__init__(self)
106 # Public.
107 self.parse_result = None
108 # Private.
109 self._expect_title = expect_title
110 self._title_entry = None
111 self._sections = []
112 self._processing_section = DocumentSection()
113 self._processing_entry = None
114 self._warnings = []
116 def handle_starttag(self, tag, attrs):
117 if tag == 'section':
118 self._OnSectionBoundary()
119 return
121 if tag != 'h1' and tag not in _HEADER_TAGS:
122 return
124 if self._processing_entry is not None:
125 self._WarnWithPosition('Found <%s> in the middle of processing a <%s>' %
126 (tag, self._processing_entry._tag))
127 return
129 attrs_dict = dict(attrs)
130 self._processing_entry = DocumentStructureEntry(tag, attrs_dict)
132 explicit_name = attrs_dict.pop('title', None)
133 if explicit_name == '':
134 # Don't create a TOC entry at all if the tag has specified title="".
135 return
136 if explicit_name is not None:
137 self._processing_entry.name = explicit_name
138 self._processing_entry._has_explicit_name = True
140 if tag == 'h1' and self._title_entry is not None:
141 self._WarnWithPosition('Found multiple <h1> tags. Subsequent <h1> tags '
142 'will be classified as <h2> for the purpose of '
143 'the structure')
144 tag = 'h2'
146 if tag == 'h1':
147 self._title_entry = self._processing_entry
148 else:
149 belongs_to = self._processing_section.structure
150 for header in _HEADER_TAGS[:_HEADER_TAGS.index(tag)]:
151 if len(belongs_to) == 0:
152 # TODO(kalman): Re-enable this warning once the reference pages have
153 # their references fixed.
154 #self._WarnWithPosition('Found <%s> without any preceding <%s>' %
155 # (tag, header))
156 break
157 belongs_to = belongs_to[-1].entries
158 belongs_to.append(self._processing_entry)
160 def handle_endtag(self, tag):
161 if tag == 'section':
162 self._OnSectionBoundary()
163 return
165 if tag != 'h1' and tag not in _HEADER_TAGS:
166 return
168 if self._processing_entry is None:
169 self._WarnWithPosition('Found closing </%s> without an opening <%s>' %
170 (tag, tag))
171 return
173 if self._processing_entry._tag != tag:
174 self._WarnWithPosition('Found closing </%s> while processing a <%s>' %
175 (tag, self._processing_entry._tag))
176 # Note: no early return, it's more likely that the mismatched header was
177 # a typo rather than a misplaced closing header tag.
179 self._processing_entry = None
181 def handle_data(self, data):
182 if (self._processing_entry is not None and
183 not self._processing_entry._has_explicit_name):
184 # += is inefficient, but probably fine here because the chances of a
185 # large number of nested tags within header tags is pretty low.
186 self._processing_entry.name += data
188 def close(self):
189 HTMLParser.close(self)
191 self._OnSectionBoundary()
193 if self._processing_entry is not None:
194 self._warnings.append('Finished parsing while still processing a <%s>' %
195 parser._processing_entry._tag)
197 if self._expect_title:
198 if not self._title_entry:
199 self._warnings.append('Expected a title')
200 title, title_attributes = '', {}
201 else:
202 title, title_attributes = (
203 self._title_entry.name, self._title_entry.attributes)
204 else:
205 if self._title_entry:
206 self._warnings.append('Found unexpected title "%s"' %
207 self._title_entry.name)
208 title, title_attributes = None, None
210 self.parse_result = ParseResult(
211 title, title_attributes, self._sections, self._warnings)
213 def _OnSectionBoundary(self):
214 # Only start a new section if the previous section was non-empty.
215 if self._processing_section.structure:
216 self._sections.append(self._processing_section)
217 self._processing_section = DocumentSection()
219 def _WarnWithPosition(self, message):
220 line, col = self.getpos()
221 self._warnings.append('%s (line %s, column %s)' % (message, line, col + 1))