1 """A parser for SGML, using the derived class as a static DTD."""
3 # XXX This only supports those SGML features used by HTML.
5 # XXX There should be a way to distinguish between PCDATA (parsed
6 # character data -- the normal case), RCDATA (replaceable character
7 # data -- only char and entity references and end tags are special)
8 # and CDATA (character data -- only end tags are special). RCDATA is
9 # not supported at all.
15 __all__
= ["SGMLParser"]
17 # Regular expressions used for parsing
19 interesting
= re
.compile('[&<]')
20 incomplete
= re
.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
25 entityref
= re
.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
26 charref
= re
.compile('&#([0-9]+)[^0-9]')
28 starttagopen
= re
.compile('<[>a-zA-Z]')
29 shorttagopen
= re
.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
30 shorttag
= re
.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
31 piclose
= re
.compile('>')
32 endbracket
= re
.compile('[<>]')
33 commentclose
= re
.compile(r
'--\s*>')
34 tagfind
= re
.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
35 attrfind
= re
.compile(
36 r
'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
37 r
'(\'[^
\']*\'|
"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~\'"]*))?
')
40 class SGMLParseError(RuntimeError):
41 """Exception raised for all parse errors."""
45 # SGML parser base class -- find tags and call handler functions.
46 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
47 # The dtd is defined by deriving a class which defines methods
48 # with special names to handle tags: start_foo and end_foo to handle
49 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
50 # (Tags are converted to lower case for this purpose.) The data
51 # between tags is passed to the parser by calling self.handle_data()
52 # with some data as argument (the data may be split up in arbitrary
53 # chunks). Entity references are passed by calling
54 # self.handle_entityref() with the entity reference as argument.
56 class SGMLParser(markupbase.ParserBase):
58 def __init__(self, verbose=0):
59 """Initialize and reset this instance."""
60 self.verbose = verbose
64 """Reset this instance. Loses all unprocessed data."""
70 markupbase.ParserBase.reset(self)
72 def setnomoretags(self):
73 """Enter literal mode (CDATA) till EOF.
75 Intended for derived classes only.
77 self.nomoretags = self.literal = 1
79 def setliteral(self, *args):
80 """Enter literal mode (CDATA).
82 Intended for derived classes only.
87 """Feed some data to the parser.
89 Call this as often as you want, with as little or as much text
90 as you want (may include '\n'). (This just saves the text,
91 all the processing is done by goahead().)
94 self.rawdata = self.rawdata + data
98 """Handle the remaining data."""
101 def error(self, message):
102 raise SGMLParseError(message)
104 # Internal -- handle data as far as reasonable. May leave state
105 # and data to be processed by a subsequent call. If 'end
' is
106 # true, force handling all data as if followed by EOF marker.
107 def goahead(self, end):
108 rawdata = self.rawdata
113 self.handle_data(rawdata[i:n])
116 match = interesting.search(rawdata, i)
117 if match: j = match.start()
120 self.handle_data(rawdata[i:j])
123 if rawdata[i] == '<':
124 if starttagopen.match(rawdata, i):
126 self.handle_data(rawdata[i])
129 k = self.parse_starttag(i)
133 if rawdata.startswith("</", i):
134 k = self.parse_endtag(i)
141 self.handle_data("<")
147 if rawdata.startswith("<!--", i):
148 k = self.parse_comment(i)
152 if rawdata.startswith("<?", i):
157 if rawdata.startswith("<!", i):
158 # This is some sort of declaration; in "HTML as
159 # deployed," this should only be the document type
160 # declaration ("<!DOCTYPE html...>").
161 k = self.parse_declaration(i)
165 elif rawdata[i] == '&':
167 self.handle_data(rawdata[i])
170 match = charref.match(rawdata, i)
172 name = match.group(1)
173 self.handle_charref(name)
175 if rawdata[i-1] != ';': i = i-1
177 match = entityref.match(rawdata, i)
179 name = match.group(1)
180 self.handle_entityref(name)
182 if rawdata[i-1] != ';': i = i-1
185 self.error('neither
< nor
& ??
')
186 # We get here only if incomplete matches but
188 match = incomplete.match(rawdata, i)
190 self.handle_data(rawdata[i])
195 break # Really incomplete
196 self.handle_data(rawdata[i:j])
200 self.handle_data(rawdata[i:n])
202 self.rawdata = rawdata[i:]
203 # XXX if end: check for empty stack
205 # Internal -- parse comment, return length or -1 if not terminated
206 def parse_comment(self, i, report=1):
207 rawdata = self.rawdata
208 if rawdata[i:i+4] != '<!--':
209 self.error('unexpected call to
parse_comment()')
210 match = commentclose.search(rawdata, i+4)
215 self.handle_comment(rawdata[i+4: j])
218 # Extensions for the DOCTYPE scanner:
219 _decl_otherchars = '='
221 # Internal -- parse processing instr, return length or -1 if not terminated
222 def parse_pi(self, i):
223 rawdata = self.rawdata
224 if rawdata[i:i+2] != '<?
':
225 self.error('unexpected call to
parse_pi()')
226 match = piclose.search(rawdata, i+2)
230 self.handle_pi(rawdata[i+2: j])
234 __starttag_text = None
235 def get_starttag_text(self):
236 return self.__starttag_text
238 # Internal -- handle starttag, return length or -1 if not terminated
239 def parse_starttag(self, i):
240 self.__starttag_text = None
242 rawdata = self.rawdata
243 if shorttagopen.match(rawdata, i):
244 # SGML shorthand: <tag/data/ == <tag>data</tag>
245 # XXX Can data contain &... (entity or char refs)?
246 # XXX Can data contain < or > (tag characters)?
247 # XXX Can there be whitespace before the first /?
248 match = shorttag.match(rawdata, i)
251 tag, data = match.group(1, 2)
252 self.__starttag_text = '<%s/' % tag
255 self.finish_shorttag(tag, data)
256 self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
258 # XXX The following should skip matching quotes (' or ")
259 match = endbracket.search(rawdata, i+1)
263 # Now parse the data between i+1 and j into a tag and attrs
265 if rawdata[i:i+2] == '<>':
266 # SGML shorthand: <> == <last open tag seen>
270 match = tagfind.match(rawdata, i+1)
272 self.error('unexpected call to parse_starttag')
274 tag = rawdata[i+1:k].lower()
277 match = attrfind.match(rawdata, k)
279 attrname, rest, attrvalue = match.group(1, 2, 3)
282 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
283 attrvalue[:1] == '"' == attrvalue[-1:]:
284 attrvalue = attrvalue[1:-1]
285 attrs.append((attrname.lower(), attrvalue))
287 if rawdata[j] == '>':
289 self.__starttag_text = rawdata[start_pos:j]
290 self.finish_starttag(tag, attrs)
293 # Internal -- parse endtag
294 def parse_endtag(self, i):
295 rawdata = self.rawdata
296 match = endbracket.search(rawdata, i+1)
300 tag = rawdata[i+2:j].strip().lower()
301 if rawdata[j] == '>':
303 self.finish_endtag(tag)
306 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
307 def finish_shorttag(self, tag, data):
308 self.finish_starttag(tag, [])
309 self.handle_data(data)
310 self.finish_endtag(tag)
312 # Internal -- finish processing of start tag
313 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
314 def finish_starttag(self, tag, attrs):
316 method = getattr(self, 'start_
' + tag)
317 except AttributeError:
319 method = getattr(self, 'do_
' + tag)
320 except AttributeError:
321 self.unknown_starttag(tag, attrs)
324 self.handle_starttag(tag, method, attrs)
327 self.stack.append(tag)
328 self.handle_starttag(tag, method, attrs)
331 # Internal -- finish processing of end tag
332 def finish_endtag(self, tag):
334 found = len(self.stack) - 1
336 self.unknown_endtag(tag)
339 if tag not in self.stack:
341 method = getattr(self, 'end_
' + tag)
342 except AttributeError:
343 self.unknown_endtag(tag)
345 self.report_unbalanced(tag)
347 found = len(self.stack)
348 for i in range(found):
349 if self.stack[i] == tag: found = i
350 while len(self.stack) > found:
353 method = getattr(self, 'end_
' + tag)
354 except AttributeError:
357 self.handle_endtag(tag, method)
359 self.unknown_endtag(tag)
362 # Overridable -- handle start tag
363 def handle_starttag(self, tag, method, attrs):
366 # Overridable -- handle end tag
367 def handle_endtag(self, tag, method):
370 # Example -- report an unbalanced </...> tag.
371 def report_unbalanced(self, tag):
373 print '*** Unbalanced
</' + tag + '>'
374 print '*** Stack
:', self.stack
376 def handle_charref(self, name):
377 """Handle character reference, no need to override."""
381 self.unknown_charref(name)
383 if not 0 <= n <= 255:
384 self.unknown_charref(name)
386 self.handle_data(chr(n))
388 # Definition of entities -- derived classes may override
390 {'lt
': '<', 'gt
': '>', 'amp
': '&', 'quot
': '"', 'apos': '\''}
392 def handle_entityref(self, name):
393 """Handle entity references.
395 There should be no need to override this method; it can be
396 tailored by setting up the self.entitydefs mapping appropriately.
398 table = self.entitydefs
399 if table.has_key(name):
400 self.handle_data(table[name])
402 self.unknown_entityref(name)
405 # Example -- handle data, should be overridden
406 def handle_data(self, data):
409 # Example -- handle comment, could be overridden
410 def handle_comment(self, data):
413 # Example -- handle declaration, could be overridden
414 def handle_decl(self, decl):
417 # Example -- handle processing instruction, could be overridden
418 def handle_pi(self, data):
421 # To be overridden -- handlers for unknown objects
422 def unknown_starttag(self, tag, attrs): pass
423 def unknown_endtag(self, tag): pass
424 def unknown_charref(self, ref): pass
425 def unknown_entityref(self, ref): pass
428 class TestSGMLParser(SGMLParser):
430 def __init__(self, verbose=0):
432 SGMLParser.__init__(self, verbose)
434 def handle_data(self, data):
435 self.testdata = self.testdata + data
436 if len(`self.testdata`) >= 70:
443 print 'data:', `data`
445 def handle_comment(self, data):
449 r = r[:32] + '...' + r[-32:]
452 def unknown_starttag(self, tag, attrs):
455 print 'start tag: <' + tag + '>'
457 print 'start tag: <' + tag,
458 for name, value in attrs:
459 print name + '=' + '"' + value + '"',
462 def unknown_endtag(self, tag):
464 print 'end tag: </' + tag + '>'
466 def unknown_entityref(self, ref):
468 print '*** unknown entity ref: &' + ref + ';'
470 def unknown_charref(self, ref):
472 print '*** unknown char ref: &#' + ref + ';'
475 SGMLParser.close(self)
479 def test(args = None):
485 if args and args[0] == '-s':
489 klass = TestSGMLParser
506 if f is not sys.stdin:
515 if __name__ == '__main__':