1 """A parser for SGML, using the derived class as a static DTD."""
3 # XXX This only supports those SGML features used by HTML.
5 # XXX There should be a way to distinguish between PCDATA (parsed
6 # character data -- the normal case), RCDATA (replaceable character
7 # data -- only char and entity references and end tags are special)
8 # and CDATA (character data -- only end tags are special). RCDATA is
9 # not supported at all.
14 __all__
= ["SGMLParser", "SGMLParseError"]
16 # Regular expressions used for parsing
18 interesting
= re
.compile('[&<]')
19 incomplete
= re
.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
24 entityref
= re
.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
25 charref
= re
.compile('&#([0-9]+)[^0-9]')
27 starttagopen
= re
.compile('<[>a-zA-Z]')
28 shorttagopen
= re
.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
29 shorttag
= re
.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
30 piclose
= re
.compile('>')
31 endbracket
= re
.compile('[<>]')
32 tagfind
= re
.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
33 attrfind
= re
.compile(
34 r
'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
35 r
'(\'[^
\']*\'|
"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?
')
38 class SGMLParseError(RuntimeError):
39 """Exception raised for all parse errors."""
43 # SGML parser base class -- find tags and call handler functions.
44 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
45 # The dtd is defined by deriving a class which defines methods
46 # with special names to handle tags: start_foo and end_foo to handle
47 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
48 # (Tags are converted to lower case for this purpose.) The data
49 # between tags is passed to the parser by calling self.handle_data()
50 # with some data as argument (the data may be split up in arbitrary
51 # chunks). Entity references are passed by calling
52 # self.handle_entityref() with the entity reference as argument.
54 class SGMLParser(_markupbase.ParserBase):
55 # Definition of entities -- derived classes may override
56 entity_or_charref = re.compile('&(?
:'
57 '([a
-zA
-Z
][-.a
-zA
-Z0
-9]*)|
#([0-9]+)'
60 def __init__(self
, verbose
=0):
61 """Initialize and reset this instance."""
62 self
.verbose
= verbose
66 """Reset this instance. Loses all unprocessed data."""
67 self
.__starttag
_text
= None
73 _markupbase
.ParserBase
.reset(self
)
75 def setnomoretags(self
):
76 """Enter literal mode (CDATA) till EOF.
78 Intended for derived classes only.
80 self
.nomoretags
= self
.literal
= 1
82 def setliteral(self
, *args
):
83 """Enter literal mode (CDATA).
85 Intended for derived classes only.
90 """Feed some data to the parser.
92 Call this as often as you want, with as little or as much text
93 as you want (may include '\n'). (This just saves the text,
94 all the processing is done by goahead().)
97 self
.rawdata
= self
.rawdata
+ data
101 """Handle the remaining data."""
104 def error(self
, message
):
105 raise SGMLParseError(message
)
107 # Internal -- handle data as far as reasonable. May leave state
108 # and data to be processed by a subsequent call. If 'end' is
109 # true, force handling all data as if followed by EOF marker.
110 def goahead(self
, end
):
111 rawdata
= self
.rawdata
116 self
.handle_data(rawdata
[i
:n
])
119 match
= interesting
.search(rawdata
, i
)
120 if match
: j
= match
.start()
123 self
.handle_data(rawdata
[i
:j
])
126 if rawdata
[i
] == '<':
127 if starttagopen
.match(rawdata
, i
):
129 self
.handle_data(rawdata
[i
])
132 k
= self
.parse_starttag(i
)
136 if rawdata
.startswith("</", i
):
137 k
= self
.parse_endtag(i
)
144 self
.handle_data("<")
150 if rawdata
.startswith("<!--", i
):
151 # Strictly speaking, a comment is --.*--
152 # within a declaration tag <!...>.
153 # This should be removed,
154 # and comments handled only in parse_declaration.
155 k
= self
.parse_comment(i
)
159 if rawdata
.startswith("<?", i
):
164 if rawdata
.startswith("<!", i
):
165 # This is some sort of declaration; in "HTML as
166 # deployed," this should only be the document type
167 # declaration ("<!DOCTYPE html...>").
168 k
= self
.parse_declaration(i
)
172 elif rawdata
[i
] == '&':
174 self
.handle_data(rawdata
[i
])
177 match
= charref
.match(rawdata
, i
)
179 name
= match
.group(1)
180 self
.handle_charref(name
)
182 if rawdata
[i
-1] != ';': i
= i
-1
184 match
= entityref
.match(rawdata
, i
)
186 name
= match
.group(1)
187 self
.handle_entityref(name
)
189 if rawdata
[i
-1] != ';': i
= i
-1
192 self
.error('neither < nor & ??')
193 # We get here only if incomplete matches but
195 match
= incomplete
.match(rawdata
, i
)
197 self
.handle_data(rawdata
[i
])
202 break # Really incomplete
203 self
.handle_data(rawdata
[i
:j
])
207 self
.handle_data(rawdata
[i
:n
])
209 self
.rawdata
= rawdata
[i
:]
210 # XXX if end: check for empty stack
212 # Extensions for the DOCTYPE scanner:
213 _decl_otherchars
= '='
215 # Internal -- parse processing instr, return length or -1 if not terminated
216 def parse_pi(self
, i
):
217 rawdata
= self
.rawdata
218 if rawdata
[i
:i
+2] != '<?':
219 self
.error('unexpected call to parse_pi()')
220 match
= piclose
.search(rawdata
, i
+2)
224 self
.handle_pi(rawdata
[i
+2: j
])
228 def get_starttag_text(self
):
229 return self
.__starttag
_text
231 # Internal -- handle starttag, return length or -1 if not terminated
232 def parse_starttag(self
, i
):
233 self
.__starttag
_text
= None
235 rawdata
= self
.rawdata
236 if shorttagopen
.match(rawdata
, i
):
237 # SGML shorthand: <tag/data/ == <tag>data</tag>
238 # XXX Can data contain &... (entity or char refs)?
239 # XXX Can data contain < or > (tag characters)?
240 # XXX Can there be whitespace before the first /?
241 match
= shorttag
.match(rawdata
, i
)
244 tag
, data
= match
.group(1, 2)
245 self
.__starttag
_text
= '<%s/' % tag
248 self
.finish_shorttag(tag
, data
)
249 self
.__starttag
_text
= rawdata
[start_pos
:match
.end(1) + 1]
251 # XXX The following should skip matching quotes (' or ")
252 # As a shortcut way to exit, this isn't so bad, but shouldn't
253 # be used to locate the actual end of the start tag since the
254 # < or > characters may be embedded in an attribute value.
255 match
= endbracket
.search(rawdata
, i
+1)
259 # Now parse the data between i+1 and j into a tag and attrs
261 if rawdata
[i
:i
+2] == '<>':
262 # SGML shorthand: <> == <last open tag seen>
266 match
= tagfind
.match(rawdata
, i
+1)
268 self
.error('unexpected call to parse_starttag')
270 tag
= rawdata
[i
+1:k
].lower()
273 match
= attrfind
.match(rawdata
, k
)
275 attrname
, rest
, attrvalue
= match
.group(1, 2, 3)
279 if (attrvalue
[:1] == "'" == attrvalue
[-1:] or
280 attrvalue
[:1] == '"' == attrvalue
[-1:]):
282 attrvalue
= attrvalue
[1:-1]
283 attrvalue
= self
.entity_or_charref
.sub(
284 self
._convert
_ref
, attrvalue
)
285 attrs
.append((attrname
.lower(), attrvalue
))
287 if rawdata
[j
] == '>':
289 self
.__starttag
_text
= rawdata
[start_pos
:j
]
290 self
.finish_starttag(tag
, attrs
)
293 # Internal -- convert entity or character reference
294 def _convert_ref(self
, match
):
296 return self
.convert_charref(match
.group(2)) or \
297 '&#%s%s' % match
.groups()[1:]
299 return self
.convert_entityref(match
.group(1)) or \
300 '&%s;' % match
.group(1)
302 return '&%s' % match
.group(1)
304 # Internal -- parse endtag
305 def parse_endtag(self
, i
):
306 rawdata
= self
.rawdata
307 match
= endbracket
.search(rawdata
, i
+1)
311 tag
= rawdata
[i
+2:j
].strip().lower()
312 if rawdata
[j
] == '>':
314 self
.finish_endtag(tag
)
317 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
318 def finish_shorttag(self
, tag
, data
):
319 self
.finish_starttag(tag
, [])
320 self
.handle_data(data
)
321 self
.finish_endtag(tag
)
323 # Internal -- finish processing of start tag
324 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
325 def finish_starttag(self
, tag
, attrs
):
327 method
= getattr(self
, 'start_' + tag
)
328 except AttributeError:
330 method
= getattr(self
, 'do_' + tag
)
331 except AttributeError:
332 self
.unknown_starttag(tag
, attrs
)
335 self
.handle_starttag(tag
, method
, attrs
)
338 self
.stack
.append(tag
)
339 self
.handle_starttag(tag
, method
, attrs
)
342 # Internal -- finish processing of end tag
343 def finish_endtag(self
, tag
):
345 found
= len(self
.stack
) - 1
347 self
.unknown_endtag(tag
)
350 if tag
not in self
.stack
:
352 method
= getattr(self
, 'end_' + tag
)
353 except AttributeError:
354 self
.unknown_endtag(tag
)
356 self
.report_unbalanced(tag
)
358 found
= len(self
.stack
)
359 for i
in range(found
):
360 if self
.stack
[i
] == tag
: found
= i
361 while len(self
.stack
) > found
:
364 method
= getattr(self
, 'end_' + tag
)
365 except AttributeError:
368 self
.handle_endtag(tag
, method
)
370 self
.unknown_endtag(tag
)
373 # Overridable -- handle start tag
374 def handle_starttag(self
, tag
, method
, attrs
):
377 # Overridable -- handle end tag
378 def handle_endtag(self
, tag
, method
):
381 # Example -- report an unbalanced </...> tag.
382 def report_unbalanced(self
, tag
):
384 print('*** Unbalanced </' + tag
+ '>')
385 print('*** Stack:', self
.stack
)
387 def convert_charref(self
, name
):
388 """Convert character reference, may be overridden."""
393 if not 0 <= n
<= 127:
395 return self
.convert_codepoint(n
)
397 def convert_codepoint(self
, codepoint
):
398 return chr(codepoint
)
400 def handle_charref(self
, name
):
401 """Handle character reference, no need to override."""
402 replacement
= self
.convert_charref(name
)
403 if replacement
is None:
404 self
.unknown_charref(name
)
406 self
.handle_data(replacement
)
408 # Definition of entities -- derived classes may override
410 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
412 def convert_entityref(self
, name
):
413 """Convert entity references.
415 As an alternative to overriding this method; one can tailor the
416 results by setting up the self.entitydefs mapping appropriately.
418 table
= self
.entitydefs
424 def handle_entityref(self
, name
):
425 """Handle entity references, no need to override."""
426 replacement
= self
.convert_entityref(name
)
427 if replacement
is None:
428 self
.unknown_entityref(name
)
430 self
.handle_data(replacement
)
432 # Example -- handle data, should be overridden
433 def handle_data(self
, data
):
436 # Example -- handle comment, could be overridden
437 def handle_comment(self
, data
):
440 # Example -- handle declaration, could be overridden
441 def handle_decl(self
, decl
):
444 # Example -- handle processing instruction, could be overridden
445 def handle_pi(self
, data
):
448 # To be overridden -- handlers for unknown objects
449 def unknown_starttag(self
, tag
, attrs
): pass
450 def unknown_endtag(self
, tag
): pass
451 def unknown_charref(self
, ref
): pass
452 def unknown_entityref(self
, ref
): pass
455 class TestSGMLParser(SGMLParser
):
457 def __init__(self
, verbose
=0):
459 SGMLParser
.__init
__(self
, verbose
)
461 def handle_data(self
, data
):
462 self
.testdata
= self
.testdata
+ data
463 if len(repr(self
.testdata
)) >= 70:
470 print('data:', repr(data
))
472 def handle_comment(self
, data
):
476 r
= r
[:32] + '...' + r
[-32:]
479 def unknown_starttag(self
, tag
, attrs
):
482 print('start tag: <' + tag
+ '>')
484 print('start tag: <' + tag
, end
=' ')
485 for name
, value
in attrs
:
486 print(name
+ '=' + '"' + value
+ '"', end
=' ')
489 def unknown_endtag(self
, tag
):
491 print('end tag: </' + tag
+ '>')
493 def unknown_entityref(self
, ref
):
495 print('*** unknown entity ref: &' + ref
+ ';')
497 def unknown_charref(self
, ref
):
499 print('*** unknown char ref: &#' + ref
+ ';')
501 def unknown_decl(self
, data
):
503 print('*** unknown decl: [' + data
+ ']')
506 SGMLParser
.close(self
)
510 def test(args
= None):
516 if args
and args
[0] == '-s':
520 klass
= TestSGMLParser
532 except IOError as msg
:
533 print(file, ":", msg
)
537 if f
is not sys
.stdin
:
546 if __name__
== '__main__':