Fix the availability statement for the spawn*() functions to reflect the
[python/dscho.git] / Lib / sgmllib.py
blob1db5423254c9a89674881babd9025f58660ac68a
1 """A parser for SGML, using the derived class as a static DTD."""
3 # XXX This only supports those SGML features used by HTML.
5 # XXX There should be a way to distinguish between PCDATA (parsed
6 # character data -- the normal case), RCDATA (replaceable character
7 # data -- only char and entity references and end tags are special)
8 # and CDATA (character data -- only end tags are special). RCDATA is
9 # not supported at all.
12 import markupbase
13 import re
15 __all__ = ["SGMLParser"]
17 # Regular expressions used for parsing
19 interesting = re.compile('[&<]')
20 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
21 '<([a-zA-Z][^<>]*|'
22 '/([a-zA-Z][^<>]*)?|'
23 '![^<>]*)?')
25 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
26 charref = re.compile('&#([0-9]+)[^0-9]')
28 starttagopen = re.compile('<[>a-zA-Z]')
29 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
30 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
31 piclose = re.compile('>')
32 endbracket = re.compile('[<>]')
33 commentclose = re.compile(r'--\s*>')
34 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
35 attrfind = re.compile(
36 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
37 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~\'"]*))?')
40 class SGMLParseError(RuntimeError):
41 """Exception raised for all parse errors."""
42 pass
45 # SGML parser base class -- find tags and call handler functions.
46 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
47 # The dtd is defined by deriving a class which defines methods
48 # with special names to handle tags: start_foo and end_foo to handle
49 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
50 # (Tags are converted to lower case for this purpose.) The data
51 # between tags is passed to the parser by calling self.handle_data()
52 # with some data as argument (the data may be split up in arbitrary
53 # chunks). Entity references are passed by calling
54 # self.handle_entityref() with the entity reference as argument.
56 class SGMLParser(markupbase.ParserBase):
58 def __init__(self, verbose=0):
59 """Initialize and reset this instance."""
60 self.verbose = verbose
61 self.reset()
63 def reset(self):
64 """Reset this instance. Loses all unprocessed data."""
65 self.rawdata = ''
66 self.stack = []
67 self.lasttag = '???'
68 self.nomoretags = 0
69 self.literal = 0
70 markupbase.ParserBase.reset(self)
72 def setnomoretags(self):
73 """Enter literal mode (CDATA) till EOF.
75 Intended for derived classes only.
76 """
77 self.nomoretags = self.literal = 1
79 def setliteral(self, *args):
80 """Enter literal mode (CDATA).
82 Intended for derived classes only.
83 """
84 self.literal = 1
86 def feed(self, data):
87 """Feed some data to the parser.
89 Call this as often as you want, with as little or as much text
90 as you want (may include '\n'). (This just saves the text,
91 all the processing is done by goahead().)
92 """
94 self.rawdata = self.rawdata + data
95 self.goahead(0)
97 def close(self):
98 """Handle the remaining data."""
99 self.goahead(1)
101 def error(self, message):
102 raise SGMLParseError(message)
104 # Internal -- handle data as far as reasonable. May leave state
105 # and data to be processed by a subsequent call. If 'end' is
106 # true, force handling all data as if followed by EOF marker.
107 def goahead(self, end):
108 rawdata = self.rawdata
109 i = 0
110 n = len(rawdata)
111 while i < n:
112 if self.nomoretags:
113 self.handle_data(rawdata[i:n])
114 i = n
115 break
116 match = interesting.search(rawdata, i)
117 if match: j = match.start()
118 else: j = n
119 if i < j:
120 self.handle_data(rawdata[i:j])
121 i = j
122 if i == n: break
123 if rawdata[i] == '<':
124 if starttagopen.match(rawdata, i):
125 if self.literal:
126 self.handle_data(rawdata[i])
127 i = i+1
128 continue
129 k = self.parse_starttag(i)
130 if k < 0: break
131 i = k
132 continue
133 if rawdata.startswith("</", i):
134 k = self.parse_endtag(i)
135 if k < 0: break
136 i = k
137 self.literal = 0
138 continue
139 if self.literal:
140 if n > (i + 1):
141 self.handle_data("<")
142 i = i+1
143 else:
144 # incomplete
145 break
146 continue
147 if rawdata.startswith("<!--", i):
148 k = self.parse_comment(i)
149 if k < 0: break
150 i = k
151 continue
152 if rawdata.startswith("<?", i):
153 k = self.parse_pi(i)
154 if k < 0: break
155 i = i+k
156 continue
157 if rawdata.startswith("<!", i):
158 # This is some sort of declaration; in "HTML as
159 # deployed," this should only be the document type
160 # declaration ("<!DOCTYPE html...>").
161 k = self.parse_declaration(i)
162 if k < 0: break
163 i = k
164 continue
165 elif rawdata[i] == '&':
166 if self.literal:
167 self.handle_data(rawdata[i])
168 i = i+1
169 continue
170 match = charref.match(rawdata, i)
171 if match:
172 name = match.group(1)
173 self.handle_charref(name)
174 i = match.end(0)
175 if rawdata[i-1] != ';': i = i-1
176 continue
177 match = entityref.match(rawdata, i)
178 if match:
179 name = match.group(1)
180 self.handle_entityref(name)
181 i = match.end(0)
182 if rawdata[i-1] != ';': i = i-1
183 continue
184 else:
185 self.error('neither < nor & ??')
186 # We get here only if incomplete matches but
187 # nothing else
188 match = incomplete.match(rawdata, i)
189 if not match:
190 self.handle_data(rawdata[i])
191 i = i+1
192 continue
193 j = match.end(0)
194 if j == n:
195 break # Really incomplete
196 self.handle_data(rawdata[i:j])
197 i = j
198 # end while
199 if end and i < n:
200 self.handle_data(rawdata[i:n])
201 i = n
202 self.rawdata = rawdata[i:]
203 # XXX if end: check for empty stack
205 # Internal -- parse comment, return length or -1 if not terminated
206 def parse_comment(self, i, report=1):
207 rawdata = self.rawdata
208 if rawdata[i:i+4] != '<!--':
209 self.error('unexpected call to parse_comment()')
210 match = commentclose.search(rawdata, i+4)
211 if not match:
212 return -1
213 if report:
214 j = match.start(0)
215 self.handle_comment(rawdata[i+4: j])
216 return match.end(0)
218 # Extensions for the DOCTYPE scanner:
219 _decl_otherchars = '='
221 # Internal -- parse processing instr, return length or -1 if not terminated
222 def parse_pi(self, i):
223 rawdata = self.rawdata
224 if rawdata[i:i+2] != '<?':
225 self.error('unexpected call to parse_pi()')
226 match = piclose.search(rawdata, i+2)
227 if not match:
228 return -1
229 j = match.start(0)
230 self.handle_pi(rawdata[i+2: j])
231 j = match.end(0)
232 return j-i
234 __starttag_text = None
235 def get_starttag_text(self):
236 return self.__starttag_text
238 # Internal -- handle starttag, return length or -1 if not terminated
239 def parse_starttag(self, i):
240 self.__starttag_text = None
241 start_pos = i
242 rawdata = self.rawdata
243 if shorttagopen.match(rawdata, i):
244 # SGML shorthand: <tag/data/ == <tag>data</tag>
245 # XXX Can data contain &... (entity or char refs)?
246 # XXX Can data contain < or > (tag characters)?
247 # XXX Can there be whitespace before the first /?
248 match = shorttag.match(rawdata, i)
249 if not match:
250 return -1
251 tag, data = match.group(1, 2)
252 self.__starttag_text = '<%s/' % tag
253 tag = tag.lower()
254 k = match.end(0)
255 self.finish_shorttag(tag, data)
256 self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
257 return k
258 # XXX The following should skip matching quotes (' or ")
259 match = endbracket.search(rawdata, i+1)
260 if not match:
261 return -1
262 j = match.start(0)
263 # Now parse the data between i+1 and j into a tag and attrs
264 attrs = []
265 if rawdata[i:i+2] == '<>':
266 # SGML shorthand: <> == <last open tag seen>
267 k = j
268 tag = self.lasttag
269 else:
270 match = tagfind.match(rawdata, i+1)
271 if not match:
272 self.error('unexpected call to parse_starttag')
273 k = match.end(0)
274 tag = rawdata[i+1:k].lower()
275 self.lasttag = tag
276 while k < j:
277 match = attrfind.match(rawdata, k)
278 if not match: break
279 attrname, rest, attrvalue = match.group(1, 2, 3)
280 if not rest:
281 attrvalue = attrname
282 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
283 attrvalue[:1] == '"' == attrvalue[-1:]:
284 attrvalue = attrvalue[1:-1]
285 attrs.append((attrname.lower(), attrvalue))
286 k = match.end(0)
287 if rawdata[j] == '>':
288 j = j+1
289 self.__starttag_text = rawdata[start_pos:j]
290 self.finish_starttag(tag, attrs)
291 return j
293 # Internal -- parse endtag
294 def parse_endtag(self, i):
295 rawdata = self.rawdata
296 match = endbracket.search(rawdata, i+1)
297 if not match:
298 return -1
299 j = match.start(0)
300 tag = rawdata[i+2:j].strip().lower()
301 if rawdata[j] == '>':
302 j = j+1
303 self.finish_endtag(tag)
304 return j
306 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
307 def finish_shorttag(self, tag, data):
308 self.finish_starttag(tag, [])
309 self.handle_data(data)
310 self.finish_endtag(tag)
312 # Internal -- finish processing of start tag
313 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
314 def finish_starttag(self, tag, attrs):
315 try:
316 method = getattr(self, 'start_' + tag)
317 except AttributeError:
318 try:
319 method = getattr(self, 'do_' + tag)
320 except AttributeError:
321 self.unknown_starttag(tag, attrs)
322 return -1
323 else:
324 self.handle_starttag(tag, method, attrs)
325 return 0
326 else:
327 self.stack.append(tag)
328 self.handle_starttag(tag, method, attrs)
329 return 1
331 # Internal -- finish processing of end tag
332 def finish_endtag(self, tag):
333 if not tag:
334 found = len(self.stack) - 1
335 if found < 0:
336 self.unknown_endtag(tag)
337 return
338 else:
339 if tag not in self.stack:
340 try:
341 method = getattr(self, 'end_' + tag)
342 except AttributeError:
343 self.unknown_endtag(tag)
344 else:
345 self.report_unbalanced(tag)
346 return
347 found = len(self.stack)
348 for i in range(found):
349 if self.stack[i] == tag: found = i
350 while len(self.stack) > found:
351 tag = self.stack[-1]
352 try:
353 method = getattr(self, 'end_' + tag)
354 except AttributeError:
355 method = None
356 if method:
357 self.handle_endtag(tag, method)
358 else:
359 self.unknown_endtag(tag)
360 del self.stack[-1]
362 # Overridable -- handle start tag
363 def handle_starttag(self, tag, method, attrs):
364 method(attrs)
366 # Overridable -- handle end tag
367 def handle_endtag(self, tag, method):
368 method()
370 # Example -- report an unbalanced </...> tag.
371 def report_unbalanced(self, tag):
372 if self.verbose:
373 print '*** Unbalanced </' + tag + '>'
374 print '*** Stack:', self.stack
376 def handle_charref(self, name):
377 """Handle character reference, no need to override."""
378 try:
379 n = int(name)
380 except ValueError:
381 self.unknown_charref(name)
382 return
383 if not 0 <= n <= 255:
384 self.unknown_charref(name)
385 return
386 self.handle_data(chr(n))
388 # Definition of entities -- derived classes may override
389 entitydefs = \
390 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
392 def handle_entityref(self, name):
393 """Handle entity references.
395 There should be no need to override this method; it can be
396 tailored by setting up the self.entitydefs mapping appropriately.
398 table = self.entitydefs
399 if table.has_key(name):
400 self.handle_data(table[name])
401 else:
402 self.unknown_entityref(name)
403 return
405 # Example -- handle data, should be overridden
406 def handle_data(self, data):
407 pass
409 # Example -- handle comment, could be overridden
410 def handle_comment(self, data):
411 pass
413 # Example -- handle declaration, could be overridden
414 def handle_decl(self, decl):
415 pass
417 # Example -- handle processing instruction, could be overridden
418 def handle_pi(self, data):
419 pass
421 # To be overridden -- handlers for unknown objects
422 def unknown_starttag(self, tag, attrs): pass
423 def unknown_endtag(self, tag): pass
424 def unknown_charref(self, ref): pass
425 def unknown_entityref(self, ref): pass
428 class TestSGMLParser(SGMLParser):
430 def __init__(self, verbose=0):
431 self.testdata = ""
432 SGMLParser.__init__(self, verbose)
434 def handle_data(self, data):
435 self.testdata = self.testdata + data
436 if len(`self.testdata`) >= 70:
437 self.flush()
439 def flush(self):
440 data = self.testdata
441 if data:
442 self.testdata = ""
443 print 'data:', `data`
445 def handle_comment(self, data):
446 self.flush()
447 r = `data`
448 if len(r) > 68:
449 r = r[:32] + '...' + r[-32:]
450 print 'comment:', r
452 def unknown_starttag(self, tag, attrs):
453 self.flush()
454 if not attrs:
455 print 'start tag: <' + tag + '>'
456 else:
457 print 'start tag: <' + tag,
458 for name, value in attrs:
459 print name + '=' + '"' + value + '"',
460 print '>'
462 def unknown_endtag(self, tag):
463 self.flush()
464 print 'end tag: </' + tag + '>'
466 def unknown_entityref(self, ref):
467 self.flush()
468 print '*** unknown entity ref: &' + ref + ';'
470 def unknown_charref(self, ref):
471 self.flush()
472 print '*** unknown char ref: &#' + ref + ';'
474 def close(self):
475 SGMLParser.close(self)
476 self.flush()
479 def test(args = None):
480 import sys
482 if not args:
483 args = sys.argv[1:]
485 if args and args[0] == '-s':
486 args = args[1:]
487 klass = SGMLParser
488 else:
489 klass = TestSGMLParser
491 if args:
492 file = args[0]
493 else:
494 file = 'test.html'
496 if file == '-':
497 f = sys.stdin
498 else:
499 try:
500 f = open(file, 'r')
501 except IOError, msg:
502 print file, ":", msg
503 sys.exit(1)
505 data = f.read()
506 if f is not sys.stdin:
507 f.close()
509 x = klass()
510 for c in data:
511 x.feed(c)
512 x.close()
515 if __name__ == '__main__':
516 test()