Lib/sgmllib.py

   1 """A parser for SGML, using the derived class as a static DTD."""
   2
   3 # XXX This only supports those SGML features used by HTML.
   4
   5 # XXX There should be a way to distinguish between PCDATA (parsed
   6 # character data -- the normal case), RCDATA (replaceable character
   7 # data -- only char and entity references and end tags are special)
   8 # and CDATA (character data -- only end tags are special).  RCDATA is
   9 # not supported at all.
  10
  11
  12 import markupbase
  13 import re
  14
  15 __all__ = ["SGMLParser"]
  16
  17 # Regular expressions used for parsing
  18
  19 interesting = re.compile('[&<]')
  20 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
  21                            '<([a-zA-Z][^<>]*|'
  22                               '/([a-zA-Z][^<>]*)?|'
  23                               '![^<>]*)?')
  24
  25 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  26 charref = re.compile('&#([0-9]+)[^0-9]')
  27
  28 starttagopen = re.compile('<[>a-zA-Z]')
  29 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
  30 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
  31 piclose = re.compile('>')
  32 endbracket = re.compile('[<>]')
  33 commentclose = re.compile(r'--\s*>')
  34 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
  35 attrfind = re.compile(
  36     r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
  37     r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~\'"]*))?')
  38
  39
  40 class SGMLParseError(RuntimeError):
  41     """Exception raised for all parse errors."""
  42     pass
  43
  44
  45 # SGML parser base class -- find tags and call handler functions.
  46 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
  47 # The dtd is defined by deriving a class which defines methods
  48 # with special names to handle tags: start_foo and end_foo to handle
  49 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
  50 # (Tags are converted to lower case for this purpose.)  The data
  51 # between tags is passed to the parser by calling self.handle_data()
  52 # with some data as argument (the data may be split up in arbitrary
  53 # chunks).  Entity references are passed by calling
  54 # self.handle_entityref() with the entity reference as argument.
  55
  56 class SGMLParser(markupbase.ParserBase):
  57
  58     def __init__(self, verbose=0):
  59         """Initialize and reset this instance."""
  60         self.verbose = verbose
  61         self.reset()
  62
  63     def reset(self):
  64         """Reset this instance. Loses all unprocessed data."""
  65         self.rawdata = ''
  66         self.stack = []
  67         self.lasttag = '???'
  68         self.nomoretags = 0
  69         self.literal = 0
  70         markupbase.ParserBase.reset(self)
  71
  72     def setnomoretags(self):
  73         """Enter literal mode (CDATA) till EOF.
  74
  75         Intended for derived classes only.
  76         """
  77         self.nomoretags = self.literal = 1
  78
  79     def setliteral(self, *args):
  80         """Enter literal mode (CDATA).
  81
  82         Intended for derived classes only.
  83         """
  84         self.literal = 1
  85
  86     def feed(self, data):
  87         """Feed some data to the parser.
  88
  89         Call this as often as you want, with as little or as much text
  90         as you want (may include '\n').  (This just saves the text,
  91         all the processing is done by goahead().)
  92         """
  93
  94         self.rawdata = self.rawdata + data
  95         self.goahead(0)
  96
  97     def close(self):
  98         """Handle the remaining data."""
  99         self.goahead(1)
 100
 101     def error(self, message):
 102         raise SGMLParseError(message)
 103
 104     # Internal -- handle data as far as reasonable.  May leave state
 105     # and data to be processed by a subsequent call.  If 'end' is
 106     # true, force handling all data as if followed by EOF marker.
 107     def goahead(self, end):
 108         rawdata = self.rawdata
 109         i = 0
 110         n = len(rawdata)
 111         while i < n:
 112             if self.nomoretags:
 113                 self.handle_data(rawdata[i:n])
 114                 i = n
 115                 break
 116             match = interesting.search(rawdata, i)
 117             if match: j = match.start()
 118             else: j = n
 119             if i < j:
 120                 self.handle_data(rawdata[i:j])
 121             i = j
 122             if i == n: break
 123             if rawdata[i] == '<':
 124                 if starttagopen.match(rawdata, i):
 125                     if self.literal:
 126                         self.handle_data(rawdata[i])
 127                         i = i+1
 128                         continue
 129                     k = self.parse_starttag(i)
 130                     if k < 0: break
 131                     i = k
 132                     continue
 133                 if rawdata.startswith("</", i):
 134                     k = self.parse_endtag(i)
 135                     if k < 0: break
 136                     i = k
 137                     self.literal = 0
 138                     continue
 139                 if self.literal:
 140                     if n > (i + 1):
 141                         self.handle_data("<")
 142                         i = i+1
 143                     else:
 144                         # incomplete
 145                         break
 146                     continue
 147                 if rawdata.startswith("<!--", i):
 148                     k = self.parse_comment(i)
 149                     if k < 0: break
 150                     i = k
 151                     continue
 152                 if rawdata.startswith("<?", i):
 153                     k = self.parse_pi(i)
 154                     if k < 0: break
 155                     i = i+k
 156                     continue
 157                 if rawdata.startswith("<!", i):
 158                     # This is some sort of declaration; in "HTML as
 159                     # deployed," this should only be the document type
 160                     # declaration ("<!DOCTYPE html...>").
 161                     k = self.parse_declaration(i)
 162                     if k < 0: break
 163                     i = k
 164                     continue
 165             elif rawdata[i] == '&':
 166                 if self.literal:
 167                     self.handle_data(rawdata[i])
 168                     i = i+1
 169                     continue
 170                 match = charref.match(rawdata, i)
 171                 if match:
 172                     name = match.group(1)
 173                     self.handle_charref(name)
 174                     i = match.end(0)
 175                     if rawdata[i-1] != ';': i = i-1
 176                     continue
 177                 match = entityref.match(rawdata, i)
 178                 if match:
 179                     name = match.group(1)
 180                     self.handle_entityref(name)
 181                     i = match.end(0)
 182                     if rawdata[i-1] != ';': i = i-1
 183                     continue
 184             else:
 185                 self.error('neither < nor & ??')
 186             # We get here only if incomplete matches but
 187             # nothing else
 188             match = incomplete.match(rawdata, i)
 189             if not match:
 190                 self.handle_data(rawdata[i])
 191                 i = i+1
 192                 continue
 193             j = match.end(0)
 194             if j == n:
 195                 break # Really incomplete
 196             self.handle_data(rawdata[i:j])
 197             i = j
 198         # end while
 199         if end and i < n:
 200             self.handle_data(rawdata[i:n])
 201             i = n
 202         self.rawdata = rawdata[i:]
 203         # XXX if end: check for empty stack
 204
 205     # Internal -- parse comment, return length or -1 if not terminated
 206     def parse_comment(self, i, report=1):
 207         rawdata = self.rawdata
 208         if rawdata[i:i+4] != '<!--':
 209             self.error('unexpected call to parse_comment()')
 210         match = commentclose.search(rawdata, i+4)
 211         if not match:
 212             return -1
 213         if report:
 214             j = match.start(0)
 215             self.handle_comment(rawdata[i+4: j])
 216         return match.end(0)
 217
 218     # Extensions for the DOCTYPE scanner:
 219     _decl_otherchars = '='
 220
 221     # Internal -- parse processing instr, return length or -1 if not terminated
 222     def parse_pi(self, i):
 223         rawdata = self.rawdata
 224         if rawdata[i:i+2] != '<?':
 225             self.error('unexpected call to parse_pi()')
 226         match = piclose.search(rawdata, i+2)
 227         if not match:
 228             return -1
 229         j = match.start(0)
 230         self.handle_pi(rawdata[i+2: j])
 231         j = match.end(0)
 232         return j-i
 233
 234     __starttag_text = None
 235     def get_starttag_text(self):
 236         return self.__starttag_text
 237
 238     # Internal -- handle starttag, return length or -1 if not terminated
 239     def parse_starttag(self, i):
 240         self.__starttag_text = None
 241         start_pos = i
 242         rawdata = self.rawdata
 243         if shorttagopen.match(rawdata, i):
 244             # SGML shorthand: <tag/data/ == <tag>data</tag>
 245             # XXX Can data contain &... (entity or char refs)?
 246             # XXX Can data contain < or > (tag characters)?
 247             # XXX Can there be whitespace before the first /?
 248             match = shorttag.match(rawdata, i)
 249             if not match:
 250                 return -1
 251             tag, data = match.group(1, 2)
 252             self.__starttag_text = '<%s/' % tag
 253             tag = tag.lower()
 254             k = match.end(0)
 255             self.finish_shorttag(tag, data)
 256             self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
 257             return k
 258         # XXX The following should skip matching quotes (' or ")
 259         match = endbracket.search(rawdata, i+1)
 260         if not match:
 261             return -1
 262         j = match.start(0)
 263         # Now parse the data between i+1 and j into a tag and attrs
 264         attrs = []
 265         if rawdata[i:i+2] == '<>':
 266             # SGML shorthand: <> == <last open tag seen>
 267             k = j
 268             tag = self.lasttag
 269         else:
 270             match = tagfind.match(rawdata, i+1)
 271             if not match:
 272                 self.error('unexpected call to parse_starttag')
 273             k = match.end(0)
 274             tag = rawdata[i+1:k].lower()
 275             self.lasttag = tag
 276         while k < j:
 277             match = attrfind.match(rawdata, k)
 278             if not match: break
 279             attrname, rest, attrvalue = match.group(1, 2, 3)
 280             if not rest:
 281                 attrvalue = attrname
 282             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
 283                  attrvalue[:1] == '"' == attrvalue[-1:]:
 284                 attrvalue = attrvalue[1:-1]
 285             attrs.append((attrname.lower(), attrvalue))
 286             k = match.end(0)
 287         if rawdata[j] == '>':
 288             j = j+1
 289         self.__starttag_text = rawdata[start_pos:j]
 290         self.finish_starttag(tag, attrs)
 291         return j
 292
 293     # Internal -- parse endtag
 294     def parse_endtag(self, i):
 295         rawdata = self.rawdata
 296         match = endbracket.search(rawdata, i+1)
 297         if not match:
 298             return -1
 299         j = match.start(0)
 300         tag = rawdata[i+2:j].strip().lower()
 301         if rawdata[j] == '>':
 302             j = j+1
 303         self.finish_endtag(tag)
 304         return j
 305
 306     # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
 307     def finish_shorttag(self, tag, data):
 308         self.finish_starttag(tag, [])
 309         self.handle_data(data)
 310         self.finish_endtag(tag)
 311
 312     # Internal -- finish processing of start tag
 313     # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
 314     def finish_starttag(self, tag, attrs):
 315         try:
 316             method = getattr(self, 'start_' + tag)
 317         except AttributeError:
 318             try:
 319                 method = getattr(self, 'do_' + tag)
 320             except AttributeError:
 321                 self.unknown_starttag(tag, attrs)
 322                 return -1
 323             else:
 324                 self.handle_starttag(tag, method, attrs)
 325                 return 0
 326         else:
 327             self.stack.append(tag)
 328             self.handle_starttag(tag, method, attrs)
 329             return 1
 330
 331     # Internal -- finish processing of end tag
 332     def finish_endtag(self, tag):
 333         if not tag:
 334             found = len(self.stack) - 1
 335             if found < 0:
 336                 self.unknown_endtag(tag)
 337                 return
 338         else:
 339             if tag not in self.stack:
 340                 try:
 341                     method = getattr(self, 'end_' + tag)
 342                 except AttributeError:
 343                     self.unknown_endtag(tag)
 344                 else:
 345                     self.report_unbalanced(tag)
 346                 return
 347             found = len(self.stack)
 348             for i in range(found):
 349                 if self.stack[i] == tag: found = i
 350         while len(self.stack) > found:
 351             tag = self.stack[-1]
 352             try:
 353                 method = getattr(self, 'end_' + tag)
 354             except AttributeError:
 355                 method = None
 356             if method:
 357                 self.handle_endtag(tag, method)
 358             else:
 359                 self.unknown_endtag(tag)
 360             del self.stack[-1]
 361
 362     # Overridable -- handle start tag
 363     def handle_starttag(self, tag, method, attrs):
 364         method(attrs)
 365
 366     # Overridable -- handle end tag
 367     def handle_endtag(self, tag, method):
 368         method()
 369
 370     # Example -- report an unbalanced </...> tag.
 371     def report_unbalanced(self, tag):
 372         if self.verbose:
 373             print '*** Unbalanced </' + tag + '>'
 374             print '*** Stack:', self.stack
 375
 376     def handle_charref(self, name):
 377         """Handle character reference, no need to override."""
 378         try:
 379             n = int(name)
 380         except ValueError:
 381             self.unknown_charref(name)
 382             return
 383         if not 0 <= n <= 255:
 384             self.unknown_charref(name)
 385             return
 386         self.handle_data(chr(n))
 387
 388     # Definition of entities -- derived classes may override
 389     entitydefs = \
 390             {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
 391
 392     def handle_entityref(self, name):
 393         """Handle entity references.
 394
 395         There should be no need to override this method; it can be
 396         tailored by setting up the self.entitydefs mapping appropriately.
 397         """
 398         table = self.entitydefs
 399         if table.has_key(name):
 400             self.handle_data(table[name])
 401         else:
 402             self.unknown_entityref(name)
 403             return
 404
 405     # Example -- handle data, should be overridden
 406     def handle_data(self, data):
 407         pass
 408
 409     # Example -- handle comment, could be overridden
 410     def handle_comment(self, data):
 411         pass
 412
 413     # Example -- handle declaration, could be overridden
 414     def handle_decl(self, decl):
 415         pass
 416
 417     # Example -- handle processing instruction, could be overridden
 418     def handle_pi(self, data):
 419         pass
 420
 421     # To be overridden -- handlers for unknown objects
 422     def unknown_starttag(self, tag, attrs): pass
 423     def unknown_endtag(self, tag): pass
 424     def unknown_charref(self, ref): pass
 425     def unknown_entityref(self, ref): pass
 426
 427
 428 class TestSGMLParser(SGMLParser):
 429
 430     def __init__(self, verbose=0):
 431         self.testdata = ""
 432         SGMLParser.__init__(self, verbose)
 433
 434     def handle_data(self, data):
 435         self.testdata = self.testdata + data
 436         if len(`self.testdata`) >= 70:
 437             self.flush()
 438
 439     def flush(self):
 440         data = self.testdata
 441         if data:
 442             self.testdata = ""
 443             print 'data:', `data`
 444
 445     def handle_comment(self, data):
 446         self.flush()
 447         r = `data`
 448         if len(r) > 68:
 449             r = r[:32] + '...' + r[-32:]
 450         print 'comment:', r
 451
 452     def unknown_starttag(self, tag, attrs):
 453         self.flush()
 454         if not attrs:
 455             print 'start tag: <' + tag + '>'
 456         else:
 457             print 'start tag: <' + tag,
 458             for name, value in attrs:
 459                 print name + '=' + '"' + value + '"',
 460             print '>'
 461
 462     def unknown_endtag(self, tag):
 463         self.flush()
 464         print 'end tag: </' + tag + '>'
 465
 466     def unknown_entityref(self, ref):
 467         self.flush()
 468         print '*** unknown entity ref: &' + ref + ';'
 469
 470     def unknown_charref(self, ref):
 471         self.flush()
 472         print '*** unknown char ref: &#' + ref + ';'
 473
 474     def close(self):
 475         SGMLParser.close(self)
 476         self.flush()
 477
 478
 479 def test(args = None):
 480     import sys
 481
 482     if not args:
 483         args = sys.argv[1:]
 484
 485     if args and args[0] == '-s':
 486         args = args[1:]
 487         klass = SGMLParser
 488     else:
 489         klass = TestSGMLParser
 490
 491     if args:
 492         file = args[0]
 493     else:
 494         file = 'test.html'
 495
 496     if file == '-':
 497         f = sys.stdin
 498     else:
 499         try:
 500             f = open(file, 'r')
 501         except IOError, msg:
 502             print file, ":", msg
 503             sys.exit(1)
 504
 505     data = f.read()
 506     if f is not sys.stdin:
 507         f.close()
 508
 509     x = klass()
 510     for c in data:
 511         x.feed(c)
 512     x.close()
 513
 514
 515 if __name__ == '__main__':
 516     test()