Lib/sgmllib.py

   1 """A parser for SGML, using the derived class as a static DTD."""
   2
   3 # XXX This only supports those SGML features used by HTML.
   4
   5 # XXX There should be a way to distinguish between PCDATA (parsed
   6 # character data -- the normal case), RCDATA (replaceable character
   7 # data -- only char and entity references and end tags are special)
   8 # and CDATA (character data -- only end tags are special).
   9
  10
  11 import re
  12 import string
  13
  14 __all__ = ["SGMLParser"]
  15
  16 # Regular expressions used for parsing
  17
  18 interesting = re.compile('[&<]')
  19 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
  20                            '<([a-zA-Z][^<>]*|'
  21                               '/([a-zA-Z][^<>]*)?|'
  22                               '![^<>]*)?')
  23
  24 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  25 charref = re.compile('&#([0-9]+)[^0-9]')
  26
  27 starttagopen = re.compile('<[>a-zA-Z]')
  28 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
  29 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
  30 piopen = re.compile('<\?')
  31 piclose = re.compile('>')
  32 endtagopen = re.compile('</[<>a-zA-Z]')
  33 endbracket = re.compile('[<>]')
  34 special = re.compile('<![^<>]*>')
  35 commentopen = re.compile('<!--')
  36 commentclose = re.compile('--[%s]*>' % string.whitespace)
  37 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9]*')
  38 attrfind = re.compile(
  39     '[%s]*([a-zA-Z_][-.a-zA-Z_0-9]*)' % string.whitespace
  40     + ('([%s]*=[%s]*' % (string.whitespace, string.whitespace))
  41     + r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
  42
  43
  44 # SGML parser base class -- find tags and call handler functions.
  45 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
  46 # The dtd is defined by deriving a class which defines methods
  47 # with special names to handle tags: start_foo and end_foo to handle
  48 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
  49 # (Tags are converted to lower case for this purpose.)  The data
  50 # between tags is passed to the parser by calling self.handle_data()
  51 # with some data as argument (the data may be split up in arbitrary
  52 # chunks).  Entity references are passed by calling
  53 # self.handle_entityref() with the entity reference as argument.
  54
  55 class SGMLParser:
  56
  57     # Interface -- initialize and reset this instance
  58     def __init__(self, verbose=0):
  59         self.verbose = verbose
  60         self.reset()
  61
  62     # Interface -- reset this instance.  Loses all unprocessed data
  63     def reset(self):
  64         self.rawdata = ''
  65         self.stack = []
  66         self.lasttag = '???'
  67         self.nomoretags = 0
  68         self.literal = 0
  69
  70     # For derived classes only -- enter literal mode (CDATA) till EOF
  71     def setnomoretags(self):
  72         self.nomoretags = self.literal = 1
  73
  74     # For derived classes only -- enter literal mode (CDATA)
  75     def setliteral(self, *args):
  76         self.literal = 1
  77
  78     # Interface -- feed some data to the parser.  Call this as
  79     # often as you want, with as little or as much text as you
  80     # want (may include '\n').  (This just saves the text, all the
  81     # processing is done by goahead().)
  82     def feed(self, data):
  83         self.rawdata = self.rawdata + data
  84         self.goahead(0)
  85
  86     # Interface -- handle the remaining data
  87     def close(self):
  88         self.goahead(1)
  89
  90     # Internal -- handle data as far as reasonable.  May leave state
  91     # and data to be processed by a subsequent call.  If 'end' is
  92     # true, force handling all data as if followed by EOF marker.
  93     def goahead(self, end):
  94         rawdata = self.rawdata
  95         i = 0
  96         n = len(rawdata)
  97         while i < n:
  98             if self.nomoretags:
  99                 self.handle_data(rawdata[i:n])
 100                 i = n
 101                 break
 102             match = interesting.search(rawdata, i)
 103             if match: j = match.start(0)
 104             else: j = n
 105             if i < j: self.handle_data(rawdata[i:j])
 106             i = j
 107             if i == n: break
 108             if rawdata[i] == '<':
 109                 if starttagopen.match(rawdata, i):
 110                     if self.literal:
 111                         self.handle_data(rawdata[i])
 112                         i = i+1
 113                         continue
 114                     k = self.parse_starttag(i)
 115                     if k < 0: break
 116                     i = k
 117                     continue
 118                 if endtagopen.match(rawdata, i):
 119                     k = self.parse_endtag(i)
 120                     if k < 0: break
 121                     i =  k
 122                     self.literal = 0
 123                     continue
 124                 if commentopen.match(rawdata, i):
 125                     if self.literal:
 126                         self.handle_data(rawdata[i])
 127                         i = i+1
 128                         continue
 129                     k = self.parse_comment(i)
 130                     if k < 0: break
 131                     i = i+k
 132                     continue
 133                 if piopen.match(rawdata, i):
 134                     if self.literal:
 135                         self.handle_data(rawdata[i])
 136                         i = i+1
 137                         continue
 138                     k = self.parse_pi(i)
 139                     if k < 0: break
 140                     i = i+k
 141                     continue
 142                 match = special.match(rawdata, i)
 143                 if match:
 144                     if self.literal:
 145                         self.handle_data(rawdata[i])
 146                         i = i+1
 147                         continue
 148                     i = match.end(0)
 149                     continue
 150             elif rawdata[i] == '&':
 151                 match = charref.match(rawdata, i)
 152                 if match:
 153                     name = match.group(1)
 154                     self.handle_charref(name)
 155                     i = match.end(0)
 156                     if rawdata[i-1] != ';': i = i-1
 157                     continue
 158                 match = entityref.match(rawdata, i)
 159                 if match:
 160                     name = match.group(1)
 161                     self.handle_entityref(name)
 162                     i = match.end(0)
 163                     if rawdata[i-1] != ';': i = i-1
 164                     continue
 165             else:
 166                 raise RuntimeError, 'neither < nor & ??'
 167             # We get here only if incomplete matches but
 168             # nothing else
 169             match = incomplete.match(rawdata, i)
 170             if not match:
 171                 self.handle_data(rawdata[i])
 172                 i = i+1
 173                 continue
 174             j = match.end(0)
 175             if j == n:
 176                 break # Really incomplete
 177             self.handle_data(rawdata[i:j])
 178             i = j
 179         # end while
 180         if end and i < n:
 181             self.handle_data(rawdata[i:n])
 182             i = n
 183         self.rawdata = rawdata[i:]
 184         # XXX if end: check for empty stack
 185
 186     # Internal -- parse comment, return length or -1 if not terminated
 187     def parse_comment(self, i):
 188         rawdata = self.rawdata
 189         if rawdata[i:i+4] != '<!--':
 190             raise RuntimeError, 'unexpected call to handle_comment'
 191         match = commentclose.search(rawdata, i+4)
 192         if not match:
 193             return -1
 194         j = match.start(0)
 195         self.handle_comment(rawdata[i+4: j])
 196         j = match.end(0)
 197         return j-i
 198
 199     # Internal -- parse processing instr, return length or -1 if not terminated
 200     def parse_pi(self, i):
 201         rawdata = self.rawdata
 202         if rawdata[i:i+2] != '<?':
 203             raise RuntimeError, 'unexpected call to handle_pi'
 204         match = piclose.search(rawdata, i+2)
 205         if not match:
 206             return -1
 207         j = match.start(0)
 208         self.handle_pi(rawdata[i+2: j])
 209         j = match.end(0)
 210         return j-i
 211
 212     __starttag_text = None
 213     def get_starttag_text(self):
 214         return self.__starttag_text
 215
 216     # Internal -- handle starttag, return length or -1 if not terminated
 217     def parse_starttag(self, i):
 218         self.__starttag_text = None
 219         start_pos = i
 220         rawdata = self.rawdata
 221         if shorttagopen.match(rawdata, i):
 222             # SGML shorthand: <tag/data/ == <tag>data</tag>
 223             # XXX Can data contain &... (entity or char refs)?
 224             # XXX Can data contain < or > (tag characters)?
 225             # XXX Can there be whitespace before the first /?
 226             match = shorttag.match(rawdata, i)
 227             if not match:
 228                 return -1
 229             tag, data = match.group(1, 2)
 230             self.__starttag_text = '<%s/' % tag
 231             tag = tag.lower()
 232             k = match.end(0)
 233             self.finish_shorttag(tag, data)
 234             self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
 235             return k
 236         # XXX The following should skip matching quotes (' or ")
 237         match = endbracket.search(rawdata, i+1)
 238         if not match:
 239             return -1
 240         j = match.start(0)
 241         # Now parse the data between i+1 and j into a tag and attrs
 242         attrs = []
 243         if rawdata[i:i+2] == '<>':
 244             # SGML shorthand: <> == <last open tag seen>
 245             k = j
 246             tag = self.lasttag
 247         else:
 248             match = tagfind.match(rawdata, i+1)
 249             if not match:
 250                 raise RuntimeError, 'unexpected call to parse_starttag'
 251             k = match.end(0)
 252             tag = rawdata[i+1:k].lower()
 253             self.lasttag = tag
 254         while k < j:
 255             match = attrfind.match(rawdata, k)
 256             if not match: break
 257             attrname, rest, attrvalue = match.group(1, 2, 3)
 258             if not rest:
 259                 attrvalue = attrname
 260             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
 261                  attrvalue[:1] == '"' == attrvalue[-1:]:
 262                 attrvalue = attrvalue[1:-1]
 263             attrs.append((attrname.lower(), attrvalue))
 264             k = match.end(0)
 265         if rawdata[j] == '>':
 266             j = j+1
 267         self.__starttag_text = rawdata[start_pos:j]
 268         self.finish_starttag(tag, attrs)
 269         return j
 270
 271     # Internal -- parse endtag
 272     def parse_endtag(self, i):
 273         rawdata = self.rawdata
 274         match = endbracket.search(rawdata, i+1)
 275         if not match:
 276             return -1
 277         j = match.start(0)
 278         tag = rawdata[i+2:j].strip().lower()
 279         if rawdata[j] == '>':
 280             j = j+1
 281         self.finish_endtag(tag)
 282         return j
 283
 284     # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
 285     def finish_shorttag(self, tag, data):
 286         self.finish_starttag(tag, [])
 287         self.handle_data(data)
 288         self.finish_endtag(tag)
 289
 290     # Internal -- finish processing of start tag
 291     # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
 292     def finish_starttag(self, tag, attrs):
 293         try:
 294             method = getattr(self, 'start_' + tag)
 295         except AttributeError:
 296             try:
 297                 method = getattr(self, 'do_' + tag)
 298             except AttributeError:
 299                 self.unknown_starttag(tag, attrs)
 300                 return -1
 301             else:
 302                 self.handle_starttag(tag, method, attrs)
 303                 return 0
 304         else:
 305             self.stack.append(tag)
 306             self.handle_starttag(tag, method, attrs)
 307             return 1
 308
 309     # Internal -- finish processing of end tag
 310     def finish_endtag(self, tag):
 311         if not tag:
 312             found = len(self.stack) - 1
 313             if found < 0:
 314                 self.unknown_endtag(tag)
 315                 return
 316         else:
 317             if tag not in self.stack:
 318                 try:
 319                     method = getattr(self, 'end_' + tag)
 320                 except AttributeError:
 321                     self.unknown_endtag(tag)
 322                 else:
 323                     self.report_unbalanced(tag)
 324                 return
 325             found = len(self.stack)
 326             for i in range(found):
 327                 if self.stack[i] == tag: found = i
 328         while len(self.stack) > found:
 329             tag = self.stack[-1]
 330             try:
 331                 method = getattr(self, 'end_' + tag)
 332             except AttributeError:
 333                 method = None
 334             if method:
 335                 self.handle_endtag(tag, method)
 336             else:
 337                 self.unknown_endtag(tag)
 338             del self.stack[-1]
 339
 340     # Overridable -- handle start tag
 341     def handle_starttag(self, tag, method, attrs):
 342         method(attrs)
 343
 344     # Overridable -- handle end tag
 345     def handle_endtag(self, tag, method):
 346         method()
 347
 348     # Example -- report an unbalanced </...> tag.
 349     def report_unbalanced(self, tag):
 350         if self.verbose:
 351             print '*** Unbalanced </' + tag + '>'
 352             print '*** Stack:', self.stack
 353
 354     # Example -- handle character reference, no need to override
 355     def handle_charref(self, name):
 356         try:
 357             n = int(name)
 358         except ValueError:
 359             self.unknown_charref(name)
 360             return
 361         if not 0 <= n <= 255:
 362             self.unknown_charref(name)
 363             return
 364         self.handle_data(chr(n))
 365
 366     # Definition of entities -- derived classes may override
 367     entitydefs = \
 368             {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
 369
 370     # Example -- handle entity reference, no need to override
 371     def handle_entityref(self, name):
 372         table = self.entitydefs
 373         if table.has_key(name):
 374             self.handle_data(table[name])
 375         else:
 376             self.unknown_entityref(name)
 377             return
 378
 379     # Example -- handle data, should be overridden
 380     def handle_data(self, data):
 381         pass
 382
 383     # Example -- handle comment, could be overridden
 384     def handle_comment(self, data):
 385         pass
 386
 387     # Example -- handle processing instruction, could be overridden
 388     def handle_pi(self, data):
 389         pass
 390
 391     # To be overridden -- handlers for unknown objects
 392     def unknown_starttag(self, tag, attrs): pass
 393     def unknown_endtag(self, tag): pass
 394     def unknown_charref(self, ref): pass
 395     def unknown_entityref(self, ref): pass
 396
 397
 398 class TestSGMLParser(SGMLParser):
 399
 400     def __init__(self, verbose=0):
 401         self.testdata = ""
 402         SGMLParser.__init__(self, verbose)
 403
 404     def handle_data(self, data):
 405         self.testdata = self.testdata + data
 406         if len(`self.testdata`) >= 70:
 407             self.flush()
 408
 409     def flush(self):
 410         data = self.testdata
 411         if data:
 412             self.testdata = ""
 413             print 'data:', `data`
 414
 415     def handle_comment(self, data):
 416         self.flush()
 417         r = `data`
 418         if len(r) > 68:
 419             r = r[:32] + '...' + r[-32:]
 420         print 'comment:', r
 421
 422     def unknown_starttag(self, tag, attrs):
 423         self.flush()
 424         if not attrs:
 425             print 'start tag: <' + tag + '>'
 426         else:
 427             print 'start tag: <' + tag,
 428             for name, value in attrs:
 429                 print name + '=' + '"' + value + '"',
 430             print '>'
 431
 432     def unknown_endtag(self, tag):
 433         self.flush()
 434         print 'end tag: </' + tag + '>'
 435
 436     def unknown_entityref(self, ref):
 437         self.flush()
 438         print '*** unknown entity ref: &' + ref + ';'
 439
 440     def unknown_charref(self, ref):
 441         self.flush()
 442         print '*** unknown char ref: &#' + ref + ';'
 443
 444     def close(self):
 445         SGMLParser.close(self)
 446         self.flush()
 447
 448
 449 def test(args = None):
 450     import sys
 451
 452     if not args:
 453         args = sys.argv[1:]
 454
 455     if args and args[0] == '-s':
 456         args = args[1:]
 457         klass = SGMLParser
 458     else:
 459         klass = TestSGMLParser
 460
 461     if args:
 462         file = args[0]
 463     else:
 464         file = 'test.html'
 465
 466     if file == '-':
 467         f = sys.stdin
 468     else:
 469         try:
 470             f = open(file, 'r')
 471         except IOError, msg:
 472             print file, ":", msg
 473             sys.exit(1)
 474
 475     data = f.read()
 476     if f is not sys.stdin:
 477         f.close()
 478
 479     x = klass()
 480     for c in data:
 481         x.feed(c)
 482     x.close()
 483
 484
 485 if __name__ == '__main__':
 486     test()