Lib/sgmllib.py

   1 """A parser for SGML, using the derived class as a static DTD."""
   2
   3 # XXX This only supports those SGML features used by HTML.
   4
   5 # XXX There should be a way to distinguish between PCDATA (parsed
   6 # character data -- the normal case), RCDATA (replaceable character
   7 # data -- only char and entity references and end tags are special)
   8 # and CDATA (character data -- only end tags are special).
   9
  10
  11 import re
  12 import string
  13
  14
  15 # Regular expressions used for parsing
  16
  17 interesting = re.compile('[&<]')
  18 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
  19                            '<([a-zA-Z][^<>]*|'
  20                               '/([a-zA-Z][^<>]*)?|'
  21                               '![^<>]*)?')
  22
  23 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  24 charref = re.compile('&#([0-9]+)[^0-9]')
  25
  26 starttagopen = re.compile('<[>a-zA-Z]')
  27 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
  28 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
  29 piopen = re.compile('<\?')
  30 piclose = re.compile('>')
  31 endtagopen = re.compile('</[<>a-zA-Z]')
  32 endbracket = re.compile('[<>]')
  33 special = re.compile('<![^<>]*>')
  34 commentopen = re.compile('<!--')
  35 commentclose = re.compile('--[%s]*>' % string.whitespace)
  36 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9]*')
  37 attrfind = re.compile(
  38     '[%s]*([a-zA-Z_][-.a-zA-Z_0-9]*)' % string.whitespace
  39     + ('([%s]*=[%s]*' % (string.whitespace, string.whitespace))
  40     + r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:+*%?!\(\)_#=~]*))?')
  41
  42
  43 # SGML parser base class -- find tags and call handler functions.
  44 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
  45 # The dtd is defined by deriving a class which defines methods
  46 # with special names to handle tags: start_foo and end_foo to handle
  47 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
  48 # (Tags are converted to lower case for this purpose.)  The data
  49 # between tags is passed to the parser by calling self.handle_data()
  50 # with some data as argument (the data may be split up in arbutrary
  51 # chunks).  Entity references are passed by calling
  52 # self.handle_entityref() with the entity reference as argument.
  53
  54 class SGMLParser:
  55
  56     # Interface -- initialize and reset this instance
  57     def __init__(self, verbose=0):
  58         self.verbose = verbose
  59         self.reset()
  60
  61     # Interface -- reset this instance.  Loses all unprocessed data
  62     def reset(self):
  63         self.rawdata = ''
  64         self.stack = []
  65         self.lasttag = '???'
  66         self.nomoretags = 0
  67         self.literal = 0
  68
  69     # For derived classes only -- enter literal mode (CDATA) till EOF
  70     def setnomoretags(self):
  71         self.nomoretags = self.literal = 1
  72
  73     # For derived classes only -- enter literal mode (CDATA)
  74     def setliteral(self, *args):
  75         self.literal = 1
  76
  77     # Interface -- feed some data to the parser.  Call this as
  78     # often as you want, with as little or as much text as you
  79     # want (may include '\n').  (This just saves the text, all the
  80     # processing is done by goahead().)
  81     def feed(self, data):
  82         self.rawdata = self.rawdata + data
  83         self.goahead(0)
  84
  85     # Interface -- handle the remaining data
  86     def close(self):
  87         self.goahead(1)
  88
  89     # Internal -- handle data as far as reasonable.  May leave state
  90     # and data to be processed by a subsequent call.  If 'end' is
  91     # true, force handling all data as if followed by EOF marker.
  92     def goahead(self, end):
  93         rawdata = self.rawdata
  94         i = 0
  95         n = len(rawdata)
  96         while i < n:
  97             if self.nomoretags:
  98                 self.handle_data(rawdata[i:n])
  99                 i = n
 100                 break
 101             match = interesting.search(rawdata, i)
 102             if match: j = match.start(0)
 103             else: j = n
 104             if i < j: self.handle_data(rawdata[i:j])
 105             i = j
 106             if i == n: break
 107             if rawdata[i] == '<':
 108                 if starttagopen.match(rawdata, i):
 109                     if self.literal:
 110                         self.handle_data(rawdata[i])
 111                         i = i+1
 112                         continue
 113                     k = self.parse_starttag(i)
 114                     if k < 0: break
 115                     i = k
 116                     continue
 117                 if endtagopen.match(rawdata, i):
 118                     k = self.parse_endtag(i)
 119                     if k < 0: break
 120                     i =  k
 121                     self.literal = 0
 122                     continue
 123                 if commentopen.match(rawdata, i):
 124                     if self.literal:
 125                         self.handle_data(rawdata[i])
 126                         i = i+1
 127                         continue
 128                     k = self.parse_comment(i)
 129                     if k < 0: break
 130                     i = i+k
 131                     continue
 132                 if piopen.match(rawdata, i):
 133                     if self.literal:
 134                         self.handle_data(rawdata[i])
 135                         i = i+1
 136                         continue
 137                     k = self.parse_pi(i)
 138                     if k < 0: break
 139                     i = i+k
 140                     continue
 141                 match = special.match(rawdata, i)
 142                 if match:
 143                     if self.literal:
 144                         self.handle_data(rawdata[i])
 145                         i = i+1
 146                         continue
 147                     i = match.end(0)
 148                     continue
 149             elif rawdata[i] == '&':
 150                 match = charref.match(rawdata, i)
 151                 if match:
 152                     name = match.group(1)
 153                     self.handle_charref(name)
 154                     i = match.end(0)
 155                     if rawdata[i-1] != ';': i = i-1
 156                     continue
 157                 match = entityref.match(rawdata, i)
 158                 if match:
 159                     name = match.group(1)
 160                     self.handle_entityref(name)
 161                     i = match.end(0)
 162                     if rawdata[i-1] != ';': i = i-1
 163                     continue
 164             else:
 165                 raise RuntimeError, 'neither < nor & ??'
 166             # We get here only if incomplete matches but
 167             # nothing else
 168             match = incomplete.match(rawdata, i)
 169             if not match:
 170                 self.handle_data(rawdata[i])
 171                 i = i+1
 172                 continue
 173             j = match.end(0)
 174             if j == n:
 175                 break # Really incomplete
 176             self.handle_data(rawdata[i:j])
 177             i = j
 178         # end while
 179         if end and i < n:
 180             self.handle_data(rawdata[i:n])
 181             i = n
 182         self.rawdata = rawdata[i:]
 183         # XXX if end: check for empty stack
 184
 185     # Internal -- parse comment, return length or -1 if not terminated
 186     def parse_comment(self, i):
 187         rawdata = self.rawdata
 188         if rawdata[i:i+4] <> '<!--':
 189             raise RuntimeError, 'unexpected call to handle_comment'
 190         match = commentclose.search(rawdata, i+4)
 191         if not match:
 192             return -1
 193         j = match.start(0)
 194         self.handle_comment(rawdata[i+4: j])
 195         j = match.end(0)
 196         return j-i
 197
 198     # Internal -- parse processing instr, return length or -1 if not terminated
 199     def parse_pi(self, i):
 200         rawdata = self.rawdata
 201         if rawdata[i:i+2] <> '<?':
 202             raise RuntimeError, 'unexpected call to handle_pi'
 203         match = piclose.search(rawdata, i+2)
 204         if not match:
 205             return -1
 206         j = match.start(0)
 207         self.handle_pi(rawdata[i+2: j])
 208         j = match.end(0)
 209         return j-i
 210
 211     # Internal -- handle starttag, return length or -1 if not terminated
 212     def parse_starttag(self, i):
 213         rawdata = self.rawdata
 214         if shorttagopen.match(rawdata, i):
 215             # SGML shorthand: <tag/data/ == <tag>data</tag>
 216             # XXX Can data contain &... (entity or char refs)?
 217             # XXX Can data contain < or > (tag characters)?
 218             # XXX Can there be whitespace before the first /?
 219             match = shorttag.match(rawdata, i)
 220             if not match:
 221                 return -1
 222             tag, data = match.group(1, 2)
 223             tag = string.lower(tag)
 224             self.finish_shorttag(tag, data)
 225             k = match.end(0)
 226             return k
 227         # XXX The following should skip matching quotes (' or ")
 228         match = endbracket.search(rawdata, i+1)
 229         if not match:
 230             return -1
 231         j = match.start(0)
 232         # Now parse the data between i+1 and j into a tag and attrs
 233         attrs = []
 234         if rawdata[i:i+2] == '<>':
 235             # SGML shorthand: <> == <last open tag seen>
 236             k = j
 237             tag = self.lasttag
 238         else:
 239             match = tagfind.match(rawdata, i+1)
 240             if not match:
 241                 raise RuntimeError, 'unexpected call to parse_starttag'
 242             k = match.end(0)
 243             tag = string.lower(rawdata[i+1:k])
 244             self.lasttag = tag
 245         while k < j:
 246             match = attrfind.match(rawdata, k)
 247             if not match: break
 248             attrname, rest, attrvalue = match.group(1, 2, 3)
 249             if not rest:
 250                 attrvalue = attrname
 251             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
 252                  attrvalue[:1] == '"' == attrvalue[-1:]:
 253                 attrvalue = attrvalue[1:-1]
 254             attrs.append((string.lower(attrname), attrvalue))
 255             k = match.end(0)
 256         if rawdata[j] == '>':
 257             j = j+1
 258         self.finish_starttag(tag, attrs)
 259         return j
 260
 261     # Internal -- parse endtag
 262     def parse_endtag(self, i):
 263         rawdata = self.rawdata
 264         match = endbracket.search(rawdata, i+1)
 265         if not match:
 266             return -1
 267         j = match.start(0)
 268         tag = string.lower(string.strip(rawdata[i+2:j]))
 269         if rawdata[j] == '>':
 270             j = j+1
 271         self.finish_endtag(tag)
 272         return j
 273
 274     # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
 275     def finish_shorttag(self, tag, data):
 276         self.finish_starttag(tag, [])
 277         self.handle_data(data)
 278         self.finish_endtag(tag)
 279
 280     # Internal -- finish processing of start tag
 281     # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
 282     def finish_starttag(self, tag, attrs):
 283         try:
 284             method = getattr(self, 'start_' + tag)
 285         except AttributeError:
 286             try:
 287                 method = getattr(self, 'do_' + tag)
 288             except AttributeError:
 289                 self.unknown_starttag(tag, attrs)
 290                 return -1
 291             else:
 292                 self.handle_starttag(tag, method, attrs)
 293                 return 0
 294         else:
 295             self.stack.append(tag)
 296             self.handle_starttag(tag, method, attrs)
 297             return 1
 298
 299     # Internal -- finish processing of end tag
 300     def finish_endtag(self, tag):
 301         if not tag:
 302             found = len(self.stack) - 1
 303             if found < 0:
 304                 self.unknown_endtag(tag)
 305                 return
 306         else:
 307             if tag not in self.stack:
 308                 try:
 309                     method = getattr(self, 'end_' + tag)
 310                 except AttributeError:
 311                     self.unknown_endtag(tag)
 312                 else:
 313                     self.report_unbalanced(tag)
 314                 return
 315             found = len(self.stack)
 316             for i in range(found):
 317                 if self.stack[i] == tag: found = i
 318         while len(self.stack) > found:
 319             tag = self.stack[-1]
 320             try:
 321                 method = getattr(self, 'end_' + tag)
 322             except AttributeError:
 323                 method = None
 324             if method:
 325                 self.handle_endtag(tag, method)
 326             else:
 327                 self.unknown_endtag(tag)
 328             del self.stack[-1]
 329
 330     # Overridable -- handle start tag
 331     def handle_starttag(self, tag, method, attrs):
 332         method(attrs)
 333
 334     # Overridable -- handle end tag
 335     def handle_endtag(self, tag, method):
 336         method()
 337
 338     # Example -- report an unbalanced </...> tag.
 339     def report_unbalanced(self, tag):
 340         if self.verbose:
 341             print '*** Unbalanced </' + tag + '>'
 342             print '*** Stack:', self.stack
 343
 344     # Example -- handle character reference, no need to override
 345     def handle_charref(self, name):
 346         try:
 347             n = string.atoi(name)
 348         except string.atoi_error:
 349             self.unknown_charref(name)
 350             return
 351         if not 0 <= n <= 255:
 352             self.unknown_charref(name)
 353             return
 354         self.handle_data(chr(n))
 355
 356     # Definition of entities -- derived classes may override
 357     entitydefs = \
 358             {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
 359
 360     # Example -- handle entity reference, no need to override
 361     def handle_entityref(self, name):
 362         table = self.entitydefs
 363         if table.has_key(name):
 364             self.handle_data(table[name])
 365         else:
 366             self.unknown_entityref(name)
 367             return
 368
 369     # Example -- handle data, should be overridden
 370     def handle_data(self, data):
 371         pass
 372
 373     # Example -- handle comment, could be overridden
 374     def handle_comment(self, data):
 375         pass
 376
 377     # Example -- handle processing instruction, could be overridden
 378     def handle_pi(self, data):
 379         pass
 380
 381     # To be overridden -- handlers for unknown objects
 382     def unknown_starttag(self, tag, attrs): pass
 383     def unknown_endtag(self, tag): pass
 384     def unknown_charref(self, ref): pass
 385     def unknown_entityref(self, ref): pass
 386
 387
 388 class TestSGMLParser(SGMLParser):
 389
 390     def __init__(self, verbose=0):
 391         self.testdata = ""
 392         SGMLParser.__init__(self, verbose)
 393
 394     def handle_data(self, data):
 395         self.testdata = self.testdata + data
 396         if len(`self.testdata`) >= 70:
 397             self.flush()
 398
 399     def flush(self):
 400         data = self.testdata
 401         if data:
 402             self.testdata = ""
 403             print 'data:', `data`
 404
 405     def handle_comment(self, data):
 406         self.flush()
 407         r = `data`
 408         if len(r) > 68:
 409             r = r[:32] + '...' + r[-32:]
 410         print 'comment:', r
 411
 412     def unknown_starttag(self, tag, attrs):
 413         self.flush()
 414         if not attrs:
 415             print 'start tag: <' + tag + '>'
 416         else:
 417             print 'start tag: <' + tag,
 418             for name, value in attrs:
 419                 print name + '=' + '"' + value + '"',
 420             print '>'
 421
 422     def unknown_endtag(self, tag):
 423         self.flush()
 424         print 'end tag: </' + tag + '>'
 425
 426     def unknown_entityref(self, ref):
 427         self.flush()
 428         print '*** unknown entity ref: &' + ref + ';'
 429
 430     def unknown_charref(self, ref):
 431         self.flush()
 432         print '*** unknown char ref: &#' + ref + ';'
 433
 434     def close(self):
 435         SGMLParser.close(self)
 436         self.flush()
 437
 438
 439 def test(args = None):
 440     import sys
 441
 442     if not args:
 443         args = sys.argv[1:]
 444
 445     if args and args[0] == '-s':
 446         args = args[1:]
 447         klass = SGMLParser
 448     else:
 449         klass = TestSGMLParser
 450
 451     if args:
 452         file = args[0]
 453     else:
 454         file = 'test.html'
 455
 456     if file == '-':
 457         f = sys.stdin
 458     else:
 459         try:
 460             f = open(file, 'r')
 461         except IOError, msg:
 462             print file, ":", msg
 463             sys.exit(1)
 464
 465     data = f.read()
 466     if f is not sys.stdin:
 467         f.close()
 468
 469     x = klass()
 470     for c in data:
 471         x.feed(c)
 472     x.close()
 473
 474
 475 if __name__ == '__main__':
 476     test()