src/searchengine/nova3/sgmllib3.py

   1 """A parser for SGML, using the derived class as a static DTD."""
   2
   3 # XXX This only supports those SGML features used by HTML.
   4
   5 # XXX There should be a way to distinguish between PCDATA (parsed
   6 # character data -- the normal case), RCDATA (replaceable character
   7 # data -- only char and entity references and end tags are special)
   8 # and CDATA (character data -- only end tags are special).  RCDATA is
   9 # not supported at all.
  10
  11 import _markupbase
  12 import re
  13
  14 __all__ = ["SGMLParser", "SGMLParseError"]
  15
  16 # Regular expressions used for parsing
  17
  18 interesting = re.compile('[&<]')
  19 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
  20                            '<([a-zA-Z][^<>]*|'
  21                               '/([a-zA-Z][^<>]*)?|'
  22                               '![^<>]*)?')
  23
  24 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  25 charref = re.compile('&#([0-9]+)[^0-9]')
  26
  27 starttagopen = re.compile('<[>a-zA-Z]')
  28 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
  29 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
  30 piclose = re.compile('>')
  31 endbracket = re.compile('[<>]')
  32 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
  33 attrfind = re.compile(
  34     r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
  35     r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
  36
  37
  38 class SGMLParseError(RuntimeError):
  39     """Exception raised for all parse errors."""
  40     pass
  41
  42
  43 # SGML parser base class -- find tags and call handler functions.
  44 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
  45 # The dtd is defined by deriving a class which defines methods
  46 # with special names to handle tags: start_foo and end_foo to handle
  47 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
  48 # (Tags are converted to lower case for this purpose.)  The data
  49 # between tags is passed to the parser by calling self.handle_data()
  50 # with some data as argument (the data may be split up in arbitrary
  51 # chunks).  Entity references are passed by calling
  52 # self.handle_entityref() with the entity reference as argument.
  53
  54 class SGMLParser(_markupbase.ParserBase):
  55     # Definition of entities -- derived classes may override
  56     entity_or_charref = re.compile('&(?:'
  57       '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
  58       ')(;?)')
  59
  60     def __init__(self, verbose=0):
  61         """Initialize and reset this instance."""
  62         self.verbose = verbose
  63         self.reset()
  64
  65     def reset(self):
  66         """Reset this instance. Loses all unprocessed data."""
  67         self.__starttag_text = None
  68         self.rawdata = ''
  69         self.stack = []
  70         self.lasttag = '???'
  71         self.nomoretags = 0
  72         self.literal = 0
  73         _markupbase.ParserBase.reset(self)
  74
  75     def setnomoretags(self):
  76         """Enter literal mode (CDATA) till EOF.
  77
  78         Intended for derived classes only.
  79         """
  80         self.nomoretags = self.literal = 1
  81
  82     def setliteral(self, *args):
  83         """Enter literal mode (CDATA).
  84
  85         Intended for derived classes only.
  86         """
  87         self.literal = 1
  88
  89     def feed(self, data):
  90         """Feed some data to the parser.
  91
  92         Call this as often as you want, with as little or as much text
  93         as you want (may include '\n').  (This just saves the text,
  94         all the processing is done by goahead().)
  95         """
  96
  97         self.rawdata = self.rawdata + data
  98         self.goahead(0)
  99
 100     def close(self):
 101         """Handle the remaining data."""
 102         self.goahead(1)
 103
 104     def error(self, message):
 105         raise SGMLParseError(message)
 106
 107     # Internal -- handle data as far as reasonable.  May leave state
 108     # and data to be processed by a subsequent call.  If 'end' is
 109     # true, force handling all data as if followed by EOF marker.
 110     def goahead(self, end):
 111         rawdata = self.rawdata
 112         i = 0
 113         n = len(rawdata)
 114         while i < n:
 115             if self.nomoretags:
 116                 self.handle_data(rawdata[i:n])
 117                 i = n
 118                 break
 119             match = interesting.search(rawdata, i)
 120             if match: j = match.start()
 121             else: j = n
 122             if i < j:
 123                 self.handle_data(rawdata[i:j])
 124             i = j
 125             if i == n: break
 126             if rawdata[i] == '<':
 127                 if starttagopen.match(rawdata, i):
 128                     if self.literal:
 129                         self.handle_data(rawdata[i])
 130                         i = i+1
 131                         continue
 132                     k = self.parse_starttag(i)
 133                     if k < 0: break
 134                     i = k
 135                     continue
 136                 if rawdata.startswith("</", i):
 137                     k = self.parse_endtag(i)
 138                     if k < 0: break
 139                     i = k
 140                     self.literal = 0
 141                     continue
 142                 if self.literal:
 143                     if n > (i + 1):
 144                         self.handle_data("<")
 145                         i = i+1
 146                     else:
 147                         # incomplete
 148                         break
 149                     continue
 150                 if rawdata.startswith("<!--", i):
 151                         # Strictly speaking, a comment is --.*--
 152                         # within a declaration tag <!...>.
 153                         # This should be removed,
 154                         # and comments handled only in parse_declaration.
 155                     k = self.parse_comment(i)
 156                     if k < 0: break
 157                     i = k
 158                     continue
 159                 if rawdata.startswith("<?", i):
 160                     k = self.parse_pi(i)
 161                     if k < 0: break
 162                     i = i+k
 163                     continue
 164                 if rawdata.startswith("<!", i):
 165                     # This is some sort of declaration; in "HTML as
 166                     # deployed," this should only be the document type
 167                     # declaration ("<!DOCTYPE html...>").
 168                     k = self.parse_declaration(i)
 169                     if k < 0: break
 170                     i = k
 171                     continue
 172             elif rawdata[i] == '&':
 173                 if self.literal:
 174                     self.handle_data(rawdata[i])
 175                     i = i+1
 176                     continue
 177                 match = charref.match(rawdata, i)
 178                 if match:
 179                     name = match.group(1)
 180                     self.handle_charref(name)
 181                     i = match.end(0)
 182                     if rawdata[i-1] != ';': i = i-1
 183                     continue
 184                 match = entityref.match(rawdata, i)
 185                 if match:
 186                     name = match.group(1)
 187                     self.handle_entityref(name)
 188                     i = match.end(0)
 189                     if rawdata[i-1] != ';': i = i-1
 190                     continue
 191             else:
 192                 self.error('neither < nor & ??')
 193             # We get here only if incomplete matches but
 194             # nothing else
 195             match = incomplete.match(rawdata, i)
 196             if not match:
 197                 self.handle_data(rawdata[i])
 198                 i = i+1
 199                 continue
 200             j = match.end(0)
 201             if j == n:
 202                 break # Really incomplete
 203             self.handle_data(rawdata[i:j])
 204             i = j
 205         # end while
 206         if end and i < n:
 207             self.handle_data(rawdata[i:n])
 208             i = n
 209         self.rawdata = rawdata[i:]
 210         # XXX if end: check for empty stack
 211
 212     # Extensions for the DOCTYPE scanner:
 213     _decl_otherchars = '='
 214
 215     # Internal -- parse processing instr, return length or -1 if not terminated
 216     def parse_pi(self, i):
 217         rawdata = self.rawdata
 218         if rawdata[i:i+2] != '<?':
 219             self.error('unexpected call to parse_pi()')
 220         match = piclose.search(rawdata, i+2)
 221         if not match:
 222             return -1
 223         j = match.start(0)
 224         self.handle_pi(rawdata[i+2: j])
 225         j = match.end(0)
 226         return j-i
 227
 228     def get_starttag_text(self):
 229         return self.__starttag_text
 230
 231     # Internal -- handle starttag, return length or -1 if not terminated
 232     def parse_starttag(self, i):
 233         self.__starttag_text = None
 234         start_pos = i
 235         rawdata = self.rawdata
 236         if shorttagopen.match(rawdata, i):
 237             # SGML shorthand: <tag/data/ == <tag>data</tag>
 238             # XXX Can data contain &... (entity or char refs)?
 239             # XXX Can data contain < or > (tag characters)?
 240             # XXX Can there be whitespace before the first /?
 241             match = shorttag.match(rawdata, i)
 242             if not match:
 243                 return -1
 244             tag, data = match.group(1, 2)
 245             self.__starttag_text = '<%s/' % tag
 246             tag = tag.lower()
 247             k = match.end(0)
 248             self.finish_shorttag(tag, data)
 249             self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
 250             return k
 251         # XXX The following should skip matching quotes (' or ")
 252         # As a shortcut way to exit, this isn't so bad, but shouldn't
 253         # be used to locate the actual end of the start tag since the
 254         # < or > characters may be embedded in an attribute value.
 255         match = endbracket.search(rawdata, i+1)
 256         if not match:
 257             return -1
 258         j = match.start(0)
 259         # Now parse the data between i+1 and j into a tag and attrs
 260         attrs = []
 261         if rawdata[i:i+2] == '<>':
 262             # SGML shorthand: <> == <last open tag seen>
 263             k = j
 264             tag = self.lasttag
 265         else:
 266             match = tagfind.match(rawdata, i+1)
 267             if not match:
 268                 self.error('unexpected call to parse_starttag')
 269             k = match.end(0)
 270             tag = rawdata[i+1:k].lower()
 271             self.lasttag = tag
 272         while k < j:
 273             match = attrfind.match(rawdata, k)
 274             if not match: break
 275             attrname, rest, attrvalue = match.group(1, 2, 3)
 276             if not rest:
 277                 attrvalue = attrname
 278             else:
 279                 if (attrvalue[:1] == "'" == attrvalue[-1:] or
 280                     attrvalue[:1] == '"' == attrvalue[-1:]):
 281                     # strip quotes
 282                     attrvalue = attrvalue[1:-1]
 283                 attrvalue = self.entity_or_charref.sub(
 284                     self._convert_ref, attrvalue)
 285             attrs.append((attrname.lower(), attrvalue))
 286             k = match.end(0)
 287         if rawdata[j] == '>':
 288             j = j+1
 289         self.__starttag_text = rawdata[start_pos:j]
 290         self.finish_starttag(tag, attrs)
 291         return j
 292
 293     # Internal -- convert entity or character reference
 294     def _convert_ref(self, match):
 295         if match.group(2):
 296             return self.convert_charref(match.group(2)) or \
 297                 '&#%s%s' % match.groups()[1:]
 298         elif match.group(3):
 299             return self.convert_entityref(match.group(1)) or \
 300                 '&%s;' % match.group(1)
 301         else:
 302             return '&%s' % match.group(1)
 303
 304     # Internal -- parse endtag
 305     def parse_endtag(self, i):
 306         rawdata = self.rawdata
 307         match = endbracket.search(rawdata, i+1)
 308         if not match:
 309             return -1
 310         j = match.start(0)
 311         tag = rawdata[i+2:j].strip().lower()
 312         if rawdata[j] == '>':
 313             j = j+1
 314         self.finish_endtag(tag)
 315         return j
 316
 317     # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
 318     def finish_shorttag(self, tag, data):
 319         self.finish_starttag(tag, [])
 320         self.handle_data(data)
 321         self.finish_endtag(tag)
 322
 323     # Internal -- finish processing of start tag
 324     # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
 325     def finish_starttag(self, tag, attrs):
 326         try:
 327             method = getattr(self, 'start_' + tag)
 328         except AttributeError:
 329             try:
 330                 method = getattr(self, 'do_' + tag)
 331             except AttributeError:
 332                 self.unknown_starttag(tag, attrs)
 333                 return -1
 334             else:
 335                 self.handle_starttag(tag, method, attrs)
 336                 return 0
 337         else:
 338             self.stack.append(tag)
 339             self.handle_starttag(tag, method, attrs)
 340             return 1
 341
 342     # Internal -- finish processing of end tag
 343     def finish_endtag(self, tag):
 344         if not tag:
 345             found = len(self.stack) - 1
 346             if found < 0:
 347                 self.unknown_endtag(tag)
 348                 return
 349         else:
 350             if tag not in self.stack:
 351                 try:
 352                     method = getattr(self, 'end_' + tag)
 353                 except AttributeError:
 354                     self.unknown_endtag(tag)
 355                 else:
 356                     self.report_unbalanced(tag)
 357                 return
 358             found = len(self.stack)
 359             for i in range(found):
 360                 if self.stack[i] == tag: found = i
 361         while len(self.stack) > found:
 362             tag = self.stack[-1]
 363             try:
 364                 method = getattr(self, 'end_' + tag)
 365             except AttributeError:
 366                 method = None
 367             if method:
 368                 self.handle_endtag(tag, method)
 369             else:
 370                 self.unknown_endtag(tag)
 371             del self.stack[-1]
 372
 373     # Overridable -- handle start tag
 374     def handle_starttag(self, tag, method, attrs):
 375         method(attrs)
 376
 377     # Overridable -- handle end tag
 378     def handle_endtag(self, tag, method):
 379         method()
 380
 381     # Example -- report an unbalanced </...> tag.
 382     def report_unbalanced(self, tag):
 383         if self.verbose:
 384             print('*** Unbalanced </' + tag + '>')
 385             print('*** Stack:', self.stack)
 386
 387     def convert_charref(self, name):
 388         """Convert character reference, may be overridden."""
 389         try:
 390             n = int(name)
 391         except ValueError:
 392             return
 393         if not 0 <= n <= 127:
 394             return
 395         return self.convert_codepoint(n)
 396
 397     def convert_codepoint(self, codepoint):
 398         return chr(codepoint)
 399
 400     def handle_charref(self, name):
 401         """Handle character reference, no need to override."""
 402         replacement = self.convert_charref(name)
 403         if replacement is None:
 404             self.unknown_charref(name)
 405         else:
 406             self.handle_data(replacement)
 407
 408     # Definition of entities -- derived classes may override
 409     entitydefs = \
 410             {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
 411
 412     def convert_entityref(self, name):
 413         """Convert entity references.
 414
 415         As an alternative to overriding this method; one can tailor the
 416         results by setting up the self.entitydefs mapping appropriately.
 417         """
 418         table = self.entitydefs
 419         if name in table:
 420             return table[name]
 421         else:
 422             return
 423
 424     def handle_entityref(self, name):
 425         """Handle entity references, no need to override."""
 426         replacement = self.convert_entityref(name)
 427         if replacement is None:
 428             self.unknown_entityref(name)
 429         else:
 430             self.handle_data(replacement)
 431
 432     # Example -- handle data, should be overridden
 433     def handle_data(self, data):
 434         pass
 435
 436     # Example -- handle comment, could be overridden
 437     def handle_comment(self, data):
 438         pass
 439
 440     # Example -- handle declaration, could be overridden
 441     def handle_decl(self, decl):
 442         pass
 443
 444     # Example -- handle processing instruction, could be overridden
 445     def handle_pi(self, data):
 446         pass
 447
 448     # To be overridden -- handlers for unknown objects
 449     def unknown_starttag(self, tag, attrs): pass
 450     def unknown_endtag(self, tag): pass
 451     def unknown_charref(self, ref): pass
 452     def unknown_entityref(self, ref): pass
 453
 454
 455 class TestSGMLParser(SGMLParser):
 456
 457     def __init__(self, verbose=0):
 458         self.testdata = ""
 459         SGMLParser.__init__(self, verbose)
 460
 461     def handle_data(self, data):
 462         self.testdata = self.testdata + data
 463         if len(repr(self.testdata)) >= 70:
 464             self.flush()
 465
 466     def flush(self):
 467         data = self.testdata
 468         if data:
 469             self.testdata = ""
 470             print('data:', repr(data))
 471
 472     def handle_comment(self, data):
 473         self.flush()
 474         r = repr(data)
 475         if len(r) > 68:
 476             r = r[:32] + '...' + r[-32:]
 477         print('comment:', r)
 478
 479     def unknown_starttag(self, tag, attrs):
 480         self.flush()
 481         if not attrs:
 482             print('start tag: <' + tag + '>')
 483         else:
 484             print('start tag: <' + tag, end=' ')
 485             for name, value in attrs:
 486                 print(name + '=' + '"' + value + '"', end=' ')
 487             print('>')
 488
 489     def unknown_endtag(self, tag):
 490         self.flush()
 491         print('end tag: </' + tag + '>')
 492
 493     def unknown_entityref(self, ref):
 494         self.flush()
 495         print('*** unknown entity ref: &' + ref + ';')
 496
 497     def unknown_charref(self, ref):
 498         self.flush()
 499         print('*** unknown char ref: &#' + ref + ';')
 500
 501     def unknown_decl(self, data):
 502         self.flush()
 503         print('*** unknown decl: [' + data + ']')
 504
 505     def close(self):
 506         SGMLParser.close(self)
 507         self.flush()
 508
 509
 510 def test(args = None):
 511     import sys
 512
 513     if args is None:
 514         args = sys.argv[1:]
 515
 516     if args and args[0] == '-s':
 517         args = args[1:]
 518         klass = SGMLParser
 519     else:
 520         klass = TestSGMLParser
 521
 522     if args:
 523         file = args[0]
 524     else:
 525         file = 'test.html'
 526
 527     if file == '-':
 528         f = sys.stdin
 529     else:
 530         try:
 531             f = open(file, 'r')
 532         except IOError as msg:
 533             print(file, ":", msg)
 534             sys.exit(1)
 535
 536     data = f.read()
 537     if f is not sys.stdin:
 538         f.close()
 539
 540     x = klass()
 541     for c in data:
 542         x.feed(c)
 543     x.close()
 544
 545
 546 if __name__ == '__main__':
 547     test()