Lib/xmllib.py

   1 """A parser for XML, using the derived class as static DTD."""
   2
   3 # Author: Sjoerd Mullender.
   4
   5 import re
   6 import string
   7
   8
   9 version = '0.3'
  10
  11 # Regular expressions used for parsing
  12
  13 _S = '[ \t\r\n]+'                       # white space
  14 _opS = '[ \t\r\n]*'                     # optional white space
  15 _Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*'    # valid XML name
  16 _QStr = "(?:'[^']*'|\"[^\"]*\")"        # quoted XML string
  17 illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
  18 interesting = re.compile('[]&<]')
  19
  20 amp = re.compile('&')
  21 ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
  22 entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
  23 charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
  24 space = re.compile(_S + '$')
  25 newline = re.compile('\n')
  26
  27 attrfind = re.compile(
  28     _S + '(?P<name>' + _Name + ')'
  29     '(' + _opS + '=' + _opS +
  30     '(?P<value>'+_QStr+'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?')
  31 starttagopen = re.compile('<' + _Name)
  32 starttagend = re.compile(_opS + '(?P<slash>/?)>')
  33 starttagmatch = re.compile('<(?P<tagname>'+_Name+')'
  34                       '(?P<attrs>(?:'+attrfind.pattern+')*)'+
  35                       starttagend.pattern)
  36 endtagopen = re.compile('</')
  37 endbracket = re.compile(_opS + '>')
  38 endbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>')
  39 tagfind = re.compile(_Name)
  40 cdataopen = re.compile(r'<!\[CDATA\[')
  41 cdataclose = re.compile(r'\]\]>')
  42 # this matches one of the following:
  43 # SYSTEM SystemLiteral
  44 # PUBLIC PubidLiteral SystemLiteral
  45 _SystemLiteral = '(?P<%s>'+_QStr+')'
  46 _PublicLiteral = '(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
  47                         "'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
  48 _ExternalId = '(?:SYSTEM|' \
  49                  'PUBLIC'+_S+_PublicLiteral%'pubid'+ \
  50               ')'+_S+_SystemLiteral%'syslit'
  51 doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'
  52                      '(?:'+_S+_ExternalId+')?'+_opS)
  53 xmldecl = re.compile('<\?xml'+_S+
  54                      'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+
  55                      '(?:'+_S+'encoding'+_opS+'='+_opS+
  56                         "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
  57                         '"[A-Za-z][-A-Za-z0-9._]*"))?'
  58                      '(?:'+_S+'standalone'+_opS+'='+_opS+
  59                         '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
  60                      _opS+'\?>')
  61 procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)
  62 procclose = re.compile(_opS + r'\?>')
  63 commentopen = re.compile('<!--')
  64 commentclose = re.compile('-->')
  65 doubledash = re.compile('--')
  66 attrtrans = string.maketrans(' \r\n\t', '    ')
  67
  68 # definitions for XML namespaces
  69 _NCName = '[a-zA-Z_][-a-zA-Z0-9._]*'    # XML Name, minus the ":"
  70 ncname = re.compile(_NCName + '$')
  71 qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix
  72                    '(?P<local>' + _NCName + ')$')
  73
  74 xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$')
  75
  76 # XML parser base class -- find tags and call handler functions.
  77 # Usage: p = XMLParser(); p.feed(data); ...; p.close().
  78 # The dtd is defined by deriving a class which defines methods with
  79 # special names to handle tags: start_foo and end_foo to handle <foo>
  80 # and </foo>, respectively.  The data between tags is passed to the
  81 # parser by calling self.handle_data() with some data as argument (the
  82 # data may be split up in arbitrary chunks).
  83
  84 class XMLParser:
  85     attributes = {}                     # default, to be overridden
  86     elements = {}                       # default, to be overridden
  87
  88     # parsing options, settable using keyword args in __init__
  89     __accept_unquoted_attributes = 0
  90     __accept_missing_endtag_name = 0
  91     __map_case = 0
  92     __accept_utf8 = 0
  93     __translate_attribute_references = 1
  94
  95     # Interface -- initialize and reset this instance
  96     def __init__(self, **kw):
  97         self.__fixed = 0
  98         if kw.has_key('accept_unquoted_attributes'):
  99             self.__accept_unquoted_attributes = kw['accept_unquoted_attributes']
 100         if kw.has_key('accept_missing_endtag_name'):
 101             self.__accept_missing_endtag_name = kw['accept_missing_endtag_name']
 102         if kw.has_key('map_case'):
 103             self.__map_case = kw['map_case']
 104         if kw.has_key('accept_utf8'):
 105             self.__accept_utf8 = kw['accept_utf8']
 106         if kw.has_key('translate_attribute_references'):
 107             self.__translate_attribute_references = kw['translate_attribute_references']
 108         self.reset()
 109
 110     def __fixelements(self):
 111         self.__fixed = 1
 112         self.elements = {}
 113         self.__fixdict(self.__dict__)
 114         self.__fixclass(self.__class__)
 115
 116     def __fixclass(self, kl):
 117         self.__fixdict(kl.__dict__)
 118         for k in kl.__bases__:
 119             self.__fixclass(k)
 120
 121     def __fixdict(self, dict):
 122         for key in dict.keys():
 123             if key[:6] == 'start_':
 124                 tag = key[6:]
 125                 start, end = self.elements.get(tag, (None, None))
 126                 if start is None:
 127                     self.elements[tag] = getattr(self, key), end
 128             elif key[:4] == 'end_':
 129                 tag = key[4:]
 130                 start, end = self.elements.get(tag, (None, None))
 131                 if end is None:
 132                     self.elements[tag] = start, getattr(self, key)
 133
 134     # Interface -- reset this instance.  Loses all unprocessed data
 135     def reset(self):
 136         self.rawdata = ''
 137         self.stack = []
 138         self.nomoretags = 0
 139         self.literal = 0
 140         self.lineno = 1
 141         self.__at_start = 1
 142         self.__seen_doctype = None
 143         self.__seen_starttag = 0
 144         self.__use_namespaces = 0
 145         self.__namespaces = {'xml':None}   # xml is implicitly declared
 146         # backward compatibility hack: if elements not overridden,
 147         # fill it in ourselves
 148         if self.elements is XMLParser.elements:
 149             self.__fixelements()
 150
 151     # For derived classes only -- enter literal mode (CDATA) till EOF
 152     def setnomoretags(self):
 153         self.nomoretags = self.literal = 1
 154
 155     # For derived classes only -- enter literal mode (CDATA)
 156     def setliteral(self, *args):
 157         self.literal = 1
 158
 159     # Interface -- feed some data to the parser.  Call this as
 160     # often as you want, with as little or as much text as you
 161     # want (may include '\n').  (This just saves the text, all the
 162     # processing is done by goahead().)
 163     def feed(self, data):
 164         self.rawdata = self.rawdata + data
 165         self.goahead(0)
 166
 167     # Interface -- handle the remaining data
 168     def close(self):
 169         self.goahead(1)
 170         if self.__fixed:
 171             self.__fixed = 0
 172             # remove self.elements so that we don't leak
 173             del self.elements
 174
 175     # Interface -- translate references
 176     def translate_references(self, data, all = 1):
 177         if not self.__translate_attribute_references:
 178             return data
 179         i = 0
 180         while 1:
 181             res = amp.search(data, i)
 182             if res is None:
 183                 return data
 184             s = res.start(0)
 185             res = ref.match(data, s)
 186             if res is None:
 187                 self.syntax_error("bogus `&'")
 188                 i = s+1
 189                 continue
 190             i = res.end(0)
 191             str = res.group(1)
 192             rescan = 0
 193             if str[0] == '#':
 194                 if str[1] == 'x':
 195                     str = chr(string.atoi(str[2:], 16))
 196                 else:
 197                     str = chr(string.atoi(str[1:]))
 198                 if data[i - 1] != ';':
 199                     self.syntax_error("`;' missing after char reference")
 200                     i = i-1
 201             elif all:
 202                 if self.entitydefs.has_key(str):
 203                     str = self.entitydefs[str]
 204                     rescan = 1
 205                 elif data[i - 1] != ';':
 206                     self.syntax_error("bogus `&'")
 207                     i = s + 1 # just past the &
 208                     continue
 209                 else:
 210                     self.syntax_error("reference to unknown entity `&%s;'" % str)
 211                     str = '&' + str + ';'
 212             elif data[i - 1] != ';':
 213                 self.syntax_error("bogus `&'")
 214                 i = s + 1 # just past the &
 215                 continue
 216
 217             # when we get here, str contains the translated text and i points
 218             # to the end of the string that is to be replaced
 219             data = data[:s] + str + data[i:]
 220             if rescan:
 221                 i = s
 222             else:
 223                 i = s + len(str)
 224
 225     # Interface - return a dictionary of all namespaces currently valid
 226     def getnamespace(self):
 227         nsdict = {}
 228         for t, d, nst in self.stack:
 229             nsdict.update(d)
 230         return nsdict
 231
 232     # Internal -- handle data as far as reasonable.  May leave state
 233     # and data to be processed by a subsequent call.  If 'end' is
 234     # true, force handling all data as if followed by EOF marker.
 235     def goahead(self, end):
 236         rawdata = self.rawdata
 237         i = 0
 238         n = len(rawdata)
 239         while i < n:
 240             if i > 0:
 241                 self.__at_start = 0
 242             if self.nomoretags:
 243                 data = rawdata[i:n]
 244                 self.handle_data(data)
 245                 self.lineno = self.lineno + string.count(data, '\n')
 246                 i = n
 247                 break
 248             res = interesting.search(rawdata, i)
 249             if res:
 250                     j = res.start(0)
 251             else:
 252                     j = n
 253             if i < j:
 254                 data = rawdata[i:j]
 255                 if self.__at_start and space.match(data) is None:
 256                     self.syntax_error('illegal data at start of file')
 257                 self.__at_start = 0
 258                 if not self.stack and space.match(data) is None:
 259                     self.syntax_error('data not in content')
 260                 if not self.__accept_utf8 and illegal.search(data):
 261                     self.syntax_error('illegal character in content')
 262                 self.handle_data(data)
 263                 self.lineno = self.lineno + string.count(data, '\n')
 264             i = j
 265             if i == n: break
 266             if rawdata[i] == '<':
 267                 if starttagopen.match(rawdata, i):
 268                     if self.literal:
 269                         data = rawdata[i]
 270                         self.handle_data(data)
 271                         self.lineno = self.lineno + string.count(data, '\n')
 272                         i = i+1
 273                         continue
 274                     k = self.parse_starttag(i)
 275                     if k < 0: break
 276                     self.__seen_starttag = 1
 277                     self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
 278                     i = k
 279                     continue
 280                 if endtagopen.match(rawdata, i):
 281                     k = self.parse_endtag(i)
 282                     if k < 0: break
 283                     self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
 284                     i =  k
 285                     continue
 286                 if commentopen.match(rawdata, i):
 287                     if self.literal:
 288                         data = rawdata[i]
 289                         self.handle_data(data)
 290                         self.lineno = self.lineno + string.count(data, '\n')
 291                         i = i+1
 292                         continue
 293                     k = self.parse_comment(i)
 294                     if k < 0: break
 295                     self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
 296                     i = k
 297                     continue
 298                 if cdataopen.match(rawdata, i):
 299                     k = self.parse_cdata(i)
 300                     if k < 0: break
 301                     self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
 302                     i = k
 303                     continue
 304                 res = xmldecl.match(rawdata, i)
 305                 if res:
 306                     if not self.__at_start:
 307                         self.syntax_error("<?xml?> declaration not at start of document")
 308                     version, encoding, standalone = res.group('version',
 309                                                               'encoding',
 310                                                               'standalone')
 311                     if version[1:-1] != '1.0':
 312                         raise RuntimeError, 'only XML version 1.0 supported'
 313                     if encoding: encoding = encoding[1:-1]
 314                     if standalone: standalone = standalone[1:-1]
 315                     self.handle_xml(encoding, standalone)
 316                     i = res.end(0)
 317                     continue
 318                 res = procopen.match(rawdata, i)
 319                 if res:
 320                     k = self.parse_proc(i)
 321                     if k < 0: break
 322                     self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
 323                     i = k
 324                     continue
 325                 res = doctype.match(rawdata, i)
 326                 if res:
 327                     if self.literal:
 328                         data = rawdata[i]
 329                         self.handle_data(data)
 330                         self.lineno = self.lineno + string.count(data, '\n')
 331                         i = i+1
 332                         continue
 333                     if self.__seen_doctype:
 334                         self.syntax_error('multiple DOCTYPE elements')
 335                     if self.__seen_starttag:
 336                         self.syntax_error('DOCTYPE not at beginning of document')
 337                     k = self.parse_doctype(res)
 338                     if k < 0: break
 339                     self.__seen_doctype = res.group('name')
 340                     if self.__map_case:
 341                         self.__seen_doctype = string.lower(self.__seen_doctype)
 342                     self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
 343                     i = k
 344                     continue
 345             elif rawdata[i] == '&':
 346                 if self.literal:
 347                     data = rawdata[i]
 348                     self.handle_data(data)
 349                     i = i+1
 350                     continue
 351                 res = charref.match(rawdata, i)
 352                 if res is not None:
 353                     i = res.end(0)
 354                     if rawdata[i-1] != ';':
 355                         self.syntax_error("`;' missing in charref")
 356                         i = i-1
 357                     if not self.stack:
 358                         self.syntax_error('data not in content')
 359                     self.handle_charref(res.group('char')[:-1])
 360                     self.lineno = self.lineno + string.count(res.group(0), '\n')
 361                     continue
 362                 res = entityref.match(rawdata, i)
 363                 if res is not None:
 364                     i = res.end(0)
 365                     if rawdata[i-1] != ';':
 366                         self.syntax_error("`;' missing in entityref")
 367                         i = i-1
 368                     name = res.group('name')
 369                     if self.__map_case:
 370                         name = string.lower(name)
 371                     if self.entitydefs.has_key(name):
 372                         self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
 373                         n = len(rawdata)
 374                         i = res.start(0)
 375                     else:
 376                         self.unknown_entityref(name)
 377                     self.lineno = self.lineno + string.count(res.group(0), '\n')
 378                     continue
 379             elif rawdata[i] == ']':
 380                 if self.literal:
 381                     data = rawdata[i]
 382                     self.handle_data(data)
 383                     i = i+1
 384                     continue
 385                 if n-i < 3:
 386                     break
 387                 if cdataclose.match(rawdata, i):
 388                     self.syntax_error("bogus `]]>'")
 389                 self.handle_data(rawdata[i])
 390                 i = i+1
 391                 continue
 392             else:
 393                 raise RuntimeError, 'neither < nor & ??'
 394             # We get here only if incomplete matches but
 395             # nothing else
 396             break
 397         # end while
 398         if i > 0:
 399             self.__at_start = 0
 400         if end and i < n:
 401             data = rawdata[i]
 402             self.syntax_error("bogus `%s'" % data)
 403             if not self.__accept_utf8 and illegal.search(data):
 404                 self.syntax_error('illegal character in content')
 405             self.handle_data(data)
 406             self.lineno = self.lineno + string.count(data, '\n')
 407             self.rawdata = rawdata[i+1:]
 408             return self.goahead(end)
 409         self.rawdata = rawdata[i:]
 410         if end:
 411             if not self.__seen_starttag:
 412                 self.syntax_error('no elements in file')
 413             if self.stack:
 414                 self.syntax_error('missing end tags')
 415                 while self.stack:
 416                     self.finish_endtag(self.stack[-1][0])
 417
 418     # Internal -- parse comment, return length or -1 if not terminated
 419     def parse_comment(self, i):
 420         rawdata = self.rawdata
 421         if rawdata[i:i+4] <> '<!--':
 422             raise RuntimeError, 'unexpected call to handle_comment'
 423         res = commentclose.search(rawdata, i+4)
 424         if res is None:
 425             return -1
 426         if doubledash.search(rawdata, i+4, res.start(0)):
 427             self.syntax_error("`--' inside comment")
 428         if rawdata[res.start(0)-1] == '-':
 429             self.syntax_error('comment cannot end in three dashes')
 430         if not self.__accept_utf8 and \
 431            illegal.search(rawdata, i+4, res.start(0)):
 432             self.syntax_error('illegal character in comment')
 433         self.handle_comment(rawdata[i+4: res.start(0)])
 434         return res.end(0)
 435
 436     # Internal -- handle DOCTYPE tag, return length or -1 if not terminated
 437     def parse_doctype(self, res):
 438         rawdata = self.rawdata
 439         n = len(rawdata)
 440         name = res.group('name')
 441         if self.__map_case:
 442             name = string.lower(name)
 443         pubid, syslit = res.group('pubid', 'syslit')
 444         if pubid is not None:
 445             pubid = pubid[1:-1]         # remove quotes
 446             pubid = string.join(string.split(pubid)) # normalize
 447         if syslit is not None: syslit = syslit[1:-1] # remove quotes
 448         j = k = res.end(0)
 449         if k >= n:
 450             return -1
 451         if rawdata[k] == '[':
 452             level = 0
 453             k = k+1
 454             dq = sq = 0
 455             while k < n:
 456                 c = rawdata[k]
 457                 if not sq and c == '"':
 458                     dq = not dq
 459                 elif not dq and c == "'":
 460                     sq = not sq
 461                 elif sq or dq:
 462                     pass
 463                 elif level <= 0 and c == ']':
 464                     res = endbracket.match(rawdata, k+1)
 465                     if res is None:
 466                         return -1
 467                     self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
 468                     return res.end(0)
 469                 elif c == '<':
 470                     level = level + 1
 471                 elif c == '>':
 472                     level = level - 1
 473                     if level < 0:
 474                         self.syntax_error("bogus `>' in DOCTYPE")
 475                 k = k+1
 476         res = endbracketfind.match(rawdata, k)
 477         if res is None:
 478             return -1
 479         if endbracket.match(rawdata, k) is None:
 480             self.syntax_error('garbage in DOCTYPE')
 481         self.handle_doctype(name, pubid, syslit, None)
 482         return res.end(0)
 483
 484     # Internal -- handle CDATA tag, return length or -1 if not terminated
 485     def parse_cdata(self, i):
 486         rawdata = self.rawdata
 487         if rawdata[i:i+9] <> '<![CDATA[':
 488             raise RuntimeError, 'unexpected call to parse_cdata'
 489         res = cdataclose.search(rawdata, i+9)
 490         if res is None:
 491             return -1
 492         if not self.__accept_utf8 and \
 493            illegal.search(rawdata, i+9, res.start(0)):
 494             self.syntax_error('illegal character in CDATA')
 495         if not self.stack:
 496             self.syntax_error('CDATA not in content')
 497         self.handle_cdata(rawdata[i+9:res.start(0)])
 498         return res.end(0)
 499
 500     __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None}
 501     # Internal -- handle a processing instruction tag
 502     def parse_proc(self, i):
 503         rawdata = self.rawdata
 504         end = procclose.search(rawdata, i)
 505         if end is None:
 506             return -1
 507         j = end.start(0)
 508         if not self.__accept_utf8 and illegal.search(rawdata, i+2, j):
 509             self.syntax_error('illegal character in processing instruction')
 510         res = tagfind.match(rawdata, i+2)
 511         if res is None:
 512             raise RuntimeError, 'unexpected call to parse_proc'
 513         k = res.end(0)
 514         name = res.group(0)
 515         if self.__map_case:
 516             name = string.lower(name)
 517         if name == 'xml:namespace':
 518             self.syntax_error('old-fashioned namespace declaration')
 519             self.__use_namespaces = -1
 520             # namespace declaration
 521             # this must come after the <?xml?> declaration (if any)
 522             # and before the <!DOCTYPE> (if any).
 523             if self.__seen_doctype or self.__seen_starttag:
 524                 self.syntax_error('xml:namespace declaration too late in document')
 525             attrdict, namespace, k = self.parse_attributes(name, k, j)
 526             if namespace:
 527                 self.syntax_error('namespace declaration inside namespace declaration')
 528             for attrname in attrdict.keys():
 529                 if not self.__xml_namespace_attributes.has_key(attrname):
 530                     self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname)
 531             if not attrdict.has_key('ns') or not attrdict.has_key('prefix'):
 532                 self.syntax_error('xml:namespace without required attributes')
 533             prefix = attrdict.get('prefix')
 534             if ncname.match(prefix) is None:
 535                 self.syntax_error('xml:namespace illegal prefix value')
 536                 return end.end(0)
 537             if self.__namespaces.has_key(prefix):
 538                 self.syntax_error('xml:namespace prefix not unique')
 539             self.__namespaces[prefix] = attrdict['ns']
 540         else:
 541             if string.lower(name) == 'xml':
 542                 self.syntax_error('illegal processing instruction target name')
 543             self.handle_proc(name, rawdata[k:j])
 544         return end.end(0)
 545
 546     # Internal -- parse attributes between i and j
 547     def parse_attributes(self, tag, i, j):
 548         rawdata = self.rawdata
 549         attrdict = {}
 550         namespace = {}
 551         while i < j:
 552             res = attrfind.match(rawdata, i)
 553             if res is None:
 554                 break
 555             attrname, attrvalue = res.group('name', 'value')
 556             if self.__map_case:
 557                 attrname = string.lower(attrname)
 558             i = res.end(0)
 559             if attrvalue is None:
 560                 self.syntax_error("no value specified for attribute `%s'" % attrname)
 561                 attrvalue = attrname
 562             elif attrvalue[:1] == "'" == attrvalue[-1:] or \
 563                  attrvalue[:1] == '"' == attrvalue[-1:]:
 564                 attrvalue = attrvalue[1:-1]
 565             elif not self.__accept_unquoted_attributes:
 566                 self.syntax_error("attribute `%s' value not quoted" % attrname)
 567             res = xmlns.match(attrname)
 568             if res is not None:
 569                 # namespace declaration
 570                 ncname = res.group('ncname')
 571                 namespace[ncname or ''] = attrvalue or None
 572                 if not self.__use_namespaces:
 573                     self.__use_namespaces = len(self.stack)+1
 574                 continue
 575             if '<' in attrvalue:
 576                 self.syntax_error("`<' illegal in attribute value")
 577             if attrdict.has_key(attrname):
 578                 self.syntax_error("attribute `%s' specified twice" % attrname)
 579             attrvalue = string.translate(attrvalue, attrtrans)
 580             attrdict[attrname] = self.translate_references(attrvalue)
 581         return attrdict, namespace, i
 582
 583     # Internal -- handle starttag, return length or -1 if not terminated
 584     def parse_starttag(self, i):
 585         rawdata = self.rawdata
 586         # i points to start of tag
 587         end = endbracketfind.match(rawdata, i+1)
 588         if end is None:
 589             return -1
 590         tag = starttagmatch.match(rawdata, i)
 591         if tag is None or tag.end(0) != end.end(0):
 592             self.syntax_error('garbage in starttag')
 593             return end.end(0)
 594         nstag = tagname = tag.group('tagname')
 595         if self.__map_case:
 596             nstag = tagname = string.lower(nstag)
 597         if not self.__seen_starttag and self.__seen_doctype and \
 598            tagname != self.__seen_doctype:
 599             self.syntax_error('starttag does not match DOCTYPE')
 600         if self.__seen_starttag and not self.stack:
 601             self.syntax_error('multiple elements on top level')
 602         k, j = tag.span('attrs')
 603         attrdict, nsdict, k = self.parse_attributes(tagname, k, j)
 604         self.stack.append((tagname, nsdict, nstag))
 605         if self.__use_namespaces:
 606             res = qname.match(tagname)
 607         else:
 608             res = None
 609         if res is not None:
 610             prefix, nstag = res.group('prefix', 'local')
 611             if prefix is None:
 612                 prefix = ''
 613             ns = None
 614             for t, d, nst in self.stack:
 615                 if d.has_key(prefix):
 616                     ns = d[prefix]
 617             if ns is None and prefix != '':
 618                 ns = self.__namespaces.get(prefix)
 619             if ns is not None:
 620                 nstag = ns + ' ' + nstag
 621             elif prefix != '':
 622                 nstag = prefix + ':' + nstag # undo split
 623             self.stack[-1] = tagname, nsdict, nstag
 624         # translate namespace of attributes
 625         if self.__use_namespaces:
 626             nattrdict = {}
 627             for key, val in attrdict.items():
 628                 res = qname.match(key)
 629                 if res is not None:
 630                     aprefix, key = res.group('prefix', 'local')
 631                     if self.__map_case:
 632                         key = string.lower(key)
 633                     if aprefix is None:
 634                         aprefix = ''
 635                     ans = None
 636                     for t, d, nst in self.stack:
 637                         if d.has_key(aprefix):
 638                             ans = d[aprefix]
 639                     if ans is None and aprefix != '':
 640                         ans = self.__namespaces.get(aprefix)
 641                     if ans is not None:
 642                         key = ans + ' ' + key
 643                     elif aprefix != '':
 644                         key = aprefix + ':' + key
 645                     elif ns is not None:
 646                         key = ns + ' ' + key
 647                 nattrdict[key] = val
 648             attrdict = nattrdict
 649         attributes = self.attributes.get(nstag)
 650         if attributes is not None:
 651             for key in attrdict.keys():
 652                 if not attributes.has_key(key):
 653                     self.syntax_error("unknown attribute `%s' in tag `%s'" % (key, tagname))
 654             for key, val in attributes.items():
 655                 if val is not None and not attrdict.has_key(key):
 656                     attrdict[key] = val
 657         method = self.elements.get(nstag, (None, None))[0]
 658         self.finish_starttag(nstag, attrdict, method)
 659         if tag.group('slash') == '/':
 660             self.finish_endtag(tagname)
 661         return tag.end(0)
 662
 663     # Internal -- parse endtag
 664     def parse_endtag(self, i):
 665         rawdata = self.rawdata
 666         end = endbracketfind.match(rawdata, i+1)
 667         if end is None:
 668             return -1
 669         res = tagfind.match(rawdata, i+2)
 670         if res is None:
 671             if self.literal:
 672                 self.handle_data(rawdata[i])
 673                 return i+1
 674             if not self.__accept_missing_endtag_name:
 675                 self.syntax_error('no name specified in end tag')
 676             tag = self.stack[-1][0]
 677             k = i+2
 678         else:
 679             tag = res.group(0)
 680             if self.__map_case:
 681                 tag = string.lower(tag)
 682             if self.literal:
 683                 if not self.stack or tag != self.stack[-1][0]:
 684                     self.handle_data(rawdata[i])
 685                     return i+1
 686                 self.literal = 0
 687             k = res.end(0)
 688         if endbracket.match(rawdata, k) is None:
 689             self.syntax_error('garbage in end tag')
 690         self.finish_endtag(tag)
 691         return end.end(0)
 692
 693     # Internal -- finish processing of start tag
 694     def finish_starttag(self, tagname, attrdict, method):
 695         if method is not None:
 696             self.handle_starttag(tagname, method, attrdict)
 697         else:
 698             self.unknown_starttag(tagname, attrdict)
 699
 700     # Internal -- finish processing of end tag
 701     def finish_endtag(self, tag):
 702         if not tag:
 703             self.syntax_error('name-less end tag')
 704             found = len(self.stack) - 1
 705             if found < 0:
 706                 self.unknown_endtag(tag)
 707                 return
 708         else:
 709             found = -1
 710             for i in range(len(self.stack)):
 711                 if tag == self.stack[i][0]:
 712                     found = i
 713             if found == -1:
 714                 self.syntax_error('unopened end tag')
 715                 return
 716         while len(self.stack) > found:
 717             if found < len(self.stack) - 1:
 718                 self.syntax_error('missing close tag for %s' % self.stack[-1][2])
 719             nstag = self.stack[-1][2]
 720             method = self.elements.get(nstag, (None, None))[1]
 721             if method is not None:
 722                 self.handle_endtag(nstag, method)
 723             else:
 724                 self.unknown_endtag(nstag)
 725             if self.__use_namespaces == len(self.stack):
 726                 self.__use_namespaces = 0
 727             del self.stack[-1]
 728
 729     # Overridable -- handle xml processing instruction
 730     def handle_xml(self, encoding, standalone):
 731         pass
 732
 733     # Overridable -- handle DOCTYPE
 734     def handle_doctype(self, tag, pubid, syslit, data):
 735         pass
 736
 737     # Overridable -- handle start tag
 738     def handle_starttag(self, tag, method, attrs):
 739         method(attrs)
 740
 741     # Overridable -- handle end tag
 742     def handle_endtag(self, tag, method):
 743         method()
 744
 745     # Example -- handle character reference, no need to override
 746     def handle_charref(self, name):
 747         try:
 748             if name[0] == 'x':
 749                 n = string.atoi(name[1:], 16)
 750             else:
 751                 n = string.atoi(name)
 752         except string.atoi_error:
 753             self.unknown_charref(name)
 754             return
 755         if not 0 <= n <= 255:
 756             self.unknown_charref(name)
 757             return
 758         self.handle_data(chr(n))
 759
 760     # Definition of entities -- derived classes may override
 761     entitydefs = {'lt': '&#60;',        # must use charref
 762                   'gt': '&#62;',
 763                   'amp': '&#38;',       # must use charref
 764                   'quot': '&#34;',
 765                   'apos': '&#39;',
 766                   }
 767
 768     # Example -- handle data, should be overridden
 769     def handle_data(self, data):
 770         pass
 771
 772     # Example -- handle cdata, could be overridden
 773     def handle_cdata(self, data):
 774         pass
 775
 776     # Example -- handle comment, could be overridden
 777     def handle_comment(self, data):
 778         pass
 779
 780     # Example -- handle processing instructions, could be overridden
 781     def handle_proc(self, name, data):
 782         pass
 783
 784     # Example -- handle relatively harmless syntax errors, could be overridden
 785     def syntax_error(self, message):
 786         raise RuntimeError, 'Syntax error at line %d: %s' % (self.lineno, message)
 787
 788     # To be overridden -- handlers for unknown objects
 789     def unknown_starttag(self, tag, attrs): pass
 790     def unknown_endtag(self, tag): pass
 791     def unknown_charref(self, ref): pass
 792     def unknown_entityref(self, name):
 793         self.syntax_error("reference to unknown entity `&%s;'" % name)
 794
 795
 796 class TestXMLParser(XMLParser):
 797
 798     def __init__(self, **kw):
 799         self.testdata = ""
 800         apply(XMLParser.__init__, (self,), kw)
 801
 802     def handle_xml(self, encoding, standalone):
 803         self.flush()
 804         print 'xml: encoding =',encoding,'standalone =',standalone
 805
 806     def handle_doctype(self, tag, pubid, syslit, data):
 807         self.flush()
 808         print 'DOCTYPE:',tag, `data`
 809
 810     def handle_data(self, data):
 811         self.testdata = self.testdata + data
 812         if len(`self.testdata`) >= 70:
 813             self.flush()
 814
 815     def flush(self):
 816         data = self.testdata
 817         if data:
 818             self.testdata = ""
 819             print 'data:', `data`
 820
 821     def handle_cdata(self, data):
 822         self.flush()
 823         print 'cdata:', `data`
 824
 825     def handle_proc(self, name, data):
 826         self.flush()
 827         print 'processing:',name,`data`
 828
 829     def handle_comment(self, data):
 830         self.flush()
 831         r = `data`
 832         if len(r) > 68:
 833             r = r[:32] + '...' + r[-32:]
 834         print 'comment:', r
 835
 836     def syntax_error(self, message):
 837         print 'error at line %d:' % self.lineno, message
 838
 839     def unknown_starttag(self, tag, attrs):
 840         self.flush()
 841         if not attrs:
 842             print 'start tag: <' + tag + '>'
 843         else:
 844             print 'start tag: <' + tag,
 845             for name, value in attrs.items():
 846                 print name + '=' + '"' + value + '"',
 847             print '>'
 848
 849     def unknown_endtag(self, tag):
 850         self.flush()
 851         print 'end tag: </' + tag + '>'
 852
 853     def unknown_entityref(self, ref):
 854         self.flush()
 855         print '*** unknown entity ref: &' + ref + ';'
 856
 857     def unknown_charref(self, ref):
 858         self.flush()
 859         print '*** unknown char ref: &#' + ref + ';'
 860
 861     def close(self):
 862         XMLParser.close(self)
 863         self.flush()
 864
 865 def test(args = None):
 866     import sys, getopt
 867     from time import time
 868
 869     if not args:
 870         args = sys.argv[1:]
 871
 872     opts, args = getopt.getopt(args, 'st')
 873     klass = TestXMLParser
 874     do_time = 0
 875     for o, a in opts:
 876         if o == '-s':
 877             klass = XMLParser
 878         elif o == '-t':
 879             do_time = 1
 880
 881     if args:
 882         file = args[0]
 883     else:
 884         file = 'test.xml'
 885
 886     if file == '-':
 887         f = sys.stdin
 888     else:
 889         try:
 890             f = open(file, 'r')
 891         except IOError, msg:
 892             print file, ":", msg
 893             sys.exit(1)
 894
 895     data = f.read()
 896     if f is not sys.stdin:
 897         f.close()
 898
 899     x = klass()
 900     t0 = time()
 901     try:
 902         if do_time:
 903             x.feed(data)
 904             x.close()
 905         else:
 906             for c in data:
 907                 x.feed(c)
 908             x.close()
 909     except RuntimeError, msg:
 910         t1 = time()
 911         print msg
 912         if do_time:
 913             print 'total time: %g' % (t1-t0)
 914         sys.exit(1)
 915     t1 = time()
 916     if do_time:
 917         print 'total time: %g' % (t1-t0)
 918
 919
 920 if __name__ == '__main__':
 921     test()