Lib/xmllib.py

   1 """A parser for XML, using the derived class as static DTD."""
   2
   3 # Author: Sjoerd Mullender.
   4
   5 import re
   6 import string
   7
   8
   9 version = '0.3'
  10
  11 class Error(RuntimeError):
  12     pass
  13
  14 # Regular expressions used for parsing
  15
  16 _S = '[ \t\r\n]+'                       # white space
  17 _opS = '[ \t\r\n]*'                     # optional white space
  18 _Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*'    # valid XML name
  19 _QStr = "(?:'[^']*'|\"[^\"]*\")"        # quoted XML string
  20 illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
  21 interesting = re.compile('[]&<]')
  22
  23 amp = re.compile('&')
  24 ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
  25 entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
  26 charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
  27 space = re.compile(_S + '$')
  28 newline = re.compile('\n')
  29
  30 attrfind = re.compile(
  31     _S + '(?P<name>' + _Name + ')'
  32     '(' + _opS + '=' + _opS +
  33     '(?P<value>'+_QStr+'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?')
  34 starttagopen = re.compile('<' + _Name)
  35 starttagend = re.compile(_opS + '(?P<slash>/?)>')
  36 starttagmatch = re.compile('<(?P<tagname>'+_Name+')'
  37                       '(?P<attrs>(?:'+attrfind.pattern+')*)'+
  38                       starttagend.pattern)
  39 endtagopen = re.compile('</')
  40 endbracket = re.compile(_opS + '>')
  41 endbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>')
  42 tagfind = re.compile(_Name)
  43 cdataopen = re.compile(r'<!\[CDATA\[')
  44 cdataclose = re.compile(r'\]\]>')
  45 # this matches one of the following:
  46 # SYSTEM SystemLiteral
  47 # PUBLIC PubidLiteral SystemLiteral
  48 _SystemLiteral = '(?P<%s>'+_QStr+')'
  49 _PublicLiteral = '(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
  50                         "'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
  51 _ExternalId = '(?:SYSTEM|' \
  52                  'PUBLIC'+_S+_PublicLiteral%'pubid'+ \
  53               ')'+_S+_SystemLiteral%'syslit'
  54 doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'
  55                      '(?:'+_S+_ExternalId+')?'+_opS)
  56 xmldecl = re.compile('<\?xml'+_S+
  57                      'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+
  58                      '(?:'+_S+'encoding'+_opS+'='+_opS+
  59                         "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
  60                         '"[A-Za-z][-A-Za-z0-9._]*"))?'
  61                      '(?:'+_S+'standalone'+_opS+'='+_opS+
  62                         '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
  63                      _opS+'\?>')
  64 procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)
  65 procclose = re.compile(_opS + r'\?>')
  66 commentopen = re.compile('<!--')
  67 commentclose = re.compile('-->')
  68 doubledash = re.compile('--')
  69 attrtrans = string.maketrans(' \r\n\t', '    ')
  70
  71 # definitions for XML namespaces
  72 _NCName = '[a-zA-Z_][-a-zA-Z0-9._]*'    # XML Name, minus the ":"
  73 ncname = re.compile(_NCName + '$')
  74 qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix
  75                    '(?P<local>' + _NCName + ')$')
  76
  77 xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$')
  78
  79 # XML parser base class -- find tags and call handler functions.
  80 # Usage: p = XMLParser(); p.feed(data); ...; p.close().
  81 # The dtd is defined by deriving a class which defines methods with
  82 # special names to handle tags: start_foo and end_foo to handle <foo>
  83 # and </foo>, respectively.  The data between tags is passed to the
  84 # parser by calling self.handle_data() with some data as argument (the
  85 # data may be split up in arbitrary chunks).
  86
  87 class XMLParser:
  88     attributes = {}                     # default, to be overridden
  89     elements = {}                       # default, to be overridden
  90
  91     # parsing options, settable using keyword args in __init__
  92     __accept_unquoted_attributes = 0
  93     __accept_missing_endtag_name = 0
  94     __map_case = 0
  95     __accept_utf8 = 0
  96     __translate_attribute_references = 1
  97
  98     # Interface -- initialize and reset this instance
  99     def __init__(self, **kw):
 100         self.__fixed = 0
 101         if kw.has_key('accept_unquoted_attributes'):
 102             self.__accept_unquoted_attributes = kw['accept_unquoted_attributes']
 103         if kw.has_key('accept_missing_endtag_name'):
 104             self.__accept_missing_endtag_name = kw['accept_missing_endtag_name']
 105         if kw.has_key('map_case'):
 106             self.__map_case = kw['map_case']
 107         if kw.has_key('accept_utf8'):
 108             self.__accept_utf8 = kw['accept_utf8']
 109         if kw.has_key('translate_attribute_references'):
 110             self.__translate_attribute_references = kw['translate_attribute_references']
 111         self.reset()
 112
 113     def __fixelements(self):
 114         self.__fixed = 1
 115         self.elements = {}
 116         self.__fixdict(self.__dict__)
 117         self.__fixclass(self.__class__)
 118
 119     def __fixclass(self, kl):
 120         self.__fixdict(kl.__dict__)
 121         for k in kl.__bases__:
 122             self.__fixclass(k)
 123
 124     def __fixdict(self, dict):
 125         for key in dict.keys():
 126             if key[:6] == 'start_':
 127                 tag = key[6:]
 128                 start, end = self.elements.get(tag, (None, None))
 129                 if start is None:
 130                     self.elements[tag] = getattr(self, key), end
 131             elif key[:4] == 'end_':
 132                 tag = key[4:]
 133                 start, end = self.elements.get(tag, (None, None))
 134                 if end is None:
 135                     self.elements[tag] = start, getattr(self, key)
 136
 137     # Interface -- reset this instance.  Loses all unprocessed data
 138     def reset(self):
 139         self.rawdata = ''
 140         self.stack = []
 141         self.nomoretags = 0
 142         self.literal = 0
 143         self.lineno = 1
 144         self.__at_start = 1
 145         self.__seen_doctype = None
 146         self.__seen_starttag = 0
 147         self.__use_namespaces = 0
 148         self.__namespaces = {'xml':None}   # xml is implicitly declared
 149         # backward compatibility hack: if elements not overridden,
 150         # fill it in ourselves
 151         if self.elements is XMLParser.elements:
 152             self.__fixelements()
 153
 154     # For derived classes only -- enter literal mode (CDATA) till EOF
 155     def setnomoretags(self):
 156         self.nomoretags = self.literal = 1
 157
 158     # For derived classes only -- enter literal mode (CDATA)
 159     def setliteral(self, *args):
 160         self.literal = 1
 161
 162     # Interface -- feed some data to the parser.  Call this as
 163     # often as you want, with as little or as much text as you
 164     # want (may include '\n').  (This just saves the text, all the
 165     # processing is done by goahead().)
 166     def feed(self, data):
 167         self.rawdata = self.rawdata + data
 168         self.goahead(0)
 169
 170     # Interface -- handle the remaining data
 171     def close(self):
 172         self.goahead(1)
 173         if self.__fixed:
 174             self.__fixed = 0
 175             # remove self.elements so that we don't leak
 176             del self.elements
 177
 178     # Interface -- translate references
 179     def translate_references(self, data, all = 1):
 180         if not self.__translate_attribute_references:
 181             return data
 182         i = 0
 183         while 1:
 184             res = amp.search(data, i)
 185             if res is None:
 186                 return data
 187             s = res.start(0)
 188             res = ref.match(data, s)
 189             if res is None:
 190                 self.syntax_error("bogus `&'")
 191                 i = s+1
 192                 continue
 193             i = res.end(0)
 194             str = res.group(1)
 195             rescan = 0
 196             if str[0] == '#':
 197                 if str[1] == 'x':
 198                     str = chr(int(str[2:], 16))
 199                 else:
 200                     str = chr(int(str[1:]))
 201                 if data[i - 1] != ';':
 202                     self.syntax_error("`;' missing after char reference")
 203                     i = i-1
 204             elif all:
 205                 if self.entitydefs.has_key(str):
 206                     str = self.entitydefs[str]
 207                     rescan = 1
 208                 elif data[i - 1] != ';':
 209                     self.syntax_error("bogus `&'")
 210                     i = s + 1 # just past the &
 211                     continue
 212                 else:
 213                     self.syntax_error("reference to unknown entity `&%s;'" % str)
 214                     str = '&' + str + ';'
 215             elif data[i - 1] != ';':
 216                 self.syntax_error("bogus `&'")
 217                 i = s + 1 # just past the &
 218                 continue
 219
 220             # when we get here, str contains the translated text and i points
 221             # to the end of the string that is to be replaced
 222             data = data[:s] + str + data[i:]
 223             if rescan:
 224                 i = s
 225             else:
 226                 i = s + len(str)
 227
 228     # Interface - return a dictionary of all namespaces currently valid
 229     def getnamespace(self):
 230         nsdict = {}
 231         for t, d, nst in self.stack:
 232             nsdict.update(d)
 233         return nsdict
 234
 235     # Internal -- handle data as far as reasonable.  May leave state
 236     # and data to be processed by a subsequent call.  If 'end' is
 237     # true, force handling all data as if followed by EOF marker.
 238     def goahead(self, end):
 239         rawdata = self.rawdata
 240         i = 0
 241         n = len(rawdata)
 242         while i < n:
 243             if i > 0:
 244                 self.__at_start = 0
 245             if self.nomoretags:
 246                 data = rawdata[i:n]
 247                 self.handle_data(data)
 248                 self.lineno = self.lineno + data.count('\n')
 249                 i = n
 250                 break
 251             res = interesting.search(rawdata, i)
 252             if res:
 253                 j = res.start(0)
 254             else:
 255                 j = n
 256             if i < j:
 257                 data = rawdata[i:j]
 258                 if self.__at_start and space.match(data) is None:
 259                     self.syntax_error('illegal data at start of file')
 260                 self.__at_start = 0
 261                 if not self.stack and space.match(data) is None:
 262                     self.syntax_error('data not in content')
 263                 if not self.__accept_utf8 and illegal.search(data):
 264                     self.syntax_error('illegal character in content')
 265                 self.handle_data(data)
 266                 self.lineno = self.lineno + data.count('\n')
 267             i = j
 268             if i == n: break
 269             if rawdata[i] == '<':
 270                 if starttagopen.match(rawdata, i):
 271                     if self.literal:
 272                         data = rawdata[i]
 273                         self.handle_data(data)
 274                         self.lineno = self.lineno + data.count('\n')
 275                         i = i+1
 276                         continue
 277                     k = self.parse_starttag(i)
 278                     if k < 0: break
 279                     self.__seen_starttag = 1
 280                     self.lineno = self.lineno + rawdata[i:k].count('\n')
 281                     i = k
 282                     continue
 283                 if endtagopen.match(rawdata, i):
 284                     k = self.parse_endtag(i)
 285                     if k < 0: break
 286                     self.lineno = self.lineno + rawdata[i:k].count('\n')
 287                     i =  k
 288                     continue
 289                 if commentopen.match(rawdata, i):
 290                     if self.literal:
 291                         data = rawdata[i]
 292                         self.handle_data(data)
 293                         self.lineno = self.lineno + data.count('\n')
 294                         i = i+1
 295                         continue
 296                     k = self.parse_comment(i)
 297                     if k < 0: break
 298                     self.lineno = self.lineno + rawdata[i:k].count('\n')
 299                     i = k
 300                     continue
 301                 if cdataopen.match(rawdata, i):
 302                     k = self.parse_cdata(i)
 303                     if k < 0: break
 304                     self.lineno = self.lineno + rawdata[i:k].count('\n')
 305                     i = k
 306                     continue
 307                 res = xmldecl.match(rawdata, i)
 308                 if res:
 309                     if not self.__at_start:
 310                         self.syntax_error("<?xml?> declaration not at start of document")
 311                     version, encoding, standalone = res.group('version',
 312                                                               'encoding',
 313                                                               'standalone')
 314                     if version[1:-1] != '1.0':
 315                         raise Error('only XML version 1.0 supported')
 316                     if encoding: encoding = encoding[1:-1]
 317                     if standalone: standalone = standalone[1:-1]
 318                     self.handle_xml(encoding, standalone)
 319                     i = res.end(0)
 320                     continue
 321                 res = procopen.match(rawdata, i)
 322                 if res:
 323                     k = self.parse_proc(i)
 324                     if k < 0: break
 325                     self.lineno = self.lineno + rawdata[i:k].count('\n')
 326                     i = k
 327                     continue
 328                 res = doctype.match(rawdata, i)
 329                 if res:
 330                     if self.literal:
 331                         data = rawdata[i]
 332                         self.handle_data(data)
 333                         self.lineno = self.lineno + data.count('\n')
 334                         i = i+1
 335                         continue
 336                     if self.__seen_doctype:
 337                         self.syntax_error('multiple DOCTYPE elements')
 338                     if self.__seen_starttag:
 339                         self.syntax_error('DOCTYPE not at beginning of document')
 340                     k = self.parse_doctype(res)
 341                     if k < 0: break
 342                     self.__seen_doctype = res.group('name')
 343                     if self.__map_case:
 344                         self.__seen_doctype = self.__seen_doctype.lower()
 345                     self.lineno = self.lineno + rawdata[i:k].count('\n')
 346                     i = k
 347                     continue
 348             elif rawdata[i] == '&':
 349                 if self.literal:
 350                     data = rawdata[i]
 351                     self.handle_data(data)
 352                     i = i+1
 353                     continue
 354                 res = charref.match(rawdata, i)
 355                 if res is not None:
 356                     i = res.end(0)
 357                     if rawdata[i-1] != ';':
 358                         self.syntax_error("`;' missing in charref")
 359                         i = i-1
 360                     if not self.stack:
 361                         self.syntax_error('data not in content')
 362                     self.handle_charref(res.group('char')[:-1])
 363                     self.lineno = self.lineno + res.group(0).count('\n')
 364                     continue
 365                 res = entityref.match(rawdata, i)
 366                 if res is not None:
 367                     i = res.end(0)
 368                     if rawdata[i-1] != ';':
 369                         self.syntax_error("`;' missing in entityref")
 370                         i = i-1
 371                     name = res.group('name')
 372                     if self.__map_case:
 373                         name = name.lower()
 374                     if self.entitydefs.has_key(name):
 375                         self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
 376                         n = len(rawdata)
 377                         i = res.start(0)
 378                     else:
 379                         self.unknown_entityref(name)
 380                     self.lineno = self.lineno + res.group(0).count('\n')
 381                     continue
 382             elif rawdata[i] == ']':
 383                 if self.literal:
 384                     data = rawdata[i]
 385                     self.handle_data(data)
 386                     i = i+1
 387                     continue
 388                 if n-i < 3:
 389                     break
 390                 if cdataclose.match(rawdata, i):
 391                     self.syntax_error("bogus `]]>'")
 392                 self.handle_data(rawdata[i])
 393                 i = i+1
 394                 continue
 395             else:
 396                 raise Error('neither < nor & ??')
 397             # We get here only if incomplete matches but
 398             # nothing else
 399             break
 400         # end while
 401         if i > 0:
 402             self.__at_start = 0
 403         if end and i < n:
 404             data = rawdata[i]
 405             self.syntax_error("bogus `%s'" % data)
 406             if not self.__accept_utf8 and illegal.search(data):
 407                 self.syntax_error('illegal character in content')
 408             self.handle_data(data)
 409             self.lineno = self.lineno + data.count('\n')
 410             self.rawdata = rawdata[i+1:]
 411             return self.goahead(end)
 412         self.rawdata = rawdata[i:]
 413         if end:
 414             if not self.__seen_starttag:
 415                 self.syntax_error('no elements in file')
 416             if self.stack:
 417                 self.syntax_error('missing end tags')
 418                 while self.stack:
 419                     self.finish_endtag(self.stack[-1][0])
 420
 421     # Internal -- parse comment, return length or -1 if not terminated
 422     def parse_comment(self, i):
 423         rawdata = self.rawdata
 424         if rawdata[i:i+4] != '<!--':
 425             raise Error('unexpected call to handle_comment')
 426         res = commentclose.search(rawdata, i+4)
 427         if res is None:
 428             return -1
 429         if doubledash.search(rawdata, i+4, res.start(0)):
 430             self.syntax_error("`--' inside comment")
 431         if rawdata[res.start(0)-1] == '-':
 432             self.syntax_error('comment cannot end in three dashes')
 433         if not self.__accept_utf8 and \
 434            illegal.search(rawdata, i+4, res.start(0)):
 435             self.syntax_error('illegal character in comment')
 436         self.handle_comment(rawdata[i+4: res.start(0)])
 437         return res.end(0)
 438
 439     # Internal -- handle DOCTYPE tag, return length or -1 if not terminated
 440     def parse_doctype(self, res):
 441         rawdata = self.rawdata
 442         n = len(rawdata)
 443         name = res.group('name')
 444         if self.__map_case:
 445             name = name.lower()
 446         pubid, syslit = res.group('pubid', 'syslit')
 447         if pubid is not None:
 448             pubid = pubid[1:-1]         # remove quotes
 449             pubid = ' '.join(pubid.split()) # normalize
 450         if syslit is not None: syslit = syslit[1:-1] # remove quotes
 451         j = k = res.end(0)
 452         if k >= n:
 453             return -1
 454         if rawdata[k] == '[':
 455             level = 0
 456             k = k+1
 457             dq = sq = 0
 458             while k < n:
 459                 c = rawdata[k]
 460                 if not sq and c == '"':
 461                     dq = not dq
 462                 elif not dq and c == "'":
 463                     sq = not sq
 464                 elif sq or dq:
 465                     pass
 466                 elif level <= 0 and c == ']':
 467                     res = endbracket.match(rawdata, k+1)
 468                     if res is None:
 469                         return -1
 470                     self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
 471                     return res.end(0)
 472                 elif c == '<':
 473                     level = level + 1
 474                 elif c == '>':
 475                     level = level - 1
 476                     if level < 0:
 477                         self.syntax_error("bogus `>' in DOCTYPE")
 478                 k = k+1
 479         res = endbracketfind.match(rawdata, k)
 480         if res is None:
 481             return -1
 482         if endbracket.match(rawdata, k) is None:
 483             self.syntax_error('garbage in DOCTYPE')
 484         self.handle_doctype(name, pubid, syslit, None)
 485         return res.end(0)
 486
 487     # Internal -- handle CDATA tag, return length or -1 if not terminated
 488     def parse_cdata(self, i):
 489         rawdata = self.rawdata
 490         if rawdata[i:i+9] != '<![CDATA[':
 491             raise Error('unexpected call to parse_cdata')
 492         res = cdataclose.search(rawdata, i+9)
 493         if res is None:
 494             return -1
 495         if not self.__accept_utf8 and \
 496            illegal.search(rawdata, i+9, res.start(0)):
 497             self.syntax_error('illegal character in CDATA')
 498         if not self.stack:
 499             self.syntax_error('CDATA not in content')
 500         self.handle_cdata(rawdata[i+9:res.start(0)])
 501         return res.end(0)
 502
 503     __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None}
 504     # Internal -- handle a processing instruction tag
 505     def parse_proc(self, i):
 506         rawdata = self.rawdata
 507         end = procclose.search(rawdata, i)
 508         if end is None:
 509             return -1
 510         j = end.start(0)
 511         if not self.__accept_utf8 and illegal.search(rawdata, i+2, j):
 512             self.syntax_error('illegal character in processing instruction')
 513         res = tagfind.match(rawdata, i+2)
 514         if res is None:
 515             raise Error('unexpected call to parse_proc')
 516         k = res.end(0)
 517         name = res.group(0)
 518         if self.__map_case:
 519             name = name.lower()
 520         if name == 'xml:namespace':
 521             self.syntax_error('old-fashioned namespace declaration')
 522             self.__use_namespaces = -1
 523             # namespace declaration
 524             # this must come after the <?xml?> declaration (if any)
 525             # and before the <!DOCTYPE> (if any).
 526             if self.__seen_doctype or self.__seen_starttag:
 527                 self.syntax_error('xml:namespace declaration too late in document')
 528             attrdict, namespace, k = self.parse_attributes(name, k, j)
 529             if namespace:
 530                 self.syntax_error('namespace declaration inside namespace declaration')
 531             for attrname in attrdict.keys():
 532                 if not self.__xml_namespace_attributes.has_key(attrname):
 533                     self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname)
 534             if not attrdict.has_key('ns') or not attrdict.has_key('prefix'):
 535                 self.syntax_error('xml:namespace without required attributes')
 536             prefix = attrdict.get('prefix')
 537             if ncname.match(prefix) is None:
 538                 self.syntax_error('xml:namespace illegal prefix value')
 539                 return end.end(0)
 540             if self.__namespaces.has_key(prefix):
 541                 self.syntax_error('xml:namespace prefix not unique')
 542             self.__namespaces[prefix] = attrdict['ns']
 543         else:
 544             if name.lower() == 'xml':
 545                 self.syntax_error('illegal processing instruction target name')
 546             self.handle_proc(name, rawdata[k:j])
 547         return end.end(0)
 548
 549     # Internal -- parse attributes between i and j
 550     def parse_attributes(self, tag, i, j):
 551         rawdata = self.rawdata
 552         attrdict = {}
 553         namespace = {}
 554         while i < j:
 555             res = attrfind.match(rawdata, i)
 556             if res is None:
 557                 break
 558             attrname, attrvalue = res.group('name', 'value')
 559             if self.__map_case:
 560                 attrname = attrname.lower()
 561             i = res.end(0)
 562             if attrvalue is None:
 563                 self.syntax_error("no value specified for attribute `%s'" % attrname)
 564                 attrvalue = attrname
 565             elif attrvalue[:1] == "'" == attrvalue[-1:] or \
 566                  attrvalue[:1] == '"' == attrvalue[-1:]:
 567                 attrvalue = attrvalue[1:-1]
 568             elif not self.__accept_unquoted_attributes:
 569                 self.syntax_error("attribute `%s' value not quoted" % attrname)
 570             res = xmlns.match(attrname)
 571             if res is not None:
 572                 # namespace declaration
 573                 ncname = res.group('ncname')
 574                 namespace[ncname or ''] = attrvalue or None
 575                 if not self.__use_namespaces:
 576                     self.__use_namespaces = len(self.stack)+1
 577                 continue
 578             if '<' in attrvalue:
 579                 self.syntax_error("`<' illegal in attribute value")
 580             if attrdict.has_key(attrname):
 581                 self.syntax_error("attribute `%s' specified twice" % attrname)
 582             attrvalue = attrvalue.translate(attrtrans)
 583             attrdict[attrname] = self.translate_references(attrvalue)
 584         return attrdict, namespace, i
 585
 586     # Internal -- handle starttag, return length or -1 if not terminated
 587     def parse_starttag(self, i):
 588         rawdata = self.rawdata
 589         # i points to start of tag
 590         end = endbracketfind.match(rawdata, i+1)
 591         if end is None:
 592             return -1
 593         tag = starttagmatch.match(rawdata, i)
 594         if tag is None or tag.end(0) != end.end(0):
 595             self.syntax_error('garbage in starttag')
 596             return end.end(0)
 597         nstag = tagname = tag.group('tagname')
 598         if self.__map_case:
 599             nstag = tagname = nstag.lower()
 600         if not self.__seen_starttag and self.__seen_doctype and \
 601            tagname != self.__seen_doctype:
 602             self.syntax_error('starttag does not match DOCTYPE')
 603         if self.__seen_starttag and not self.stack:
 604             self.syntax_error('multiple elements on top level')
 605         k, j = tag.span('attrs')
 606         attrdict, nsdict, k = self.parse_attributes(tagname, k, j)
 607         self.stack.append((tagname, nsdict, nstag))
 608         if self.__use_namespaces:
 609             res = qname.match(tagname)
 610         else:
 611             res = None
 612         if res is not None:
 613             prefix, nstag = res.group('prefix', 'local')
 614             if prefix is None:
 615                 prefix = ''
 616             ns = None
 617             for t, d, nst in self.stack:
 618                 if d.has_key(prefix):
 619                     ns = d[prefix]
 620             if ns is None and prefix != '':
 621                 ns = self.__namespaces.get(prefix)
 622             if ns is not None:
 623                 nstag = ns + ' ' + nstag
 624             elif prefix != '':
 625                 nstag = prefix + ':' + nstag # undo split
 626             self.stack[-1] = tagname, nsdict, nstag
 627         # translate namespace of attributes
 628         attrnamemap = {} # map from new name to old name (used for error reporting)
 629         for key in attrdict.keys():
 630             attrnamemap[key] = key
 631         if self.__use_namespaces:
 632             nattrdict = {}
 633             for key, val in attrdict.items():
 634                 okey = key
 635                 res = qname.match(key)
 636                 if res is not None:
 637                     aprefix, key = res.group('prefix', 'local')
 638                     if self.__map_case:
 639                         key = key.lower()
 640                     if aprefix is None:
 641                         aprefix = ''
 642                     ans = None
 643                     for t, d, nst in self.stack:
 644                         if d.has_key(aprefix):
 645                             ans = d[aprefix]
 646                     if ans is None and aprefix != '':
 647                         ans = self.__namespaces.get(aprefix)
 648                     if ans is not None:
 649                         key = ans + ' ' + key
 650                     elif aprefix != '':
 651                         key = aprefix + ':' + key
 652                     elif ns is not None:
 653                         key = ns + ' ' + key
 654                 nattrdict[key] = val
 655                 attrnamemap[key] = okey
 656             attrdict = nattrdict
 657         attributes = self.attributes.get(nstag)
 658         if attributes is not None:
 659             for key in attrdict.keys():
 660                 if not attributes.has_key(key):
 661                     self.syntax_error("unknown attribute `%s' in tag `%s'" % (attrnamemap[key], tagname))
 662             for key, val in attributes.items():
 663                 if val is not None and not attrdict.has_key(key):
 664                     attrdict[key] = val
 665         method = self.elements.get(nstag, (None, None))[0]
 666         self.finish_starttag(nstag, attrdict, method)
 667         if tag.group('slash') == '/':
 668             self.finish_endtag(tagname)
 669         return tag.end(0)
 670
 671     # Internal -- parse endtag
 672     def parse_endtag(self, i):
 673         rawdata = self.rawdata
 674         end = endbracketfind.match(rawdata, i+1)
 675         if end is None:
 676             return -1
 677         res = tagfind.match(rawdata, i+2)
 678         if res is None:
 679             if self.literal:
 680                 self.handle_data(rawdata[i])
 681                 return i+1
 682             if not self.__accept_missing_endtag_name:
 683                 self.syntax_error('no name specified in end tag')
 684             tag = self.stack[-1][0]
 685             k = i+2
 686         else:
 687             tag = res.group(0)
 688             if self.__map_case:
 689                 tag = tag.lower()
 690             if self.literal:
 691                 if not self.stack or tag != self.stack[-1][0]:
 692                     self.handle_data(rawdata[i])
 693                     return i+1
 694             k = res.end(0)
 695         if endbracket.match(rawdata, k) is None:
 696             self.syntax_error('garbage in end tag')
 697         self.finish_endtag(tag)
 698         return end.end(0)
 699
 700     # Internal -- finish processing of start tag
 701     def finish_starttag(self, tagname, attrdict, method):
 702         if method is not None:
 703             self.handle_starttag(tagname, method, attrdict)
 704         else:
 705             self.unknown_starttag(tagname, attrdict)
 706
 707     # Internal -- finish processing of end tag
 708     def finish_endtag(self, tag):
 709         self.literal = 0
 710         if not tag:
 711             self.syntax_error('name-less end tag')
 712             found = len(self.stack) - 1
 713             if found < 0:
 714                 self.unknown_endtag(tag)
 715                 return
 716         else:
 717             found = -1
 718             for i in range(len(self.stack)):
 719                 if tag == self.stack[i][0]:
 720                     found = i
 721             if found == -1:
 722                 self.syntax_error('unopened end tag')
 723                 return
 724         while len(self.stack) > found:
 725             if found < len(self.stack) - 1:
 726                 self.syntax_error('missing close tag for %s' % self.stack[-1][2])
 727             nstag = self.stack[-1][2]
 728             method = self.elements.get(nstag, (None, None))[1]
 729             if method is not None:
 730                 self.handle_endtag(nstag, method)
 731             else:
 732                 self.unknown_endtag(nstag)
 733             if self.__use_namespaces == len(self.stack):
 734                 self.__use_namespaces = 0
 735             del self.stack[-1]
 736
 737     # Overridable -- handle xml processing instruction
 738     def handle_xml(self, encoding, standalone):
 739         pass
 740
 741     # Overridable -- handle DOCTYPE
 742     def handle_doctype(self, tag, pubid, syslit, data):
 743         pass
 744
 745     # Overridable -- handle start tag
 746     def handle_starttag(self, tag, method, attrs):
 747         method(attrs)
 748
 749     # Overridable -- handle end tag
 750     def handle_endtag(self, tag, method):
 751         method()
 752
 753     # Example -- handle character reference, no need to override
 754     def handle_charref(self, name):
 755         try:
 756             if name[0] == 'x':
 757                 n = int(name[1:], 16)
 758             else:
 759                 n = int(name)
 760         except ValueError:
 761             self.unknown_charref(name)
 762             return
 763         if not 0 <= n <= 255:
 764             self.unknown_charref(name)
 765             return
 766         self.handle_data(chr(n))
 767
 768     # Definition of entities -- derived classes may override
 769     entitydefs = {'lt': '&#60;',        # must use charref
 770                   'gt': '&#62;',
 771                   'amp': '&#38;',       # must use charref
 772                   'quot': '&#34;',
 773                   'apos': '&#39;',
 774                   }
 775
 776     # Example -- handle data, should be overridden
 777     def handle_data(self, data):
 778         pass
 779
 780     # Example -- handle cdata, could be overridden
 781     def handle_cdata(self, data):
 782         pass
 783
 784     # Example -- handle comment, could be overridden
 785     def handle_comment(self, data):
 786         pass
 787
 788     # Example -- handle processing instructions, could be overridden
 789     def handle_proc(self, name, data):
 790         pass
 791
 792     # Example -- handle relatively harmless syntax errors, could be overridden
 793     def syntax_error(self, message):
 794         raise Error('Syntax error at line %d: %s' % (self.lineno, message))
 795
 796     # To be overridden -- handlers for unknown objects
 797     def unknown_starttag(self, tag, attrs): pass
 798     def unknown_endtag(self, tag): pass
 799     def unknown_charref(self, ref): pass
 800     def unknown_entityref(self, name):
 801         self.syntax_error("reference to unknown entity `&%s;'" % name)
 802
 803
 804 class TestXMLParser(XMLParser):
 805
 806     def __init__(self, **kw):
 807         self.testdata = ""
 808         apply(XMLParser.__init__, (self,), kw)
 809
 810     def handle_xml(self, encoding, standalone):
 811         self.flush()
 812         print 'xml: encoding =',encoding,'standalone =',standalone
 813
 814     def handle_doctype(self, tag, pubid, syslit, data):
 815         self.flush()
 816         print 'DOCTYPE:',tag, `data`
 817
 818     def handle_data(self, data):
 819         self.testdata = self.testdata + data
 820         if len(`self.testdata`) >= 70:
 821             self.flush()
 822
 823     def flush(self):
 824         data = self.testdata
 825         if data:
 826             self.testdata = ""
 827             print 'data:', `data`
 828
 829     def handle_cdata(self, data):
 830         self.flush()
 831         print 'cdata:', `data`
 832
 833     def handle_proc(self, name, data):
 834         self.flush()
 835         print 'processing:',name,`data`
 836
 837     def handle_comment(self, data):
 838         self.flush()
 839         r = `data`
 840         if len(r) > 68:
 841             r = r[:32] + '...' + r[-32:]
 842         print 'comment:', r
 843
 844     def syntax_error(self, message):
 845         print 'error at line %d:' % self.lineno, message
 846
 847     def unknown_starttag(self, tag, attrs):
 848         self.flush()
 849         if not attrs:
 850             print 'start tag: <' + tag + '>'
 851         else:
 852             print 'start tag: <' + tag,
 853             for name, value in attrs.items():
 854                 print name + '=' + '"' + value + '"',
 855             print '>'
 856
 857     def unknown_endtag(self, tag):
 858         self.flush()
 859         print 'end tag: </' + tag + '>'
 860
 861     def unknown_entityref(self, ref):
 862         self.flush()
 863         print '*** unknown entity ref: &' + ref + ';'
 864
 865     def unknown_charref(self, ref):
 866         self.flush()
 867         print '*** unknown char ref: &#' + ref + ';'
 868
 869     def close(self):
 870         XMLParser.close(self)
 871         self.flush()
 872
 873 def test(args = None):
 874     import sys, getopt
 875     from time import time
 876
 877     if not args:
 878         args = sys.argv[1:]
 879
 880     opts, args = getopt.getopt(args, 'st')
 881     klass = TestXMLParser
 882     do_time = 0
 883     for o, a in opts:
 884         if o == '-s':
 885             klass = XMLParser
 886         elif o == '-t':
 887             do_time = 1
 888
 889     if args:
 890         file = args[0]
 891     else:
 892         file = 'test.xml'
 893
 894     if file == '-':
 895         f = sys.stdin
 896     else:
 897         try:
 898             f = open(file, 'r')
 899         except IOError, msg:
 900             print file, ":", msg
 901             sys.exit(1)
 902
 903     data = f.read()
 904     if f is not sys.stdin:
 905         f.close()
 906
 907     x = klass()
 908     t0 = time()
 909     try:
 910         if do_time:
 911             x.feed(data)
 912             x.close()
 913         else:
 914             for c in data:
 915                 x.feed(c)
 916             x.close()
 917     except Error, msg:
 918         t1 = time()
 919         print msg
 920         if do_time:
 921             print 'total time: %g' % (t1-t0)
 922         sys.exit(1)
 923     t1 = time()
 924     if do_time:
 925         print 'total time: %g' % (t1-t0)
 926
 927
 928 if __name__ == '__main__':
 929     test()