Lib/xmllib.py

   1 """A parser for XML, using the derived class as static DTD."""
   2
   3 # Author: Sjoerd Mullender.
   4
   5 import re
   6 import string
   7
   8 import warnings
   9 warnings.warn("The xmllib module is obsolete.  Use xml.sax instead.",
  10               DeprecationWarning)
  11 del warnings
  12
  13 version = '0.3'
  14
  15 class Error(RuntimeError):
  16     pass
  17
  18 # Regular expressions used for parsing
  19
  20 _S = '[ \t\r\n]+'                       # white space
  21 _opS = '[ \t\r\n]*'                     # optional white space
  22 _Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*'    # valid XML name
  23 _QStr = "(?:'[^']*'|\"[^\"]*\")"        # quoted XML string
  24 illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
  25 interesting = re.compile('[]&<]')
  26
  27 amp = re.compile('&')
  28 ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
  29 entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
  30 charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
  31 space = re.compile(_S + '$')
  32 newline = re.compile('\n')
  33
  34 attrfind = re.compile(
  35     _S + '(?P<name>' + _Name + ')'
  36     '(' + _opS + '=' + _opS +
  37     '(?P<value>'+_QStr+'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?')
  38 starttagopen = re.compile('<' + _Name)
  39 starttagend = re.compile(_opS + '(?P<slash>/?)>')
  40 starttagmatch = re.compile('<(?P<tagname>'+_Name+')'
  41                       '(?P<attrs>(?:'+attrfind.pattern+')*)'+
  42                       starttagend.pattern)
  43 endtagopen = re.compile('</')
  44 endbracket = re.compile(_opS + '>')
  45 endbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>')
  46 tagfind = re.compile(_Name)
  47 cdataopen = re.compile(r'<!\[CDATA\[')
  48 cdataclose = re.compile(r'\]\]>')
  49 # this matches one of the following:
  50 # SYSTEM SystemLiteral
  51 # PUBLIC PubidLiteral SystemLiteral
  52 _SystemLiteral = '(?P<%s>'+_QStr+')'
  53 _PublicLiteral = '(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
  54                         "'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
  55 _ExternalId = '(?:SYSTEM|' \
  56                  'PUBLIC'+_S+_PublicLiteral%'pubid'+ \
  57               ')'+_S+_SystemLiteral%'syslit'
  58 doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'
  59                      '(?:'+_S+_ExternalId+')?'+_opS)
  60 xmldecl = re.compile('<\?xml'+_S+
  61                      'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+
  62                      '(?:'+_S+'encoding'+_opS+'='+_opS+
  63                         "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
  64                         '"[A-Za-z][-A-Za-z0-9._]*"))?'
  65                      '(?:'+_S+'standalone'+_opS+'='+_opS+
  66                         '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
  67                      _opS+'\?>')
  68 procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)
  69 procclose = re.compile(_opS + r'\?>')
  70 commentopen = re.compile('<!--')
  71 commentclose = re.compile('-->')
  72 doubledash = re.compile('--')
  73 attrtrans = string.maketrans(' \r\n\t', '    ')
  74
  75 # definitions for XML namespaces
  76 _NCName = '[a-zA-Z_][-a-zA-Z0-9._]*'    # XML Name, minus the ":"
  77 ncname = re.compile(_NCName + '$')
  78 qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix
  79                    '(?P<local>' + _NCName + ')$')
  80
  81 xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$')
  82
  83 # XML parser base class -- find tags and call handler functions.
  84 # Usage: p = XMLParser(); p.feed(data); ...; p.close().
  85 # The dtd is defined by deriving a class which defines methods with
  86 # special names to handle tags: start_foo and end_foo to handle <foo>
  87 # and </foo>, respectively.  The data between tags is passed to the
  88 # parser by calling self.handle_data() with some data as argument (the
  89 # data may be split up in arbitrary chunks).
  90
  91 class XMLParser:
  92     attributes = {}                     # default, to be overridden
  93     elements = {}                       # default, to be overridden
  94
  95     # parsing options, settable using keyword args in __init__
  96     __accept_unquoted_attributes = 0
  97     __accept_missing_endtag_name = 0
  98     __map_case = 0
  99     __accept_utf8 = 0
 100     __translate_attribute_references = 1
 101
 102     # Interface -- initialize and reset this instance
 103     def __init__(self, **kw):
 104         self.__fixed = 0
 105         if 'accept_unquoted_attributes' in kw:
 106             self.__accept_unquoted_attributes = kw['accept_unquoted_attributes']
 107         if 'accept_missing_endtag_name' in kw:
 108             self.__accept_missing_endtag_name = kw['accept_missing_endtag_name']
 109         if 'map_case' in kw:
 110             self.__map_case = kw['map_case']
 111         if 'accept_utf8' in kw:
 112             self.__accept_utf8 = kw['accept_utf8']
 113         if 'translate_attribute_references' in kw:
 114             self.__translate_attribute_references = kw['translate_attribute_references']
 115         self.reset()
 116
 117     def __fixelements(self):
 118         self.__fixed = 1
 119         self.elements = {}
 120         self.__fixdict(self.__dict__)
 121         self.__fixclass(self.__class__)
 122
 123     def __fixclass(self, kl):
 124         self.__fixdict(kl.__dict__)
 125         for k in kl.__bases__:
 126             self.__fixclass(k)
 127
 128     def __fixdict(self, dict):
 129         for key in dict.keys():
 130             if key[:6] == 'start_':
 131                 tag = key[6:]
 132                 start, end = self.elements.get(tag, (None, None))
 133                 if start is None:
 134                     self.elements[tag] = getattr(self, key), end
 135             elif key[:4] == 'end_':
 136                 tag = key[4:]
 137                 start, end = self.elements.get(tag, (None, None))
 138                 if end is None:
 139                     self.elements[tag] = start, getattr(self, key)
 140
 141     # Interface -- reset this instance.  Loses all unprocessed data
 142     def reset(self):
 143         self.rawdata = ''
 144         self.stack = []
 145         self.nomoretags = 0
 146         self.literal = 0
 147         self.lineno = 1
 148         self.__at_start = 1
 149         self.__seen_doctype = None
 150         self.__seen_starttag = 0
 151         self.__use_namespaces = 0
 152         self.__namespaces = {'xml':None}   # xml is implicitly declared
 153         # backward compatibility hack: if elements not overridden,
 154         # fill it in ourselves
 155         if self.elements is XMLParser.elements:
 156             self.__fixelements()
 157
 158     # For derived classes only -- enter literal mode (CDATA) till EOF
 159     def setnomoretags(self):
 160         self.nomoretags = self.literal = 1
 161
 162     # For derived classes only -- enter literal mode (CDATA)
 163     def setliteral(self, *args):
 164         self.literal = 1
 165
 166     # Interface -- feed some data to the parser.  Call this as
 167     # often as you want, with as little or as much text as you
 168     # want (may include '\n').  (This just saves the text, all the
 169     # processing is done by goahead().)
 170     def feed(self, data):
 171         self.rawdata = self.rawdata + data
 172         self.goahead(0)
 173
 174     # Interface -- handle the remaining data
 175     def close(self):
 176         self.goahead(1)
 177         if self.__fixed:
 178             self.__fixed = 0
 179             # remove self.elements so that we don't leak
 180             del self.elements
 181
 182     # Interface -- translate references
 183     def translate_references(self, data, all = 1):
 184         if not self.__translate_attribute_references:
 185             return data
 186         i = 0
 187         while 1:
 188             res = amp.search(data, i)
 189             if res is None:
 190                 return data
 191             s = res.start(0)
 192             res = ref.match(data, s)
 193             if res is None:
 194                 self.syntax_error("bogus `&'")
 195                 i = s+1
 196                 continue
 197             i = res.end(0)
 198             str = res.group(1)
 199             rescan = 0
 200             if str[0] == '#':
 201                 if str[1] == 'x':
 202                     str = chr(int(str[2:], 16))
 203                 else:
 204                     str = chr(int(str[1:]))
 205                 if data[i - 1] != ';':
 206                     self.syntax_error("`;' missing after char reference")
 207                     i = i-1
 208             elif all:
 209                 if str in self.entitydefs:
 210                     str = self.entitydefs[str]
 211                     rescan = 1
 212                 elif data[i - 1] != ';':
 213                     self.syntax_error("bogus `&'")
 214                     i = s + 1 # just past the &
 215                     continue
 216                 else:
 217                     self.syntax_error("reference to unknown entity `&%s;'" % str)
 218                     str = '&' + str + ';'
 219             elif data[i - 1] != ';':
 220                 self.syntax_error("bogus `&'")
 221                 i = s + 1 # just past the &
 222                 continue
 223
 224             # when we get here, str contains the translated text and i points
 225             # to the end of the string that is to be replaced
 226             data = data[:s] + str + data[i:]
 227             if rescan:
 228                 i = s
 229             else:
 230                 i = s + len(str)
 231
 232     # Interface - return a dictionary of all namespaces currently valid
 233     def getnamespace(self):
 234         nsdict = {}
 235         for t, d, nst in self.stack:
 236             nsdict.update(d)
 237         return nsdict
 238
 239     # Internal -- handle data as far as reasonable.  May leave state
 240     # and data to be processed by a subsequent call.  If 'end' is
 241     # true, force handling all data as if followed by EOF marker.
 242     def goahead(self, end):
 243         rawdata = self.rawdata
 244         i = 0
 245         n = len(rawdata)
 246         while i < n:
 247             if i > 0:
 248                 self.__at_start = 0
 249             if self.nomoretags:
 250                 data = rawdata[i:n]
 251                 self.handle_data(data)
 252                 self.lineno = self.lineno + data.count('\n')
 253                 i = n
 254                 break
 255             res = interesting.search(rawdata, i)
 256             if res:
 257                 j = res.start(0)
 258             else:
 259                 j = n
 260             if i < j:
 261                 data = rawdata[i:j]
 262                 if self.__at_start and space.match(data) is None:
 263                     self.syntax_error('illegal data at start of file')
 264                 self.__at_start = 0
 265                 if not self.stack and space.match(data) is None:
 266                     self.syntax_error('data not in content')
 267                 if not self.__accept_utf8 and illegal.search(data):
 268                     self.syntax_error('illegal character in content')
 269                 self.handle_data(data)
 270                 self.lineno = self.lineno + data.count('\n')
 271             i = j
 272             if i == n: break
 273             if rawdata[i] == '<':
 274                 if starttagopen.match(rawdata, i):
 275                     if self.literal:
 276                         data = rawdata[i]
 277                         self.handle_data(data)
 278                         self.lineno = self.lineno + data.count('\n')
 279                         i = i+1
 280                         continue
 281                     k = self.parse_starttag(i)
 282                     if k < 0: break
 283                     self.__seen_starttag = 1
 284                     self.lineno = self.lineno + rawdata[i:k].count('\n')
 285                     i = k
 286                     continue
 287                 if endtagopen.match(rawdata, i):
 288                     k = self.parse_endtag(i)
 289                     if k < 0: break
 290                     self.lineno = self.lineno + rawdata[i:k].count('\n')
 291                     i =  k
 292                     continue
 293                 if commentopen.match(rawdata, i):
 294                     if self.literal:
 295                         data = rawdata[i]
 296                         self.handle_data(data)
 297                         self.lineno = self.lineno + data.count('\n')
 298                         i = i+1
 299                         continue
 300                     k = self.parse_comment(i)
 301                     if k < 0: break
 302                     self.lineno = self.lineno + rawdata[i:k].count('\n')
 303                     i = k
 304                     continue
 305                 if cdataopen.match(rawdata, i):
 306                     k = self.parse_cdata(i)
 307                     if k < 0: break
 308                     self.lineno = self.lineno + rawdata[i:k].count('\n')
 309                     i = k
 310                     continue
 311                 res = xmldecl.match(rawdata, i)
 312                 if res:
 313                     if not self.__at_start:
 314                         self.syntax_error("<?xml?> declaration not at start of document")
 315                     version, encoding, standalone = res.group('version',
 316                                                               'encoding',
 317                                                               'standalone')
 318                     if version[1:-1] != '1.0':
 319                         raise Error('only XML version 1.0 supported')
 320                     if encoding: encoding = encoding[1:-1]
 321                     if standalone: standalone = standalone[1:-1]
 322                     self.handle_xml(encoding, standalone)
 323                     i = res.end(0)
 324                     continue
 325                 res = procopen.match(rawdata, i)
 326                 if res:
 327                     k = self.parse_proc(i)
 328                     if k < 0: break
 329                     self.lineno = self.lineno + rawdata[i:k].count('\n')
 330                     i = k
 331                     continue
 332                 res = doctype.match(rawdata, i)
 333                 if res:
 334                     if self.literal:
 335                         data = rawdata[i]
 336                         self.handle_data(data)
 337                         self.lineno = self.lineno + data.count('\n')
 338                         i = i+1
 339                         continue
 340                     if self.__seen_doctype:
 341                         self.syntax_error('multiple DOCTYPE elements')
 342                     if self.__seen_starttag:
 343                         self.syntax_error('DOCTYPE not at beginning of document')
 344                     k = self.parse_doctype(res)
 345                     if k < 0: break
 346                     self.__seen_doctype = res.group('name')
 347                     if self.__map_case:
 348                         self.__seen_doctype = self.__seen_doctype.lower()
 349                     self.lineno = self.lineno + rawdata[i:k].count('\n')
 350                     i = k
 351                     continue
 352             elif rawdata[i] == '&':
 353                 if self.literal:
 354                     data = rawdata[i]
 355                     self.handle_data(data)
 356                     i = i+1
 357                     continue
 358                 res = charref.match(rawdata, i)
 359                 if res is not None:
 360                     i = res.end(0)
 361                     if rawdata[i-1] != ';':
 362                         self.syntax_error("`;' missing in charref")
 363                         i = i-1
 364                     if not self.stack:
 365                         self.syntax_error('data not in content')
 366                     self.handle_charref(res.group('char')[:-1])
 367                     self.lineno = self.lineno + res.group(0).count('\n')
 368                     continue
 369                 res = entityref.match(rawdata, i)
 370                 if res is not None:
 371                     i = res.end(0)
 372                     if rawdata[i-1] != ';':
 373                         self.syntax_error("`;' missing in entityref")
 374                         i = i-1
 375                     name = res.group('name')
 376                     if self.__map_case:
 377                         name = name.lower()
 378                     if name in self.entitydefs:
 379                         self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
 380                         n = len(rawdata)
 381                         i = res.start(0)
 382                     else:
 383                         self.unknown_entityref(name)
 384                     self.lineno = self.lineno + res.group(0).count('\n')
 385                     continue
 386             elif rawdata[i] == ']':
 387                 if self.literal:
 388                     data = rawdata[i]
 389                     self.handle_data(data)
 390                     i = i+1
 391                     continue
 392                 if n-i < 3:
 393                     break
 394                 if cdataclose.match(rawdata, i):
 395                     self.syntax_error("bogus `]]>'")
 396                 self.handle_data(rawdata[i])
 397                 i = i+1
 398                 continue
 399             else:
 400                 raise Error('neither < nor & ??')
 401             # We get here only if incomplete matches but
 402             # nothing else
 403             break
 404         # end while
 405         if i > 0:
 406             self.__at_start = 0
 407         if end and i < n:
 408             data = rawdata[i]
 409             self.syntax_error("bogus `%s'" % data)
 410             if not self.__accept_utf8 and illegal.search(data):
 411                 self.syntax_error('illegal character in content')
 412             self.handle_data(data)
 413             self.lineno = self.lineno + data.count('\n')
 414             self.rawdata = rawdata[i+1:]
 415             return self.goahead(end)
 416         self.rawdata = rawdata[i:]
 417         if end:
 418             if not self.__seen_starttag:
 419                 self.syntax_error('no elements in file')
 420             if self.stack:
 421                 self.syntax_error('missing end tags')
 422                 while self.stack:
 423                     self.finish_endtag(self.stack[-1][0])
 424
 425     # Internal -- parse comment, return length or -1 if not terminated
 426     def parse_comment(self, i):
 427         rawdata = self.rawdata
 428         if rawdata[i:i+4] != '<!--':
 429             raise Error('unexpected call to handle_comment')
 430         res = commentclose.search(rawdata, i+4)
 431         if res is None:
 432             return -1
 433         if doubledash.search(rawdata, i+4, res.start(0)):
 434             self.syntax_error("`--' inside comment")
 435         if rawdata[res.start(0)-1] == '-':
 436             self.syntax_error('comment cannot end in three dashes')
 437         if not self.__accept_utf8 and \
 438            illegal.search(rawdata, i+4, res.start(0)):
 439             self.syntax_error('illegal character in comment')
 440         self.handle_comment(rawdata[i+4: res.start(0)])
 441         return res.end(0)
 442
 443     # Internal -- handle DOCTYPE tag, return length or -1 if not terminated
 444     def parse_doctype(self, res):
 445         rawdata = self.rawdata
 446         n = len(rawdata)
 447         name = res.group('name')
 448         if self.__map_case:
 449             name = name.lower()
 450         pubid, syslit = res.group('pubid', 'syslit')
 451         if pubid is not None:
 452             pubid = pubid[1:-1]         # remove quotes
 453             pubid = ' '.join(pubid.split()) # normalize
 454         if syslit is not None: syslit = syslit[1:-1] # remove quotes
 455         j = k = res.end(0)
 456         if k >= n:
 457             return -1
 458         if rawdata[k] == '[':
 459             level = 0
 460             k = k+1
 461             dq = sq = 0
 462             while k < n:
 463                 c = rawdata[k]
 464                 if not sq and c == '"':
 465                     dq = not dq
 466                 elif not dq and c == "'":
 467                     sq = not sq
 468                 elif sq or dq:
 469                     pass
 470                 elif level <= 0 and c == ']':
 471                     res = endbracket.match(rawdata, k+1)
 472                     if res is None:
 473                         return -1
 474                     self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
 475                     return res.end(0)
 476                 elif c == '<':
 477                     level = level + 1
 478                 elif c == '>':
 479                     level = level - 1
 480                     if level < 0:
 481                         self.syntax_error("bogus `>' in DOCTYPE")
 482                 k = k+1
 483         res = endbracketfind.match(rawdata, k)
 484         if res is None:
 485             return -1
 486         if endbracket.match(rawdata, k) is None:
 487             self.syntax_error('garbage in DOCTYPE')
 488         self.handle_doctype(name, pubid, syslit, None)
 489         return res.end(0)
 490
 491     # Internal -- handle CDATA tag, return length or -1 if not terminated
 492     def parse_cdata(self, i):
 493         rawdata = self.rawdata
 494         if rawdata[i:i+9] != '<![CDATA[':
 495             raise Error('unexpected call to parse_cdata')
 496         res = cdataclose.search(rawdata, i+9)
 497         if res is None:
 498             return -1
 499         if not self.__accept_utf8 and \
 500            illegal.search(rawdata, i+9, res.start(0)):
 501             self.syntax_error('illegal character in CDATA')
 502         if not self.stack:
 503             self.syntax_error('CDATA not in content')
 504         self.handle_cdata(rawdata[i+9:res.start(0)])
 505         return res.end(0)
 506
 507     __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None}
 508     # Internal -- handle a processing instruction tag
 509     def parse_proc(self, i):
 510         rawdata = self.rawdata
 511         end = procclose.search(rawdata, i)
 512         if end is None:
 513             return -1
 514         j = end.start(0)
 515         if not self.__accept_utf8 and illegal.search(rawdata, i+2, j):
 516             self.syntax_error('illegal character in processing instruction')
 517         res = tagfind.match(rawdata, i+2)
 518         if res is None:
 519             raise Error('unexpected call to parse_proc')
 520         k = res.end(0)
 521         name = res.group(0)
 522         if self.__map_case:
 523             name = name.lower()
 524         if name == 'xml:namespace':
 525             self.syntax_error('old-fashioned namespace declaration')
 526             self.__use_namespaces = -1
 527             # namespace declaration
 528             # this must come after the <?xml?> declaration (if any)
 529             # and before the <!DOCTYPE> (if any).
 530             if self.__seen_doctype or self.__seen_starttag:
 531                 self.syntax_error('xml:namespace declaration too late in document')
 532             attrdict, namespace, k = self.parse_attributes(name, k, j)
 533             if namespace:
 534                 self.syntax_error('namespace declaration inside namespace declaration')
 535             for attrname in attrdict.keys():
 536                 if not attrname in self.__xml_namespace_attributes:
 537                     self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname)
 538             if not 'ns' in attrdict or not 'prefix' in attrdict:
 539                 self.syntax_error('xml:namespace without required attributes')
 540             prefix = attrdict.get('prefix')
 541             if ncname.match(prefix) is None:
 542                 self.syntax_error('xml:namespace illegal prefix value')
 543                 return end.end(0)
 544             if prefix in self.__namespaces:
 545                 self.syntax_error('xml:namespace prefix not unique')
 546             self.__namespaces[prefix] = attrdict['ns']
 547         else:
 548             if name.lower() == 'xml':
 549                 self.syntax_error('illegal processing instruction target name')
 550             self.handle_proc(name, rawdata[k:j])
 551         return end.end(0)
 552
 553     # Internal -- parse attributes between i and j
 554     def parse_attributes(self, tag, i, j):
 555         rawdata = self.rawdata
 556         attrdict = {}
 557         namespace = {}
 558         while i < j:
 559             res = attrfind.match(rawdata, i)
 560             if res is None:
 561                 break
 562             attrname, attrvalue = res.group('name', 'value')
 563             if self.__map_case:
 564                 attrname = attrname.lower()
 565             i = res.end(0)
 566             if attrvalue is None:
 567                 self.syntax_error("no value specified for attribute `%s'" % attrname)
 568                 attrvalue = attrname
 569             elif attrvalue[:1] == "'" == attrvalue[-1:] or \
 570                  attrvalue[:1] == '"' == attrvalue[-1:]:
 571                 attrvalue = attrvalue[1:-1]
 572             elif not self.__accept_unquoted_attributes:
 573                 self.syntax_error("attribute `%s' value not quoted" % attrname)
 574             res = xmlns.match(attrname)
 575             if res is not None:
 576                 # namespace declaration
 577                 ncname = res.group('ncname')
 578                 namespace[ncname or ''] = attrvalue or None
 579                 if not self.__use_namespaces:
 580                     self.__use_namespaces = len(self.stack)+1
 581                 continue
 582             if '<' in attrvalue:
 583                 self.syntax_error("`<' illegal in attribute value")
 584             if attrname in attrdict:
 585                 self.syntax_error("attribute `%s' specified twice" % attrname)
 586             attrvalue = attrvalue.translate(attrtrans)
 587             attrdict[attrname] = self.translate_references(attrvalue)
 588         return attrdict, namespace, i
 589
 590     # Internal -- handle starttag, return length or -1 if not terminated
 591     def parse_starttag(self, i):
 592         rawdata = self.rawdata
 593         # i points to start of tag
 594         end = endbracketfind.match(rawdata, i+1)
 595         if end is None:
 596             return -1
 597         tag = starttagmatch.match(rawdata, i)
 598         if tag is None or tag.end(0) != end.end(0):
 599             self.syntax_error('garbage in starttag')
 600             return end.end(0)
 601         nstag = tagname = tag.group('tagname')
 602         if self.__map_case:
 603             nstag = tagname = nstag.lower()
 604         if not self.__seen_starttag and self.__seen_doctype and \
 605            tagname != self.__seen_doctype:
 606             self.syntax_error('starttag does not match DOCTYPE')
 607         if self.__seen_starttag and not self.stack:
 608             self.syntax_error('multiple elements on top level')
 609         k, j = tag.span('attrs')
 610         attrdict, nsdict, k = self.parse_attributes(tagname, k, j)
 611         self.stack.append((tagname, nsdict, nstag))
 612         if self.__use_namespaces:
 613             res = qname.match(tagname)
 614         else:
 615             res = None
 616         if res is not None:
 617             prefix, nstag = res.group('prefix', 'local')
 618             if prefix is None:
 619                 prefix = ''
 620             ns = None
 621             for t, d, nst in self.stack:
 622                 if prefix in d:
 623                     ns = d[prefix]
 624             if ns is None and prefix != '':
 625                 ns = self.__namespaces.get(prefix)
 626             if ns is not None:
 627                 nstag = ns + ' ' + nstag
 628             elif prefix != '':
 629                 nstag = prefix + ':' + nstag # undo split
 630             self.stack[-1] = tagname, nsdict, nstag
 631         # translate namespace of attributes
 632         attrnamemap = {} # map from new name to old name (used for error reporting)
 633         for key in attrdict.keys():
 634             attrnamemap[key] = key
 635         if self.__use_namespaces:
 636             nattrdict = {}
 637             for key, val in attrdict.items():
 638                 okey = key
 639                 res = qname.match(key)
 640                 if res is not None:
 641                     aprefix, key = res.group('prefix', 'local')
 642                     if self.__map_case:
 643                         key = key.lower()
 644                     if aprefix is None:
 645                         aprefix = ''
 646                     ans = None
 647                     for t, d, nst in self.stack:
 648                         if aprefix in d:
 649                             ans = d[aprefix]
 650                     if ans is None and aprefix != '':
 651                         ans = self.__namespaces.get(aprefix)
 652                     if ans is not None:
 653                         key = ans + ' ' + key
 654                     elif aprefix != '':
 655                         key = aprefix + ':' + key
 656                     elif ns is not None:
 657                         key = ns + ' ' + key
 658                 nattrdict[key] = val
 659                 attrnamemap[key] = okey
 660             attrdict = nattrdict
 661         attributes = self.attributes.get(nstag)
 662         if attributes is not None:
 663             for key in attrdict.keys():
 664                 if not key in attributes:
 665                     self.syntax_error("unknown attribute `%s' in tag `%s'" % (attrnamemap[key], tagname))
 666             for key, val in attributes.items():
 667                 if val is not None and not key in attrdict:
 668                     attrdict[key] = val
 669         method = self.elements.get(nstag, (None, None))[0]
 670         self.finish_starttag(nstag, attrdict, method)
 671         if tag.group('slash') == '/':
 672             self.finish_endtag(tagname)
 673         return tag.end(0)
 674
 675     # Internal -- parse endtag
 676     def parse_endtag(self, i):
 677         rawdata = self.rawdata
 678         end = endbracketfind.match(rawdata, i+1)
 679         if end is None:
 680             return -1
 681         res = tagfind.match(rawdata, i+2)
 682         if res is None:
 683             if self.literal:
 684                 self.handle_data(rawdata[i])
 685                 return i+1
 686             if not self.__accept_missing_endtag_name:
 687                 self.syntax_error('no name specified in end tag')
 688             tag = self.stack[-1][0]
 689             k = i+2
 690         else:
 691             tag = res.group(0)
 692             if self.__map_case:
 693                 tag = tag.lower()
 694             if self.literal:
 695                 if not self.stack or tag != self.stack[-1][0]:
 696                     self.handle_data(rawdata[i])
 697                     return i+1
 698             k = res.end(0)
 699         if endbracket.match(rawdata, k) is None:
 700             self.syntax_error('garbage in end tag')
 701         self.finish_endtag(tag)
 702         return end.end(0)
 703
 704     # Internal -- finish processing of start tag
 705     def finish_starttag(self, tagname, attrdict, method):
 706         if method is not None:
 707             self.handle_starttag(tagname, method, attrdict)
 708         else:
 709             self.unknown_starttag(tagname, attrdict)
 710
 711     # Internal -- finish processing of end tag
 712     def finish_endtag(self, tag):
 713         self.literal = 0
 714         if not tag:
 715             self.syntax_error('name-less end tag')
 716             found = len(self.stack) - 1
 717             if found < 0:
 718                 self.unknown_endtag(tag)
 719                 return
 720         else:
 721             found = -1
 722             for i in range(len(self.stack)):
 723                 if tag == self.stack[i][0]:
 724                     found = i
 725             if found == -1:
 726                 self.syntax_error('unopened end tag')
 727                 return
 728         while len(self.stack) > found:
 729             if found < len(self.stack) - 1:
 730                 self.syntax_error('missing close tag for %s' % self.stack[-1][2])
 731             nstag = self.stack[-1][2]
 732             method = self.elements.get(nstag, (None, None))[1]
 733             if method is not None:
 734                 self.handle_endtag(nstag, method)
 735             else:
 736                 self.unknown_endtag(nstag)
 737             if self.__use_namespaces == len(self.stack):
 738                 self.__use_namespaces = 0
 739             del self.stack[-1]
 740
 741     # Overridable -- handle xml processing instruction
 742     def handle_xml(self, encoding, standalone):
 743         pass
 744
 745     # Overridable -- handle DOCTYPE
 746     def handle_doctype(self, tag, pubid, syslit, data):
 747         pass
 748
 749     # Overridable -- handle start tag
 750     def handle_starttag(self, tag, method, attrs):
 751         method(attrs)
 752
 753     # Overridable -- handle end tag
 754     def handle_endtag(self, tag, method):
 755         method()
 756
 757     # Example -- handle character reference, no need to override
 758     def handle_charref(self, name):
 759         try:
 760             if name[0] == 'x':
 761                 n = int(name[1:], 16)
 762             else:
 763                 n = int(name)
 764         except ValueError:
 765             self.unknown_charref(name)
 766             return
 767         if not 0 <= n <= 255:
 768             self.unknown_charref(name)
 769             return
 770         self.handle_data(chr(n))
 771
 772     # Definition of entities -- derived classes may override
 773     entitydefs = {'lt': '&#60;',        # must use charref
 774                   'gt': '&#62;',
 775                   'amp': '&#38;',       # must use charref
 776                   'quot': '&#34;',
 777                   'apos': '&#39;',
 778                   }
 779
 780     # Example -- handle data, should be overridden
 781     def handle_data(self, data):
 782         pass
 783
 784     # Example -- handle cdata, could be overridden
 785     def handle_cdata(self, data):
 786         pass
 787
 788     # Example -- handle comment, could be overridden
 789     def handle_comment(self, data):
 790         pass
 791
 792     # Example -- handle processing instructions, could be overridden
 793     def handle_proc(self, name, data):
 794         pass
 795
 796     # Example -- handle relatively harmless syntax errors, could be overridden
 797     def syntax_error(self, message):
 798         raise Error('Syntax error at line %d: %s' % (self.lineno, message))
 799
 800     # To be overridden -- handlers for unknown objects
 801     def unknown_starttag(self, tag, attrs): pass
 802     def unknown_endtag(self, tag): pass
 803     def unknown_charref(self, ref): pass
 804     def unknown_entityref(self, name):
 805         self.syntax_error("reference to unknown entity `&%s;'" % name)
 806
 807
 808 class TestXMLParser(XMLParser):
 809
 810     def __init__(self, **kw):
 811         self.testdata = ""
 812         apply(XMLParser.__init__, (self,), kw)
 813
 814     def handle_xml(self, encoding, standalone):
 815         self.flush()
 816         print 'xml: encoding =',encoding,'standalone =',standalone
 817
 818     def handle_doctype(self, tag, pubid, syslit, data):
 819         self.flush()
 820         print 'DOCTYPE:',tag, `data`
 821
 822     def handle_data(self, data):
 823         self.testdata = self.testdata + data
 824         if len(`self.testdata`) >= 70:
 825             self.flush()
 826
 827     def flush(self):
 828         data = self.testdata
 829         if data:
 830             self.testdata = ""
 831             print 'data:', `data`
 832
 833     def handle_cdata(self, data):
 834         self.flush()
 835         print 'cdata:', `data`
 836
 837     def handle_proc(self, name, data):
 838         self.flush()
 839         print 'processing:',name,`data`
 840
 841     def handle_comment(self, data):
 842         self.flush()
 843         r = `data`
 844         if len(r) > 68:
 845             r = r[:32] + '...' + r[-32:]
 846         print 'comment:', r
 847
 848     def syntax_error(self, message):
 849         print 'error at line %d:' % self.lineno, message
 850
 851     def unknown_starttag(self, tag, attrs):
 852         self.flush()
 853         if not attrs:
 854             print 'start tag: <' + tag + '>'
 855         else:
 856             print 'start tag: <' + tag,
 857             for name, value in attrs.items():
 858                 print name + '=' + '"' + value + '"',
 859             print '>'
 860
 861     def unknown_endtag(self, tag):
 862         self.flush()
 863         print 'end tag: </' + tag + '>'
 864
 865     def unknown_entityref(self, ref):
 866         self.flush()
 867         print '*** unknown entity ref: &' + ref + ';'
 868
 869     def unknown_charref(self, ref):
 870         self.flush()
 871         print '*** unknown char ref: &#' + ref + ';'
 872
 873     def close(self):
 874         XMLParser.close(self)
 875         self.flush()
 876
 877 def test(args = None):
 878     import sys, getopt
 879     from time import time
 880
 881     if not args:
 882         args = sys.argv[1:]
 883
 884     opts, args = getopt.getopt(args, 'st')
 885     klass = TestXMLParser
 886     do_time = 0
 887     for o, a in opts:
 888         if o == '-s':
 889             klass = XMLParser
 890         elif o == '-t':
 891             do_time = 1
 892
 893     if args:
 894         file = args[0]
 895     else:
 896         file = 'test.xml'
 897
 898     if file == '-':
 899         f = sys.stdin
 900     else:
 901         try:
 902             f = open(file, 'r')
 903         except IOError, msg:
 904             print file, ":", msg
 905             sys.exit(1)
 906
 907     data = f.read()
 908     if f is not sys.stdin:
 909         f.close()
 910
 911     x = klass()
 912     t0 = time()
 913     try:
 914         if do_time:
 915             x.feed(data)
 916             x.close()
 917         else:
 918             for c in data:
 919                 x.feed(c)
 920             x.close()
 921     except Error, msg:
 922         t1 = time()
 923         print msg
 924         if do_time:
 925             print 'total time: %g' % (t1-t0)
 926         sys.exit(1)
 927     t1 = time()
 928     if do_time:
 929         print 'total time: %g' % (t1-t0)
 930
 931
 932 if __name__ == '__main__':
 933     test()