1 """A parser for XML, using the derived class as static DTD."""
3 # Author: Sjoerd Mullender.
11 class Error(RuntimeError):
14 # Regular expressions used for parsing
16 _S
= '[ \t\r\n]+' # white space
17 _opS
= '[ \t\r\n]*' # optional white space
18 _Name
= '[a-zA-Z_:][-a-zA-Z0-9._:]*' # valid XML name
19 _QStr
= "(?:'[^']*'|\"[^\"]*\")" # quoted XML string
20 illegal
= re
.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
21 interesting
= re
.compile('[]&<]')
24 ref
= re
.compile('&(' + _Name
+ '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
25 entityref
= re
.compile('&(?P<name>' + _Name
+ ')[^-a-zA-Z0-9._:]')
26 charref
= re
.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
27 space
= re
.compile(_S
+ '$')
28 newline
= re
.compile('\n')
30 attrfind
= re
.compile(
31 _S
+ '(?P<name>' + _Name
+ ')'
32 '(' + _opS
+ '=' + _opS
+
33 '(?P<value>'+_QStr
+'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?')
34 starttagopen
= re
.compile('<' + _Name
)
35 starttagend
= re
.compile(_opS
+ '(?P<slash>/?)>')
36 starttagmatch
= re
.compile('<(?P<tagname>'+_Name
+')'
37 '(?P<attrs>(?:'+attrfind
.pattern
+')*)'+
39 endtagopen
= re
.compile('</')
40 endbracket
= re
.compile(_opS
+ '>')
41 endbracketfind
= re
.compile('(?:[^>\'"]|'+_QStr
+')*>')
42 tagfind
= re
.compile(_Name
)
43 cdataopen
= re
.compile(r
'<!\[CDATA\[')
44 cdataclose
= re
.compile(r
'\]\]>')
45 # this matches one of the following:
46 # SYSTEM SystemLiteral
47 # PUBLIC PubidLiteral SystemLiteral
48 _SystemLiteral
= '(?P<%s>'+_QStr
+')'
49 _PublicLiteral
= '(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
50 "'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
51 _ExternalId
= '(?:SYSTEM|' \
52 'PUBLIC'+_S
+_PublicLiteral
%'pubid'+ \
53 ')'+_S
+_SystemLiteral
%'syslit'
54 doctype
= re
.compile('<!DOCTYPE'+_S
+'(?P<name>'+_Name
+')'
55 '(?:'+_S
+_ExternalId
+')?'+_opS
)
56 xmldecl
= re
.compile('<\?xml'+_S
+
57 'version'+_opS
+'='+_opS
+'(?P<version>'+_QStr
+')'+
58 '(?:'+_S
+'encoding'+_opS
+'='+_opS
+
59 "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
60 '"[A-Za-z][-A-Za-z0-9._]*"))?'
61 '(?:'+_S
+'standalone'+_opS
+'='+_opS
+
62 '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
64 procopen
= re
.compile(r
'<\?(?P<proc>' + _Name
+ ')' + _opS
)
65 procclose
= re
.compile(_opS
+ r
'\?>')
66 commentopen
= re
.compile('<!--')
67 commentclose
= re
.compile('-->')
68 doubledash
= re
.compile('--')
69 attrtrans
= string
.maketrans(' \r\n\t', ' ')
71 # definitions for XML namespaces
72 _NCName
= '[a-zA-Z_][-a-zA-Z0-9._]*' # XML Name, minus the ":"
73 ncname
= re
.compile(_NCName
+ '$')
74 qname
= re
.compile('(?:(?P<prefix>' + _NCName
+ '):)?' # optional prefix
75 '(?P<local>' + _NCName
+ ')$')
77 xmlns
= re
.compile('xmlns(?::(?P<ncname>'+_NCName
+'))?$')
79 # XML parser base class -- find tags and call handler functions.
80 # Usage: p = XMLParser(); p.feed(data); ...; p.close().
81 # The dtd is defined by deriving a class which defines methods with
82 # special names to handle tags: start_foo and end_foo to handle <foo>
83 # and </foo>, respectively. The data between tags is passed to the
84 # parser by calling self.handle_data() with some data as argument (the
85 # data may be split up in arbitrary chunks).
88 attributes
= {} # default, to be overridden
89 elements
= {} # default, to be overridden
91 # parsing options, settable using keyword args in __init__
92 __accept_unquoted_attributes
= 0
93 __accept_missing_endtag_name
= 0
96 __translate_attribute_references
= 1
98 # Interface -- initialize and reset this instance
99 def __init__(self
, **kw
):
101 if kw
.has_key('accept_unquoted_attributes'):
102 self
.__accept
_unquoted
_attributes
= kw
['accept_unquoted_attributes']
103 if kw
.has_key('accept_missing_endtag_name'):
104 self
.__accept
_missing
_endtag
_name
= kw
['accept_missing_endtag_name']
105 if kw
.has_key('map_case'):
106 self
.__map
_case
= kw
['map_case']
107 if kw
.has_key('accept_utf8'):
108 self
.__accept
_utf
8 = kw
['accept_utf8']
109 if kw
.has_key('translate_attribute_references'):
110 self
.__translate
_attribute
_references
= kw
['translate_attribute_references']
113 def __fixelements(self
):
116 self
.__fixdict
(self
.__dict
__)
117 self
.__fixclass
(self
.__class
__)
119 def __fixclass(self
, kl
):
120 self
.__fixdict
(kl
.__dict
__)
121 for k
in kl
.__bases
__:
124 def __fixdict(self
, dict):
125 for key
in dict.keys():
126 if key
[:6] == 'start_':
128 start
, end
= self
.elements
.get(tag
, (None, None))
130 self
.elements
[tag
] = getattr(self
, key
), end
131 elif key
[:4] == 'end_':
133 start
, end
= self
.elements
.get(tag
, (None, None))
135 self
.elements
[tag
] = start
, getattr(self
, key
)
137 # Interface -- reset this instance. Loses all unprocessed data
145 self
.__seen
_doctype
= None
146 self
.__seen
_starttag
= 0
147 self
.__use
_namespaces
= 0
148 self
.__namespaces
= {'xml':None} # xml is implicitly declared
149 # backward compatibility hack: if elements not overridden,
150 # fill it in ourselves
151 if self
.elements
is XMLParser
.elements
:
154 # For derived classes only -- enter literal mode (CDATA) till EOF
155 def setnomoretags(self
):
156 self
.nomoretags
= self
.literal
= 1
158 # For derived classes only -- enter literal mode (CDATA)
159 def setliteral(self
, *args
):
162 # Interface -- feed some data to the parser. Call this as
163 # often as you want, with as little or as much text as you
164 # want (may include '\n'). (This just saves the text, all the
165 # processing is done by goahead().)
166 def feed(self
, data
):
167 self
.rawdata
= self
.rawdata
+ data
170 # Interface -- handle the remaining data
175 # remove self.elements so that we don't leak
178 # Interface -- translate references
179 def translate_references(self
, data
, all
= 1):
180 if not self
.__translate
_attribute
_references
:
184 res
= amp
.search(data
, i
)
188 res
= ref
.match(data
, s
)
190 self
.syntax_error("bogus `&'")
198 str = chr(int(str[2:], 16))
200 str = chr(int(str[1:]))
201 if data
[i
- 1] != ';':
202 self
.syntax_error("`;' missing after char reference")
205 if self
.entitydefs
.has_key(str):
206 str = self
.entitydefs
[str]
208 elif data
[i
- 1] != ';':
209 self
.syntax_error("bogus `&'")
210 i
= s
+ 1 # just past the &
213 self
.syntax_error("reference to unknown entity `&%s;'" % str)
214 str = '&' + str + ';'
215 elif data
[i
- 1] != ';':
216 self
.syntax_error("bogus `&'")
217 i
= s
+ 1 # just past the &
220 # when we get here, str contains the translated text and i points
221 # to the end of the string that is to be replaced
222 data
= data
[:s
] + str + data
[i
:]
228 # Interface - return a dictionary of all namespaces currently valid
229 def getnamespace(self
):
231 for t
, d
, nst
in self
.stack
:
235 # Internal -- handle data as far as reasonable. May leave state
236 # and data to be processed by a subsequent call. If 'end' is
237 # true, force handling all data as if followed by EOF marker.
238 def goahead(self
, end
):
239 rawdata
= self
.rawdata
247 self
.handle_data(data
)
248 self
.lineno
= self
.lineno
+ data
.count('\n')
251 res
= interesting
.search(rawdata
, i
)
258 if self
.__at
_start
and space
.match(data
) is None:
259 self
.syntax_error('illegal data at start of file')
261 if not self
.stack
and space
.match(data
) is None:
262 self
.syntax_error('data not in content')
263 if not self
.__accept
_utf
8 and illegal
.search(data
):
264 self
.syntax_error('illegal character in content')
265 self
.handle_data(data
)
266 self
.lineno
= self
.lineno
+ data
.count('\n')
269 if rawdata
[i
] == '<':
270 if starttagopen
.match(rawdata
, i
):
273 self
.handle_data(data
)
274 self
.lineno
= self
.lineno
+ data
.count('\n')
277 k
= self
.parse_starttag(i
)
279 self
.__seen
_starttag
= 1
280 self
.lineno
= self
.lineno
+ rawdata
[i
:k
].count('\n')
283 if endtagopen
.match(rawdata
, i
):
284 k
= self
.parse_endtag(i
)
286 self
.lineno
= self
.lineno
+ rawdata
[i
:k
].count('\n')
289 if commentopen
.match(rawdata
, i
):
292 self
.handle_data(data
)
293 self
.lineno
= self
.lineno
+ data
.count('\n')
296 k
= self
.parse_comment(i
)
298 self
.lineno
= self
.lineno
+ rawdata
[i
:k
].count('\n')
301 if cdataopen
.match(rawdata
, i
):
302 k
= self
.parse_cdata(i
)
304 self
.lineno
= self
.lineno
+ rawdata
[i
:k
].count('\n')
307 res
= xmldecl
.match(rawdata
, i
)
309 if not self
.__at
_start
:
310 self
.syntax_error("<?xml?> declaration not at start of document")
311 version
, encoding
, standalone
= res
.group('version',
314 if version
[1:-1] != '1.0':
315 raise Error('only XML version 1.0 supported')
316 if encoding
: encoding
= encoding
[1:-1]
317 if standalone
: standalone
= standalone
[1:-1]
318 self
.handle_xml(encoding
, standalone
)
321 res
= procopen
.match(rawdata
, i
)
323 k
= self
.parse_proc(i
)
325 self
.lineno
= self
.lineno
+ rawdata
[i
:k
].count('\n')
328 res
= doctype
.match(rawdata
, i
)
332 self
.handle_data(data
)
333 self
.lineno
= self
.lineno
+ data
.count('\n')
336 if self
.__seen
_doctype
:
337 self
.syntax_error('multiple DOCTYPE elements')
338 if self
.__seen
_starttag
:
339 self
.syntax_error('DOCTYPE not at beginning of document')
340 k
= self
.parse_doctype(res
)
342 self
.__seen
_doctype
= res
.group('name')
344 self
.__seen
_doctype
= self
.__seen
_doctype
.lower()
345 self
.lineno
= self
.lineno
+ rawdata
[i
:k
].count('\n')
348 elif rawdata
[i
] == '&':
351 self
.handle_data(data
)
354 res
= charref
.match(rawdata
, i
)
357 if rawdata
[i
-1] != ';':
358 self
.syntax_error("`;' missing in charref")
361 self
.syntax_error('data not in content')
362 self
.handle_charref(res
.group('char')[:-1])
363 self
.lineno
= self
.lineno
+ res
.group(0).count('\n')
365 res
= entityref
.match(rawdata
, i
)
368 if rawdata
[i
-1] != ';':
369 self
.syntax_error("`;' missing in entityref")
371 name
= res
.group('name')
374 if self
.entitydefs
.has_key(name
):
375 self
.rawdata
= rawdata
= rawdata
[:res
.start(0)] + self
.entitydefs
[name
] + rawdata
[i
:]
379 self
.unknown_entityref(name
)
380 self
.lineno
= self
.lineno
+ res
.group(0).count('\n')
382 elif rawdata
[i
] == ']':
385 self
.handle_data(data
)
390 if cdataclose
.match(rawdata
, i
):
391 self
.syntax_error("bogus `]]>'")
392 self
.handle_data(rawdata
[i
])
396 raise Error('neither < nor & ??')
397 # We get here only if incomplete matches but
405 self
.syntax_error("bogus `%s'" % data
)
406 if not self
.__accept
_utf
8 and illegal
.search(data
):
407 self
.syntax_error('illegal character in content')
408 self
.handle_data(data
)
409 self
.lineno
= self
.lineno
+ data
.count('\n')
410 self
.rawdata
= rawdata
[i
+1:]
411 return self
.goahead(end
)
412 self
.rawdata
= rawdata
[i
:]
414 if not self
.__seen
_starttag
:
415 self
.syntax_error('no elements in file')
417 self
.syntax_error('missing end tags')
419 self
.finish_endtag(self
.stack
[-1][0])
421 # Internal -- parse comment, return length or -1 if not terminated
422 def parse_comment(self
, i
):
423 rawdata
= self
.rawdata
424 if rawdata
[i
:i
+4] != '<!--':
425 raise Error('unexpected call to handle_comment')
426 res
= commentclose
.search(rawdata
, i
+4)
429 if doubledash
.search(rawdata
, i
+4, res
.start(0)):
430 self
.syntax_error("`--' inside comment")
431 if rawdata
[res
.start(0)-1] == '-':
432 self
.syntax_error('comment cannot end in three dashes')
433 if not self
.__accept
_utf
8 and \
434 illegal
.search(rawdata
, i
+4, res
.start(0)):
435 self
.syntax_error('illegal character in comment')
436 self
.handle_comment(rawdata
[i
+4: res
.start(0)])
439 # Internal -- handle DOCTYPE tag, return length or -1 if not terminated
440 def parse_doctype(self
, res
):
441 rawdata
= self
.rawdata
443 name
= res
.group('name')
446 pubid
, syslit
= res
.group('pubid', 'syslit')
447 if pubid
is not None:
448 pubid
= pubid
[1:-1] # remove quotes
449 pubid
= ' '.join(pubid
.split()) # normalize
450 if syslit
is not None: syslit
= syslit
[1:-1] # remove quotes
454 if rawdata
[k
] == '[':
460 if not sq
and c
== '"':
462 elif not dq
and c
== "'":
466 elif level
<= 0 and c
== ']':
467 res
= endbracket
.match(rawdata
, k
+1)
470 self
.handle_doctype(name
, pubid
, syslit
, rawdata
[j
+1:k
])
477 self
.syntax_error("bogus `>' in DOCTYPE")
479 res
= endbracketfind
.match(rawdata
, k
)
482 if endbracket
.match(rawdata
, k
) is None:
483 self
.syntax_error('garbage in DOCTYPE')
484 self
.handle_doctype(name
, pubid
, syslit
, None)
487 # Internal -- handle CDATA tag, return length or -1 if not terminated
488 def parse_cdata(self
, i
):
489 rawdata
= self
.rawdata
490 if rawdata
[i
:i
+9] != '<![CDATA[':
491 raise Error('unexpected call to parse_cdata')
492 res
= cdataclose
.search(rawdata
, i
+9)
495 if not self
.__accept
_utf
8 and \
496 illegal
.search(rawdata
, i
+9, res
.start(0)):
497 self
.syntax_error('illegal character in CDATA')
499 self
.syntax_error('CDATA not in content')
500 self
.handle_cdata(rawdata
[i
+9:res
.start(0)])
503 __xml_namespace_attributes
= {'ns':None, 'src':None, 'prefix':None}
504 # Internal -- handle a processing instruction tag
505 def parse_proc(self
, i
):
506 rawdata
= self
.rawdata
507 end
= procclose
.search(rawdata
, i
)
511 if not self
.__accept
_utf
8 and illegal
.search(rawdata
, i
+2, j
):
512 self
.syntax_error('illegal character in processing instruction')
513 res
= tagfind
.match(rawdata
, i
+2)
515 raise Error('unexpected call to parse_proc')
520 if name
== 'xml:namespace':
521 self
.syntax_error('old-fashioned namespace declaration')
522 self
.__use
_namespaces
= -1
523 # namespace declaration
524 # this must come after the <?xml?> declaration (if any)
525 # and before the <!DOCTYPE> (if any).
526 if self
.__seen
_doctype
or self
.__seen
_starttag
:
527 self
.syntax_error('xml:namespace declaration too late in document')
528 attrdict
, namespace
, k
= self
.parse_attributes(name
, k
, j
)
530 self
.syntax_error('namespace declaration inside namespace declaration')
531 for attrname
in attrdict
.keys():
532 if not self
.__xml
_namespace
_attributes
.has_key(attrname
):
533 self
.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname
)
534 if not attrdict
.has_key('ns') or not attrdict
.has_key('prefix'):
535 self
.syntax_error('xml:namespace without required attributes')
536 prefix
= attrdict
.get('prefix')
537 if ncname
.match(prefix
) is None:
538 self
.syntax_error('xml:namespace illegal prefix value')
540 if self
.__namespaces
.has_key(prefix
):
541 self
.syntax_error('xml:namespace prefix not unique')
542 self
.__namespaces
[prefix
] = attrdict
['ns']
544 if name
.lower() == 'xml':
545 self
.syntax_error('illegal processing instruction target name')
546 self
.handle_proc(name
, rawdata
[k
:j
])
549 # Internal -- parse attributes between i and j
550 def parse_attributes(self
, tag
, i
, j
):
551 rawdata
= self
.rawdata
555 res
= attrfind
.match(rawdata
, i
)
558 attrname
, attrvalue
= res
.group('name', 'value')
560 attrname
= attrname
.lower()
562 if attrvalue
is None:
563 self
.syntax_error("no value specified for attribute `%s'" % attrname
)
565 elif attrvalue
[:1] == "'" == attrvalue
[-1:] or \
566 attrvalue
[:1] == '"' == attrvalue
[-1:]:
567 attrvalue
= attrvalue
[1:-1]
568 elif not self
.__accept
_unquoted
_attributes
:
569 self
.syntax_error("attribute `%s' value not quoted" % attrname
)
570 res
= xmlns
.match(attrname
)
572 # namespace declaration
573 ncname
= res
.group('ncname')
574 namespace
[ncname
or ''] = attrvalue
or None
575 if not self
.__use
_namespaces
:
576 self
.__use
_namespaces
= len(self
.stack
)+1
579 self
.syntax_error("`<' illegal in attribute value")
580 if attrdict
.has_key(attrname
):
581 self
.syntax_error("attribute `%s' specified twice" % attrname
)
582 attrvalue
= attrvalue
.translate(attrtrans
)
583 attrdict
[attrname
] = self
.translate_references(attrvalue
)
584 return attrdict
, namespace
, i
586 # Internal -- handle starttag, return length or -1 if not terminated
587 def parse_starttag(self
, i
):
588 rawdata
= self
.rawdata
589 # i points to start of tag
590 end
= endbracketfind
.match(rawdata
, i
+1)
593 tag
= starttagmatch
.match(rawdata
, i
)
594 if tag
is None or tag
.end(0) != end
.end(0):
595 self
.syntax_error('garbage in starttag')
597 nstag
= tagname
= tag
.group('tagname')
599 nstag
= tagname
= nstag
.lower()
600 if not self
.__seen
_starttag
and self
.__seen
_doctype
and \
601 tagname
!= self
.__seen
_doctype
:
602 self
.syntax_error('starttag does not match DOCTYPE')
603 if self
.__seen
_starttag
and not self
.stack
:
604 self
.syntax_error('multiple elements on top level')
605 k
, j
= tag
.span('attrs')
606 attrdict
, nsdict
, k
= self
.parse_attributes(tagname
, k
, j
)
607 self
.stack
.append((tagname
, nsdict
, nstag
))
608 if self
.__use
_namespaces
:
609 res
= qname
.match(tagname
)
613 prefix
, nstag
= res
.group('prefix', 'local')
617 for t
, d
, nst
in self
.stack
:
618 if d
.has_key(prefix
):
620 if ns
is None and prefix
!= '':
621 ns
= self
.__namespaces
.get(prefix
)
623 nstag
= ns
+ ' ' + nstag
625 nstag
= prefix
+ ':' + nstag
# undo split
626 self
.stack
[-1] = tagname
, nsdict
, nstag
627 # translate namespace of attributes
628 attrnamemap
= {} # map from new name to old name (used for error reporting)
629 for key
in attrdict
.keys():
630 attrnamemap
[key
] = key
631 if self
.__use
_namespaces
:
633 for key
, val
in attrdict
.items():
635 res
= qname
.match(key
)
637 aprefix
, key
= res
.group('prefix', 'local')
643 for t
, d
, nst
in self
.stack
:
644 if d
.has_key(aprefix
):
646 if ans
is None and aprefix
!= '':
647 ans
= self
.__namespaces
.get(aprefix
)
649 key
= ans
+ ' ' + key
651 key
= aprefix
+ ':' + key
655 attrnamemap
[key
] = okey
657 attributes
= self
.attributes
.get(nstag
)
658 if attributes
is not None:
659 for key
in attrdict
.keys():
660 if not attributes
.has_key(key
):
661 self
.syntax_error("unknown attribute `%s' in tag `%s'" % (attrnamemap
[key
], tagname
))
662 for key
, val
in attributes
.items():
663 if val
is not None and not attrdict
.has_key(key
):
665 method
= self
.elements
.get(nstag
, (None, None))[0]
666 self
.finish_starttag(nstag
, attrdict
, method
)
667 if tag
.group('slash') == '/':
668 self
.finish_endtag(tagname
)
671 # Internal -- parse endtag
672 def parse_endtag(self
, i
):
673 rawdata
= self
.rawdata
674 end
= endbracketfind
.match(rawdata
, i
+1)
677 res
= tagfind
.match(rawdata
, i
+2)
680 self
.handle_data(rawdata
[i
])
682 if not self
.__accept
_missing
_endtag
_name
:
683 self
.syntax_error('no name specified in end tag')
684 tag
= self
.stack
[-1][0]
691 if not self
.stack
or tag
!= self
.stack
[-1][0]:
692 self
.handle_data(rawdata
[i
])
695 if endbracket
.match(rawdata
, k
) is None:
696 self
.syntax_error('garbage in end tag')
697 self
.finish_endtag(tag
)
700 # Internal -- finish processing of start tag
701 def finish_starttag(self
, tagname
, attrdict
, method
):
702 if method
is not None:
703 self
.handle_starttag(tagname
, method
, attrdict
)
705 self
.unknown_starttag(tagname
, attrdict
)
707 # Internal -- finish processing of end tag
708 def finish_endtag(self
, tag
):
711 self
.syntax_error('name-less end tag')
712 found
= len(self
.stack
) - 1
714 self
.unknown_endtag(tag
)
718 for i
in range(len(self
.stack
)):
719 if tag
== self
.stack
[i
][0]:
722 self
.syntax_error('unopened end tag')
724 while len(self
.stack
) > found
:
725 if found
< len(self
.stack
) - 1:
726 self
.syntax_error('missing close tag for %s' % self
.stack
[-1][2])
727 nstag
= self
.stack
[-1][2]
728 method
= self
.elements
.get(nstag
, (None, None))[1]
729 if method
is not None:
730 self
.handle_endtag(nstag
, method
)
732 self
.unknown_endtag(nstag
)
733 if self
.__use
_namespaces
== len(self
.stack
):
734 self
.__use
_namespaces
= 0
737 # Overridable -- handle xml processing instruction
738 def handle_xml(self
, encoding
, standalone
):
741 # Overridable -- handle DOCTYPE
742 def handle_doctype(self
, tag
, pubid
, syslit
, data
):
745 # Overridable -- handle start tag
746 def handle_starttag(self
, tag
, method
, attrs
):
749 # Overridable -- handle end tag
750 def handle_endtag(self
, tag
, method
):
753 # Example -- handle character reference, no need to override
754 def handle_charref(self
, name
):
757 n
= int(name
[1:], 16)
761 self
.unknown_charref(name
)
763 if not 0 <= n
<= 255:
764 self
.unknown_charref(name
)
766 self
.handle_data(chr(n
))
768 # Definition of entities -- derived classes may override
769 entitydefs
= {'lt': '<', # must use charref
771 'amp': '&', # must use charref
776 # Example -- handle data, should be overridden
777 def handle_data(self
, data
):
780 # Example -- handle cdata, could be overridden
781 def handle_cdata(self
, data
):
784 # Example -- handle comment, could be overridden
785 def handle_comment(self
, data
):
788 # Example -- handle processing instructions, could be overridden
789 def handle_proc(self
, name
, data
):
792 # Example -- handle relatively harmless syntax errors, could be overridden
793 def syntax_error(self
, message
):
794 raise Error('Syntax error at line %d: %s' % (self
.lineno
, message
))
796 # To be overridden -- handlers for unknown objects
797 def unknown_starttag(self
, tag
, attrs
): pass
798 def unknown_endtag(self
, tag
): pass
799 def unknown_charref(self
, ref
): pass
800 def unknown_entityref(self
, name
):
801 self
.syntax_error("reference to unknown entity `&%s;'" % name
)
804 class TestXMLParser(XMLParser
):
806 def __init__(self
, **kw
):
808 apply(XMLParser
.__init
__, (self
,), kw
)
810 def handle_xml(self
, encoding
, standalone
):
812 print 'xml: encoding =',encoding
,'standalone =',standalone
814 def handle_doctype(self
, tag
, pubid
, syslit
, data
):
816 print 'DOCTYPE:',tag
, `data`
818 def handle_data(self
, data
):
819 self
.testdata
= self
.testdata
+ data
820 if len(`self
.testdata`
) >= 70:
827 print 'data:', `data`
829 def handle_cdata(self
, data
):
831 print 'cdata:', `data`
833 def handle_proc(self
, name
, data
):
835 print 'processing:',name
,`data`
837 def handle_comment(self
, data
):
841 r
= r
[:32] + '...' + r
[-32:]
844 def syntax_error(self
, message
):
845 print 'error at line %d:' % self
.lineno
, message
847 def unknown_starttag(self
, tag
, attrs
):
850 print 'start tag: <' + tag
+ '>'
852 print 'start tag: <' + tag
,
853 for name
, value
in attrs
.items():
854 print name
+ '=' + '"' + value
+ '"',
857 def unknown_endtag(self
, tag
):
859 print 'end tag: </' + tag
+ '>'
861 def unknown_entityref(self
, ref
):
863 print '*** unknown entity ref: &' + ref
+ ';'
865 def unknown_charref(self
, ref
):
867 print '*** unknown char ref: &#' + ref
+ ';'
870 XMLParser
.close(self
)
873 def test(args
= None):
875 from time
import time
880 opts
, args
= getopt
.getopt(args
, 'st')
881 klass
= TestXMLParser
904 if f
is not sys
.stdin
:
921 print 'total time: %g' % (t1
-t0
)
925 print 'total time: %g' % (t1
-t0
)
928 if __name__
== '__main__':