1 """A parser for XML, using the derived class as static DTD."""
3 # Author: Sjoerd Mullender.
11 # Regular expressions used for parsing
13 _S
= '[ \t\r\n]+' # white space
14 _opS
= '[ \t\r\n]*' # optional white space
15 _Name
= '[a-zA-Z_:][-a-zA-Z0-9._:]*' # valid XML name
16 _QStr
= "(?:'[^']*'|\"[^\"]*\")" # quoted XML string
17 illegal
= re
.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
18 interesting
= re
.compile('[]&<]')
21 ref
= re
.compile('&(' + _Name
+ '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
22 entityref
= re
.compile('&(?P<name>' + _Name
+ ')[^-a-zA-Z0-9._:]')
23 charref
= re
.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
24 space
= re
.compile(_S
+ '$')
25 newline
= re
.compile('\n')
27 attrfind
= re
.compile(
28 _S
+ '(?P<name>' + _Name
+ ')'
29 '(' + _opS
+ '=' + _opS
+
30 '(?P<value>'+_QStr
+'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?')
31 starttagopen
= re
.compile('<' + _Name
)
32 starttagend
= re
.compile(_opS
+ '(?P<slash>/?)>')
33 starttagmatch
= re
.compile('<(?P<tagname>'+_Name
+')'
34 '(?P<attrs>(?:'+attrfind
.pattern
+')*)'+
36 endtagopen
= re
.compile('</')
37 endbracket
= re
.compile(_opS
+ '>')
38 endbracketfind
= re
.compile('(?:[^>\'"]|'+_QStr
+')*>')
39 tagfind
= re
.compile(_Name
)
40 cdataopen
= re
.compile(r
'<!\[CDATA\[')
41 cdataclose
= re
.compile(r
'\]\]>')
42 # this matches one of the following:
43 # SYSTEM SystemLiteral
44 # PUBLIC PubidLiteral SystemLiteral
45 _SystemLiteral
= '(?P<%s>'+_QStr
+')'
46 _PublicLiteral
= '(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
47 "'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
48 _ExternalId
= '(?:SYSTEM|' \
49 'PUBLIC'+_S
+_PublicLiteral
%'pubid'+ \
50 ')'+_S
+_SystemLiteral
%'syslit'
51 doctype
= re
.compile('<!DOCTYPE'+_S
+'(?P<name>'+_Name
+')'
52 '(?:'+_S
+_ExternalId
+')?'+_opS
)
53 xmldecl
= re
.compile('<\?xml'+_S
+
54 'version'+_opS
+'='+_opS
+'(?P<version>'+_QStr
+')'+
55 '(?:'+_S
+'encoding'+_opS
+'='+_opS
+
56 "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
57 '"[A-Za-z][-A-Za-z0-9._]*"))?'
58 '(?:'+_S
+'standalone'+_opS
+'='+_opS
+
59 '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
61 procopen
= re
.compile(r
'<\?(?P<proc>' + _Name
+ ')' + _opS
)
62 procclose
= re
.compile(_opS
+ r
'\?>')
63 commentopen
= re
.compile('<!--')
64 commentclose
= re
.compile('-->')
65 doubledash
= re
.compile('--')
66 attrtrans
= string
.maketrans(' \r\n\t', ' ')
68 # definitions for XML namespaces
69 _NCName
= '[a-zA-Z_][-a-zA-Z0-9._]*' # XML Name, minus the ":"
70 ncname
= re
.compile(_NCName
+ '$')
71 qname
= re
.compile('(?:(?P<prefix>' + _NCName
+ '):)?' # optional prefix
72 '(?P<local>' + _NCName
+ ')$')
74 xmlns
= re
.compile('xmlns(?::(?P<ncname>'+_NCName
+'))?$')
76 # XML parser base class -- find tags and call handler functions.
77 # Usage: p = XMLParser(); p.feed(data); ...; p.close().
78 # The dtd is defined by deriving a class which defines methods with
79 # special names to handle tags: start_foo and end_foo to handle <foo>
80 # and </foo>, respectively. The data between tags is passed to the
81 # parser by calling self.handle_data() with some data as argument (the
82 # data may be split up in arbitrary chunks).
85 attributes
= {} # default, to be overridden
86 elements
= {} # default, to be overridden
88 # parsing options, settable using keyword args in __init__
89 __accept_unquoted_attributes
= 0
90 __accept_missing_endtag_name
= 0
93 __translate_attribute_references
= 1
95 # Interface -- initialize and reset this instance
96 def __init__(self
, **kw
):
98 if kw
.has_key('accept_unquoted_attributes'):
99 self
.__accept
_unquoted
_attributes
= kw
['accept_unquoted_attributes']
100 if kw
.has_key('accept_missing_endtag_name'):
101 self
.__accept
_missing
_endtag
_name
= kw
['accept_missing_endtag_name']
102 if kw
.has_key('map_case'):
103 self
.__map
_case
= kw
['map_case']
104 if kw
.has_key('accept_utf8'):
105 self
.__accept
_utf
8 = kw
['accept_utf8']
106 if kw
.has_key('translate_attribute_references'):
107 self
.__translate
_attribute
_references
= kw
['translate_attribute_references']
110 def __fixelements(self
):
113 self
.__fixdict
(self
.__dict
__)
114 self
.__fixclass
(self
.__class
__)
116 def __fixclass(self
, kl
):
117 self
.__fixdict
(kl
.__dict
__)
118 for k
in kl
.__bases
__:
121 def __fixdict(self
, dict):
122 for key
in dict.keys():
123 if key
[:6] == 'start_':
125 start
, end
= self
.elements
.get(tag
, (None, None))
127 self
.elements
[tag
] = getattr(self
, key
), end
128 elif key
[:4] == 'end_':
130 start
, end
= self
.elements
.get(tag
, (None, None))
132 self
.elements
[tag
] = start
, getattr(self
, key
)
134 # Interface -- reset this instance. Loses all unprocessed data
142 self
.__seen
_doctype
= None
143 self
.__seen
_starttag
= 0
144 self
.__use
_namespaces
= 0
145 self
.__namespaces
= {'xml':None} # xml is implicitly declared
146 # backward compatibility hack: if elements not overridden,
147 # fill it in ourselves
148 if self
.elements
is XMLParser
.elements
:
151 # For derived classes only -- enter literal mode (CDATA) till EOF
152 def setnomoretags(self
):
153 self
.nomoretags
= self
.literal
= 1
155 # For derived classes only -- enter literal mode (CDATA)
156 def setliteral(self
, *args
):
159 # Interface -- feed some data to the parser. Call this as
160 # often as you want, with as little or as much text as you
161 # want (may include '\n'). (This just saves the text, all the
162 # processing is done by goahead().)
163 def feed(self
, data
):
164 self
.rawdata
= self
.rawdata
+ data
167 # Interface -- handle the remaining data
172 # remove self.elements so that we don't leak
175 # Interface -- translate references
176 def translate_references(self
, data
, all
= 1):
177 if not self
.__translate
_attribute
_references
:
181 res
= amp
.search(data
, i
)
185 res
= ref
.match(data
, s
)
187 self
.syntax_error("bogus `&'")
195 str = chr(string
.atoi(str[2:], 16))
197 str = chr(string
.atoi(str[1:]))
198 if data
[i
- 1] != ';':
199 self
.syntax_error("`;' missing after char reference")
202 if self
.entitydefs
.has_key(str):
203 str = self
.entitydefs
[str]
205 elif data
[i
- 1] != ';':
206 self
.syntax_error("bogus `&'")
207 i
= s
+ 1 # just past the &
210 self
.syntax_error("reference to unknown entity `&%s;'" % str)
211 str = '&' + str + ';'
212 elif data
[i
- 1] != ';':
213 self
.syntax_error("bogus `&'")
214 i
= s
+ 1 # just past the &
217 # when we get here, str contains the translated text and i points
218 # to the end of the string that is to be replaced
219 data
= data
[:s
] + str + data
[i
:]
225 # Interface - return a dictionary of all namespaces currently valid
226 def getnamespace(self
):
228 for t
, d
, nst
in self
.stack
:
232 # Internal -- handle data as far as reasonable. May leave state
233 # and data to be processed by a subsequent call. If 'end' is
234 # true, force handling all data as if followed by EOF marker.
235 def goahead(self
, end
):
236 rawdata
= self
.rawdata
244 self
.handle_data(data
)
245 self
.lineno
= self
.lineno
+ string
.count(data
, '\n')
248 res
= interesting
.search(rawdata
, i
)
255 if self
.__at
_start
and space
.match(data
) is None:
256 self
.syntax_error('illegal data at start of file')
258 if not self
.stack
and space
.match(data
) is None:
259 self
.syntax_error('data not in content')
260 if not self
.__accept
_utf
8 and illegal
.search(data
):
261 self
.syntax_error('illegal character in content')
262 self
.handle_data(data
)
263 self
.lineno
= self
.lineno
+ string
.count(data
, '\n')
266 if rawdata
[i
] == '<':
267 if starttagopen
.match(rawdata
, i
):
270 self
.handle_data(data
)
271 self
.lineno
= self
.lineno
+ string
.count(data
, '\n')
274 k
= self
.parse_starttag(i
)
276 self
.__seen
_starttag
= 1
277 self
.lineno
= self
.lineno
+ string
.count(rawdata
[i
:k
], '\n')
280 if endtagopen
.match(rawdata
, i
):
281 k
= self
.parse_endtag(i
)
283 self
.lineno
= self
.lineno
+ string
.count(rawdata
[i
:k
], '\n')
286 if commentopen
.match(rawdata
, i
):
289 self
.handle_data(data
)
290 self
.lineno
= self
.lineno
+ string
.count(data
, '\n')
293 k
= self
.parse_comment(i
)
295 self
.lineno
= self
.lineno
+ string
.count(rawdata
[i
:k
], '\n')
298 if cdataopen
.match(rawdata
, i
):
299 k
= self
.parse_cdata(i
)
301 self
.lineno
= self
.lineno
+ string
.count(rawdata
[i
:k
], '\n')
304 res
= xmldecl
.match(rawdata
, i
)
306 if not self
.__at
_start
:
307 self
.syntax_error("<?xml?> declaration not at start of document")
308 version
, encoding
, standalone
= res
.group('version',
311 if version
[1:-1] != '1.0':
312 raise RuntimeError, 'only XML version 1.0 supported'
313 if encoding
: encoding
= encoding
[1:-1]
314 if standalone
: standalone
= standalone
[1:-1]
315 self
.handle_xml(encoding
, standalone
)
318 res
= procopen
.match(rawdata
, i
)
320 k
= self
.parse_proc(i
)
322 self
.lineno
= self
.lineno
+ string
.count(rawdata
[i
:k
], '\n')
325 res
= doctype
.match(rawdata
, i
)
329 self
.handle_data(data
)
330 self
.lineno
= self
.lineno
+ string
.count(data
, '\n')
333 if self
.__seen
_doctype
:
334 self
.syntax_error('multiple DOCTYPE elements')
335 if self
.__seen
_starttag
:
336 self
.syntax_error('DOCTYPE not at beginning of document')
337 k
= self
.parse_doctype(res
)
339 self
.__seen
_doctype
= res
.group('name')
341 self
.__seen
_doctype
= string
.lower(self
.__seen
_doctype
)
342 self
.lineno
= self
.lineno
+ string
.count(rawdata
[i
:k
], '\n')
345 elif rawdata
[i
] == '&':
348 self
.handle_data(data
)
351 res
= charref
.match(rawdata
, i
)
354 if rawdata
[i
-1] != ';':
355 self
.syntax_error("`;' missing in charref")
358 self
.syntax_error('data not in content')
359 self
.handle_charref(res
.group('char')[:-1])
360 self
.lineno
= self
.lineno
+ string
.count(res
.group(0), '\n')
362 res
= entityref
.match(rawdata
, i
)
365 if rawdata
[i
-1] != ';':
366 self
.syntax_error("`;' missing in entityref")
368 name
= res
.group('name')
370 name
= string
.lower(name
)
371 if self
.entitydefs
.has_key(name
):
372 self
.rawdata
= rawdata
= rawdata
[:res
.start(0)] + self
.entitydefs
[name
] + rawdata
[i
:]
376 self
.unknown_entityref(name
)
377 self
.lineno
= self
.lineno
+ string
.count(res
.group(0), '\n')
379 elif rawdata
[i
] == ']':
382 self
.handle_data(data
)
387 if cdataclose
.match(rawdata
, i
):
388 self
.syntax_error("bogus `]]>'")
389 self
.handle_data(rawdata
[i
])
393 raise RuntimeError, 'neither < nor & ??'
394 # We get here only if incomplete matches but
402 self
.syntax_error("bogus `%s'" % data
)
403 if not self
.__accept
_utf
8 and illegal
.search(data
):
404 self
.syntax_error('illegal character in content')
405 self
.handle_data(data
)
406 self
.lineno
= self
.lineno
+ string
.count(data
, '\n')
407 self
.rawdata
= rawdata
[i
+1:]
408 return self
.goahead(end
)
409 self
.rawdata
= rawdata
[i
:]
411 if not self
.__seen
_starttag
:
412 self
.syntax_error('no elements in file')
414 self
.syntax_error('missing end tags')
416 self
.finish_endtag(self
.stack
[-1][0])
418 # Internal -- parse comment, return length or -1 if not terminated
419 def parse_comment(self
, i
):
420 rawdata
= self
.rawdata
421 if rawdata
[i
:i
+4] <> '<!--':
422 raise RuntimeError, 'unexpected call to handle_comment'
423 res
= commentclose
.search(rawdata
, i
+4)
426 if doubledash
.search(rawdata
, i
+4, res
.start(0)):
427 self
.syntax_error("`--' inside comment")
428 if rawdata
[res
.start(0)-1] == '-':
429 self
.syntax_error('comment cannot end in three dashes')
430 if not self
.__accept
_utf
8 and \
431 illegal
.search(rawdata
, i
+4, res
.start(0)):
432 self
.syntax_error('illegal character in comment')
433 self
.handle_comment(rawdata
[i
+4: res
.start(0)])
436 # Internal -- handle DOCTYPE tag, return length or -1 if not terminated
437 def parse_doctype(self
, res
):
438 rawdata
= self
.rawdata
440 name
= res
.group('name')
442 name
= string
.lower(name
)
443 pubid
, syslit
= res
.group('pubid', 'syslit')
444 if pubid
is not None:
445 pubid
= pubid
[1:-1] # remove quotes
446 pubid
= string
.join(string
.split(pubid
)) # normalize
447 if syslit
is not None: syslit
= syslit
[1:-1] # remove quotes
451 if rawdata
[k
] == '[':
457 if not sq
and c
== '"':
459 elif not dq
and c
== "'":
463 elif level
<= 0 and c
== ']':
464 res
= endbracket
.match(rawdata
, k
+1)
467 self
.handle_doctype(name
, pubid
, syslit
, rawdata
[j
+1:k
])
474 self
.syntax_error("bogus `>' in DOCTYPE")
476 res
= endbracketfind
.match(rawdata
, k
)
479 if endbracket
.match(rawdata
, k
) is None:
480 self
.syntax_error('garbage in DOCTYPE')
481 self
.handle_doctype(name
, pubid
, syslit
, None)
484 # Internal -- handle CDATA tag, return length or -1 if not terminated
485 def parse_cdata(self
, i
):
486 rawdata
= self
.rawdata
487 if rawdata
[i
:i
+9] <> '<![CDATA[':
488 raise RuntimeError, 'unexpected call to parse_cdata'
489 res
= cdataclose
.search(rawdata
, i
+9)
492 if not self
.__accept
_utf
8 and \
493 illegal
.search(rawdata
, i
+9, res
.start(0)):
494 self
.syntax_error('illegal character in CDATA')
496 self
.syntax_error('CDATA not in content')
497 self
.handle_cdata(rawdata
[i
+9:res
.start(0)])
500 __xml_namespace_attributes
= {'ns':None, 'src':None, 'prefix':None}
501 # Internal -- handle a processing instruction tag
502 def parse_proc(self
, i
):
503 rawdata
= self
.rawdata
504 end
= procclose
.search(rawdata
, i
)
508 if not self
.__accept
_utf
8 and illegal
.search(rawdata
, i
+2, j
):
509 self
.syntax_error('illegal character in processing instruction')
510 res
= tagfind
.match(rawdata
, i
+2)
512 raise RuntimeError, 'unexpected call to parse_proc'
516 name
= string
.lower(name
)
517 if name
== 'xml:namespace':
518 self
.syntax_error('old-fashioned namespace declaration')
519 self
.__use
_namespaces
= -1
520 # namespace declaration
521 # this must come after the <?xml?> declaration (if any)
522 # and before the <!DOCTYPE> (if any).
523 if self
.__seen
_doctype
or self
.__seen
_starttag
:
524 self
.syntax_error('xml:namespace declaration too late in document')
525 attrdict
, namespace
, k
= self
.parse_attributes(name
, k
, j
)
527 self
.syntax_error('namespace declaration inside namespace declaration')
528 for attrname
in attrdict
.keys():
529 if not self
.__xml
_namespace
_attributes
.has_key(attrname
):
530 self
.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname
)
531 if not attrdict
.has_key('ns') or not attrdict
.has_key('prefix'):
532 self
.syntax_error('xml:namespace without required attributes')
533 prefix
= attrdict
.get('prefix')
534 if ncname
.match(prefix
) is None:
535 self
.syntax_error('xml:namespace illegal prefix value')
537 if self
.__namespaces
.has_key(prefix
):
538 self
.syntax_error('xml:namespace prefix not unique')
539 self
.__namespaces
[prefix
] = attrdict
['ns']
541 if string
.lower(name
) == 'xml':
542 self
.syntax_error('illegal processing instruction target name')
543 self
.handle_proc(name
, rawdata
[k
:j
])
546 # Internal -- parse attributes between i and j
547 def parse_attributes(self
, tag
, i
, j
):
548 rawdata
= self
.rawdata
552 res
= attrfind
.match(rawdata
, i
)
555 attrname
, attrvalue
= res
.group('name', 'value')
557 attrname
= string
.lower(attrname
)
559 if attrvalue
is None:
560 self
.syntax_error("no value specified for attribute `%s'" % attrname
)
562 elif attrvalue
[:1] == "'" == attrvalue
[-1:] or \
563 attrvalue
[:1] == '"' == attrvalue
[-1:]:
564 attrvalue
= attrvalue
[1:-1]
565 elif not self
.__accept
_unquoted
_attributes
:
566 self
.syntax_error("attribute `%s' value not quoted" % attrname
)
567 res
= xmlns
.match(attrname
)
569 # namespace declaration
570 ncname
= res
.group('ncname')
571 namespace
[ncname
or ''] = attrvalue
or None
572 if not self
.__use
_namespaces
:
573 self
.__use
_namespaces
= len(self
.stack
)+1
576 self
.syntax_error("`<' illegal in attribute value")
577 if attrdict
.has_key(attrname
):
578 self
.syntax_error("attribute `%s' specified twice" % attrname
)
579 attrvalue
= string
.translate(attrvalue
, attrtrans
)
580 attrdict
[attrname
] = self
.translate_references(attrvalue
)
581 return attrdict
, namespace
, i
583 # Internal -- handle starttag, return length or -1 if not terminated
584 def parse_starttag(self
, i
):
585 rawdata
= self
.rawdata
586 # i points to start of tag
587 end
= endbracketfind
.match(rawdata
, i
+1)
590 tag
= starttagmatch
.match(rawdata
, i
)
591 if tag
is None or tag
.end(0) != end
.end(0):
592 self
.syntax_error('garbage in starttag')
594 nstag
= tagname
= tag
.group('tagname')
596 nstag
= tagname
= string
.lower(nstag
)
597 if not self
.__seen
_starttag
and self
.__seen
_doctype
and \
598 tagname
!= self
.__seen
_doctype
:
599 self
.syntax_error('starttag does not match DOCTYPE')
600 if self
.__seen
_starttag
and not self
.stack
:
601 self
.syntax_error('multiple elements on top level')
602 k
, j
= tag
.span('attrs')
603 attrdict
, nsdict
, k
= self
.parse_attributes(tagname
, k
, j
)
604 self
.stack
.append((tagname
, nsdict
, nstag
))
605 if self
.__use
_namespaces
:
606 res
= qname
.match(tagname
)
610 prefix
, nstag
= res
.group('prefix', 'local')
614 for t
, d
, nst
in self
.stack
:
615 if d
.has_key(prefix
):
617 if ns
is None and prefix
!= '':
618 ns
= self
.__namespaces
.get(prefix
)
620 nstag
= ns
+ ' ' + nstag
622 nstag
= prefix
+ ':' + nstag
# undo split
623 self
.stack
[-1] = tagname
, nsdict
, nstag
624 # translate namespace of attributes
625 if self
.__use
_namespaces
:
627 for key
, val
in attrdict
.items():
628 res
= qname
.match(key
)
630 aprefix
, key
= res
.group('prefix', 'local')
632 key
= string
.lower(key
)
636 for t
, d
, nst
in self
.stack
:
637 if d
.has_key(aprefix
):
639 if ans
is None and aprefix
!= '':
640 ans
= self
.__namespaces
.get(aprefix
)
642 key
= ans
+ ' ' + key
644 key
= aprefix
+ ':' + key
649 attributes
= self
.attributes
.get(nstag
)
650 if attributes
is not None:
651 for key
in attrdict
.keys():
652 if not attributes
.has_key(key
):
653 self
.syntax_error("unknown attribute `%s' in tag `%s'" % (key
, tagname
))
654 for key
, val
in attributes
.items():
655 if val
is not None and not attrdict
.has_key(key
):
657 method
= self
.elements
.get(nstag
, (None, None))[0]
658 self
.finish_starttag(nstag
, attrdict
, method
)
659 if tag
.group('slash') == '/':
660 self
.finish_endtag(tagname
)
663 # Internal -- parse endtag
664 def parse_endtag(self
, i
):
665 rawdata
= self
.rawdata
666 end
= endbracketfind
.match(rawdata
, i
+1)
669 res
= tagfind
.match(rawdata
, i
+2)
672 self
.handle_data(rawdata
[i
])
674 if not self
.__accept
_missing
_endtag
_name
:
675 self
.syntax_error('no name specified in end tag')
676 tag
= self
.stack
[-1][0]
681 tag
= string
.lower(tag
)
683 if not self
.stack
or tag
!= self
.stack
[-1][0]:
684 self
.handle_data(rawdata
[i
])
688 if endbracket
.match(rawdata
, k
) is None:
689 self
.syntax_error('garbage in end tag')
690 self
.finish_endtag(tag
)
693 # Internal -- finish processing of start tag
694 def finish_starttag(self
, tagname
, attrdict
, method
):
695 if method
is not None:
696 self
.handle_starttag(tagname
, method
, attrdict
)
698 self
.unknown_starttag(tagname
, attrdict
)
700 # Internal -- finish processing of end tag
701 def finish_endtag(self
, tag
):
703 self
.syntax_error('name-less end tag')
704 found
= len(self
.stack
) - 1
706 self
.unknown_endtag(tag
)
710 for i
in range(len(self
.stack
)):
711 if tag
== self
.stack
[i
][0]:
714 self
.syntax_error('unopened end tag')
716 while len(self
.stack
) > found
:
717 if found
< len(self
.stack
) - 1:
718 self
.syntax_error('missing close tag for %s' % self
.stack
[-1][2])
719 nstag
= self
.stack
[-1][2]
720 method
= self
.elements
.get(nstag
, (None, None))[1]
721 if method
is not None:
722 self
.handle_endtag(nstag
, method
)
724 self
.unknown_endtag(nstag
)
725 if self
.__use
_namespaces
== len(self
.stack
):
726 self
.__use
_namespaces
= 0
729 # Overridable -- handle xml processing instruction
730 def handle_xml(self
, encoding
, standalone
):
733 # Overridable -- handle DOCTYPE
734 def handle_doctype(self
, tag
, pubid
, syslit
, data
):
737 # Overridable -- handle start tag
738 def handle_starttag(self
, tag
, method
, attrs
):
741 # Overridable -- handle end tag
742 def handle_endtag(self
, tag
, method
):
745 # Example -- handle character reference, no need to override
746 def handle_charref(self
, name
):
749 n
= string
.atoi(name
[1:], 16)
751 n
= string
.atoi(name
)
752 except string
.atoi_error
:
753 self
.unknown_charref(name
)
755 if not 0 <= n
<= 255:
756 self
.unknown_charref(name
)
758 self
.handle_data(chr(n
))
760 # Definition of entities -- derived classes may override
761 entitydefs
= {'lt': '<', # must use charref
763 'amp': '&', # must use charref
768 # Example -- handle data, should be overridden
769 def handle_data(self
, data
):
772 # Example -- handle cdata, could be overridden
773 def handle_cdata(self
, data
):
776 # Example -- handle comment, could be overridden
777 def handle_comment(self
, data
):
780 # Example -- handle processing instructions, could be overridden
781 def handle_proc(self
, name
, data
):
784 # Example -- handle relatively harmless syntax errors, could be overridden
785 def syntax_error(self
, message
):
786 raise RuntimeError, 'Syntax error at line %d: %s' % (self
.lineno
, message
)
788 # To be overridden -- handlers for unknown objects
789 def unknown_starttag(self
, tag
, attrs
): pass
790 def unknown_endtag(self
, tag
): pass
791 def unknown_charref(self
, ref
): pass
792 def unknown_entityref(self
, name
):
793 self
.syntax_error("reference to unknown entity `&%s;'" % name
)
796 class TestXMLParser(XMLParser
):
798 def __init__(self
, **kw
):
800 apply(XMLParser
.__init
__, (self
,), kw
)
802 def handle_xml(self
, encoding
, standalone
):
804 print 'xml: encoding =',encoding
,'standalone =',standalone
806 def handle_doctype(self
, tag
, pubid
, syslit
, data
):
808 print 'DOCTYPE:',tag
, `data`
810 def handle_data(self
, data
):
811 self
.testdata
= self
.testdata
+ data
812 if len(`self
.testdata`
) >= 70:
819 print 'data:', `data`
821 def handle_cdata(self
, data
):
823 print 'cdata:', `data`
825 def handle_proc(self
, name
, data
):
827 print 'processing:',name
,`data`
829 def handle_comment(self
, data
):
833 r
= r
[:32] + '...' + r
[-32:]
836 def syntax_error(self
, message
):
837 print 'error at line %d:' % self
.lineno
, message
839 def unknown_starttag(self
, tag
, attrs
):
842 print 'start tag: <' + tag
+ '>'
844 print 'start tag: <' + tag
,
845 for name
, value
in attrs
.items():
846 print name
+ '=' + '"' + value
+ '"',
849 def unknown_endtag(self
, tag
):
851 print 'end tag: </' + tag
+ '>'
853 def unknown_entityref(self
, ref
):
855 print '*** unknown entity ref: &' + ref
+ ';'
857 def unknown_charref(self
, ref
):
859 print '*** unknown char ref: &#' + ref
+ ';'
862 XMLParser
.close(self
)
865 def test(args
= None):
867 from time
import time
872 opts
, args
= getopt
.getopt(args
, 'st')
873 klass
= TestXMLParser
896 if f
is not sys
.stdin
:
909 except RuntimeError, msg
:
913 print 'total time: %g' % (t1
-t0
)
917 print 'total time: %g' % (t1
-t0
)
920 if __name__
== '__main__':