1 """Facility to use the Expat parser to load a minidom instance
4 This avoids all the overhead of SAX and pulldom to gain performance.
9 # This module is tightly bound to the implementation details of the
10 # minidom DOM and can't be used with other DOM implementations. This
11 # is due, in part, to a lack of appropriate methods in the DOM (there is
12 # no way to create Entity and Notation nodes via the DOM Level 2
13 # interface), and for performance. The later is the cause of some fairly
18 # - .character_data_handler() has an extra case in which continuing
19 # data is appended to an existing Text node; this can be a
20 # speedup since pyexpat can break up character data into multiple
21 # callbacks even though we set the buffer_text attribute on the
22 # parser. This also gives us the advantage that we don't need a
23 # separate normalization pass.
25 # - Determining that a node exists is done using an identity comparison
26 # with None rather than a truth test; this avoids searching for and
27 # calling any methods on the node object if it exists. (A rather
28 # nice speedup is achieved this way as well!)
30 from xml
.dom
import xmlbuilder
, minidom
, Node
31 from xml
.dom
import EMPTY_NAMESPACE
, EMPTY_PREFIX
, XMLNS_NAMESPACE
32 from xml
.parsers
import expat
33 from xml
.dom
.minidom
import _append_child
, _set_attribute_node
34 from xml
.dom
.NodeFilter
import NodeFilter
36 from xml
.dom
.minicompat
import *
38 TEXT_NODE
= Node
.TEXT_NODE
39 CDATA_SECTION_NODE
= Node
.CDATA_SECTION_NODE
40 DOCUMENT_NODE
= Node
.DOCUMENT_NODE
42 FILTER_ACCEPT
= xmlbuilder
.DOMBuilderFilter
.FILTER_ACCEPT
43 FILTER_REJECT
= xmlbuilder
.DOMBuilderFilter
.FILTER_REJECT
44 FILTER_SKIP
= xmlbuilder
.DOMBuilderFilter
.FILTER_SKIP
45 FILTER_INTERRUPT
= xmlbuilder
.DOMBuilderFilter
.FILTER_INTERRUPT
47 theDOMImplementation
= minidom
.getDOMImplementation()
49 # Expat typename -> TypeInfo
51 "CDATA": minidom
.TypeInfo(None, "cdata"),
52 "ENUM": minidom
.TypeInfo(None, "enumeration"),
53 "ENTITY": minidom
.TypeInfo(None, "entity"),
54 "ENTITIES": minidom
.TypeInfo(None, "entities"),
55 "ID": minidom
.TypeInfo(None, "id"),
56 "IDREF": minidom
.TypeInfo(None, "idref"),
57 "IDREFS": minidom
.TypeInfo(None, "idrefs"),
58 "NMTOKEN": minidom
.TypeInfo(None, "nmtoken"),
59 "NMTOKENS": minidom
.TypeInfo(None, "nmtokens"),
62 class ElementInfo(NewStyle
):
63 __slots__
= '_attr_info', '_model', 'tagName'
65 def __init__(self
, tagName
, model
=None):
66 self
.tagName
= tagName
70 def __getstate__(self
):
71 return self
._attr
_info
, self
._model
, self
.tagName
73 def __setstate__(self
, state
):
74 self
._attr
_info
, self
._model
, self
.tagName
= state
76 def getAttributeType(self
, aname
):
77 for info
in self
._attr
_info
:
81 return _typeinfo_map
["ENUM"]
83 return _typeinfo_map
[info
[-2]]
84 return minidom
._no
_type
86 def getAttributeTypeNS(self
, namespaceURI
, localName
):
87 return minidom
._no
_type
89 def isElementContent(self
):
92 return type not in (expat
.model
.XML_CTYPE_ANY
,
93 expat
.model
.XML_CTYPE_MIXED
)
99 return self
._model
[0] == expat
.model
.XML_CTYPE_EMPTY
103 def isId(self
, aname
):
104 for info
in self
._attr
_info
:
106 return info
[-2] == "ID"
109 def isIdNS(self
, euri
, ename
, auri
, aname
):
110 # not sure this is meaningful
111 return self
.isId((auri
, aname
))
113 def _intern(builder
, s
):
114 return builder
._intern
_setdefault
(s
, s
)
116 def _parse_ns_name(builder
, name
):
118 parts
= name
.split(' ')
119 intern = builder
._intern
_setdefault
121 uri
, localname
, prefix
= parts
122 prefix
= intern(prefix
, prefix
)
123 qname
= "%s:%s" % (prefix
, localname
)
124 qname
= intern(qname
, qname
)
125 localname
= intern(localname
, localname
)
127 uri
, localname
= parts
128 prefix
= EMPTY_PREFIX
129 qname
= localname
= intern(localname
, localname
)
130 return intern(uri
, uri
), localname
, prefix
, qname
134 """Document builder that uses Expat to build a ParsedXML.DOM document
137 def __init__(self
, options
=None):
139 options
= xmlbuilder
.Options()
140 self
._options
= options
141 if self
._options
.filter is not None:
142 self
._filter
= FilterVisibilityController(self
._options
.filter)
145 # This *really* doesn't do anything in this case, so
146 # override it with something fast & minimal.
147 self
._finish
_start
_element
= id
151 def createParser(self
):
152 """Create a new parser object."""
153 return expat
.ParserCreate()
156 """Return the parser object, creating a new one if needed."""
158 self
._parser
= self
.createParser()
159 self
._intern
_setdefault
= self
._parser
.intern.setdefault
160 self
._parser
.buffer_text
= True
161 self
._parser
.ordered_attributes
= True
162 self
._parser
.specified_attributes
= True
163 self
.install(self
._parser
)
167 """Free all data structures used during DOM construction."""
168 self
.document
= theDOMImplementation
.createDocument(
169 EMPTY_NAMESPACE
, None, None)
170 self
.curNode
= self
.document
171 self
._elem
_info
= self
.document
._elem
_info
174 def install(self
, parser
):
175 """Install the callbacks needed to build the DOM into the parser."""
176 # This creates circular references!
177 parser
.StartDoctypeDeclHandler
= self
.start_doctype_decl_handler
178 parser
.StartElementHandler
= self
.first_element_handler
179 parser
.EndElementHandler
= self
.end_element_handler
180 parser
.ProcessingInstructionHandler
= self
.pi_handler
181 if self
._options
.entities
:
182 parser
.EntityDeclHandler
= self
.entity_decl_handler
183 parser
.NotationDeclHandler
= self
.notation_decl_handler
184 if self
._options
.comments
:
185 parser
.CommentHandler
= self
.comment_handler
186 if self
._options
.cdata_sections
:
187 parser
.StartCdataSectionHandler
= self
.start_cdata_section_handler
188 parser
.EndCdataSectionHandler
= self
.end_cdata_section_handler
189 parser
.CharacterDataHandler
= self
.character_data_handler_cdata
191 parser
.CharacterDataHandler
= self
.character_data_handler
192 parser
.ExternalEntityRefHandler
= self
.external_entity_ref_handler
193 parser
.XmlDeclHandler
= self
.xml_decl_handler
194 parser
.ElementDeclHandler
= self
.element_decl_handler
195 parser
.AttlistDeclHandler
= self
.attlist_decl_handler
197 def parseFile(self
, file):
198 """Parse a document from a file object, returning the document
200 parser
= self
.getParser()
204 buffer = file.read(16*1024)
207 parser
.Parse(buffer, 0)
208 if first_buffer
and self
.document
.documentElement
:
209 self
._setup
_subset
(buffer)
211 parser
.Parse("", True)
219 def parseString(self
, string
):
220 """Parse a document from a string, returning the document node."""
221 parser
= self
.getParser()
223 parser
.Parse(string
, True)
224 self
._setup
_subset
(string
)
232 def _setup_subset(self
, buffer):
233 """Load the internal subset if there might be one."""
234 if self
.document
.doctype
:
235 extractor
= InternalSubsetExtractor()
236 extractor
.parseString(buffer)
237 subset
= extractor
.getSubset()
238 self
.document
.doctype
.internalSubset
= subset
240 def start_doctype_decl_handler(self
, doctypeName
, systemId
, publicId
,
241 has_internal_subset
):
242 doctype
= self
.document
.implementation
.createDocumentType(
243 doctypeName
, publicId
, systemId
)
244 doctype
.ownerDocument
= self
.document
245 self
.document
.childNodes
.append(doctype
)
246 self
.document
.doctype
= doctype
247 if self
._filter
and self
._filter
.acceptNode(doctype
) == FILTER_REJECT
:
248 self
.document
.doctype
= None
249 del self
.document
.childNodes
[-1]
251 self
._parser
.EntityDeclHandler
= None
252 self
._parser
.NotationDeclHandler
= None
253 if has_internal_subset
:
254 if doctype
is not None:
255 doctype
.entities
._seq
= []
256 doctype
.notations
._seq
= []
257 self
._parser
.CommentHandler
= None
258 self
._parser
.ProcessingInstructionHandler
= None
259 self
._parser
.EndDoctypeDeclHandler
= self
.end_doctype_decl_handler
261 def end_doctype_decl_handler(self
):
262 if self
._options
.comments
:
263 self
._parser
.CommentHandler
= self
.comment_handler
264 self
._parser
.ProcessingInstructionHandler
= self
.pi_handler
265 if not (self
._elem
_info
or self
._filter
):
266 self
._finish
_end
_element
= id
268 def pi_handler(self
, target
, data
):
269 node
= self
.document
.createProcessingInstruction(target
, data
)
270 _append_child(self
.curNode
, node
)
271 if self
._filter
and self
._filter
.acceptNode(node
) == FILTER_REJECT
:
272 self
.curNode
.removeChild(node
)
274 def character_data_handler_cdata(self
, data
):
275 childNodes
= self
.curNode
.childNodes
277 if ( self
._cdata
_continue
278 and childNodes
[-1].nodeType
== CDATA_SECTION_NODE
):
279 childNodes
[-1].appendData(data
)
281 node
= self
.document
.createCDATASection(data
)
282 self
._cdata
_continue
= True
283 elif childNodes
and childNodes
[-1].nodeType
== TEXT_NODE
:
284 node
= childNodes
[-1]
285 value
= node
.data
+ data
287 d
['data'] = d
['nodeValue'] = value
290 node
= minidom
.Text()
292 d
['data'] = d
['nodeValue'] = data
293 d
['ownerDocument'] = self
.document
294 _append_child(self
.curNode
, node
)
296 def character_data_handler(self
, data
):
297 childNodes
= self
.curNode
.childNodes
298 if childNodes
and childNodes
[-1].nodeType
== TEXT_NODE
:
299 node
= childNodes
[-1]
301 d
['data'] = d
['nodeValue'] = node
.data
+ data
303 node
= minidom
.Text()
305 d
['data'] = d
['nodeValue'] = node
.data
+ data
306 d
['ownerDocument'] = self
.document
307 _append_child(self
.curNode
, node
)
309 def entity_decl_handler(self
, entityName
, is_parameter_entity
, value
,
310 base
, systemId
, publicId
, notationName
):
311 if is_parameter_entity
:
312 # we don't care about parameter entities for the DOM
314 if not self
._options
.entities
:
316 node
= self
.document
._create
_entity
(entityName
, publicId
,
317 systemId
, notationName
)
318 if value
is not None:
320 # node *should* be readonly, but we'll cheat
321 child
= self
.document
.createTextNode(value
)
322 node
.childNodes
.append(child
)
323 self
.document
.doctype
.entities
._seq
.append(node
)
324 if self
._filter
and self
._filter
.acceptNode(node
) == FILTER_REJECT
:
325 del self
.document
.doctype
.entities
._seq
[-1]
327 def notation_decl_handler(self
, notationName
, base
, systemId
, publicId
):
328 node
= self
.document
._create
_notation
(notationName
, publicId
, systemId
)
329 self
.document
.doctype
.notations
._seq
.append(node
)
330 if self
._filter
and self
._filter
.acceptNode(node
) == FILTER_ACCEPT
:
331 del self
.document
.doctype
.notations
._seq
[-1]
333 def comment_handler(self
, data
):
334 node
= self
.document
.createComment(data
)
335 _append_child(self
.curNode
, node
)
336 if self
._filter
and self
._filter
.acceptNode(node
) == FILTER_REJECT
:
337 self
.curNode
.removeChild(node
)
339 def start_cdata_section_handler(self
):
341 self
._cdata
_continue
= False
343 def end_cdata_section_handler(self
):
345 self
._cdata
_continue
= False
347 def external_entity_ref_handler(self
, context
, base
, systemId
, publicId
):
350 def first_element_handler(self
, name
, attributes
):
351 if self
._filter
is None and not self
._elem
_info
:
352 self
._finish
_end
_element
= id
353 self
.getParser().StartElementHandler
= self
.start_element_handler
354 self
.start_element_handler(name
, attributes
)
356 def start_element_handler(self
, name
, attributes
):
357 node
= self
.document
.createElement(name
)
358 _append_child(self
.curNode
, node
)
362 for i
in range(0, len(attributes
), 2):
363 a
= minidom
.Attr(attributes
[i
], EMPTY_NAMESPACE
,
365 value
= attributes
[i
+1]
366 d
= a
.childNodes
[0].__dict
__
367 d
['data'] = d
['nodeValue'] = value
369 d
['value'] = d
['nodeValue'] = value
370 d
['ownerDocument'] = self
.document
371 _set_attribute_node(node
, a
)
373 if node
is not self
.document
.documentElement
:
374 self
._finish
_start
_element
(node
)
376 def _finish_start_element(self
, node
):
378 # To be general, we'd have to call isSameNode(), but this
379 # is sufficient for minidom:
380 if node
is self
.document
.documentElement
:
382 filt
= self
._filter
.startContainer(node
)
383 if filt
== FILTER_REJECT
:
384 # ignore this node & all descendents
386 elif filt
== FILTER_SKIP
:
387 # ignore this node, but make it's children become
388 # children of the parent node
392 self
.curNode
= node
.parentNode
393 node
.parentNode
.removeChild(node
)
396 # If this ever changes, Namespaces.end_element_handler() needs to
397 # be changed to match.
399 def end_element_handler(self
, name
):
400 curNode
= self
.curNode
401 self
.curNode
= curNode
.parentNode
402 self
._finish
_end
_element
(curNode
)
404 def _finish_end_element(self
, curNode
):
405 info
= self
._elem
_info
.get(curNode
.tagName
)
407 self
._handle
_white
_text
_nodes
(curNode
, info
)
409 if curNode
is self
.document
.documentElement
:
411 if self
._filter
.acceptNode(curNode
) == FILTER_REJECT
:
412 self
.curNode
.removeChild(curNode
)
415 def _handle_white_text_nodes(self
, node
, info
):
416 if (self
._options
.whitespace_in_element_content
417 or not info
.isElementContent()):
420 # We have element type information and should remove ignorable
421 # whitespace; identify for text nodes which contain only
424 for child
in node
.childNodes
:
425 if child
.nodeType
== TEXT_NODE
and not child
.data
.strip():
428 # Remove ignorable whitespace from the tree.
430 node
.removeChild(child
)
432 def element_decl_handler(self
, name
, model
):
433 info
= self
._elem
_info
.get(name
)
435 self
._elem
_info
[name
] = ElementInfo(name
, model
)
437 assert info
._model
is None
440 def attlist_decl_handler(self
, elem
, name
, type, default
, required
):
441 info
= self
._elem
_info
.get(elem
)
443 info
= ElementInfo(elem
)
444 self
._elem
_info
[elem
] = info
445 info
._attr
_info
.append(
446 [None, name
, None, None, default
, 0, type, required
])
448 def xml_decl_handler(self
, version
, encoding
, standalone
):
449 self
.document
.version
= version
450 self
.document
.encoding
= encoding
451 # This is still a little ugly, thanks to the pyexpat API. ;-(
454 self
.document
.standalone
= True
456 self
.document
.standalone
= False
459 # Don't include FILTER_INTERRUPT, since that's checked separately
461 _ALLOWED_FILTER_RETURNS
= (FILTER_ACCEPT
, FILTER_REJECT
, FILTER_SKIP
)
463 class FilterVisibilityController(NewStyle
):
464 """Wrapper around a DOMBuilderFilter which implements the checks
465 to make the whatToShow filter attribute work."""
467 __slots__
= 'filter',
469 def __init__(self
, filter):
472 def startContainer(self
, node
):
473 mask
= self
._nodetype
_mask
[node
.nodeType
]
474 if self
.filter.whatToShow
& mask
:
475 val
= self
.filter.startContainer(node
)
476 if val
== FILTER_INTERRUPT
:
478 if val
not in _ALLOWED_FILTER_RETURNS
:
480 "startContainer() returned illegal value: " + repr(val
)
485 def acceptNode(self
, node
):
486 mask
= self
._nodetype
_mask
[node
.nodeType
]
487 if self
.filter.whatToShow
& mask
:
488 val
= self
.filter.acceptNode(node
)
489 if val
== FILTER_INTERRUPT
:
491 if val
== FILTER_SKIP
:
492 # move all child nodes to the parent, and remove this node
493 parent
= node
.parentNode
494 for child
in node
.childNodes
[:]:
495 parent
.appendChild(child
)
496 # node is handled by the caller
498 if val
not in _ALLOWED_FILTER_RETURNS
:
500 "acceptNode() returned illegal value: " + repr(val
)
506 Node
.ELEMENT_NODE
: NodeFilter
.SHOW_ELEMENT
,
507 Node
.ATTRIBUTE_NODE
: NodeFilter
.SHOW_ATTRIBUTE
,
508 Node
.TEXT_NODE
: NodeFilter
.SHOW_TEXT
,
509 Node
.CDATA_SECTION_NODE
: NodeFilter
.SHOW_CDATA_SECTION
,
510 Node
.ENTITY_REFERENCE_NODE
: NodeFilter
.SHOW_ENTITY_REFERENCE
,
511 Node
.ENTITY_NODE
: NodeFilter
.SHOW_ENTITY
,
512 Node
.PROCESSING_INSTRUCTION_NODE
: NodeFilter
.SHOW_PROCESSING_INSTRUCTION
,
513 Node
.COMMENT_NODE
: NodeFilter
.SHOW_COMMENT
,
514 Node
.DOCUMENT_NODE
: NodeFilter
.SHOW_DOCUMENT
,
515 Node
.DOCUMENT_TYPE_NODE
: NodeFilter
.SHOW_DOCUMENT_TYPE
,
516 Node
.DOCUMENT_FRAGMENT_NODE
: NodeFilter
.SHOW_DOCUMENT_FRAGMENT
,
517 Node
.NOTATION_NODE
: NodeFilter
.SHOW_NOTATION
,
521 class FilterCrutch(NewStyle
):
522 __slots__
= '_builder', '_level', '_old_start', '_old_end'
524 def __init__(self
, builder
):
526 self
._builder
= builder
527 parser
= builder
._parser
528 self
._old
_start
= parser
.StartElementHandler
529 self
._old
_end
= parser
.EndElementHandler
530 parser
.StartElementHandler
= self
.start_element_handler
531 parser
.EndElementHandler
= self
.end_element_handler
533 class Rejecter(FilterCrutch
):
536 def __init__(self
, builder
):
537 FilterCrutch
.__init
__(self
, builder
)
538 parser
= builder
._parser
539 for name
in ("ProcessingInstructionHandler",
541 "CharacterDataHandler",
542 "StartCdataSectionHandler",
543 "EndCdataSectionHandler",
544 "ExternalEntityRefHandler",
546 setattr(parser
, name
, None)
548 def start_element_handler(self
, *args
):
549 self
._level
= self
._level
+ 1
551 def end_element_handler(self
, *args
):
553 # restore the old handlers
554 parser
= self
._builder
._parser
555 self
._builder
.install(parser
)
556 parser
.StartElementHandler
= self
._old
_start
557 parser
.EndElementHandler
= self
._old
_end
559 self
._level
= self
._level
- 1
561 class Skipper(FilterCrutch
):
564 def start_element_handler(self
, *args
):
565 node
= self
._builder
.curNode
566 self
._old
_start
(*args
)
567 if self
._builder
.curNode
is not node
:
568 self
._level
= self
._level
+ 1
570 def end_element_handler(self
, *args
):
572 # We're popping back out of the node we're skipping, so we
573 # shouldn't need to do anything but reset the handlers.
574 self
._builder
._parser
.StartElementHandler
= self
._old
_start
575 self
._builder
._parser
.EndElementHandler
= self
._old
_end
578 self
._level
= self
._level
- 1
582 # framework document used by the fragment builder.
583 # Takes a string for the doctype, subset string, and namespace attrs string.
585 _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID
= \
586 "http://xml.python.org/entities/fragment-builder/internal"
588 _FRAGMENT_BUILDER_TEMPLATE
= (
592 <!ENTITY fragment-builder-internal
597 >&fragment-builder-internal;</wrapper>'''
598 % _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID
)
601 class FragmentBuilder(ExpatBuilder
):
602 """Builder which constructs document fragments given XML source
603 text and a context node.
605 The context node is expected to provide information about the
606 namespace declarations which are in scope at the start of the
610 def __init__(self
, context
, options
=None):
611 if context
.nodeType
== DOCUMENT_NODE
:
612 self
.originalDocument
= context
613 self
.context
= context
615 self
.originalDocument
= context
.ownerDocument
616 self
.context
= context
617 ExpatBuilder
.__init
__(self
, options
)
620 ExpatBuilder
.reset(self
)
623 def parseFile(self
, file):
624 """Parse a document fragment from a file object, returning the
626 return self
.parseString(file.read())
628 def parseString(self
, string
):
629 """Parse a document fragment from a string, returning the
631 self
._source
= string
632 parser
= self
.getParser()
633 doctype
= self
.originalDocument
.doctype
636 subset
= doctype
.internalSubset
or self
._getDeclarations
()
638 ident
= ('PUBLIC "%s" "%s"'
639 % (doctype
.publicId
, doctype
.systemId
))
640 elif doctype
.systemId
:
641 ident
= 'SYSTEM "%s"' % doctype
.systemId
644 nsattrs
= self
._getNSattrs
() # get ns decls from node's ancestors
645 document
= _FRAGMENT_BUILDER_TEMPLATE
% (ident
, subset
, nsattrs
)
647 parser
.Parse(document
, 1)
651 fragment
= self
.fragment
653 ## self._parser = None
656 def _getDeclarations(self
):
657 """Re-create the internal subset from the DocumentType node.
659 This is only needed if we don't already have the
660 internalSubset as a string.
662 doctype
= self
.context
.ownerDocument
.doctype
665 for i
in range(doctype
.notations
.length
):
666 notation
= doctype
.notations
.item(i
)
669 s
= "%s<!NOTATION %s" % (s
, notation
.nodeName
)
670 if notation
.publicId
:
671 s
= '%s PUBLIC "%s"\n "%s">' \
672 % (s
, notation
.publicId
, notation
.systemId
)
674 s
= '%s SYSTEM "%s">' % (s
, notation
.systemId
)
675 for i
in range(doctype
.entities
.length
):
676 entity
= doctype
.entities
.item(i
)
679 s
= "%s<!ENTITY %s" % (s
, entity
.nodeName
)
681 s
= '%s PUBLIC "%s"\n "%s"' \
682 % (s
, entity
.publicId
, entity
.systemId
)
683 elif entity
.systemId
:
684 s
= '%s SYSTEM "%s"' % (s
, entity
.systemId
)
686 s
= '%s "%s"' % (s
, entity
.firstChild
.data
)
687 if entity
.notationName
:
688 s
= "%s NOTATION %s" % (s
, entity
.notationName
)
692 def _getNSattrs(self
):
695 def external_entity_ref_handler(self
, context
, base
, systemId
, publicId
):
696 if systemId
== _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID
:
697 # this entref is the one that we made to put the subtree
698 # in; all of our given input is parsed in here.
699 old_document
= self
.document
700 old_cur_node
= self
.curNode
701 parser
= self
._parser
.ExternalEntityParserCreate(context
)
702 # put the real document back, parse into the fragment to return
703 self
.document
= self
.originalDocument
704 self
.fragment
= self
.document
.createDocumentFragment()
705 self
.curNode
= self
.fragment
707 parser
.Parse(self
._source
, 1)
709 self
.curNode
= old_cur_node
710 self
.document
= old_document
714 return ExpatBuilder
.external_entity_ref_handler(
715 self
, context
, base
, systemId
, publicId
)
719 """Mix-in class for builders; adds support for namespaces."""
721 def _initNamespaces(self
):
722 # list of (prefix, uri) ns declarations. Namespace attrs are
723 # constructed from this and added to the element's attrs.
724 self
._ns
_ordered
_prefixes
= []
726 def createParser(self
):
727 """Create a new namespace-handling parser."""
728 parser
= expat
.ParserCreate(namespace_separator
=" ")
729 parser
.namespace_prefixes
= True
732 def install(self
, parser
):
733 """Insert the namespace-handlers onto the parser."""
734 ExpatBuilder
.install(self
, parser
)
735 if self
._options
.namespace_declarations
:
736 parser
.StartNamespaceDeclHandler
= (
737 self
.start_namespace_decl_handler
)
739 def start_namespace_decl_handler(self
, prefix
, uri
):
740 """Push this namespace declaration on our storage."""
741 self
._ns
_ordered
_prefixes
.append((prefix
, uri
))
743 def start_element_handler(self
, name
, attributes
):
745 uri
, localname
, prefix
, qname
= _parse_ns_name(self
, name
)
747 uri
= EMPTY_NAMESPACE
750 prefix
= EMPTY_PREFIX
751 node
= minidom
.Element(qname
, uri
, prefix
, localname
)
752 node
.ownerDocument
= self
.document
753 _append_child(self
.curNode
, node
)
756 if self
._ns
_ordered
_prefixes
:
757 for prefix
, uri
in self
._ns
_ordered
_prefixes
:
759 a
= minidom
.Attr(_intern(self
, 'xmlns:' + prefix
),
760 XMLNS_NAMESPACE
, prefix
, "xmlns")
762 a
= minidom
.Attr("xmlns", XMLNS_NAMESPACE
,
763 "xmlns", EMPTY_PREFIX
)
764 d
= a
.childNodes
[0].__dict
__
765 d
['data'] = d
['nodeValue'] = uri
767 d
['value'] = d
['nodeValue'] = uri
768 d
['ownerDocument'] = self
.document
769 _set_attribute_node(node
, a
)
770 del self
._ns
_ordered
_prefixes
[:]
774 _attrsNS
= node
._attrsNS
775 for i
in range(0, len(attributes
), 2):
776 aname
= attributes
[i
]
777 value
= attributes
[i
+1]
779 uri
, localname
, prefix
, qname
= _parse_ns_name(self
, aname
)
780 a
= minidom
.Attr(qname
, uri
, localname
, prefix
)
782 _attrsNS
[(uri
, localname
)] = a
784 a
= minidom
.Attr(aname
, EMPTY_NAMESPACE
,
787 _attrsNS
[(EMPTY_NAMESPACE
, aname
)] = a
788 d
= a
.childNodes
[0].__dict
__
789 d
['data'] = d
['nodeValue'] = value
791 d
['ownerDocument'] = self
.document
792 d
['value'] = d
['nodeValue'] = value
793 d
['ownerElement'] = node
796 # This only adds some asserts to the original
797 # end_element_handler(), so we only define this when -O is not
798 # used. If changing one, be sure to check the other to see if
799 # it needs to be changed as well.
801 def end_element_handler(self
, name
):
802 curNode
= self
.curNode
804 uri
, localname
, prefix
, qname
= _parse_ns_name(self
, name
)
805 assert (curNode
.namespaceURI
== uri
806 and curNode
.localName
== localname
807 and curNode
.prefix
== prefix
), \
808 "element stack messed up! (namespace)"
810 assert curNode
.nodeName
== name
, \
811 "element stack messed up - bad nodeName"
812 assert curNode
.namespaceURI
== EMPTY_NAMESPACE
, \
813 "element stack messed up - bad namespaceURI"
814 self
.curNode
= curNode
.parentNode
815 self
._finish
_end
_element
(curNode
)
818 class ExpatBuilderNS(Namespaces
, ExpatBuilder
):
819 """Document builder that supports namespaces."""
822 ExpatBuilder
.reset(self
)
823 self
._initNamespaces
()
826 class FragmentBuilderNS(Namespaces
, FragmentBuilder
):
827 """Fragment builder that supports namespaces."""
830 FragmentBuilder
.reset(self
)
831 self
._initNamespaces
()
833 def _getNSattrs(self
):
834 """Return string of namespace attributes from this element and
836 # XXX This needs to be re-written to walk the ancestors of the
837 # context to build up the namespace information from
838 # declarations, elements, and attributes found in context.
839 # Otherwise we have to store a bunch more data on the DOM
840 # (though that *might* be more reliable -- not clear).
842 context
= self
.context
845 if hasattr(context
, '_ns_prefix_uri'):
846 for prefix
, uri
in context
._ns
_prefix
_uri
.items():
847 # add every new NS decl from context to L and attrs string
852 declname
= "xmlns:" + prefix
856 attrs
= "%s\n %s='%s'" % (attrs
, declname
, uri
)
858 attrs
= " %s='%s'" % (declname
, uri
)
859 context
= context
.parentNode
863 class ParseEscape(Exception):
864 """Exception raised to short-circuit parsing in InternalSubsetExtractor."""
867 class InternalSubsetExtractor(ExpatBuilder
):
868 """XML processor which can rip out the internal document type subset."""
873 """Return the internal subset as a string."""
876 def parseFile(self
, file):
878 ExpatBuilder
.parseFile(self
, file)
882 def parseString(self
, string
):
884 ExpatBuilder
.parseString(self
, string
)
888 def install(self
, parser
):
889 parser
.StartDoctypeDeclHandler
= self
.start_doctype_decl_handler
890 parser
.StartElementHandler
= self
.start_element_handler
892 def start_doctype_decl_handler(self
, name
, publicId
, systemId
,
893 has_internal_subset
):
894 if has_internal_subset
:
895 parser
= self
.getParser()
897 parser
.DefaultHandler
= self
.subset
.append
898 parser
.EndDoctypeDeclHandler
= self
.end_doctype_decl_handler
902 def end_doctype_decl_handler(self
):
903 s
= ''.join(self
.subset
).replace('\r\n', '\n').replace('\r', '\n')
907 def start_element_handler(self
, name
, attrs
):
911 def parse(file, namespaces
=1):
912 """Parse a document, returning the resulting Document node.
914 'file' may be either a file name or an open file object.
917 builder
= ExpatBuilderNS()
919 builder
= ExpatBuilder()
921 if isinstance(file, StringTypes
):
922 fp
= open(file, 'rb')
924 result
= builder
.parseFile(fp
)
928 result
= builder
.parseFile(file)
932 def parseString(string
, namespaces
=1):
933 """Parse a document from a string, returning the resulting
937 builder
= ExpatBuilderNS()
939 builder
= ExpatBuilder()
940 return builder
.parseString(string
)
943 def parseFragment(file, context
, namespaces
=1):
944 """Parse a fragment of a document, given the context from which it
945 was originally extracted. context should be the parent of the
946 node(s) which are in the fragment.
948 'file' may be either a file name or an open file object.
951 builder
= FragmentBuilderNS(context
)
953 builder
= FragmentBuilder(context
)
955 if isinstance(file, StringTypes
):
956 fp
= open(file, 'rb')
958 result
= builder
.parseFile(fp
)
962 result
= builder
.parseFile(file)
966 def parseFragmentString(string
, context
, namespaces
=1):
967 """Parse a fragment of a document from a string, given the context
968 from which it was originally extracted. context should be the
969 parent of the node(s) which are in the fragment.
972 builder
= FragmentBuilderNS(context
)
974 builder
= FragmentBuilder(context
)
975 return builder
.parseString(string
)
978 def makeBuilder(options
):
979 """Create a builder based on an Options object."""
980 if options
.namespaces
:
981 return ExpatBuilderNS(options
)
983 return ExpatBuilder(options
)