3 * XML syntax and type checker.
5 * Since 1.24.2, it uses XMLReader instead of xml_parse, which gives us
6 * more control over the expansion of XML entities. When passed to the
7 * callback, entities will be fully expanded, but may report the XML is
8 * invalid if expanding the entities are likely to cause a DoS.
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License along
21 * with this program; if not, write to the Free Software Foundation, Inc.,
22 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 * http://www.gnu.org/copyleft/gpl.html
30 * Will be set to true or false to indicate whether the file is
31 * well-formed XML. Note that this doesn't check schema validity.
33 public $wellFormed = null;
36 * Will be set to true if the optional element filter returned
37 * a match at some point.
39 public $filterMatch = false;
42 * Will contain the type of filter hit if the optional element filter returned
43 * a match at some point.
46 public $filterMatchType = false;
49 * Name of the document's root element, including any namespace
52 public $rootElement = '';
55 * A stack of strings containing the data of each xml element as it's processed. Append
56 * data to the top string of the stack, then pop off the string and process it when the
59 protected $elementData = array();
62 * A stack of element names and attributes, as we process them.
64 protected $elementDataContext = array();
67 * Current depth of the data stack.
69 protected $stackDepth = 0;
72 * Additional parsing options
74 private $parserOptions = array(
75 'processing_instruction_handler' => '',
79 * @param string $input a filename or string containing the XML element
80 * @param callable $filterCallback (optional)
81 * Function to call to do additional custom validity checks from the
82 * SAX element handler event. This gives you access to the element
83 * namespace, name, attributes, and text contents.
84 * Filter should return 'true' to toggle on $this->filterMatch
85 * @param bool $isFile (optional) indicates if the first parameter is a
86 * filename (default, true) or if it is a string (false)
87 * @param array $options list of additional parsing options:
88 * processing_instruction_handler: Callback for xml_set_processing_instruction_handler
90 function __construct( $input, $filterCallback = null, $isFile = true, $options = array() ) {
91 $this->filterCallback
= $filterCallback;
92 $this->parserOptions
= array_merge( $this->parserOptions
, $options );
93 $this->validateFromInput( $input, $isFile );
97 * Alternative constructor: from filename
99 * @param string $fname the filename of an XML document
100 * @param callable $filterCallback (optional)
101 * Function to call to do additional custom validity checks from the
102 * SAX element handler event. This gives you access to the element
103 * namespace, name, and attributes, but not to text contents.
104 * Filter should return 'true' to toggle on $this->filterMatch
105 * @return XmlTypeCheck
107 public static function newFromFilename( $fname, $filterCallback = null ) {
108 return new self( $fname, $filterCallback, true );
112 * Alternative constructor: from string
114 * @param string $string a string containing an XML element
115 * @param callable $filterCallback (optional)
116 * Function to call to do additional custom validity checks from the
117 * SAX element handler event. This gives you access to the element
118 * namespace, name, and attributes, but not to text contents.
119 * Filter should return 'true' to toggle on $this->filterMatch
120 * @return XmlTypeCheck
122 public static function newFromString( $string, $filterCallback = null ) {
123 return new self( $string, $filterCallback, false );
127 * Get the root element. Simple accessor to $rootElement
131 public function getRootElement() {
132 return $this->rootElement
;
137 * @param string $fname the filename
139 private function validateFromInput( $xml, $isFile ) {
140 $reader = new XMLReader();
142 $s = $reader->open( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING
);
144 $s = $reader->XML( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING
);
147 // Couldn't open the XML
148 $this->wellFormed
= false;
150 $oldDisable = libxml_disable_entity_loader( true );
151 $reader->setParserProperty( XMLReader
::SUBST_ENTITIES
, true );
153 $this->validate( $reader );
154 } catch ( Exception
$e ) {
155 // Calling this malformed, because we didn't parse the whole
156 // thing. Maybe just an external entity refernce.
157 $this->wellFormed
= false;
159 libxml_disable_entity_loader( $oldDisable );
163 libxml_disable_entity_loader( $oldDisable );
167 private function readNext( XMLReader
$reader ) {
168 set_error_handler( array( $this, 'XmlErrorHandler' ) );
169 $ret = $reader->read();
170 restore_error_handler();
174 public function XmlErrorHandler( $errno, $errstr ) {
175 $this->wellFormed
= false;
178 private function validate( $reader ) {
180 // First, move through anything that isn't an element, and
181 // handle any processing instructions with the callback
183 if ( !$this->readNext( $reader ) ) {
184 // Hit the end of the document before any elements
185 $this->wellFormed
= false;
188 if ( $reader->nodeType
=== XMLReader
::PI
) {
189 $this->processingInstructionHandler( $reader->name
, $reader->value
);
191 } while ( $reader->nodeType
!= XMLReader
::ELEMENT
);
193 // Process the rest of the document
195 switch ( $reader->nodeType
) {
196 case XMLReader
::ELEMENT
:
197 $name = $this->expandNS(
199 $reader->namespaceURI
201 if ( $this->rootElement
=== '' ) {
202 $this->rootElement
= $name;
204 $empty = $reader->isEmptyElement
;
205 $attrs = $this->getAttributesArray( $reader );
206 $this->elementOpen( $name, $attrs );
208 $this->elementClose();
212 case XMLReader
::END_ELEMENT
:
213 $this->elementClose();
216 case XMLReader
::WHITESPACE
:
217 case XMLReader
::SIGNIFICANT_WHITESPACE
:
218 case XMLReader
::CDATA
:
219 case XMLReader
::TEXT
:
220 $this->elementData( $reader->value
);
223 case XMLReader
::ENTITY_REF
:
224 // Unexpanded entity (maybe external?),
225 // don't send to the filter (xml_parse didn't)
228 case XMLReader
::COMMENT
:
229 // Don't send to the filter (xml_parse didn't)
233 // Processing instructions can happen after the header too
234 $this->processingInstructionHandler(
240 // One of DOC, DOC_TYPE, ENTITY, END_ENTITY,
241 // NOTATION, or XML_DECLARATION
242 // xml_parse didn't send these to the filter, so we won't.
245 } while ( $this->readNext( $reader ) );
247 if ( $this->stackDepth
!== 0 ) {
248 $this->wellFormed
= false;
249 } elseif ( $this->wellFormed
=== null ) {
250 $this->wellFormed
= true;
256 * Get all of the attributes for an XMLReader's current node
257 * @param $r XMLReader
258 * @return array of attributes
260 private function getAttributesArray( XMLReader
$r ) {
262 while ( $r->moveToNextAttribute() ) {
263 if ( $r->namespaceURI
=== 'http://www.w3.org/2000/xmlns/' ) {
264 // XMLReader treats xmlns attributes as normal
265 // attributes, while xml_parse doesn't
268 $name = $this->expandNS( $r->name
, $r->namespaceURI
);
269 $attrs[$name] = $r->value
;
275 * @param $name element or attribute name, maybe with a full or short prefix
276 * @param $namespaceURI the namespaceURI
277 * @return string the name prefixed with namespaceURI
279 private function expandNS( $name, $namespaceURI ) {
280 if ( $namespaceURI ) {
281 $parts = explode( ':', $name );
282 $localname = array_pop( $parts );
283 return "$namespaceURI:$localname";
292 private function elementOpen( $name, $attribs ) {
293 $this->elementDataContext
[] = array( $name, $attribs );
294 $this->elementData
[] = '';
300 private function elementClose() {
301 list( $name, $attribs ) = array_pop( $this->elementDataContext
);
302 $data = array_pop( $this->elementData
);
304 $callbackReturn = false;
306 if ( is_callable( $this->filterCallback
) ) {
307 $callbackReturn = call_user_func(
308 $this->filterCallback
,
314 if ( $callbackReturn ) {
316 $this->filterMatch
= true;
317 $this->filterMatchType
= $callbackReturn;
324 private function elementData( $data ) {
325 // Collect any data here, and we'll run the callback in elementClose
326 $this->elementData
[ $this->stackDepth
- 1 ] .= trim( $data );
333 private function processingInstructionHandler( $target, $data ) {
334 $callbackReturn = false;
335 if ( $this->parserOptions
['processing_instruction_handler'] ) {
336 $callbackReturn = call_user_func(
337 $this->parserOptions
['processing_instruction_handler'],
342 if ( $callbackReturn ) {
344 $this->filterMatch
= true;
345 $this->filterMatchType
= $callbackReturn;